From c2a1bd2f7432ed7ea01b538c72c09a84441dab45 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 9 Nov 2023 14:05:01 -0800 Subject: [PATCH 01/16] Add SHA-1 and SHA-2 hash functions. --- cpp/CMakeLists.txt | 5 + cpp/benchmarks/hashing/hash.cpp | 40 +- cpp/include/cudf/hashing.hpp | 77 ++- .../cudf/hashing/detail/hash_functions.cuh | 41 ++ cpp/include/cudf/hashing/detail/hashing.hpp | 20 + cpp/src/hash/hashing.cu | 5 + cpp/src/hash/md5_hash.cu | 23 - cpp/src/hash/sha1_hash.cu | 100 ++++ cpp/src/hash/sha224_hash.cu | 101 ++++ cpp/src/hash/sha256_hash.cu | 102 ++++ cpp/src/hash/sha384_hash.cu | 110 ++++ cpp/src/hash/sha512_hash.cu | 110 ++++ cpp/src/hash/sha_hash.cuh | 537 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 5 + cpp/tests/hashing/sha1_test.cpp | 204 +++++++ cpp/tests/hashing/sha224_test.cpp | 204 +++++++ cpp/tests/hashing/sha256_test.cpp | 204 +++++++ cpp/tests/hashing/sha384_test.cpp | 220 +++++++ cpp/tests/hashing/sha512_test.cpp | 220 +++++++ .../main/java/ai/rapids/cudf/HashType.java | 5 + python/cudf/cudf/_lib/cpp/hash.pxd | 7 +- python/cudf/cudf/_lib/hash.pyx | 12 +- python/cudf/cudf/tests/test_dataframe.py | 5 +- python/cudf/cudf/tests/test_series.py | 4 +- 24 files changed, 2332 insertions(+), 29 deletions(-) create mode 100644 cpp/src/hash/sha1_hash.cu create mode 100644 cpp/src/hash/sha224_hash.cu create mode 100644 cpp/src/hash/sha256_hash.cu create mode 100644 cpp/src/hash/sha384_hash.cu create mode 100644 cpp/src/hash/sha512_hash.cu create mode 100644 cpp/src/hash/sha_hash.cuh create mode 100644 cpp/tests/hashing/sha1_test.cpp create mode 100644 cpp/tests/hashing/sha224_test.cpp create mode 100644 cpp/tests/hashing/sha256_test.cpp create mode 100644 cpp/tests/hashing/sha384_test.cpp create mode 100644 cpp/tests/hashing/sha512_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index bd9c936626a..46856a23fe9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -347,6 +347,11 @@ add_library( src/hash/md5_hash.cu src/hash/murmurhash3_x86_32.cu src/hash/murmurhash3_x64_128.cu + src/hash/sha1_hash.cu + src/hash/sha224_hash.cu + src/hash/sha256_hash.cu + src/hash/sha384_hash.cu + src/hash/sha512_hash.cu src/hash/spark_murmurhash3_x86_32.cu src/hash/xxhash_64.cu src/interop/dlpack.cpp diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp index e679b4b62d2..18a95d2f46e 100644 --- a/cpp/benchmarks/hashing/hash.cpp +++ b/cpp/benchmarks/hashing/hash.cpp @@ -67,6 +67,36 @@ static void bench_hash(nvbench::state& state) state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); }); + } else if (hash_name == "sha1") { + // sha1 creates a 40-byte string + state.add_global_memory_writes(40 * num_rows); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = cudf::hashing::sha1(data->view()); }); + } else if (hash_name == "sha224") { + // sha224 creates a 56-byte string + state.add_global_memory_writes(56 * num_rows); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = cudf::hashing::sha224(data->view()); }); + } else if (hash_name == "sha256") { + // sha256 creates a 64-byte string + state.add_global_memory_writes(64 * num_rows); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = cudf::hashing::sha256(data->view()); }); + } else if (hash_name == "sha384") { + // sha384 creates a 96-byte string + state.add_global_memory_writes(96 * num_rows); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = cudf::hashing::sha384(data->view()); }); + } else if (hash_name == "sha512") { + // sha512 creates a 128-byte string + state.add_global_memory_writes(128 * num_rows); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = cudf::hashing::sha512(data->view()); }); } else if (hash_name == "spark_murmurhash3_x86_32") { state.add_global_memory_writes(num_rows); @@ -82,4 +112,12 @@ NVBENCH_BENCH(bench_hash) .set_name("hashing") .add_int64_axis("num_rows", {65536, 16777216}) .add_float64_axis("nulls", {0.0, 0.1}) - .add_string_axis("hash_name", {"murmurhash3_x86_32", "md5", "spark_murmurhash3_x86_32"}); + .add_string_axis("hash_name", + {"murmurhash3_x86_32", + "md5", + "sha1", + "sha224", + "sha256", + "sha384", + "sha512", + "spark_murmurhash3_x86_32"}); diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 72e32715ed4..1a128089760 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -42,7 +42,12 @@ enum class hash_id { HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed HASH_MURMUR3, ///< Murmur3 hash function HASH_SPARK_MURMUR3, ///< Spark Murmur3 hash function - HASH_MD5 ///< MD5 hash function + HASH_MD5, ///< MD5 hash function + HASH_SHA1, ///< SHA-1 hash function + HASH_SHA224, ///< SHA-224 hash function + HASH_SHA256, ///< SHA-256 hash function + HASH_SHA384, ///< SHA-384 hash function + HASH_SHA512 ///< SHA-512 hash function }; /** @@ -145,6 +150,76 @@ std::unique_ptr md5( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Computes the SHA-1 hash value of each row in the given table + * + * @param input The table of columns to hash + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr sha1( + table_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Computes the SHA-224 hash value of each row in the given table + * + * @param input The table of columns to hash + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr sha224( + table_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Computes the SHA-256 hash value of each row in the given table + * + * @param input The table of columns to hash + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr sha256( + table_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Computes the SHA-384 hash value of each row in the given table + * + * @param input The table of columns to hash + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr sha384( + table_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Computes the SHA-512 hash value of each row in the given table + * + * @param input The table of columns to hash + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr sha512( + table_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Computes the XXHash_64 hash value of each row in the given table * diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh index 7a3d1990791..4b65a5e6b37 100644 --- a/cpp/include/cudf/hashing/detail/hash_functions.cuh +++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh @@ -68,4 +68,45 @@ __device__ inline uint64_t rotate_bits_right(uint64_t x, uint32_t r) return (x >> r) | (x << (64 - r)); } +// Swap the endianness of a 32 bit value +__device__ inline uint32_t swap_endian(uint32_t x) +{ + // The selector 0x0123 reverses the byte order + return __byte_perm(x, 0, 0x0123); +} + +// Swap the endianness of a 64 bit value +// There is no CUDA intrinsic for permuting bytes in 64 bit integers +__device__ inline uint64_t swap_endian(uint64_t x) +{ + // Reverse the endianness of each 32 bit section + uint32_t low_bits = swap_endian(static_cast(x)); + uint32_t high_bits = swap_endian(static_cast(x >> 32)); + // Reassemble a 64 bit result, swapping the low bits and high bits + return (static_cast(low_bits) << 32) | (static_cast(high_bits)); +}; + +/** + * Modified GPU implementation of + * https://johnnylee-sde.github.io/Fast-unsigned-integer-to-hex-string/ + * Copyright (c) 2015 Barry Clark + * Licensed under the MIT license. + * See file LICENSE for detail or copy at https://opensource.org/licenses/MIT + */ +void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destination) +{ + // Transform 0xABCD'1234 => 0x0000'ABCD'0000'1234 => 0x0B0A'0D0C'0201'0403 + uint64_t x = num; + x = ((x & 0xFFFF'0000u) << 16) | ((x & 0xFFFF)); + x = ((x & 0x000F'0000'000Fu) << 8) | ((x & 0x00F0'0000'00F0u) >> 4) | + ((x & 0x0F00'0000'0F00u) << 16) | ((x & 0xF000'0000'F000) << 4); + + // Calculate a mask of ascii value offsets for bytes that contain alphabetical hex digits + uint64_t offsets = (((x + 0x0606'0606'0606'0606) >> 4) & 0x0101'0101'0101'0101) * 0x27; + + x |= 0x3030'3030'3030'3030; + x += offsets; + std::memcpy(destination, reinterpret_cast(&x), 8); +} + } // namespace cudf::hashing::detail diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp index f08d0fbb849..f5bc7000604 100644 --- a/cpp/include/cudf/hashing/detail/hashing.hpp +++ b/cpp/include/cudf/hashing/detail/hashing.hpp @@ -46,6 +46,26 @@ std::unique_ptr md5(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +std::unique_ptr sha1(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +std::unique_ptr sha224(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +std::unique_ptr sha256(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +std::unique_ptr sha384(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +std::unique_ptr sha512(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + std::unique_ptr xxhash_64(table_view const& input, uint64_t seed, rmm::cuda_stream_view, diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index 68e02ef3cf4..9986f7f27d3 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -33,6 +33,11 @@ std::unique_ptr hash(table_view const& input, case (hash_id::HASH_MURMUR3): return murmurhash3_x86_32(input, seed, stream, mr); case (hash_id::HASH_SPARK_MURMUR3): return spark_murmurhash3_x86_32(input, seed, stream, mr); case (hash_id::HASH_MD5): return md5(input, stream, mr); + case (hash_id::HASH_SHA1): return sha1(input, stream, mr); + case (hash_id::HASH_SHA224): return sha224(input, stream, mr); + case (hash_id::HASH_SHA256): return sha256(input, stream, mr); + case (hash_id::HASH_SHA384): return sha384(input, stream, mr); + case (hash_id::HASH_SHA512): return sha512(input, stream, mr); default: CUDF_FAIL("Unsupported hash function."); } } diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu index 4e25f9f8c23..868a0ad34de 100644 --- a/cpp/src/hash/md5_hash.cu +++ b/cpp/src/hash/md5_hash.cu @@ -108,29 +108,6 @@ auto __device__ inline get_element_pointer_and_size(string_view const& element) return thrust::make_pair(reinterpret_cast(element.data()), element.size_bytes()); } -/** - * Modified GPU implementation of - * https://johnnylee-sde.github.io/Fast-unsigned-integer-to-hex-string/ - * Copyright (c) 2015 Barry Clark - * Licensed under the MIT license. - * See file LICENSE for detail or copy at https://opensource.org/licenses/MIT - */ -void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destination) -{ - // Transform 0xABCD'1234 => 0x0000'ABCD'0000'1234 => 0x0B0A'0D0C'0201'0403 - uint64_t x = num; - x = ((x & 0xFFFF'0000u) << 16) | ((x & 0xFFFF)); - x = ((x & 0x000F'0000'000Fu) << 8) | ((x & 0x00F0'0000'00F0u) >> 4) | - ((x & 0x0F00'0000'0F00u) << 16) | ((x & 0xF000'0000'F000) << 4); - - // Calculate a mask of ascii value offsets for bytes that contain alphabetical hex digits - uint64_t offsets = (((x + 0x0606'0606'0606'0606) >> 4) & 0x0101'0101'0101'0101) * 0x27; - - x |= 0x3030'3030'3030'3030; - x += offsets; - std::memcpy(destination, reinterpret_cast(&x), 8); -} - // The MD5 algorithm and its hash/shift constants are officially specified in // RFC 1321. For convenience, these values can also be found on Wikipedia: // https://en.wikipedia.org/wiki/MD5 diff --git a/cpp/src/hash/sha1_hash.cu b/cpp/src/hash/sha1_hash.cu new file mode 100644 index 00000000000..c0477343ca1 --- /dev/null +++ b/cpp/src/hash/sha1_hash.cu @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sha_hash.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf { +namespace hashing { +namespace detail { + +namespace { + +struct sha1_hash_state { + uint64_t message_length = 0; + uint32_t buffer_length = 0; + uint32_t hash_value[5] = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0}; + uint8_t buffer[64]; +}; + +struct SHA1Hash : HashBase { + __device__ inline SHA1Hash(char* result_location) : HashBase(result_location) {} + + // Intermediate data type storing the hash state + using hash_state = sha1_hash_state; + // The word type used by this hash function + using sha_word_type = uint32_t; + // Number of bytes processed in each hash step + static constexpr uint32_t message_chunk_size = 64; + // Digest size in bytes + static constexpr uint32_t digest_size = 40; + // Number of bytes used for the message length + static constexpr uint32_t message_length_size = 8; + + void __device__ inline hash_step(hash_state& state) { sha1_hash_step(state); } + + hash_state state; +}; + +} // namespace + +std::unique_ptr sha1(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + string_scalar const empty_result("da39a3ee5e6b4b0d3255bfef95601890afd80709"); + return sha_hash(input, empty_result, stream, mr); +} + +} // namespace detail + +std::unique_ptr sha1(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::sha1(input, stream, mr); +} + +} // namespace hashing +} // namespace cudf diff --git a/cpp/src/hash/sha224_hash.cu b/cpp/src/hash/sha224_hash.cu new file mode 100644 index 00000000000..af37bde07ab --- /dev/null +++ b/cpp/src/hash/sha224_hash.cu @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sha_hash.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf { +namespace hashing { +namespace detail { + +namespace { + +struct sha224_hash_state { + uint64_t message_length = 0; + uint32_t buffer_length = 0; + uint32_t hash_value[8] = { + 0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4}; + uint8_t buffer[64]; +}; + +struct SHA224Hash : HashBase { + __device__ inline SHA224Hash(char* result_location) : HashBase(result_location) {} + + // Intermediate data type storing the hash state + using hash_state = sha224_hash_state; + // The word type used by this hash function + using sha_word_type = uint32_t; + // Number of bytes processed in each hash step + static constexpr uint32_t message_chunk_size = 64; + // Digest size in bytes. This is truncated from SHA-256. + static constexpr uint32_t digest_size = 56; + // Number of bytes used for the message length + static constexpr uint32_t message_length_size = 8; + + void __device__ inline hash_step(hash_state& state) { sha256_hash_step(state); } + + hash_state state; +}; + +} // namespace + +std::unique_ptr sha224(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + string_scalar const empty_result("d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f"); + return sha_hash(input, empty_result, stream, mr); +} + +} // namespace detail + +std::unique_ptr sha224(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::sha224(input, stream, mr); +} + +} // namespace hashing +} // namespace cudf diff --git a/cpp/src/hash/sha256_hash.cu b/cpp/src/hash/sha256_hash.cu new file mode 100644 index 00000000000..d7a8502337e --- /dev/null +++ b/cpp/src/hash/sha256_hash.cu @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sha_hash.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf { +namespace hashing { +namespace detail { + +namespace { + +struct sha256_hash_state { + uint64_t message_length = 0; + uint32_t buffer_length = 0; + uint32_t hash_value[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19}; + uint8_t buffer[64]; +}; + +struct SHA256Hash : HashBase { + __device__ inline SHA256Hash(char* result_location) : HashBase(result_location) {} + + // Intermediate data type storing the hash state + using hash_state = sha256_hash_state; + // The word type used by this hash function + using sha_word_type = uint32_t; + // Number of bytes processed in each hash step + static constexpr uint32_t message_chunk_size = 64; + // Digest size in bytes + static constexpr uint32_t digest_size = 64; + // Number of bytes used for the message length + static constexpr uint32_t message_length_size = 8; + + void __device__ inline hash_step(hash_state& state) { sha256_hash_step(state); } + + hash_state state; +}; + +} // namespace + +std::unique_ptr sha256(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + string_scalar const empty_result( + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); + return sha_hash(input, empty_result, stream, mr); +} + +} // namespace detail + +std::unique_ptr sha256(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::sha256(input, stream, mr); +} + +} // namespace hashing +} // namespace cudf diff --git a/cpp/src/hash/sha384_hash.cu b/cpp/src/hash/sha384_hash.cu new file mode 100644 index 00000000000..83c7ceb4703 --- /dev/null +++ b/cpp/src/hash/sha384_hash.cu @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sha_hash.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf { +namespace hashing { +namespace detail { + +namespace { + +// Need alignas(16) to avoid compiler bug. +struct alignas(16) sha384_hash_state { + uint64_t message_length = 0; + uint32_t buffer_length = 0; + uint64_t hash_value[8] = {0xcbbb9d5dc1059ed8, + 0x629a292a367cd507, + 0x9159015a3070dd17, + 0x152fecd8f70e5939, + 0x67332667ffc00b31, + 0x8eb44a8768581511, + 0xdb0c2e0d64f98fa7, + 0x47b5481dbefa4fa4}; + uint8_t buffer[128]; +}; + +struct SHA384Hash : HashBase { + __device__ inline SHA384Hash(char* result_location) : HashBase(result_location) {} + + // Intermediate data type storing the hash state + using hash_state = sha384_hash_state; + // The word type used by this hash function + using sha_word_type = uint64_t; + // Number of bytes processed in each hash step + static constexpr uint32_t message_chunk_size = 128; + // Digest size in bytes. This is truncated from SHA-512. + static constexpr uint32_t digest_size = 96; + // Number of bytes used for the message length + static constexpr uint32_t message_length_size = 16; + + void __device__ inline hash_step(hash_state& state) { sha512_hash_step(state); } + + hash_state state; +}; + +} // namespace + +std::unique_ptr sha384(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + string_scalar const empty_result( + "38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b9" + "5b"); + return sha_hash(input, empty_result, stream, mr); +} + +} // namespace detail + +std::unique_ptr sha384(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::sha384(input, stream, mr); +} + +} // namespace hashing +} // namespace cudf diff --git a/cpp/src/hash/sha512_hash.cu b/cpp/src/hash/sha512_hash.cu new file mode 100644 index 00000000000..f705b25a45f --- /dev/null +++ b/cpp/src/hash/sha512_hash.cu @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "sha_hash.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf { +namespace hashing { +namespace detail { + +namespace { + +// Need alignas(16) to avoid compiler bug. +struct alignas(16) sha512_hash_state { + uint64_t message_length = 0; + uint32_t buffer_length = 0; + uint64_t hash_value[8] = {0x6a09e667f3bcc908, + 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, + 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, + 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, + 0x5be0cd19137e2179}; + uint8_t buffer[128]; +}; + +struct SHA512Hash : HashBase { + __device__ inline SHA512Hash(char* result_location) : HashBase(result_location) {} + + // Intermediate data type storing the hash state + using hash_state = sha512_hash_state; + // The word type used by this hash function + using sha_word_type = uint64_t; + // Number of bytes processed in each hash step + static constexpr uint32_t message_chunk_size = 128; + // Digest size in bytes + static constexpr uint32_t digest_size = 128; + // Number of bytes used for the message length + static constexpr uint32_t message_length_size = 16; + + void __device__ inline hash_step(hash_state& state) { sha512_hash_step(state); } + + hash_state state; +}; + +} // namespace + +std::unique_ptr sha512(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + string_scalar const empty_result( + "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec" + "2f63b931bd47417a81a538327af927da3e"); + return sha_hash(input, empty_result, stream, mr); +} + +} // namespace detail + +std::unique_ptr sha512(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::sha512(input, stream, mr); +} + +} // namespace hashing +} // namespace cudf diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh new file mode 100644 index 00000000000..eeca14f6b3b --- /dev/null +++ b/cpp/src/hash/sha_hash.cuh @@ -0,0 +1,537 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf { +namespace hashing { +namespace detail { + +const __constant__ uint32_t sha256_hash_constants[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, +}; + +const __constant__ uint64_t sha512_hash_constants[80] = { + 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, + 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, + 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, + 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694, + 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, + 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, + 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4, + 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70, + 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, + 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b, + 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30, + 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8, + 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, + 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, + 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec, + 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b, + 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, + 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b, + 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, + 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, +}; + +/** + * @brief A CRTP helper function + * + * https://www.fluentcpp.com/2017/05/19/crtp-helper/ + * + * Does two things: + * 1. Makes "crtp" explicit in the inheritance structure of a CRTP base class. + * 2. Avoids having to `static_cast` in a lot of places + * + * @tparam T The derived class in a CRTP hierarchy + */ +template +struct crtp { + __device__ inline T& underlying() { return static_cast(*this); } + __device__ inline T const& underlying() const { return static_cast(*this); } +}; + +template +struct HashBase : public crtp { + char* result_location; + + __device__ inline HashBase(char* result_location) : result_location(result_location) {} + + /** + * @brief Execute SHA on input data chunks. + * + * This accepts arbitrary data, handles it as bytes, and calls the hash step + * when the buffer is filled up to message_chunk_size bytes. + */ + void __device__ inline process(uint8_t const* data, uint32_t len) + { + auto& state = this->underlying().state; + state.message_length += len; + + if (state.buffer_length + len < Hasher::message_chunk_size) { + // The buffer will not be filled by this data. We copy the new data into + // the buffer but do not trigger a hash step yet. + memcpy(state.buffer + state.buffer_length, data, len); + state.buffer_length += len; + } else { + // The buffer will be filled by this data. Copy a chunk of the data to fill + // the buffer and trigger a hash step. + uint32_t copylen = Hasher::message_chunk_size - state.buffer_length; + memcpy(state.buffer + state.buffer_length, data, copylen); + this->underlying().hash_step(state); + + // Take buffer-sized chunks of the data and do a hash step on each chunk. + while (len > Hasher::message_chunk_size + copylen) { + memcpy(state.buffer, data + copylen, Hasher::message_chunk_size); + this->underlying().hash_step(state); + copylen += Hasher::message_chunk_size; + } + + // The remaining data chunk does not fill the buffer. We copy the data into + // the buffer but do not trigger a hash step yet. + memcpy(state.buffer, data + copylen, len - copylen); + state.buffer_length = len - copylen; + } + } + + template + void __device__ inline process_fixed_width(T const& key) + { + uint8_t const* data = reinterpret_cast(&key); + uint32_t constexpr len = sizeof(T); + process(data, len); + } + + /** + * @brief Finalize SHA element processing. + * + * This method fills the remainder of the message buffer with zeros, appends + * the message length (in another step of the hash, if needed), and performs + * the final hash step. + */ + void __device__ inline finalize() + { + auto& state = this->underlying().state; + // Message length in bits. + uint64_t const message_length_in_bits = (static_cast(state.message_length)) << 3; + // Add a one bit flag (10000000) to signal the end of the message + uint8_t constexpr end_of_message = 0x80; + // 1 byte for the end of the message flag + uint32_t constexpr end_of_message_size = 1; + + thrust::fill_n( + thrust::seq, state.buffer + state.buffer_length, end_of_message_size, end_of_message); + + // SHA-512 uses a 128-bit message length instead of a 64-bit message length + // but this code does not support messages with lengths exceeding UINT64_MAX + // bits. We always pad the upper 64 bits with zeros. + uint32_t constexpr message_length_supported_size = sizeof(message_length_in_bits); + + if (state.buffer_length + Hasher::message_length_size + end_of_message_size <= + Hasher::message_chunk_size) { + // Fill the remainder of the buffer with zeros up to the space reserved + // for the message length. The message length fits in this hash step. + thrust::fill(thrust::seq, + state.buffer + state.buffer_length + end_of_message_size, + state.buffer + Hasher::message_chunk_size - message_length_supported_size, + 0x00); + } else { + // Fill the remainder of the buffer with zeros. The message length doesn't + // fit and will be processed in a subsequent hash step comprised of only + // zeros followed by the message length. + thrust::fill(thrust::seq, + state.buffer + state.buffer_length + end_of_message_size, + state.buffer + Hasher::message_chunk_size, + 0x00); + this->underlying().hash_step(state); + + // Fill the entire message with zeros up to the final bytes reserved for + // the message length. + thrust::fill_n(thrust::seq, + state.buffer, + Hasher::message_chunk_size - message_length_supported_size, + 0x00); + } + + // Convert the 64-bit message length from little-endian to big-endian. + uint64_t const full_length_flipped = swap_endian(message_length_in_bits); + memcpy(state.buffer + Hasher::message_chunk_size - message_length_supported_size, + reinterpret_cast(&full_length_flipped), + message_length_supported_size); + this->underlying().hash_step(state); + + // Each byte in the word generates two bytes in the hexadecimal string digest. + // SHA-224 and SHA-384 digests are truncated because their digest does not + // include all of the hash values. + auto constexpr num_words_to_copy = + Hasher::digest_size / (2 * sizeof(typename Hasher::sha_word_type)); + for (int i = 0; i < num_words_to_copy; i++) { + // Convert word representation from big-endian to little-endian. + typename Hasher::sha_word_type flipped = swap_endian(state.hash_value[i]); + if constexpr (std::is_same_v) { + uint32ToLowercaseHexString(flipped, result_location + (8 * i)); + } else if constexpr (std::is_same_v) { + uint32_t low_bits = static_cast(flipped); + uint32ToLowercaseHexString(low_bits, result_location + (16 * i)); + uint32_t high_bits = static_cast(flipped >> 32); + uint32ToLowercaseHexString(high_bits, result_location + (16 * i) + 8); + } else { + cudf_assert(false && "Unsupported SHA word type."); + } + } + }; +}; + +template +struct HasherDispatcher { + Hasher* hasher; + column_device_view input_col; + + __device__ inline HasherDispatcher(Hasher* hasher, column_device_view const& input_col) + : hasher{hasher}, input_col{input_col} + { + } + + template + void __device__ inline operator()(size_type row_index) + { + if constexpr (is_fixed_width() && !is_chrono()) { + Element const& key = input_col.element(row_index); + if constexpr (is_floating_point()) { + if (isnan(key)) { + Element nan = std::numeric_limits::quiet_NaN(); + hasher->process_fixed_width(nan); + } else if (key == Element{0.0}) { + hasher->process_fixed_width(Element{0.0}); + } else { + hasher->process_fixed_width(key); + } + } else { + hasher->process_fixed_width(key); + } + } else if constexpr (std::is_same_v) { + string_view key = input_col.element(row_index); + uint8_t const* data = reinterpret_cast(key.data()); + uint32_t const len = static_cast(key.size_bytes()); + hasher->process(data, len); + } else { + (void)row_index; + cudf_assert(false && "Unsupported type for hash function."); + } + } +}; + +/** + * @brief Core SHA-1 algorithm implementation. Processes a single 512-bit chunk, + * updating the hash value so far. Does not zero out the buffer contents. + */ +template +void __device__ inline sha1_hash_step(hash_state& state) +{ + uint32_t words[80]; + + // The 512-bit message buffer fills the first 16 words. + memcpy(&words[0], state.buffer, sizeof(words[0]) * 16); + for (int i = 0; i < 16; i++) { + // Convert word representation from little-endian to big-endian. + words[i] = swap_endian(words[i]); + } + + // The rest of the 80 words are generated from the first 16 words. + for (int i = 16; i < 80; i++) { + uint32_t const temp = words[i - 3] ^ words[i - 8] ^ words[i - 14] ^ words[i - 16]; + words[i] = rotate_bits_left(temp, 1); + } + + uint32_t A = state.hash_value[0]; + uint32_t B = state.hash_value[1]; + uint32_t C = state.hash_value[2]; + uint32_t D = state.hash_value[3]; + uint32_t E = state.hash_value[4]; + + for (int i = 0; i < 80; i++) { + uint32_t F; + uint32_t k; + uint32_t temp; + switch (i / 20) { + case 0: + F = D ^ (B & (C ^ D)); + k = 0x5a827999; + break; + case 1: + F = B ^ C ^ D; + k = 0x6ed9eba1; + break; + case 2: + F = (B & C) | (B & D) | (C & D); + k = 0x8f1bbcdc; + break; + case 3: + F = B ^ C ^ D; + k = 0xca62c1d6; + break; + } + temp = rotate_bits_left(A, 5) + F + E + k + words[i]; + E = D; + D = C; + C = rotate_bits_left(B, 30); + B = A; + A = temp; + } + + state.hash_value[0] += A; + state.hash_value[1] += B; + state.hash_value[2] += C; + state.hash_value[3] += D; + state.hash_value[4] += E; + + state.buffer_length = 0; +} + +/** + * @brief Core SHA-256 algorithm implementation. Processes a single 512-bit chunk, + * updating the hash value so far. Does not zero out the buffer contents. + */ +template +void __device__ inline sha256_hash_step(hash_state& state) +{ + uint32_t words[64]; + + // The 512-bit message buffer fills the first 16 words. + memcpy(&words[0], state.buffer, sizeof(words[0]) * 16); + for (int i = 0; i < 16; i++) { + // Convert word representation from little-endian to big-endian. + words[i] = swap_endian(words[i]); + } + + // The rest of the 64 words are generated from the first 16 words. + for (int i = 16; i < 64; i++) { + uint32_t const s0 = rotate_bits_right(words[i - 15], 7) ^ rotate_bits_right(words[i - 15], 18) ^ + (words[i - 15] >> 3); + uint32_t const s1 = rotate_bits_right(words[i - 2], 17) ^ rotate_bits_right(words[i - 2], 19) ^ + (words[i - 2] >> 10); + words[i] = words[i - 16] + s0 + words[i - 7] + s1; + } + + uint32_t A = state.hash_value[0]; + uint32_t B = state.hash_value[1]; + uint32_t C = state.hash_value[2]; + uint32_t D = state.hash_value[3]; + uint32_t E = state.hash_value[4]; + uint32_t F = state.hash_value[5]; + uint32_t G = state.hash_value[6]; + uint32_t H = state.hash_value[7]; + + for (int i = 0; i < 64; i++) { + uint32_t const s1 = + rotate_bits_right(E, 6) ^ rotate_bits_right(E, 11) ^ rotate_bits_right(E, 25); + uint32_t const ch = (E & F) ^ ((~E) & G); + uint32_t const temp1 = H + s1 + ch + sha256_hash_constants[i] + words[i]; + uint32_t const s0 = + rotate_bits_right(A, 2) ^ rotate_bits_right(A, 13) ^ rotate_bits_right(A, 22); + uint32_t const maj = (A & B) ^ (A & C) ^ (B & C); + uint32_t const temp2 = s0 + maj; + + H = G; + G = F; + F = E; + E = D + temp1; + D = C; + C = B; + B = A; + A = temp1 + temp2; + } + + state.hash_value[0] += A; + state.hash_value[1] += B; + state.hash_value[2] += C; + state.hash_value[3] += D; + state.hash_value[4] += E; + state.hash_value[5] += F; + state.hash_value[6] += G; + state.hash_value[7] += H; + + state.buffer_length = 0; +} + +/** + * @brief Core SHA-512 algorithm implementation. Processes a single 1024-bit chunk, + * updating the hash value so far. Does not zero out the buffer contents. + */ +template +void __device__ inline sha512_hash_step(hash_state& state) +{ + uint64_t words[80]; + + // The 1024-bit message buffer fills the first 16 words. + memcpy(&words[0], state.buffer, sizeof(words[0]) * 16); + for (int i = 0; i < 16; i++) { + // Convert word representation from little-endian to big-endian. + words[i] = swap_endian(words[i]); + } + + // The rest of the 80 words are generated from the first 16 words. + for (int i = 16; i < 80; i++) { + uint64_t const s0 = rotate_bits_right(words[i - 15], 1) ^ rotate_bits_right(words[i - 15], 8) ^ + (words[i - 15] >> 7); + uint64_t const s1 = rotate_bits_right(words[i - 2], 19) ^ rotate_bits_right(words[i - 2], 61) ^ + (words[i - 2] >> 6); + words[i] = words[i - 16] + s0 + words[i - 7] + s1; + } + + uint64_t A = state.hash_value[0]; + uint64_t B = state.hash_value[1]; + uint64_t C = state.hash_value[2]; + uint64_t D = state.hash_value[3]; + uint64_t E = state.hash_value[4]; + uint64_t F = state.hash_value[5]; + uint64_t G = state.hash_value[6]; + uint64_t H = state.hash_value[7]; + + for (int i = 0; i < 80; i++) { + uint64_t const s1 = + rotate_bits_right(E, 14) ^ rotate_bits_right(E, 18) ^ rotate_bits_right(E, 41); + uint64_t const ch = (E & F) ^ ((~E) & G); + uint64_t const temp1 = H + s1 + ch + sha512_hash_constants[i] + words[i]; + uint64_t const s0 = + rotate_bits_right(A, 28) ^ rotate_bits_right(A, 34) ^ rotate_bits_right(A, 39); + uint64_t const maj = (A & B) ^ (A & C) ^ (B & C); + uint64_t const temp2 = s0 + maj; + + H = G; + G = F; + F = E; + E = D + temp1; + D = C; + C = B; + B = A; + A = temp1 + temp2; + } + + state.hash_value[0] += A; + state.hash_value[1] += B; + state.hash_value[2] += C; + state.hash_value[3] += D; + state.hash_value[4] += E; + state.hash_value[5] += F; + state.hash_value[6] += G; + state.hash_value[7] += H; + + state.buffer_length = 0; +} + +// SHA supported leaf data type check +bool inline sha_leaf_type_check(data_type dt) +{ + return (is_fixed_width(dt) && !is_chrono(dt)) || (dt.id() == type_id::STRING); +} + +/** + * @brief Call a SHA-1 or SHA-2 hash function on a table view. + * + * @tparam Hasher The struct used for computing SHA hashes. + * + * @param input The input table. + * results. + * @param empty_result A string representing the expected result for empty inputs. + * @param stream CUDA stream on which memory may be allocated if the memory + * resource supports streams. + * @param mr Memory resource to use for the device memory allocation + * @return A new column with the computed hash function result. + */ +template +std::unique_ptr sha_hash(table_view const& input, + string_scalar const& empty_result, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.num_columns() == 0 || input.num_rows() == 0) { + // Return the hash of a zero-length input. + return make_column_from_scalar(empty_result, input.num_rows(), stream, mr); + } + + // Accepts string and fixed width columns. + // TODO: Accept single layer list columns holding those types. + CUDF_EXPECTS( + std::all_of( + input.begin(), input.end(), [](auto const& col) { return sha_leaf_type_check(col.type()); }), + "Unsupported column type for hash function."); + + // Result column allocation and creation + auto begin = thrust::make_constant_iterator(Hasher::digest_size); + auto [offsets_column, bytes] = + cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr); + + auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr); + auto chars_view = chars_column->mutable_view(); + auto d_chars = chars_view.data(); + + auto const device_input = table_device_view::create(input, stream); + + // Hash each row, hashing each element sequentially left to right + thrust::for_each(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.num_rows()), + [d_chars, device_input = *device_input] __device__(auto row_index) { + Hasher hasher(d_chars + (row_index * Hasher::digest_size)); + for (auto const& col : device_input) { + if (col.is_valid(row_index)) { + cudf::type_dispatcher( + col.type(), HasherDispatcher(&hasher, col), row_index); + } + } + hasher.finalize(); + }); + + return make_strings_column( + input.num_rows(), std::move(offsets_column), std::move(chars_column), 0, {}); +} + +} // namespace detail +} // namespace hashing +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index b0382d15807..9b86c397ea3 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -169,6 +169,11 @@ ConfigureTest( hashing/md5_test.cpp hashing/murmurhash3_x86_32_test.cpp hashing/murmurhash3_x64_128_test.cpp + hashing/sha1_test.cpp + hashing/sha224_test.cpp + hashing/sha256_test.cpp + hashing/sha384_test.cpp + hashing/sha512_test.cpp hashing/spark_murmurhash3_x86_32_test.cpp hashing/xxhash_64_test.cpp ) diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp new file mode 100644 index 00000000000..6168fb06909 --- /dev/null +++ b/cpp/tests/hashing/sha1_test.cpp @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; + +class SHA1HashTest : public cudf::test::BaseFixture {}; + +TEST_F(SHA1HashTest, EmptyTable) +{ + auto const empty_table = cudf::table_view{}; + auto const empty_column = cudf::make_empty_column(cudf::data_type(cudf::type_id::STRING)); + auto const output_empty_table = cudf::hashing::sha1(empty_table); + EXPECT_EQ(empty_column->size(), output_empty_table->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_empty_table->view()); + + auto const table_one_empty_column = cudf::table_view{{empty_column->view()}}; + auto const output_one_empty_column = cudf::hashing::sha1(empty_table); + EXPECT_EQ(empty_column->size(), output_one_empty_column->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_one_empty_column->view()); +} + +TEST_F(SHA1HashTest, MultiValue) +{ + cudf::test::strings_column_wrapper const strings_col( + {"", + "0", + "A 56 character string to test message padding algorithm.", + "A 63 character string to test message padding algorithm, again.", + "A 64 character string to test message padding algorithm, again!!", + "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " + "the hash function being tested. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); + + cudf::test::strings_column_wrapper const sha1_string_results1( + {"da39a3ee5e6b4b0d3255bfef95601890afd80709", + "b6589fc6ab0dc82cf12099d1c2d40ab994e8410c", + "cb73203438ab46ea54491c53e288a2703c440c4a", + "c595ebd13a785c1c2659e010a42e2ff9987ef51f", + "4ffaf61804c55b8c2171be548bef2e1d0baca17a", + "595965dd18f38087186162c788485fe249242131", + "a62ca720fbab830c8890044eacbeac216f1ca2e4", + "11e16c52273b5669a41d17ec7c187475193f88b3"}); + + cudf::test::strings_column_wrapper const sha1_string_results2( + {"da39a3ee5e6b4b0d3255bfef95601890afd80709", + "fb96549631c835eb239cd614cc6b5cb7d295121a", + "e3977ee0ea7f238134ec93c79988fa84b7c5d79e", + "f6f75b6fa3c3d8d86b44fcb2c98c9ad4b37dcdd0", + "c7abd431a775c604edf41a62f7f215e7258dc16a", + "153fdf20d2bd8ae76241197314d6e0be7fe10f50", + "8c3656f7cb37898f9296c1965000d6da13fed64e", + "b4a848399375ec842c2cb445d98b5f80a4dce94f"}); + + using limits = std::numeric_limits; + cudf::test::fixed_width_column_wrapper const ints_col( + {0, 100, -100, limits::min(), limits::max(), 1, 2, 3}); + + // Different truth values should be equal + cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0, 1, 2, 255}); + + auto const string_input1 = cudf::table_view({strings_col}); + auto const string_input2 = cudf::table_view({strings_col, strings_col}); + auto const sha1_string_output1 = cudf::hashing::sha1(string_input1); + auto const sha1_string_output2 = cudf::hashing::sha1(string_input2); + EXPECT_EQ(string_input1.num_rows(), sha1_string_output1->size()); + EXPECT_EQ(string_input2.num_rows(), sha1_string_output2->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha1_string_output1->view(), sha1_string_results1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha1_string_output2->view(), sha1_string_results2); + + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); + auto const sha1_output1 = cudf::hashing::sha1(input1); + auto const sha1_output2 = cudf::hashing::sha1(input2); + EXPECT_EQ(input1.num_rows(), sha1_output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha1_output1->view(), sha1_output2->view()); +} + +TEST_F(SHA1HashTest, MultiValueNulls) +{ + // Nulls with different values should be equal + cudf::test::strings_column_wrapper const strings_col1( + {"", + "Different but null!", + "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " + "the hash function being tested. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, + {1, 0, 0, 1, 0}); + cudf::test::strings_column_wrapper const strings_col2( + {"", + "Another string that is null.", + "Very different... but null", + "All work and no play makes Jack a dull boy", + ""}, + {1, 0, 0, 1, 1}); // empty string is equivalent to null + + // Nulls with different values should be equal + using limits = std::numeric_limits; + cudf::test::fixed_width_column_wrapper const ints_col1( + {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper const ints_col2( + {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + + // Nulls with different values should be equal + // Different truth values should be equal + cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); + cudf::test::fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); + + auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1}); + auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2}); + + auto const output1 = cudf::hashing::sha1(input1); + auto const output2 = cudf::hashing::sha1(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +template +class SHA1HashTestTyped : public cudf::test::BaseFixture {}; + +TYPED_TEST_CASE(SHA1HashTestTyped, cudf::test::NumericTypes); + +TYPED_TEST(SHA1HashTestTyped, Equality) +{ + cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); + auto const input = cudf::table_view({col}); + + // Hash of same input should be equal + auto const output1 = cudf::hashing::sha1(input); + auto const output2 = cudf::hashing::sha1(input); + + EXPECT_EQ(input.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +TYPED_TEST(SHA1HashTestTyped, EqualityNulls) +{ + using T = TypeParam; + + // Nulls with different values should be equal + cudf::test::fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hashing::sha1(input1); + auto const output2 = cudf::hashing::sha1(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +template +class SHA1HashTestFloatTyped : public cudf::test::BaseFixture {}; + +TYPED_TEST_CASE(SHA1HashTestFloatTyped, cudf::test::FloatingPointTypes); + +TYPED_TEST(SHA1HashTestFloatTyped, TestExtremes) +{ + using T = TypeParam; + T min = std::numeric_limits::min(); + T max = std::numeric_limits::max(); + T nan = std::numeric_limits::quiet_NaN(); + T inf = std::numeric_limits::infinity(); + + cudf::test::fixed_width_column_wrapper const col1( + {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf}); + cudf::test::fixed_width_column_wrapper const col2( + {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hashing::sha1(input1); + auto const output2 = cudf::hashing::sha1(input2); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp new file mode 100644 index 00000000000..3192107e8ec --- /dev/null +++ b/cpp/tests/hashing/sha224_test.cpp @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; + +class SHA224HashTest : public cudf::test::BaseFixture {}; + +TEST_F(SHA224HashTest, EmptyTable) +{ + auto const empty_table = cudf::table_view{}; + auto const empty_column = cudf::make_empty_column(cudf::data_type(cudf::type_id::STRING)); + auto const output_empty_table = cudf::hashing::sha224(empty_table); + EXPECT_EQ(empty_column->size(), output_empty_table->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_empty_table->view()); + + auto const table_one_empty_column = cudf::table_view{{empty_column->view()}}; + auto const output_one_empty_column = cudf::hashing::sha224(empty_table); + EXPECT_EQ(empty_column->size(), output_one_empty_column->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_one_empty_column->view()); +} + +TEST_F(SHA224HashTest, MultiValue) +{ + cudf::test::strings_column_wrapper const strings_col( + {"", + "0", + "A 56 character string to test message padding algorithm.", + "A 63 character string to test message padding algorithm, again.", + "A 64 character string to test message padding algorithm, again!!", + "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " + "the hash function being tested. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); + + cudf::test::strings_column_wrapper const sha224_string_results1( + {"d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f", + "dfd5f9139a820075df69d7895015360b76d0360f3d4b77a845689614", + "5d1ed8373987e403482cefe1662a63fa3076c0a5331d141f41654bbe", + "0662c91000b99de7a20c89097dd62f59120398d52499497489ccff95", + "f9ea303770699483f3e53263b32a3b3c876d1b8808ce84df4b8ca1c4", + "2da6cd4bdaa0a99fd7236cd5507c52e12328e71192e83b32d2f110f9", + "e7d0adb165079efc6c6343112f8b154aa3644ca6326f658aaa0f8e4a", + "309cc09eaa051beea7d0b0159daca9b4e8a533cb554e8f382c82709e"}); + + cudf::test::strings_column_wrapper const sha224_string_results2( + {"d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f", + "5538ae2b02d4ae0b7090dc908ca69cd11a2ffad43c7435f1dbad5e6a", + "8e1955a473a149368dc0a931f99379b44b0bb752f206dbdf68629232", + "8581001e08295b7884428c022378cfdd643c977aefe4512f0252dc30", + "d5854dfe3c32996345b103a6a16c7bdfa924723d620b150737e77370", + "dd56deac5f2caa579a440ee814fc04a3afaf805d567087ac3317beb3", + "14fb559f6309604bedd89183f585f3b433932b5b0e675848feebf8ec", + "d219eefea538491efcb69bc5bbef4177ad991d1b6e1367b5981b8c31"}); + + using limits = std::numeric_limits; + cudf::test::fixed_width_column_wrapper const ints_col( + {0, 100, -100, limits::min(), limits::max(), 1, 2, 3}); + + // Different truth values should be equal + cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0, 1, 2, 255}); + + auto const string_input1 = cudf::table_view({strings_col}); + auto const string_input2 = cudf::table_view({strings_col, strings_col}); + auto const sha224_string_output1 = cudf::hashing::sha224(string_input1); + auto const sha224_string_output2 = cudf::hashing::sha224(string_input2); + EXPECT_EQ(string_input1.num_rows(), sha224_string_output1->size()); + EXPECT_EQ(string_input2.num_rows(), sha224_string_output2->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha224_string_output1->view(), sha224_string_results1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha224_string_output2->view(), sha224_string_results2); + + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); + auto const sha224_output1 = cudf::hashing::sha224(input1); + auto const sha224_output2 = cudf::hashing::sha224(input2); + EXPECT_EQ(input1.num_rows(), sha224_output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha224_output1->view(), sha224_output2->view()); +} + +TEST_F(SHA224HashTest, MultiValueNulls) +{ + // Nulls with different values should be equal + cudf::test::strings_column_wrapper const strings_col1( + {"", + "Different but null!", + "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " + "the hash function being tested. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, + {1, 0, 0, 1, 0}); + cudf::test::strings_column_wrapper const strings_col2( + {"", + "Another string that is null.", + "Very different... but null", + "All work and no play makes Jack a dull boy", + ""}, + {1, 0, 0, 1, 1}); // empty string is equivalent to null + + // Nulls with different values should be equal + using limits = std::numeric_limits; + cudf::test::fixed_width_column_wrapper const ints_col1( + {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper const ints_col2( + {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + + // Nulls with different values should be equal + // Different truth values should be equal + cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); + cudf::test::fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); + + auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1}); + auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2}); + + auto const output1 = cudf::hashing::sha224(input1); + auto const output2 = cudf::hashing::sha224(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +template +class SHA224HashTestTyped : public cudf::test::BaseFixture {}; + +TYPED_TEST_CASE(SHA224HashTestTyped, cudf::test::NumericTypes); + +TYPED_TEST(SHA224HashTestTyped, Equality) +{ + cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); + auto const input = cudf::table_view({col}); + + // Hash of same input should be equal + auto const output1 = cudf::hashing::sha224(input); + auto const output2 = cudf::hashing::sha224(input); + + EXPECT_EQ(input.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +TYPED_TEST(SHA224HashTestTyped, EqualityNulls) +{ + using T = TypeParam; + + // Nulls with different values should be equal + cudf::test::fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hashing::sha224(input1); + auto const output2 = cudf::hashing::sha224(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +template +class SHA224HashTestFloatTyped : public cudf::test::BaseFixture {}; + +TYPED_TEST_CASE(SHA224HashTestFloatTyped, cudf::test::FloatingPointTypes); + +TYPED_TEST(SHA224HashTestFloatTyped, TestExtremes) +{ + using T = TypeParam; + T min = std::numeric_limits::min(); + T max = std::numeric_limits::max(); + T nan = std::numeric_limits::quiet_NaN(); + T inf = std::numeric_limits::infinity(); + + cudf::test::fixed_width_column_wrapper const col1( + {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf}); + cudf::test::fixed_width_column_wrapper const col2( + {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hashing::sha224(input1); + auto const output2 = cudf::hashing::sha224(input2); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp new file mode 100644 index 00000000000..df35755831c --- /dev/null +++ b/cpp/tests/hashing/sha256_test.cpp @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; + +class SHA256HashTest : public cudf::test::BaseFixture {}; + +TEST_F(SHA256HashTest, EmptyTable) +{ + auto const empty_table = cudf::table_view{}; + auto const empty_column = cudf::make_empty_column(cudf::data_type(cudf::type_id::STRING)); + auto const output_empty_table = cudf::hashing::sha256(empty_table); + EXPECT_EQ(empty_column->size(), output_empty_table->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_empty_table->view()); + + auto const table_one_empty_column = cudf::table_view{{empty_column->view()}}; + auto const output_one_empty_column = cudf::hashing::sha256(empty_table); + EXPECT_EQ(empty_column->size(), output_one_empty_column->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_one_empty_column->view()); +} + +TEST_F(SHA256HashTest, MultiValue) +{ + cudf::test::strings_column_wrapper const strings_col( + {"", + "0", + "A 56 character string to test message padding algorithm.", + "A 63 character string to test message padding algorithm, again.", + "A 64 character string to test message padding algorithm, again!!", + "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " + "the hash function being tested. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); + + cudf::test::strings_column_wrapper const sha256_string_results1( + {"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + "5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91b46729d73a27fb57e9", + "d16883c666112142c1d72c9080b41161be7563250539e3f6ab6e2fdf2210074b", + "11174fa180460f5d683c2e63fcdd897dcbf10c28a9225d3ced9a8bbc3774415d", + "10a7d211e692c6f71bb9f7524ba1437588c2797356f05fc585340f002fe7015e", + "339d610dcb030bb4222bcf18c8ab82d911bfe7fb95b2cd9f6785fd4562b02401", + "2ce9936a4a2234bf8a76c37d92e01d549d03949792242e7f8a1ad68575e4e4a8", + "255fdd4d80a72f67921eb36f3e1157ea3e995068cee80e430c034e0d3692f614"}); + + cudf::test::strings_column_wrapper const sha256_string_results2( + {"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + "f1534392279bddbf9d43dde8701cb5be14b82f76ec6607bf8d6ad557f60f304e", + "96c204fa5d44b2487abfec105a05f8ae634551604f6596202ca99e3724e3953a", + "2e7be264f3ecbb2930e7c54bf6c5fc1f310a8c63c50916bb713f34699ed11719", + "224e4dce71d5dbd5e79ba65aaced7ad9c4f45dda146278087b2b61d164f056f0", + "91f3108d4e9c696fdb37ae49fdc6a2237f1d1f977b7216406cc8a6365355f43b", + "490be480afe271685e9c1fdf46daac0b9bf7f25602e153ca92a0ddb0e4b662ef", + "4ddc45855d7ce3ab09efacff1fbafb33502f7dd468dc5a62826689c1c658dbce"}); + + using limits = std::numeric_limits; + cudf::test::fixed_width_column_wrapper const ints_col( + {0, 100, -100, limits::min(), limits::max(), 1, 2, 3}); + + // Different truth values should be equal + cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0, 1, 2, 255}); + + auto const string_input1 = cudf::table_view({strings_col}); + auto const string_input2 = cudf::table_view({strings_col, strings_col}); + auto const sha256_string_output1 = cudf::hashing::sha256(string_input1); + auto const sha256_string_output2 = cudf::hashing::sha256(string_input2); + EXPECT_EQ(string_input1.num_rows(), sha256_string_output1->size()); + EXPECT_EQ(string_input2.num_rows(), sha256_string_output2->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha256_string_output1->view(), sha256_string_results1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha256_string_output2->view(), sha256_string_results2); + + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); + auto const sha256_output1 = cudf::hashing::sha256(input1); + auto const sha256_output2 = cudf::hashing::sha256(input2); + EXPECT_EQ(input1.num_rows(), sha256_output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha256_output1->view(), sha256_output2->view()); +} + +TEST_F(SHA256HashTest, MultiValueNulls) +{ + // Nulls with different values should be equal + cudf::test::strings_column_wrapper const strings_col1( + {"", + "Different but null!", + "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " + "the hash function being tested. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, + {1, 0, 0, 1, 0}); + cudf::test::strings_column_wrapper const strings_col2( + {"", + "Another string that is null.", + "Very different... but null", + "All work and no play makes Jack a dull boy", + ""}, + {1, 0, 0, 1, 1}); // empty string is equivalent to null + + // Nulls with different values should be equal + using limits = std::numeric_limits; + cudf::test::fixed_width_column_wrapper const ints_col1( + {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper const ints_col2( + {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + + // Nulls with different values should be equal + // Different truth values should be equal + cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); + cudf::test::fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); + + auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1}); + auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2}); + + auto const output1 = cudf::hashing::sha256(input1); + auto const output2 = cudf::hashing::sha256(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +template +class SHA256HashTestTyped : public cudf::test::BaseFixture {}; + +TYPED_TEST_CASE(SHA256HashTestTyped, cudf::test::NumericTypes); + +TYPED_TEST(SHA256HashTestTyped, Equality) +{ + cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); + auto const input = cudf::table_view({col}); + + // Hash of same input should be equal + auto const output1 = cudf::hashing::sha256(input); + auto const output2 = cudf::hashing::sha256(input); + + EXPECT_EQ(input.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +TYPED_TEST(SHA256HashTestTyped, EqualityNulls) +{ + using T = TypeParam; + + // Nulls with different values should be equal + cudf::test::fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hashing::sha256(input1); + auto const output2 = cudf::hashing::sha256(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +template +class SHA256HashTestFloatTyped : public cudf::test::BaseFixture {}; + +TYPED_TEST_CASE(SHA256HashTestFloatTyped, cudf::test::FloatingPointTypes); + +TYPED_TEST(SHA256HashTestFloatTyped, TestExtremes) +{ + using T = TypeParam; + T min = std::numeric_limits::min(); + T max = std::numeric_limits::max(); + T nan = std::numeric_limits::quiet_NaN(); + T inf = std::numeric_limits::infinity(); + + cudf::test::fixed_width_column_wrapper const col1( + {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf}); + cudf::test::fixed_width_column_wrapper const col2( + {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hashing::sha256(input1); + auto const output2 = cudf::hashing::sha256(input2); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp new file mode 100644 index 00000000000..435a95bb4a8 --- /dev/null +++ b/cpp/tests/hashing/sha384_test.cpp @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; + +class SHA384HashTest : public cudf::test::BaseFixture {}; + +TEST_F(SHA384HashTest, EmptyTable) +{ + auto const empty_table = cudf::table_view{}; + auto const empty_column = cudf::make_empty_column(cudf::data_type(cudf::type_id::STRING)); + auto const output_empty_table = cudf::hashing::sha384(empty_table); + EXPECT_EQ(empty_column->size(), output_empty_table->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_empty_table->view()); + + auto const table_one_empty_column = cudf::table_view{{empty_column->view()}}; + auto const output_one_empty_column = cudf::hashing::sha384(empty_table); + EXPECT_EQ(empty_column->size(), output_one_empty_column->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_one_empty_column->view()); +} + +TEST_F(SHA384HashTest, MultiValue) +{ + cudf::test::strings_column_wrapper const strings_col( + {"", + "0", + "A 56 character string to test message padding algorithm.", + "A 63 character string to test message padding algorithm, again.", + "A 64 character string to test message padding algorithm, again!!", + "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " + "the hash function being tested. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); + + cudf::test::strings_column_wrapper const sha384_string_results1( + {"38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b" + "95b", + "5f91550edb03f0bb8917da57f0f8818976f5da971307b7ee4886bb951c4891a1f16f840dae8f655aa5df718884ebc" + "15b", + "982000cce895dc439edbcb7ba5b908cb5b7e939fe913d58506a486735a914b0dfbcebb02c33c428287baa0bfc7fe0" + "948", + "c3ea54e4d6d97c2a84dac9ac48ed9dd1a49118be880d8466044720cfdcd23427bf556f12204bb34ede29dbf207033" + "78c", + "5d7a853a18138fa90feac07c896dfca65a0f1eb2ed40f1fd7be6238dd7ef429bb1aeb0236735500eb954c9b4ba923" + "254", + "c72bcaf3a4b01986711cd5d2614aa8f9d7fad61455613eac4561b1468f9a25dd26566c8ad1190dec7567be4f6fc1d" + "b29", + "281826f23bebb3f835d2f15edcb0cdb3078ae2d7dc516f3a366af172dff4db6dd5833bc1e5ee411d52c598773e939" + "7b6", + "3a9d1a870a5f6a4c04df1daf1808163d33852897ebc757a5b028a1214fbc758485a392159b11bc360cfadc79f9512" + "822"}); + + cudf::test::strings_column_wrapper const sha384_string_results2( + {"38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b" + "95b", + "34ae2cd40efabf896d8d4173e500278d10671b2d914efb5480e8349190bc7e8e1d532ad568d00a8295ea536a9b42b" + "bc6", + "e80c25efd8032ea94dad1509a68f9bf745ce1184b8a148714c28c7e0fae1100ab14057417394f83118eaa151e014d" + "917", + "69eaddc4ef2ed967fc6a86d3ed3777b2c2015df4cf8bbbf65681556f451a4a0ae805a89c2d56641b4422b5f248c56" + "77d", + "112a6f9c74741d490747db90f5e901a88b7a32f637c030d6d96e5f89a70a5f1ee209e018648842c0e1d32002f95fd" + "d07", + "dc6f24bb0eb2c96fb53c52c402f073de089f3aeae9594be0c4f4cb31b13bd48769b80aa97d83a25ece1edf0c83373" + "f56", + "781a33adfdcdcbb514318728c074fbb59d44002995825642e0c9bfef8a2ccf3fb637b39ff3dd265df8cd93c86e945" + "ce9", + "d2efb1591c4503f23c34ddb4da6bb1017d3d4d7c9f23ee6aa52e71c98d41060bc35eb22f41b6130d5c42a6e717fb3" + "edf"}); + + using limits = std::numeric_limits; + cudf::test::fixed_width_column_wrapper const ints_col( + {0, 100, -100, limits::min(), limits::max(), 1, 2, 3}); + + // Different truth values should be equal + cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0, 1, 2, 255}); + + auto const string_input1 = cudf::table_view({strings_col}); + auto const string_input2 = cudf::table_view({strings_col, strings_col}); + auto const sha384_string_output1 = cudf::hashing::sha384(string_input1); + auto const sha384_string_output2 = cudf::hashing::sha384(string_input2); + EXPECT_EQ(string_input1.num_rows(), sha384_string_output1->size()); + EXPECT_EQ(string_input2.num_rows(), sha384_string_output2->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_string_output1->view(), sha384_string_results1, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_string_output2->view(), sha384_string_results2, verbosity); + + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); + auto const sha384_output1 = cudf::hashing::sha384(input1); + auto const sha384_output2 = cudf::hashing::sha384(input2); + EXPECT_EQ(input1.num_rows(), sha384_output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_output1->view(), sha384_output2->view(), verbosity); +} + +TEST_F(SHA384HashTest, MultiValueNulls) +{ + // Nulls with different values should be equal + cudf::test::strings_column_wrapper const strings_col1( + {"", + "Different but null!", + "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " + "the hash function being tested. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, + {1, 0, 0, 1, 0}); + cudf::test::strings_column_wrapper const strings_col2( + {"", + "Another string that is null.", + "Very different... but null", + "All work and no play makes Jack a dull boy", + ""}, + {1, 0, 0, 1, 1}); // empty string is equivalent to null + + // Nulls with different values should be equal + using limits = std::numeric_limits; + cudf::test::fixed_width_column_wrapper const ints_col1( + {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper const ints_col2( + {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + + // Nulls with different values should be equal + // Different truth values should be equal + cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); + cudf::test::fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); + + auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1}); + auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2}); + + auto const output1 = cudf::hashing::sha384(input1); + auto const output2 = cudf::hashing::sha384(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +template +class SHA384HashTestTyped : public cudf::test::BaseFixture {}; + +TYPED_TEST_CASE(SHA384HashTestTyped, cudf::test::NumericTypes); + +TYPED_TEST(SHA384HashTestTyped, Equality) +{ + cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); + auto const input = cudf::table_view({col}); + + // Hash of same input should be equal + auto const output1 = cudf::hashing::sha384(input); + auto const output2 = cudf::hashing::sha384(input); + + EXPECT_EQ(input.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +TYPED_TEST(SHA384HashTestTyped, EqualityNulls) +{ + using T = TypeParam; + + // Nulls with different values should be equal + cudf::test::fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hashing::sha384(input1); + auto const output2 = cudf::hashing::sha384(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +template +class SHA384HashTestFloatTyped : public cudf::test::BaseFixture {}; + +TYPED_TEST_CASE(SHA384HashTestFloatTyped, cudf::test::FloatingPointTypes); + +TYPED_TEST(SHA384HashTestFloatTyped, TestExtremes) +{ + using T = TypeParam; + T min = std::numeric_limits::min(); + T max = std::numeric_limits::max(); + T nan = std::numeric_limits::quiet_NaN(); + T inf = std::numeric_limits::infinity(); + + cudf::test::fixed_width_column_wrapper const col1( + {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf}); + cudf::test::fixed_width_column_wrapper const col2( + {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hashing::sha384(input1); + auto const output2 = cudf::hashing::sha384(input2); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp new file mode 100644 index 00000000000..4e0b76999b7 --- /dev/null +++ b/cpp/tests/hashing/sha512_test.cpp @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; + +class SHA512HashTest : public cudf::test::BaseFixture {}; + +TEST_F(SHA512HashTest, EmptyTable) +{ + auto const empty_table = cudf::table_view{}; + auto const empty_column = cudf::make_empty_column(cudf::data_type(cudf::type_id::STRING)); + auto const output_empty_table = cudf::hashing::sha512(empty_table); + EXPECT_EQ(empty_column->size(), output_empty_table->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_empty_table->view()); + + auto const table_one_empty_column = cudf::table_view{{empty_column->view()}}; + auto const output_one_empty_column = cudf::hashing::sha512(empty_table); + EXPECT_EQ(empty_column->size(), output_one_empty_column->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(empty_column->view(), output_one_empty_column->view()); +} + +TEST_F(SHA512HashTest, MultiValue) +{ + cudf::test::strings_column_wrapper const strings_col( + {"", + "0", + "A 56 character string to test message padding algorithm.", + "A 63 character string to test message padding algorithm, again.", + "A 64 character string to test message padding algorithm, again!!", + "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " + "the hash function being tested. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); + + cudf::test::strings_column_wrapper const sha512_string_results1( + {"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877ee" + "c2f63b931bd47417a81a538327af927da3e", + "31bca02094eb78126a517b206a88c73cfa9ec6f704c7030d18212cace820f025f00bf0ea68dbf3f3a5436ca63b53b" + "f7bf80ad8d5de7d8359d0b7fed9dbc3ab99", + "1d8b355dbe0c4ad81c9815a1490f0b6a6fa710e42ca60767ffd6d845acd116defe307c9496a80c4a67653873af6ed" + "83e2e04c2102f55f9cd402677b246832e4c", + "8ac8ae9de5597aa630f071f81fcb94dc93b6a8f92d8f2cdd5a469764a5daf6ef387b6465ae097dcd6e0c64286260d" + "cc3d2c789d2cf5960df648c78a765e6c27c", + "9c436e24be60e17425a1a829642d97e7180b57485cf95db007cf5b32bbae1f2325b6874b3377e37806b15b739bffa" + "412ea6d095b726487d70e7b50e92d56c750", + "6a25ca1f20f6e79faea2a0770075e4262beb66b40f59c22d3e8abdb6188ef8d8914faf5dbf6df76165bb61b81dfda" + "46643f0d6366a39f7bd3d270312f9d3cf87", + "bae9eb4b5c05a4c5f85750b70b2f0ce78e387f992f0927a017eb40bd180a13004f6252a6bbf9816f195fb7d86668c" + "393dc0985aaf7168f48e8b905f3b9b02df2", + "05a4ca1c523dcab32edb7d8793934a4cdf41a9062b229d711f5326e297bda83fa965118b9d7636172b43688e8e149" + "008b3f967f1a969962b7e959af894a8a315"}); + + cudf::test::strings_column_wrapper const sha512_string_results2( + {"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877ee" + "c2f63b931bd47417a81a538327af927da3e", + "8ab3361c051a97ddc3c665d29f2762f8ac4240d08995f8724b6d07d8cbedd32c28f589ccdae514f20a6c8eea6f755" + "408dd3dd6837d66932ca2352eaeab594427", + "338b22eb841420affff9904f903ed14c91bf8f4d1b10f25c145a31018367607a2cf562121ba7eaa2d08db3382cc82" + "149805198c1fa3e7dc714fc2782e0f6ebd8", + "d3045ecde16ea036d2f2ff3fa685beb46d5fcb73de71f0aee653265f18b22e4c131255e6eb5ad3be2f32914408ec6" + "67911b49d951714decbdbfca1957be8ba10", + "da7706221f8861ef522ab9555f57306382fb18c337536545d839e431dede4ff9f9affafb82ab5588734a8fc6631e6" + "a0cd864634b62e24a42755c863c5d5c5848", + "04dadc8fdf205fe535c8eb38f20882fc2a0e308081052d7588e74f6620aa207749039468c126db7407050def80415" + "1d037cb188d5d4d459015032972a9e9f001", + "aae2e742074847889a029a8d3170f9e17177d48ec0b9dabe572aa68dd3001af0c512f164ba84aa75b13950948170a" + "0912912d16c98d2f05cb633c0d5b6a9105e", + "77f46e99a7a51ac04b4380ebca70c0782381629f711169a3b9dad3fc9aa6221a9c0cdaa9b9ea4329773e773e2987c" + "d1eebe0661386909684927d67819a2cf736"}); + + using limits = std::numeric_limits; + cudf::test::fixed_width_column_wrapper const ints_col( + {0, 100, -100, limits::min(), limits::max(), 1, 2, 3}); + + // Different truth values should be equal + cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0, 1, 2, 255}); + + auto const string_input1 = cudf::table_view({strings_col}); + auto const string_input2 = cudf::table_view({strings_col, strings_col}); + auto const sha512_string_output1 = cudf::hashing::sha512(string_input1); + auto const sha512_string_output2 = cudf::hashing::sha512(string_input2); + EXPECT_EQ(string_input1.num_rows(), sha512_string_output1->size()); + EXPECT_EQ(string_input2.num_rows(), sha512_string_output2->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_string_output1->view(), sha512_string_results1, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_string_output2->view(), sha512_string_results2, verbosity); + + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); + auto const sha512_output1 = cudf::hashing::sha512(input1); + auto const sha512_output2 = cudf::hashing::sha512(input2); + EXPECT_EQ(input1.num_rows(), sha512_output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_output1->view(), sha512_output2->view(), verbosity); +} + +TEST_F(SHA512HashTest, MultiValueNulls) +{ + // Nulls with different values should be equal + cudf::test::strings_column_wrapper const strings_col1( + {"", + "Different but null!", + "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " + "the hash function being tested. This string needed to be longer.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, + {1, 0, 0, 1, 0}); + cudf::test::strings_column_wrapper const strings_col2( + {"", + "Another string that is null.", + "Very different... but null", + "All work and no play makes Jack a dull boy", + ""}, + {1, 0, 0, 1, 1}); // empty string is equivalent to null + + // Nulls with different values should be equal + using limits = std::numeric_limits; + cudf::test::fixed_width_column_wrapper const ints_col1( + {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper const ints_col2( + {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + + // Nulls with different values should be equal + // Different truth values should be equal + cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); + cudf::test::fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); + + auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1}); + auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2}); + + auto const output1 = cudf::hashing::sha512(input1); + auto const output2 = cudf::hashing::sha512(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +template +class SHA512HashTestTyped : public cudf::test::BaseFixture {}; + +TYPED_TEST_CASE(SHA512HashTestTyped, cudf::test::NumericTypes); + +TYPED_TEST(SHA512HashTestTyped, Equality) +{ + cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); + auto const input = cudf::table_view({col}); + + // Hash of same input should be equal + auto const output1 = cudf::hashing::sha512(input); + auto const output2 = cudf::hashing::sha512(input); + + EXPECT_EQ(input.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +TYPED_TEST(SHA512HashTestTyped, EqualityNulls) +{ + using T = TypeParam; + + // Nulls with different values should be equal + cudf::test::fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hashing::sha512(input1); + auto const output2 = cudf::hashing::sha512(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} + +template +class SHA512HashTestFloatTyped : public cudf::test::BaseFixture {}; + +TYPED_TEST_CASE(SHA512HashTestFloatTyped, cudf::test::FloatingPointTypes); + +TYPED_TEST(SHA512HashTestFloatTyped, TestExtremes) +{ + using T = TypeParam; + T min = std::numeric_limits::min(); + T max = std::numeric_limits::max(); + T nan = std::numeric_limits::quiet_NaN(); + T inf = std::numeric_limits::infinity(); + + cudf::test::fixed_width_column_wrapper const col1( + {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf}); + cudf::test::fixed_width_column_wrapper const col2( + {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf}); + + auto const input1 = cudf::table_view({col1}); + auto const input2 = cudf::table_view({col2}); + + auto const output1 = cudf::hashing::sha512(input1); + auto const output2 = cudf::hashing::sha512(input2); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); +} diff --git a/java/src/main/java/ai/rapids/cudf/HashType.java b/java/src/main/java/ai/rapids/cudf/HashType.java index 081e8aa6700..170c0ce0fa9 100644 --- a/java/src/main/java/ai/rapids/cudf/HashType.java +++ b/java/src/main/java/ai/rapids/cudf/HashType.java @@ -26,6 +26,11 @@ public enum HashType { MURMUR3(1), HASH_SPARK_MURMUR3(2), HASH_MD5(3); + HASH_SHA1(4), + HASH_SHA224(5), + HASH_SHA256(6), + HASH_SHA384(7), + HASH_SHA512(8); private static final HashType[] HASH_TYPES = HashType.values(); final int nativeId; diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index ac5e1dbe9c4..c5e435aceb7 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr @@ -16,6 +16,11 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil: HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3" HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3" HASH_MD5 "cudf::hash_id::HASH_MD5" + HASH_SHA1 "cudf::hash_id::HASH_SHA1" + HASH_SHA224 "cudf::hash_id::HASH_SHA224" + HASH_SHA256 "cudf::hash_id::HASH_SHA256" + HASH_SHA384 "cudf::hash_id::HASH_SHA384" + HASH_SHA512 "cudf::hash_id::HASH_SHA512" cdef unique_ptr[column] hash "cudf::hash" ( const table_view& input, diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 1264a9b2126..8bdbed6c2b1 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock @@ -49,6 +49,16 @@ def hash(list source_columns, str method, int seed=0): c_hash_function = cpp_hash_id.HASH_MURMUR3 elif method == "md5": c_hash_function = cpp_hash_id.HASH_MD5 + elif method == "sha1": + c_hash_function = cpp_hash_id.HASH_SHA1 + elif method == "sha224": + c_hash_function = cpp_hash_id.HASH_SHA224 + elif method == "sha256": + c_hash_function = cpp_hash_id.HASH_SHA256 + elif method == "sha384": + c_hash_function = cpp_hash_id.HASH_SHA384 + elif method == "sha512": + c_hash_function = cpp_hash_id.HASH_SHA512 else: raise ValueError(f"Unsupported hash function: {method}") with nogil: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d44cf594e8b..040afe3605f 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1388,7 +1388,10 @@ def test_assign_callable(mapping): @pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) -@pytest.mark.parametrize("method", ["murmur3", "md5"]) +@pytest.mark.parametrize( + "method", + ["murmur3", "md5", "sha1", "sha224", "sha256", "sha384", "sha512"], +) @pytest.mark.parametrize("seed", [None, 42]) def test_dataframe_hash_values(nrows, method, seed): gdf = cudf.DataFrame() diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 8f8f87c20e0..9dac1477942 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1390,7 +1390,9 @@ def test_series_sort_index( assert_eq(expected, got, check_index_type=True) -@pytest.mark.parametrize("method", ["md5"]) +@pytest.mark.parametrize( + "method", ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"] +) def test_series_hash_values(method): inputs = cudf.Series( [ From 0debae79afc4f6096e42dff48f9cba4207afc7e7 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 15 Nov 2023 14:10:52 -0600 Subject: [PATCH 02/16] Update copyright. --- java/src/main/java/ai/rapids/cudf/HashType.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/HashType.java b/java/src/main/java/ai/rapids/cudf/HashType.java index 170c0ce0fa9..a24f159b2e7 100644 --- a/java/src/main/java/ai/rapids/cudf/HashType.java +++ b/java/src/main/java/ai/rapids/cudf/HashType.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 0bf01fcc7386c09e31511886b45338d754552ffb Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 15 Nov 2023 18:44:05 -0600 Subject: [PATCH 03/16] Remove changes for cudf::hash and update Python/Java bindings. --- cpp/include/cudf/hashing.hpp | 7 +-- cpp/src/hash/hashing.cu | 5 -- .../main/java/ai/rapids/cudf/HashType.java | 5 -- python/cudf/cudf/_lib/cpp/hash.pxd | 47 +++++++++++++------ python/cudf/cudf/_lib/hash.pyx | 36 ++++++++++---- python/cudf/cudf/core/indexed_frame.py | 5 +- 6 files changed, 63 insertions(+), 42 deletions(-) diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 1a128089760..965de95830a 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -42,12 +42,7 @@ enum class hash_id { HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed HASH_MURMUR3, ///< Murmur3 hash function HASH_SPARK_MURMUR3, ///< Spark Murmur3 hash function - HASH_MD5, ///< MD5 hash function - HASH_SHA1, ///< SHA-1 hash function - HASH_SHA224, ///< SHA-224 hash function - HASH_SHA256, ///< SHA-256 hash function - HASH_SHA384, ///< SHA-384 hash function - HASH_SHA512 ///< SHA-512 hash function + HASH_MD5 ///< MD5 hash function }; /** diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index 9986f7f27d3..68e02ef3cf4 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -33,11 +33,6 @@ std::unique_ptr hash(table_view const& input, case (hash_id::HASH_MURMUR3): return murmurhash3_x86_32(input, seed, stream, mr); case (hash_id::HASH_SPARK_MURMUR3): return spark_murmurhash3_x86_32(input, seed, stream, mr); case (hash_id::HASH_MD5): return md5(input, stream, mr); - case (hash_id::HASH_SHA1): return sha1(input, stream, mr); - case (hash_id::HASH_SHA224): return sha224(input, stream, mr); - case (hash_id::HASH_SHA256): return sha256(input, stream, mr); - case (hash_id::HASH_SHA384): return sha384(input, stream, mr); - case (hash_id::HASH_SHA512): return sha512(input, stream, mr); default: CUDF_FAIL("Unsupported hash function."); } } diff --git a/java/src/main/java/ai/rapids/cudf/HashType.java b/java/src/main/java/ai/rapids/cudf/HashType.java index a24f159b2e7..6e004619e57 100644 --- a/java/src/main/java/ai/rapids/cudf/HashType.java +++ b/java/src/main/java/ai/rapids/cudf/HashType.java @@ -26,11 +26,6 @@ public enum HashType { MURMUR3(1), HASH_SPARK_MURMUR3(2), HASH_MD5(3); - HASH_SHA1(4), - HASH_SHA224(5), - HASH_SHA256(6), - HASH_SHA384(7), - HASH_SHA512(8); private static final HashType[] HASH_TYPES = HashType.values(); final int nativeId; diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index c5e435aceb7..0b7cd91d72f 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -9,21 +9,38 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil: - - ctypedef enum hash_id "cudf::hash_id": - HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY" - HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3" - HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3" - HASH_MD5 "cudf::hash_id::HASH_MD5" - HASH_SHA1 "cudf::hash_id::HASH_SHA1" - HASH_SHA224 "cudf::hash_id::HASH_SHA224" - HASH_SHA256 "cudf::hash_id::HASH_SHA256" - HASH_SHA384 "cudf::hash_id::HASH_SHA384" - HASH_SHA512 "cudf::hash_id::HASH_SHA512" - - cdef unique_ptr[column] hash "cudf::hash" ( +cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: + + cdef unique_ptr[column] murmurhash3_x86_32 "cudf::hashing::murmurhash3_x86_32" ( const table_view& input, - const hash_id hash_function, const uint32_t seed ) except + + + cdef unique_ptr[column] md5 "cudf::hashing::md5" ( + const table_view& input + ) except + + + cdef unique_ptr[column] sha1 "cudf::hashing::sha1" ( + const table_view& input + ) except + + + cdef unique_ptr[column] sha224 "cudf::hashing::sha224" ( + const table_view& input + ) except + + + cdef unique_ptr[column] sha256 "cudf::hashing::sha256" ( + const table_view& input + ) except + + + cdef unique_ptr[column] sha384 "cudf::hashing::sha384" ( + const table_view& input + ) except + + + cdef unique_ptr[column] sha512 "cudf::hashing::sha512" ( + const table_view& input + ) except + + + cdef unique_ptr[column] xxhash_64 "cudf::hashing::xxhash_64" ( + const table_view& input, + const uint64_t seed + ) except + diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 8bdbed6c2b1..43764b562a2 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -10,7 +10,16 @@ from libcpp.vector cimport vector cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id +from cudf._lib.cpp.hash cimport ( + md5, + murmurhash3_x86_32, + sha1, + sha224, + sha256, + sha384, + sha512, + xxhash_64, +) from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -44,21 +53,30 @@ def hash_partition(list source_columns, object columns_to_hash, def hash(list source_columns, str method, int seed=0): cdef table_view c_source_view = table_view_from_columns(source_columns) cdef unique_ptr[column] c_result - cdef cpp_hash_id c_hash_function if method == "murmur3": - c_hash_function = cpp_hash_id.HASH_MURMUR3 + with nogil: + c_result = move(murmurhash3_x86_32(c_source_view, seed)) elif method == "md5": - c_hash_function = cpp_hash_id.HASH_MD5 + with nogil: + c_result = move(md5(c_source_view, seed)) elif method == "sha1": - c_hash_function = cpp_hash_id.HASH_SHA1 + with nogil: + c_result = move(sha1(c_source_view, seed)) elif method == "sha224": - c_hash_function = cpp_hash_id.HASH_SHA224 + with nogil: + c_result = move(sha224(c_source_view, seed)) elif method == "sha256": - c_hash_function = cpp_hash_id.HASH_SHA256 + with nogil: + c_result = move(sha256(c_source_view, seed)) elif method == "sha384": - c_hash_function = cpp_hash_id.HASH_SHA384 + with nogil: + c_result = move(sha384(c_source_view, seed)) elif method == "sha512": - c_hash_function = cpp_hash_id.HASH_SHA512 + with nogil: + c_result = move(sha512(c_source_view, seed)) + elif method == "xxhash_64": + with nogil: + c_result = move(xxhash_64(c_source_view, seed)) else: raise ValueError(f"Unsupported hash function: {method}") with nogil: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 376bef6d0b2..04cf9b92f70 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1736,8 +1736,9 @@ def hash_values(self, method="murmur3", seed=None): ---------- method : {'murmur3', 'md5'}, default 'murmur3' Hash function to use: - * murmur3: MurmurHash3 hash function. - * md5: MD5 hash function. + + * murmur3: MurmurHash3 hash function + * md5: MD5 hash function seed : int, optional Seed value to use for the hash function. From b64c9c7e6edf7432340b2116a36708d080bb40cf Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 21 Dec 2023 12:05:11 -0600 Subject: [PATCH 04/16] Update includes and copyrights. --- cpp/src/hash/sha1_hash.cu | 26 ++++---------------------- cpp/src/hash/sha224_hash.cu | 26 ++++---------------------- cpp/src/hash/sha256_hash.cu | 26 ++++---------------------- cpp/src/hash/sha384_hash.cu | 26 ++++---------------------- cpp/src/hash/sha512_hash.cu | 26 ++++---------------------- cpp/tests/hashing/sha1_test.cpp | 2 +- cpp/tests/hashing/sha224_test.cpp | 2 +- cpp/tests/hashing/sha256_test.cpp | 2 +- cpp/tests/hashing/sha384_test.cpp | 2 +- cpp/tests/hashing/sha512_test.cpp | 2 +- 10 files changed, 25 insertions(+), 115 deletions(-) diff --git a/cpp/src/hash/sha1_hash.cu b/cpp/src/hash/sha1_hash.cu index c0477343ca1..2104abb9f4c 100644 --- a/cpp/src/hash/sha1_hash.cu +++ b/cpp/src/hash/sha1_hash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,33 +16,15 @@ #include "sha_hash.cuh" -#include -#include -#include -#include -#include -#include -#include +#include #include -#include -#include -#include -#include +#include #include #include +#include -#include -#include -#include -#include -#include - -#include -#include #include -#include -#include namespace cudf { namespace hashing { diff --git a/cpp/src/hash/sha224_hash.cu b/cpp/src/hash/sha224_hash.cu index af37bde07ab..6b697f4427c 100644 --- a/cpp/src/hash/sha224_hash.cu +++ b/cpp/src/hash/sha224_hash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,33 +16,15 @@ #include "sha_hash.cuh" -#include -#include -#include -#include -#include -#include -#include +#include #include -#include -#include -#include -#include +#include #include #include +#include -#include -#include -#include -#include -#include - -#include -#include #include -#include -#include namespace cudf { namespace hashing { diff --git a/cpp/src/hash/sha256_hash.cu b/cpp/src/hash/sha256_hash.cu index d7a8502337e..b368f3e2630 100644 --- a/cpp/src/hash/sha256_hash.cu +++ b/cpp/src/hash/sha256_hash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,33 +16,15 @@ #include "sha_hash.cuh" -#include -#include -#include -#include -#include -#include -#include +#include #include -#include -#include -#include -#include +#include #include #include +#include -#include -#include -#include -#include -#include - -#include -#include #include -#include -#include namespace cudf { namespace hashing { diff --git a/cpp/src/hash/sha384_hash.cu b/cpp/src/hash/sha384_hash.cu index 83c7ceb4703..e77dadaf1c4 100644 --- a/cpp/src/hash/sha384_hash.cu +++ b/cpp/src/hash/sha384_hash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,33 +16,15 @@ #include "sha_hash.cuh" -#include -#include -#include -#include -#include -#include -#include +#include #include -#include -#include -#include -#include +#include #include #include +#include -#include -#include -#include -#include -#include - -#include -#include #include -#include -#include namespace cudf { namespace hashing { diff --git a/cpp/src/hash/sha512_hash.cu b/cpp/src/hash/sha512_hash.cu index f705b25a45f..88ddb58cff5 100644 --- a/cpp/src/hash/sha512_hash.cu +++ b/cpp/src/hash/sha512_hash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,33 +16,15 @@ #include "sha_hash.cuh" -#include -#include -#include -#include -#include -#include -#include +#include #include -#include -#include -#include -#include +#include #include #include +#include -#include -#include -#include -#include -#include - -#include -#include #include -#include -#include namespace cudf { namespace hashing { diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp index 6168fb06909..a7b73c1477a 100644 --- a/cpp/tests/hashing/sha1_test.cpp +++ b/cpp/tests/hashing/sha1_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp index 3192107e8ec..9c669d617bf 100644 --- a/cpp/tests/hashing/sha224_test.cpp +++ b/cpp/tests/hashing/sha224_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp index df35755831c..70f1a3bda77 100644 --- a/cpp/tests/hashing/sha256_test.cpp +++ b/cpp/tests/hashing/sha256_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp index 435a95bb4a8..c3e4acc8504 100644 --- a/cpp/tests/hashing/sha384_test.cpp +++ b/cpp/tests/hashing/sha384_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp index 4e0b76999b7..80f2de08721 100644 --- a/cpp/tests/hashing/sha512_test.cpp +++ b/cpp/tests/hashing/sha512_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 4822760e072e8d882a51d21c1be98957a2856581 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 2 Jan 2024 13:59:52 -0600 Subject: [PATCH 05/16] Update copyright years. --- cpp/CMakeLists.txt | 2 +- cpp/benchmarks/hashing/hash.cpp | 2 +- cpp/include/cudf/hashing.hpp | 2 +- cpp/include/cudf/hashing/detail/hash_functions.cuh | 2 +- cpp/include/cudf/hashing/detail/hashing.hpp | 2 +- cpp/src/hash/md5_hash.cu | 2 +- cpp/src/hash/sha1_hash.cu | 2 +- cpp/src/hash/sha224_hash.cu | 2 +- cpp/src/hash/sha256_hash.cu | 2 +- cpp/src/hash/sha384_hash.cu | 2 +- cpp/src/hash/sha512_hash.cu | 2 +- cpp/src/hash/sha_hash.cuh | 2 +- cpp/tests/CMakeLists.txt | 2 +- cpp/tests/hashing/sha1_test.cpp | 2 +- cpp/tests/hashing/sha224_test.cpp | 2 +- cpp/tests/hashing/sha256_test.cpp | 2 +- cpp/tests/hashing/sha384_test.cpp | 2 +- cpp/tests/hashing/sha512_test.cpp | 2 +- java/src/main/java/ai/rapids/cudf/HashType.java | 2 +- python/cudf/cudf/_lib/cpp/hash.pxd | 2 +- python/cudf/cudf/_lib/hash.pyx | 2 +- python/cudf/cudf/tests/test_dataframe.py | 2 +- python/cudf/cudf/tests/test_series.py | 2 +- 23 files changed, 23 insertions(+), 23 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3aee24b9616..6760b8787b0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp index 18a95d2f46e..b86d8136400 100644 --- a/cpp/benchmarks/hashing/hash.cpp +++ b/cpp/benchmarks/hashing/hash.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 965de95830a..c3a57af1358 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh index 4b65a5e6b37..038370086fa 100644 --- a/cpp/include/cudf/hashing/detail/hash_functions.cuh +++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023, NVIDIA CORPORATION. + * Copyright (c) 2017-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp index f5bc7000604..eaeb5d6b068 100644 --- a/cpp/include/cudf/hashing/detail/hashing.hpp +++ b/cpp/include/cudf/hashing/detail/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu index 6c09f67c055..40dde6fefc5 100644 --- a/cpp/src/hash/md5_hash.cu +++ b/cpp/src/hash/md5_hash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/hash/sha1_hash.cu b/cpp/src/hash/sha1_hash.cu index 2104abb9f4c..e7e5d09a684 100644 --- a/cpp/src/hash/sha1_hash.cu +++ b/cpp/src/hash/sha1_hash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/hash/sha224_hash.cu b/cpp/src/hash/sha224_hash.cu index 6b697f4427c..b68c1205ac1 100644 --- a/cpp/src/hash/sha224_hash.cu +++ b/cpp/src/hash/sha224_hash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/hash/sha256_hash.cu b/cpp/src/hash/sha256_hash.cu index b368f3e2630..7b5fe489d17 100644 --- a/cpp/src/hash/sha256_hash.cu +++ b/cpp/src/hash/sha256_hash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/hash/sha384_hash.cu b/cpp/src/hash/sha384_hash.cu index e77dadaf1c4..700a23d42e1 100644 --- a/cpp/src/hash/sha384_hash.cu +++ b/cpp/src/hash/sha384_hash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/hash/sha512_hash.cu b/cpp/src/hash/sha512_hash.cu index 88ddb58cff5..9b533ab911d 100644 --- a/cpp/src/hash/sha512_hash.cu +++ b/cpp/src/hash/sha512_hash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh index eeca14f6b3b..d0dde7f1c34 100644 --- a/cpp/src/hash/sha_hash.cuh +++ b/cpp/src/hash/sha_hash.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 616550a8e66..8a63a5c4870 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp index a7b73c1477a..7ae401aa1ec 100644 --- a/cpp/tests/hashing/sha1_test.cpp +++ b/cpp/tests/hashing/sha1_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp index 9c669d617bf..fc1fbe07f2f 100644 --- a/cpp/tests/hashing/sha224_test.cpp +++ b/cpp/tests/hashing/sha224_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp index 70f1a3bda77..de1c4ccc24a 100644 --- a/cpp/tests/hashing/sha256_test.cpp +++ b/cpp/tests/hashing/sha256_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp index c3e4acc8504..3095307d070 100644 --- a/cpp/tests/hashing/sha384_test.cpp +++ b/cpp/tests/hashing/sha384_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp index 80f2de08721..01ae681106c 100644 --- a/cpp/tests/hashing/sha512_test.cpp +++ b/cpp/tests/hashing/sha512_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/java/ai/rapids/cudf/HashType.java b/java/src/main/java/ai/rapids/cudf/HashType.java index 6e004619e57..081e8aa6700 100644 --- a/java/src/main/java/ai/rapids/cudf/HashType.java +++ b/java/src/main/java/ai/rapids/cudf/HashType.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index ef45181fbbf..d55e244dc2c 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index cb413a113c8..6854cff7763 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8cf1476e533..a83498e5336 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import array as arr import contextlib diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 75775725753..87c9442e3c4 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import decimal import hashlib From 1e5017ee4f2443cb33005cdb8ebabf1c1ceb3752 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 2 Jan 2024 15:24:22 -0600 Subject: [PATCH 06/16] Add dtypes for SHA tests. --- python/cudf/cudf/tests/test_dataframe.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index a83498e5336..3db6bf72924 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1427,6 +1427,11 @@ def test_dataframe_hash_values(nrows, method, seed): expected_dtypes = { "murmur3": np.uint32, "md5": object, + "sha1": object, + "sha224": object, + "sha256": object, + "sha384": object, + "sha512": object, "xxhash64": np.uint64, } assert out.dtype == expected_dtypes[method] From 2da294f936f8e41ff8381a9fac05476932e0c593 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 17 Jan 2024 09:10:22 -0800 Subject: [PATCH 07/16] Move return types after __device__ and other annotations. --- cpp/include/cudf/hashing/detail/hash_functions.cuh | 2 +- cpp/src/hash/sha1_hash.cu | 2 +- cpp/src/hash/sha224_hash.cu | 2 +- cpp/src/hash/sha256_hash.cu | 2 +- cpp/src/hash/sha384_hash.cu | 2 +- cpp/src/hash/sha512_hash.cu | 2 +- cpp/src/hash/sha_hash.cuh | 14 +++++++------- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh index 038370086fa..0ec41a20ef1 100644 --- a/cpp/include/cudf/hashing/detail/hash_functions.cuh +++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh @@ -93,7 +93,7 @@ __device__ inline uint64_t swap_endian(uint64_t x) * Licensed under the MIT license. * See file LICENSE for detail or copy at https://opensource.org/licenses/MIT */ -void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destination) +__device__ inline void uint32ToLowercaseHexString(uint32_t num, char* destination) { // Transform 0xABCD'1234 => 0x0000'ABCD'0000'1234 => 0x0B0A'0D0C'0201'0403 uint64_t x = num; diff --git a/cpp/src/hash/sha1_hash.cu b/cpp/src/hash/sha1_hash.cu index e7e5d09a684..5d5e7099acd 100644 --- a/cpp/src/hash/sha1_hash.cu +++ b/cpp/src/hash/sha1_hash.cu @@ -53,7 +53,7 @@ struct SHA1Hash : HashBase { // Number of bytes used for the message length static constexpr uint32_t message_length_size = 8; - void __device__ inline hash_step(hash_state& state) { sha1_hash_step(state); } + __device__ inline void hash_step(hash_state& state) { sha1_hash_step(state); } hash_state state; }; diff --git a/cpp/src/hash/sha224_hash.cu b/cpp/src/hash/sha224_hash.cu index b68c1205ac1..40ddf46b2ff 100644 --- a/cpp/src/hash/sha224_hash.cu +++ b/cpp/src/hash/sha224_hash.cu @@ -54,7 +54,7 @@ struct SHA224Hash : HashBase { // Number of bytes used for the message length static constexpr uint32_t message_length_size = 8; - void __device__ inline hash_step(hash_state& state) { sha256_hash_step(state); } + __device__ inline void hash_step(hash_state& state) { sha256_hash_step(state); } hash_state state; }; diff --git a/cpp/src/hash/sha256_hash.cu b/cpp/src/hash/sha256_hash.cu index 7b5fe489d17..d5e12c5adc4 100644 --- a/cpp/src/hash/sha256_hash.cu +++ b/cpp/src/hash/sha256_hash.cu @@ -54,7 +54,7 @@ struct SHA256Hash : HashBase { // Number of bytes used for the message length static constexpr uint32_t message_length_size = 8; - void __device__ inline hash_step(hash_state& state) { sha256_hash_step(state); } + __device__ inline void hash_step(hash_state& state) { sha256_hash_step(state); } hash_state state; }; diff --git a/cpp/src/hash/sha384_hash.cu b/cpp/src/hash/sha384_hash.cu index 700a23d42e1..1d115f23cd0 100644 --- a/cpp/src/hash/sha384_hash.cu +++ b/cpp/src/hash/sha384_hash.cu @@ -61,7 +61,7 @@ struct SHA384Hash : HashBase { // Number of bytes used for the message length static constexpr uint32_t message_length_size = 16; - void __device__ inline hash_step(hash_state& state) { sha512_hash_step(state); } + __device__ inline void hash_step(hash_state& state) { sha512_hash_step(state); } hash_state state; }; diff --git a/cpp/src/hash/sha512_hash.cu b/cpp/src/hash/sha512_hash.cu index 9b533ab911d..dfc1f43f478 100644 --- a/cpp/src/hash/sha512_hash.cu +++ b/cpp/src/hash/sha512_hash.cu @@ -61,7 +61,7 @@ struct SHA512Hash : HashBase { // Number of bytes used for the message length static constexpr uint32_t message_length_size = 16; - void __device__ inline hash_step(hash_state& state) { sha512_hash_step(state); } + __device__ inline void hash_step(hash_state& state) { sha512_hash_step(state); } hash_state state; }; diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh index d0dde7f1c34..57cfa2d7dbc 100644 --- a/cpp/src/hash/sha_hash.cuh +++ b/cpp/src/hash/sha_hash.cuh @@ -108,7 +108,7 @@ struct HashBase : public crtp { * This accepts arbitrary data, handles it as bytes, and calls the hash step * when the buffer is filled up to message_chunk_size bytes. */ - void __device__ inline process(uint8_t const* data, uint32_t len) + __device__ inline void process(uint8_t const* data, uint32_t len) { auto& state = this->underlying().state; state.message_length += len; @@ -140,7 +140,7 @@ struct HashBase : public crtp { } template - void __device__ inline process_fixed_width(T const& key) + __device__ inline void process_fixed_width(T const& key) { uint8_t const* data = reinterpret_cast(&key); uint32_t constexpr len = sizeof(T); @@ -154,7 +154,7 @@ struct HashBase : public crtp { * the message length (in another step of the hash, if needed), and performs * the final hash step. */ - void __device__ inline finalize() + __device__ inline void finalize() { auto& state = this->underlying().state; // Message length in bits. @@ -238,7 +238,7 @@ struct HasherDispatcher { } template - void __device__ inline operator()(size_type row_index) + __device__ inline void operator()(size_type row_index) { if constexpr (is_fixed_width() && !is_chrono()) { Element const& key = input_col.element(row_index); @@ -271,7 +271,7 @@ struct HasherDispatcher { * updating the hash value so far. Does not zero out the buffer contents. */ template -void __device__ inline sha1_hash_step(hash_state& state) +__device__ inline void sha1_hash_step(hash_state& state) { uint32_t words[80]; @@ -338,7 +338,7 @@ void __device__ inline sha1_hash_step(hash_state& state) * updating the hash value so far. Does not zero out the buffer contents. */ template -void __device__ inline sha256_hash_step(hash_state& state) +__device__ inline void sha256_hash_step(hash_state& state) { uint32_t words[64]; @@ -404,7 +404,7 @@ void __device__ inline sha256_hash_step(hash_state& state) * updating the hash value so far. Does not zero out the buffer contents. */ template -void __device__ inline sha512_hash_step(hash_state& state) +__device__ inline void sha512_hash_step(hash_state& state) { uint64_t words[80]; From fdb7c8fd17a35db978b9d2711d5ada33961d47b4 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 17 Jan 2024 16:44:03 -0800 Subject: [PATCH 08/16] Remove empty_result and fix some docstrings. --- cpp/src/hash/sha1_hash.cu | 3 +-- cpp/src/hash/sha224_hash.cu | 3 +-- cpp/src/hash/sha256_hash.cu | 4 +--- cpp/src/hash/sha384_hash.cu | 5 +---- cpp/src/hash/sha512_hash.cu | 5 +---- cpp/src/hash/sha_hash.cuh | 15 ++++----------- 6 files changed, 9 insertions(+), 26 deletions(-) diff --git a/cpp/src/hash/sha1_hash.cu b/cpp/src/hash/sha1_hash.cu index 5d5e7099acd..71253d279b9 100644 --- a/cpp/src/hash/sha1_hash.cu +++ b/cpp/src/hash/sha1_hash.cu @@ -64,8 +64,7 @@ std::unique_ptr sha1(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - string_scalar const empty_result("da39a3ee5e6b4b0d3255bfef95601890afd80709"); - return sha_hash(input, empty_result, stream, mr); + return sha_hash(input, stream, mr); } } // namespace detail diff --git a/cpp/src/hash/sha224_hash.cu b/cpp/src/hash/sha224_hash.cu index 40ddf46b2ff..61480a78776 100644 --- a/cpp/src/hash/sha224_hash.cu +++ b/cpp/src/hash/sha224_hash.cu @@ -65,8 +65,7 @@ std::unique_ptr sha224(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - string_scalar const empty_result("d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f"); - return sha_hash(input, empty_result, stream, mr); + return sha_hash(input, stream, mr); } } // namespace detail diff --git a/cpp/src/hash/sha256_hash.cu b/cpp/src/hash/sha256_hash.cu index d5e12c5adc4..b15cfe09d52 100644 --- a/cpp/src/hash/sha256_hash.cu +++ b/cpp/src/hash/sha256_hash.cu @@ -65,9 +65,7 @@ std::unique_ptr sha256(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - string_scalar const empty_result( - "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); - return sha_hash(input, empty_result, stream, mr); + return sha_hash(input, stream, mr); } } // namespace detail diff --git a/cpp/src/hash/sha384_hash.cu b/cpp/src/hash/sha384_hash.cu index 1d115f23cd0..3075d2c62f8 100644 --- a/cpp/src/hash/sha384_hash.cu +++ b/cpp/src/hash/sha384_hash.cu @@ -72,10 +72,7 @@ std::unique_ptr sha384(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - string_scalar const empty_result( - "38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b9" - "5b"); - return sha_hash(input, empty_result, stream, mr); + return sha_hash(input, stream, mr); } } // namespace detail diff --git a/cpp/src/hash/sha512_hash.cu b/cpp/src/hash/sha512_hash.cu index dfc1f43f478..d073cf1edca 100644 --- a/cpp/src/hash/sha512_hash.cu +++ b/cpp/src/hash/sha512_hash.cu @@ -72,10 +72,7 @@ std::unique_ptr sha512(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - string_scalar const empty_result( - "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec" - "2f63b931bd47417a81a538327af927da3e"); - return sha_hash(input, empty_result, stream, mr); + return sha_hash(input, stream, mr); } } // namespace detail diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh index 57cfa2d7dbc..ebf6e80bedf 100644 --- a/cpp/src/hash/sha_hash.cuh +++ b/cpp/src/hash/sha_hash.cuh @@ -476,24 +476,17 @@ bool inline sha_leaf_type_check(data_type dt) * * @tparam Hasher The struct used for computing SHA hashes. * - * @param input The input table. - * results. - * @param empty_result A string representing the expected result for empty inputs. - * @param stream CUDA stream on which memory may be allocated if the memory - * resource supports streams. + * @param input The input table + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Memory resource to use for the device memory allocation - * @return A new column with the computed hash function result. + * @return A new column with the computed hash function result */ template std::unique_ptr sha_hash(table_view const& input, - string_scalar const& empty_result, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (input.num_columns() == 0 || input.num_rows() == 0) { - // Return the hash of a zero-length input. - return make_column_from_scalar(empty_result, input.num_rows(), stream, mr); - } + if (input.num_rows() == 0) { return cudf::make_empty_column(cudf::type_id::STRING); } // Accepts string and fixed width columns. // TODO: Accept single layer list columns holding those types. From 2993c24a89a553758cf06de24cbc927280ab4590 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 17 Jan 2024 16:57:23 -0800 Subject: [PATCH 09/16] Update docstrings. --- cpp/src/hash/sha_hash.cuh | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh index ebf6e80bedf..b8c303bfb88 100644 --- a/cpp/src/hash/sha_hash.cuh +++ b/cpp/src/hash/sha_hash.cuh @@ -267,8 +267,10 @@ struct HasherDispatcher { }; /** - * @brief Core SHA-1 algorithm implementation. Processes a single 512-bit chunk, - * updating the hash value so far. Does not zero out the buffer contents. + * @brief Core SHA-1 algorithm implementation + * + * Processes a single 512-bit chunk, updating the hash value so far. + * This does not zero out the buffer contents. */ template __device__ inline void sha1_hash_step(hash_state& state) @@ -334,8 +336,10 @@ __device__ inline void sha1_hash_step(hash_state& state) } /** - * @brief Core SHA-256 algorithm implementation. Processes a single 512-bit chunk, - * updating the hash value so far. Does not zero out the buffer contents. + * @brief Core SHA-256 algorithm implementation + * + * Processes a single 512-bit chunk, updating the hash value so far. + * This does not zero out the buffer contents. */ template __device__ inline void sha256_hash_step(hash_state& state) @@ -400,8 +404,10 @@ __device__ inline void sha256_hash_step(hash_state& state) } /** - * @brief Core SHA-512 algorithm implementation. Processes a single 1024-bit chunk, - * updating the hash value so far. Does not zero out the buffer contents. + * @brief Core SHA-512 algorithm implementation + * + * Processes a single 1024-bit chunk, updating the hash value so far. + * This does not zero out the buffer contents. */ template __device__ inline void sha512_hash_step(hash_state& state) From 335e99139afb949f7295438efe2918e530e818fb Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 19 Jan 2024 12:49:10 -0600 Subject: [PATCH 10/16] Fix calls to deprecated strings factory API. --- cpp/src/hash/sha_hash.cuh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh index b8c303bfb88..ebefd8108f0 100644 --- a/cpp/src/hash/sha_hash.cuh +++ b/cpp/src/hash/sha_hash.cuh @@ -506,9 +506,8 @@ std::unique_ptr sha_hash(table_view const& input, auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr); - auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr); - auto chars_view = chars_column->mutable_view(); - auto d_chars = chars_view.data(); + auto chars = rmm::device_uvector(bytes, stream, mr); + auto d_chars = chars.data(); auto const device_input = table_device_view::create(input, stream); @@ -527,8 +526,7 @@ std::unique_ptr sha_hash(table_view const& input, hasher.finalize(); }); - return make_strings_column( - input.num_rows(), std::move(offsets_column), std::move(chars_column), 0, {}); + return make_strings_column(input.num_rows(), std::move(offsets_column), chars.release(), 0, {}); } } // namespace detail From 8d883e4dd706956d4d3635923fb71e303ed9af7b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 19 Jan 2024 12:52:12 -0600 Subject: [PATCH 11/16] Remove verbosity. --- cpp/tests/hashing/sha1_test.cpp | 2 -- cpp/tests/hashing/sha224_test.cpp | 2 -- cpp/tests/hashing/sha384_test.cpp | 8 +++----- cpp/tests/hashing/sha512_test.cpp | 8 +++----- 4 files changed, 6 insertions(+), 14 deletions(-) diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp index 7ae401aa1ec..4ad101c5f93 100644 --- a/cpp/tests/hashing/sha1_test.cpp +++ b/cpp/tests/hashing/sha1_test.cpp @@ -23,8 +23,6 @@ #include #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - class SHA1HashTest : public cudf::test::BaseFixture {}; TEST_F(SHA1HashTest, EmptyTable) diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp index fc1fbe07f2f..8f47a87f330 100644 --- a/cpp/tests/hashing/sha224_test.cpp +++ b/cpp/tests/hashing/sha224_test.cpp @@ -23,8 +23,6 @@ #include #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - class SHA224HashTest : public cudf::test::BaseFixture {}; TEST_F(SHA224HashTest, EmptyTable) diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp index 3095307d070..e636ac5dd4e 100644 --- a/cpp/tests/hashing/sha384_test.cpp +++ b/cpp/tests/hashing/sha384_test.cpp @@ -23,8 +23,6 @@ #include #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - class SHA384HashTest : public cudf::test::BaseFixture {}; TEST_F(SHA384HashTest, EmptyTable) @@ -104,15 +102,15 @@ TEST_F(SHA384HashTest, MultiValue) auto const sha384_string_output2 = cudf::hashing::sha384(string_input2); EXPECT_EQ(string_input1.num_rows(), sha384_string_output1->size()); EXPECT_EQ(string_input2.num_rows(), sha384_string_output2->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_string_output1->view(), sha384_string_results1, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_string_output2->view(), sha384_string_results2, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_string_output1->view(), sha384_string_results1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_string_output2->view(), sha384_string_results2); auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); auto const sha384_output1 = cudf::hashing::sha384(input1); auto const sha384_output2 = cudf::hashing::sha384(input2); EXPECT_EQ(input1.num_rows(), sha384_output1->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_output1->view(), sha384_output2->view(), verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_output1->view(), sha384_output2->view()); } TEST_F(SHA384HashTest, MultiValueNulls) diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp index 01ae681106c..301aa614dfa 100644 --- a/cpp/tests/hashing/sha512_test.cpp +++ b/cpp/tests/hashing/sha512_test.cpp @@ -23,8 +23,6 @@ #include #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - class SHA512HashTest : public cudf::test::BaseFixture {}; TEST_F(SHA512HashTest, EmptyTable) @@ -104,15 +102,15 @@ TEST_F(SHA512HashTest, MultiValue) auto const sha512_string_output2 = cudf::hashing::sha512(string_input2); EXPECT_EQ(string_input1.num_rows(), sha512_string_output1->size()); EXPECT_EQ(string_input2.num_rows(), sha512_string_output2->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_string_output1->view(), sha512_string_results1, verbosity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_string_output2->view(), sha512_string_results2, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_string_output1->view(), sha512_string_results1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_string_output2->view(), sha512_string_results2); auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); auto const sha512_output1 = cudf::hashing::sha512(input1); auto const sha512_output2 = cudf::hashing::sha512(input2); EXPECT_EQ(input1.num_rows(), sha512_output1->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_output1->view(), sha512_output2->view(), verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_output1->view(), sha512_output2->view()); } TEST_F(SHA512HashTest, MultiValueNulls) From 01881154feacfbc1fca4ee0e3b0a40ef56b5802f Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 19 Jan 2024 16:10:13 -0600 Subject: [PATCH 12/16] Remove unsanitized nulls from MD5 tests. --- cpp/tests/hashing/md5_test.cpp | 90 ++++++++++++---------------------- 1 file changed, 31 insertions(+), 59 deletions(-) diff --git a/cpp/tests/hashing/md5_test.cpp b/cpp/tests/hashing/md5_test.cpp index 52ca52eb2ff..7bfb87bdf81 100644 --- a/cpp/tests/hashing/md5_test.cpp +++ b/cpp/tests/hashing/md5_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,10 +55,9 @@ TEST_F(MD5HashTest, MultiValue) cudf::test::fixed_width_column_wrapper const ints_col( {0, 100, -100, limits::min(), limits::max()}); - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); + cudf::test::fixed_width_column_wrapper const bools_col({0, 1, 1, 1, 0}); + // Test against known outputs auto const string_input1 = cudf::table_view({strings_col}); auto const string_input2 = cudf::table_view({strings_col, strings_col}); auto const md5_string_output1 = cudf::hashing::md5(string_input1); @@ -68,47 +67,23 @@ TEST_F(MD5HashTest, MultiValue) CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_string_output1->view(), md5_string_results1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_string_output2->view(), md5_string_results2); - auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); - auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); + // Test non-string inputs for self-consistency + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col}); auto const md5_output1 = cudf::hashing::md5(input1); auto const md5_output2 = cudf::hashing::md5(input2); EXPECT_EQ(input1.num_rows(), md5_output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(md5_output1->view(), md5_output2->view()); } -TEST_F(MD5HashTest, MultiValueNulls) +TEST_F(MD5HashTest, EmptyNullEquivalence) { - // Nulls with different values should be equal - cudf::test::strings_column_wrapper const strings_col1( - {"", - "Different but null!", - "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the " - "MD5 hash function. This string needed to be longer.", - "All work and no play makes Jack a dull boy", - R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"}, - {1, 0, 0, 1, 0}); - cudf::test::strings_column_wrapper const strings_col2( - {"", - "A 60 character string to test MD5's message padding algorithm", - "Very different... but null", - "All work and no play makes Jack a dull boy", - ""}, - {1, 0, 0, 1, 1}); // empty string is equivalent to null - - // Nulls with different values should be equal - using limits = std::numeric_limits; - cudf::test::fixed_width_column_wrapper const ints_col1( - {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); - cudf::test::fixed_width_column_wrapper const ints_col2( - {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + // Test that empty strings hash the same as nulls + cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0}); + cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1}); - // Nulls with different values should be equal - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); - - auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1}); - auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2}); + auto const input1 = cudf::table_view({strings_col1}); + auto const input2 = cudf::table_view({strings_col2}); auto const output1 = cudf::hashing::md5(input1); auto const output2 = cudf::hashing::md5(input2); @@ -117,10 +92,12 @@ TEST_F(MD5HashTest, MultiValueNulls) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } -TEST_F(MD5HashTest, StringListsNulls) +TEST_F(MD5HashTest, StringLists) { auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; }); + // Test of data serialization: a string should hash the same as a list of + // strings that concatenate to the same input. cudf::test::strings_column_wrapper const strings_col( {"", "A 60 character string to test MD5's message padding algorithm", @@ -131,7 +108,7 @@ TEST_F(MD5HashTest, StringListsNulls) cudf::test::lists_column_wrapper strings_list_col( {{""}, - {{"NULL", "A 60 character string to test MD5's message padding algorithm"}, validity}, + {{"", "A 60 character string to test MD5's message padding algorithm"}, validity}, {"A very long (greater than 128 bytes/char string) to test a multi hash-step data point in " "the " "MD5 hash function. This string needed to be longer.", @@ -153,7 +130,7 @@ class MD5HashTestTyped : public cudf::test::BaseFixture {}; TYPED_TEST_SUITE(MD5HashTestTyped, cudf::test::NumericTypes); -TYPED_TEST(MD5HashTestTyped, Equality) +TYPED_TEST(MD5HashTestTyped, NoNulls) { cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); auto const input = cudf::table_view({col}); @@ -166,31 +143,26 @@ TYPED_TEST(MD5HashTestTyped, Equality) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } -TYPED_TEST(MD5HashTestTyped, EqualityNulls) +TYPED_TEST(MD5HashTestTyped, WithNulls) { - using T = TypeParam; - - // Nulls with different values should be equal - cudf::test::fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - cudf::test::fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - - auto const input1 = cudf::table_view({col1}); - auto const input2 = cudf::table_view({col2}); + cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + auto const input = cudf::table_view({col}); - auto const output1 = cudf::hashing::md5(input1); - auto const output2 = cudf::hashing::md5(input2); + // Hash of same input should be equal + auto const output1 = cudf::hashing::md5(input); + auto const output2 = cudf::hashing::md5(input); - EXPECT_EQ(input1.num_rows(), output1->size()); + EXPECT_EQ(input.num_rows(), output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } TEST_F(MD5HashTest, TestBoolListsWithNulls) { - cudf::test::fixed_width_column_wrapper const col1({0, 255, 255, 16, 27, 18, 100, 1, 2}, + cudf::test::fixed_width_column_wrapper const col1({0, 0, 0, 0, 1, 1, 1, 0, 0}, {1, 0, 0, 0, 1, 1, 1, 0, 0}); - cudf::test::fixed_width_column_wrapper const col2({0, 255, 255, 32, 81, 68, 3, 101, 4}, + cudf::test::fixed_width_column_wrapper const col2({0, 0, 0, 1, 0, 1, 0, 1, 0}, {1, 0, 0, 1, 0, 1, 0, 1, 0}); - cudf::test::fixed_width_column_wrapper const col3({0, 255, 255, 64, 49, 42, 5, 6, 102}, + cudf::test::fixed_width_column_wrapper const col3({0, 0, 0, 1, 1, 0, 0, 0, 1}, {1, 0, 0, 1, 1, 0, 0, 0, 1}); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; }); @@ -218,16 +190,16 @@ TYPED_TEST(MD5HashListTestTyped, TestListsWithNulls) { using T = TypeParam; - cudf::test::fixed_width_column_wrapper const col1({0, 255, 255, 16, 27, 18, 100, 1, 2}, + cudf::test::fixed_width_column_wrapper const col1({0, 0, 0, 0, 27, 18, 100, 0, 0}, {1, 0, 0, 0, 1, 1, 1, 0, 0}); - cudf::test::fixed_width_column_wrapper const col2({0, 255, 255, 32, 81, 68, 3, 101, 4}, + cudf::test::fixed_width_column_wrapper const col2({0, 0, 0, 32, 0, 68, 0, 101, 0}, {1, 0, 0, 1, 0, 1, 0, 1, 0}); - cudf::test::fixed_width_column_wrapper const col3({0, 255, 255, 64, 49, 42, 5, 6, 102}, + cudf::test::fixed_width_column_wrapper const col3({0, 0, 0, 64, 49, 0, 0, 0, 102}, {1, 0, 0, 1, 1, 0, 0, 0, 1}); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; }); cudf::test::lists_column_wrapper const list_col( - {{0, 0, 0}, {127}, {}, {{32, 127, 64}, validity}, {27, 49}, {18, 68}, {100}, {101}, {102}}, + {{0, 0, 0}, {}, {}, {{32, 0, 64}, validity}, {27, 49}, {18, 68}, {100}, {101}, {102}}, validity); auto const input1 = cudf::table_view({col1, col2, col3}); From 186eace705f00b815c2a8962a8a92fbdda922adf Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 19 Jan 2024 17:04:15 -0600 Subject: [PATCH 13/16] Add tests for multi-byte inputs, logic errors for lists/structs, comments about how to reproduce test values, remove unsanitized null tests. --- cpp/tests/hashing/md5_test.cpp | 28 ++++++--- cpp/tests/hashing/sha1_test.cpp | 94 +++++++++++++++++------------- cpp/tests/hashing/sha224_test.cpp | 94 +++++++++++++++++------------- cpp/tests/hashing/sha256_test.cpp | 95 ++++++++++++++++-------------- cpp/tests/hashing/sha384_test.cpp | 96 ++++++++++++++++++------------- cpp/tests/hashing/sha512_test.cpp | 96 ++++++++++++++++++------------- 6 files changed, 290 insertions(+), 213 deletions(-) diff --git a/cpp/tests/hashing/md5_test.cpp b/cpp/tests/hashing/md5_test.cpp index 7bfb87bdf81..f35e11699c0 100644 --- a/cpp/tests/hashing/md5_test.cpp +++ b/cpp/tests/hashing/md5_test.cpp @@ -35,29 +35,43 @@ TEST_F(MD5HashTest, MultiValue) "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the " "MD5 hash function. This string needed to be longer.", "All work and no play makes Jack a dull boy", - R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"}); - + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~", + "Multi-byte characters: é¼³⅝"}); + + /* + These outputs can be generated with shell: + ``` + echo -n "input string" | md5sum + ``` + Or with Python: + ``` + import hashlib + print(hashlib.md5("input string".encode()).hexdigest()) + ``` + */ cudf::test::strings_column_wrapper const md5_string_results1( {"d41d8cd98f00b204e9800998ecf8427e", "682240021651ae166d08fe2a014d5c09", "3669d5225fddbb34676312ca3b78bbd9", "c61a4185135eda043f35e92c3505e180", - "52da74c75cb6575d25be29e66bd0adde"}); + "52da74c75cb6575d25be29e66bd0adde", + "65d1f8a3274d134f1ea9e6e854c72caa"}); cudf::test::strings_column_wrapper const md5_string_results2( {"d41d8cd98f00b204e9800998ecf8427e", "e5a5682e82278e78dbaad9a689df7a73", "4121ab1bb6e84172fd94822645862ae9", "28970886501efe20164213855afe5850", - "6bc1b872103cc6a02d882245b8516e2e"}); + "6bc1b872103cc6a02d882245b8516e2e", + "0772a7e13ec8fef61474c131598762f7"}); using limits = std::numeric_limits; cudf::test::fixed_width_column_wrapper const ints_col( - {0, 100, -100, limits::min(), limits::max()}); + {0, -1, 100, -100, limits::min(), limits::max()}); - cudf::test::fixed_width_column_wrapper const bools_col({0, 1, 1, 1, 0}); + cudf::test::fixed_width_column_wrapper const bools_col({0, 1, 1, 1, 0, 1}); - // Test against known outputs + // Test string inputs against known outputs auto const string_input1 = cudf::table_view({strings_col}); auto const string_input2 = cudf::table_view({strings_col, strings_col}); auto const md5_string_output1 = cudf::hashing::md5(string_input1); diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp index 4ad101c5f93..128794cd826 100644 --- a/cpp/tests/hashing/sha1_test.cpp +++ b/cpp/tests/hashing/sha1_test.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -50,8 +51,20 @@ TEST_F(SHA1HashTest, MultiValue) "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " "the hash function being tested. This string needed to be longer.", "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); - + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~", + "Multi-byte characters: é¼³⅝"}); + + /* + These outputs can be generated with shell: + ``` + echo -n "input string" | sha1sum + ``` + Or with Python: + ``` + import hashlib + print(hashlib.sha1("input string".encode()).hexdigest()) + ``` + */ cudf::test::strings_column_wrapper const sha1_string_results1( {"da39a3ee5e6b4b0d3255bfef95601890afd80709", "b6589fc6ab0dc82cf12099d1c2d40ab994e8410c", @@ -60,7 +73,8 @@ TEST_F(SHA1HashTest, MultiValue) "4ffaf61804c55b8c2171be548bef2e1d0baca17a", "595965dd18f38087186162c788485fe249242131", "a62ca720fbab830c8890044eacbeac216f1ca2e4", - "11e16c52273b5669a41d17ec7c187475193f88b3"}); + "11e16c52273b5669a41d17ec7c187475193f88b3", + "0826be2f7b9340eed269c7f9f3f3662c0a3ece68"}); cudf::test::strings_column_wrapper const sha1_string_results2( {"da39a3ee5e6b4b0d3255bfef95601890afd80709", @@ -70,16 +84,16 @@ TEST_F(SHA1HashTest, MultiValue) "c7abd431a775c604edf41a62f7f215e7258dc16a", "153fdf20d2bd8ae76241197314d6e0be7fe10f50", "8c3656f7cb37898f9296c1965000d6da13fed64e", - "b4a848399375ec842c2cb445d98b5f80a4dce94f"}); + "b4a848399375ec842c2cb445d98b5f80a4dce94f", + "106a56e997aa6a149cc5091750574a25c324fa65"}); using limits = std::numeric_limits; cudf::test::fixed_width_column_wrapper const ints_col( - {0, 100, -100, limits::min(), limits::max(), 1, 2, 3}); + {0, -1, 100, -100, limits::min(), limits::max(), 1, 2, 3}); - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0, 1, 1, 1}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0, 1, 2, 255}); + cudf::test::fixed_width_column_wrapper const bools_col({0, 1, 1, 1, 0, 1, 1, 1, 0}); + // Test string inputs against known outputs auto const string_input1 = cudf::table_view({strings_col}); auto const string_input2 = cudf::table_view({strings_col, strings_col}); auto const sha1_string_output1 = cudf::hashing::sha1(string_input1); @@ -89,47 +103,23 @@ TEST_F(SHA1HashTest, MultiValue) CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha1_string_output1->view(), sha1_string_results1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha1_string_output2->view(), sha1_string_results2); - auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); - auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); + // Test non-string inputs for self-consistency + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col}); auto const sha1_output1 = cudf::hashing::sha1(input1); auto const sha1_output2 = cudf::hashing::sha1(input2); EXPECT_EQ(input1.num_rows(), sha1_output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha1_output1->view(), sha1_output2->view()); } -TEST_F(SHA1HashTest, MultiValueNulls) +TEST_F(SHA1HashTest, EmptyNullEquivalence) { - // Nulls with different values should be equal - cudf::test::strings_column_wrapper const strings_col1( - {"", - "Different but null!", - "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " - "the hash function being tested. This string needed to be longer.", - "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, - {1, 0, 0, 1, 0}); - cudf::test::strings_column_wrapper const strings_col2( - {"", - "Another string that is null.", - "Very different... but null", - "All work and no play makes Jack a dull boy", - ""}, - {1, 0, 0, 1, 1}); // empty string is equivalent to null - - // Nulls with different values should be equal - using limits = std::numeric_limits; - cudf::test::fixed_width_column_wrapper const ints_col1( - {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); - cudf::test::fixed_width_column_wrapper const ints_col2( - {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + // Test that empty strings hash the same as nulls + cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0}); + cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1}); - // Nulls with different values should be equal - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); - - auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1}); - auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2}); + auto const input1 = cudf::table_view({strings_col1}); + auto const input2 = cudf::table_view({strings_col2}); auto const output1 = cudf::hashing::sha1(input1); auto const output2 = cudf::hashing::sha1(input2); @@ -138,6 +128,28 @@ TEST_F(SHA1HashTest, MultiValueNulls) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } +TEST_F(SHA1HashTest, ListsUnsupported) +{ + cudf::test::lists_column_wrapper strings_list_col( + {{""}, + {"", "Some inputs"}, + {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"}, + {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}}); + + auto const input = cudf::table_view({strings_list_col}); + + EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error); +} + +TEST_F(SHA1HashTest, StructsUnsupported) +{ + auto child_col = cudf::test::fixed_width_column_wrapper{0, 1, 2, 3}; + auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; + auto const input = cudf::table_view({struct_col}); + + EXPECT_THROW(cudf::hashing::sha1(input), cudf::logic_error); +} + template class SHA1HashTestTyped : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp index 8f47a87f330..5122d067a05 100644 --- a/cpp/tests/hashing/sha224_test.cpp +++ b/cpp/tests/hashing/sha224_test.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -50,8 +51,20 @@ TEST_F(SHA224HashTest, MultiValue) "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " "the hash function being tested. This string needed to be longer.", "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); - + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~", + "Multi-byte characters: é¼³⅝"}); + + /* + These outputs can be generated with shell: + ``` + echo -n "input string" | sha224sum + ``` + Or with Python: + ``` + import hashlib + print(hashlib.sha224("input string".encode()).hexdigest()) + ``` + */ cudf::test::strings_column_wrapper const sha224_string_results1( {"d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f", "dfd5f9139a820075df69d7895015360b76d0360f3d4b77a845689614", @@ -60,7 +73,8 @@ TEST_F(SHA224HashTest, MultiValue) "f9ea303770699483f3e53263b32a3b3c876d1b8808ce84df4b8ca1c4", "2da6cd4bdaa0a99fd7236cd5507c52e12328e71192e83b32d2f110f9", "e7d0adb165079efc6c6343112f8b154aa3644ca6326f658aaa0f8e4a", - "309cc09eaa051beea7d0b0159daca9b4e8a533cb554e8f382c82709e"}); + "309cc09eaa051beea7d0b0159daca9b4e8a533cb554e8f382c82709e", + "6c728722ae8eafd058672bd92958199ff3a5a129e8c076752f7650f8"}); cudf::test::strings_column_wrapper const sha224_string_results2( {"d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f", @@ -70,16 +84,16 @@ TEST_F(SHA224HashTest, MultiValue) "d5854dfe3c32996345b103a6a16c7bdfa924723d620b150737e77370", "dd56deac5f2caa579a440ee814fc04a3afaf805d567087ac3317beb3", "14fb559f6309604bedd89183f585f3b433932b5b0e675848feebf8ec", - "d219eefea538491efcb69bc5bbef4177ad991d1b6e1367b5981b8c31"}); + "d219eefea538491efcb69bc5bbef4177ad991d1b6e1367b5981b8c31", + "5d5c2eace7ee553fe5cd25c8a8916e1eda81a5a5ca36a6338118a661"}); using limits = std::numeric_limits; cudf::test::fixed_width_column_wrapper const ints_col( - {0, 100, -100, limits::min(), limits::max(), 1, 2, 3}); + {0, -1, 100, -100, limits::min(), limits::max(), 1, 2, 3}); - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0, 1, 1, 1}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0, 1, 2, 255}); + cudf::test::fixed_width_column_wrapper const bools_col({0, 1, 1, 1, 0, 1, 1, 1, 0}); + // Test string inputs against known outputs auto const string_input1 = cudf::table_view({strings_col}); auto const string_input2 = cudf::table_view({strings_col, strings_col}); auto const sha224_string_output1 = cudf::hashing::sha224(string_input1); @@ -89,47 +103,23 @@ TEST_F(SHA224HashTest, MultiValue) CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha224_string_output1->view(), sha224_string_results1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha224_string_output2->view(), sha224_string_results2); - auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); - auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); + // Test non-string inputs for self-consistency + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col}); auto const sha224_output1 = cudf::hashing::sha224(input1); auto const sha224_output2 = cudf::hashing::sha224(input2); EXPECT_EQ(input1.num_rows(), sha224_output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha224_output1->view(), sha224_output2->view()); } -TEST_F(SHA224HashTest, MultiValueNulls) +TEST_F(SHA224HashTest, EmptyNullEquivalence) { - // Nulls with different values should be equal - cudf::test::strings_column_wrapper const strings_col1( - {"", - "Different but null!", - "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " - "the hash function being tested. This string needed to be longer.", - "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, - {1, 0, 0, 1, 0}); - cudf::test::strings_column_wrapper const strings_col2( - {"", - "Another string that is null.", - "Very different... but null", - "All work and no play makes Jack a dull boy", - ""}, - {1, 0, 0, 1, 1}); // empty string is equivalent to null - - // Nulls with different values should be equal - using limits = std::numeric_limits; - cudf::test::fixed_width_column_wrapper const ints_col1( - {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); - cudf::test::fixed_width_column_wrapper const ints_col2( - {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + // Test that empty strings hash the same as nulls + cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0}); + cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1}); - // Nulls with different values should be equal - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); - - auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1}); - auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2}); + auto const input1 = cudf::table_view({strings_col1}); + auto const input2 = cudf::table_view({strings_col2}); auto const output1 = cudf::hashing::sha224(input1); auto const output2 = cudf::hashing::sha224(input2); @@ -138,6 +128,28 @@ TEST_F(SHA224HashTest, MultiValueNulls) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } +TEST_F(SHA224HashTest, ListsUnsupported) +{ + cudf::test::lists_column_wrapper strings_list_col( + {{""}, + {"", "Some inputs"}, + {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"}, + {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}}); + + auto const input = cudf::table_view({strings_list_col}); + + EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error); +} + +TEST_F(SHA224HashTest, StructsUnsupported) +{ + auto child_col = cudf::test::fixed_width_column_wrapper{0, 1, 2, 3}; + auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; + auto const input = cudf::table_view({struct_col}); + + EXPECT_THROW(cudf::hashing::sha224(input), cudf::logic_error); +} + template class SHA224HashTestTyped : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp index de1c4ccc24a..99f7f0ad82b 100644 --- a/cpp/tests/hashing/sha256_test.cpp +++ b/cpp/tests/hashing/sha256_test.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -52,8 +53,20 @@ TEST_F(SHA256HashTest, MultiValue) "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " "the hash function being tested. This string needed to be longer.", "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); - + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~", + "Multi-byte characters: é¼³⅝"}); + + /* + These outputs can be generated with shell: + ``` + echo -n "input string" | sha256sum + ``` + Or with Python: + ``` + import hashlib + print(hashlib.sha256("input string".encode()).hexdigest()) + ``` + */ cudf::test::strings_column_wrapper const sha256_string_results1( {"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", "5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91b46729d73a27fb57e9", @@ -62,7 +75,8 @@ TEST_F(SHA256HashTest, MultiValue) "10a7d211e692c6f71bb9f7524ba1437588c2797356f05fc585340f002fe7015e", "339d610dcb030bb4222bcf18c8ab82d911bfe7fb95b2cd9f6785fd4562b02401", "2ce9936a4a2234bf8a76c37d92e01d549d03949792242e7f8a1ad68575e4e4a8", - "255fdd4d80a72f67921eb36f3e1157ea3e995068cee80e430c034e0d3692f614"}); + "255fdd4d80a72f67921eb36f3e1157ea3e995068cee80e430c034e0d3692f614", + "9f9a89d448937f853c0067a3e2cb732d703eca971e3fb0f88fc73a730b7a85f4"}); cudf::test::strings_column_wrapper const sha256_string_results2( {"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", @@ -72,16 +86,16 @@ TEST_F(SHA256HashTest, MultiValue) "224e4dce71d5dbd5e79ba65aaced7ad9c4f45dda146278087b2b61d164f056f0", "91f3108d4e9c696fdb37ae49fdc6a2237f1d1f977b7216406cc8a6365355f43b", "490be480afe271685e9c1fdf46daac0b9bf7f25602e153ca92a0ddb0e4b662ef", - "4ddc45855d7ce3ab09efacff1fbafb33502f7dd468dc5a62826689c1c658dbce"}); + "4ddc45855d7ce3ab09efacff1fbafb33502f7dd468dc5a62826689c1c658dbce", + "bed32be19e1f432f5caec2b8bf914a968dfa5a5cba3868ea640ba9cbb0f9c9c8"}); using limits = std::numeric_limits; cudf::test::fixed_width_column_wrapper const ints_col( - {0, 100, -100, limits::min(), limits::max(), 1, 2, 3}); + {0, -1, 100, -100, limits::min(), limits::max(), 1, 2, 3}); - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0, 1, 1, 1}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0, 1, 2, 255}); + cudf::test::fixed_width_column_wrapper const bools_col({0, 1, 1, 1, 0, 1, 1, 1, 0}); + // Test string inputs against known outputs auto const string_input1 = cudf::table_view({strings_col}); auto const string_input2 = cudf::table_view({strings_col, strings_col}); auto const sha256_string_output1 = cudf::hashing::sha256(string_input1); @@ -91,47 +105,22 @@ TEST_F(SHA256HashTest, MultiValue) CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha256_string_output1->view(), sha256_string_results1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha256_string_output2->view(), sha256_string_results2); - auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); - auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); + // Test non-string inputs for self-consistency + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col}); auto const sha256_output1 = cudf::hashing::sha256(input1); auto const sha256_output2 = cudf::hashing::sha256(input2); EXPECT_EQ(input1.num_rows(), sha256_output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha256_output1->view(), sha256_output2->view()); } - -TEST_F(SHA256HashTest, MultiValueNulls) +TEST_F(SHA256HashTest, EmptyNullEquivalence) { - // Nulls with different values should be equal - cudf::test::strings_column_wrapper const strings_col1( - {"", - "Different but null!", - "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " - "the hash function being tested. This string needed to be longer.", - "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, - {1, 0, 0, 1, 0}); - cudf::test::strings_column_wrapper const strings_col2( - {"", - "Another string that is null.", - "Very different... but null", - "All work and no play makes Jack a dull boy", - ""}, - {1, 0, 0, 1, 1}); // empty string is equivalent to null + // Test that empty strings hash the same as nulls + cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0}); + cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1}); - // Nulls with different values should be equal - using limits = std::numeric_limits; - cudf::test::fixed_width_column_wrapper const ints_col1( - {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); - cudf::test::fixed_width_column_wrapper const ints_col2( - {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); - - // Nulls with different values should be equal - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); - - auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1}); - auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2}); + auto const input1 = cudf::table_view({strings_col1}); + auto const input2 = cudf::table_view({strings_col2}); auto const output1 = cudf::hashing::sha256(input1); auto const output2 = cudf::hashing::sha256(input2); @@ -140,6 +129,28 @@ TEST_F(SHA256HashTest, MultiValueNulls) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } +TEST_F(SHA256HashTest, ListsUnsupported) +{ + cudf::test::lists_column_wrapper strings_list_col( + {{""}, + {"", "Some inputs"}, + {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"}, + {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}}); + + auto const input = cudf::table_view({strings_list_col}); + + EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error); +} + +TEST_F(SHA256HashTest, StructsUnsupported) +{ + auto child_col = cudf::test::fixed_width_column_wrapper{0, 1, 2, 3}; + auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; + auto const input = cudf::table_view({struct_col}); + + EXPECT_THROW(cudf::hashing::sha256(input), cudf::logic_error); +} + template class SHA256HashTestTyped : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp index e636ac5dd4e..47d0aee4020 100644 --- a/cpp/tests/hashing/sha384_test.cpp +++ b/cpp/tests/hashing/sha384_test.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -50,8 +51,20 @@ TEST_F(SHA384HashTest, MultiValue) "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " "the hash function being tested. This string needed to be longer.", "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); - + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~", + "Multi-byte characters: é¼³⅝"}); + + /* + These outputs can be generated with shell: + ``` + echo -n "input string" | sha384sum + ``` + Or with Python: + ``` + import hashlib + print(hashlib.sha384("input string".encode()).hexdigest()) + ``` + */ cudf::test::strings_column_wrapper const sha384_string_results1( {"38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b" "95b", @@ -68,7 +81,9 @@ TEST_F(SHA384HashTest, MultiValue) "281826f23bebb3f835d2f15edcb0cdb3078ae2d7dc516f3a366af172dff4db6dd5833bc1e5ee411d52c598773e939" "7b6", "3a9d1a870a5f6a4c04df1daf1808163d33852897ebc757a5b028a1214fbc758485a392159b11bc360cfadc79f9512" - "822"}); + "822", + "f6d9687e48ef1f69f7523c2a06c338e2b2e6cb251823d46bfa7f9ba65a071693919726b85f6dd77726a73c57a0e3a" + "4a5"}); cudf::test::strings_column_wrapper const sha384_string_results2( {"38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b" @@ -86,16 +101,17 @@ TEST_F(SHA384HashTest, MultiValue) "781a33adfdcdcbb514318728c074fbb59d44002995825642e0c9bfef8a2ccf3fb637b39ff3dd265df8cd93c86e945" "ce9", "d2efb1591c4503f23c34ddb4da6bb1017d3d4d7c9f23ee6aa52e71c98d41060bc35eb22f41b6130d5c42a6e717fb3" - "edf"}); + "edf", + "46e493cdd8b1e43ce2e90b6934a39e724949a1f8ea6709e09dbc68172089de864873ee7e10decdff98b44fbce2ba8" + "146"}); using limits = std::numeric_limits; cudf::test::fixed_width_column_wrapper const ints_col( - {0, 100, -100, limits::min(), limits::max(), 1, 2, 3}); + {0, -1, 100, -100, limits::min(), limits::max(), 1, 2, 3}); - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0, 1, 1, 1}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0, 1, 2, 255}); + cudf::test::fixed_width_column_wrapper const bools_col({0, 1, 1, 1, 0, 1, 1, 1, 0}); + // Test string inputs against known outputs auto const string_input1 = cudf::table_view({strings_col}); auto const string_input2 = cudf::table_view({strings_col, strings_col}); auto const sha384_string_output1 = cudf::hashing::sha384(string_input1); @@ -105,47 +121,23 @@ TEST_F(SHA384HashTest, MultiValue) CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_string_output1->view(), sha384_string_results1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_string_output2->view(), sha384_string_results2); - auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); - auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); + // Test non-string inputs for self-consistency + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col}); auto const sha384_output1 = cudf::hashing::sha384(input1); auto const sha384_output2 = cudf::hashing::sha384(input2); EXPECT_EQ(input1.num_rows(), sha384_output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha384_output1->view(), sha384_output2->view()); } -TEST_F(SHA384HashTest, MultiValueNulls) +TEST_F(SHA384HashTest, EmptyNullEquivalence) { - // Nulls with different values should be equal - cudf::test::strings_column_wrapper const strings_col1( - {"", - "Different but null!", - "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " - "the hash function being tested. This string needed to be longer.", - "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, - {1, 0, 0, 1, 0}); - cudf::test::strings_column_wrapper const strings_col2( - {"", - "Another string that is null.", - "Very different... but null", - "All work and no play makes Jack a dull boy", - ""}, - {1, 0, 0, 1, 1}); // empty string is equivalent to null - - // Nulls with different values should be equal - using limits = std::numeric_limits; - cudf::test::fixed_width_column_wrapper const ints_col1( - {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); - cudf::test::fixed_width_column_wrapper const ints_col2( - {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + // Test that empty strings hash the same as nulls + cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0}); + cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1}); - // Nulls with different values should be equal - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); - - auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1}); - auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2}); + auto const input1 = cudf::table_view({strings_col1}); + auto const input2 = cudf::table_view({strings_col2}); auto const output1 = cudf::hashing::sha384(input1); auto const output2 = cudf::hashing::sha384(input2); @@ -154,6 +146,28 @@ TEST_F(SHA384HashTest, MultiValueNulls) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } +TEST_F(SHA384HashTest, ListsUnsupported) +{ + cudf::test::lists_column_wrapper strings_list_col( + {{""}, + {"", "Some inputs"}, + {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"}, + {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}}); + + auto const input = cudf::table_view({strings_list_col}); + + EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error); +} + +TEST_F(SHA384HashTest, StructsUnsupported) +{ + auto child_col = cudf::test::fixed_width_column_wrapper{0, 1, 2, 3}; + auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; + auto const input = cudf::table_view({struct_col}); + + EXPECT_THROW(cudf::hashing::sha384(input), cudf::logic_error); +} + template class SHA384HashTestTyped : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp index 301aa614dfa..3e74be07844 100644 --- a/cpp/tests/hashing/sha512_test.cpp +++ b/cpp/tests/hashing/sha512_test.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -50,8 +51,20 @@ TEST_F(SHA512HashTest, MultiValue) "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " "the hash function being tested. This string needed to be longer.", "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}); - + "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~", + "Multi-byte characters: é¼³⅝"}); + + /* + These outputs can be generated with shell: + ``` + echo -n "input string" | sha512sum + ``` + Or with Python: + ``` + import hashlib + print(hashlib.sha512("input string".encode()).hexdigest()) + ``` + */ cudf::test::strings_column_wrapper const sha512_string_results1( {"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877ee" "c2f63b931bd47417a81a538327af927da3e", @@ -68,7 +81,9 @@ TEST_F(SHA512HashTest, MultiValue) "bae9eb4b5c05a4c5f85750b70b2f0ce78e387f992f0927a017eb40bd180a13004f6252a6bbf9816f195fb7d86668c" "393dc0985aaf7168f48e8b905f3b9b02df2", "05a4ca1c523dcab32edb7d8793934a4cdf41a9062b229d711f5326e297bda83fa965118b9d7636172b43688e8e149" - "008b3f967f1a969962b7e959af894a8a315"}); + "008b3f967f1a969962b7e959af894a8a315", + "1a15d73f16820b25f2af1c824a00a6ab18fe3eb91adaae31f441f4eca7ca11baf56d2f56e4f600781bf3637a49a4f" + "bdbd5d7e0d8e894c51144e28eed59b3721a"}); cudf::test::strings_column_wrapper const sha512_string_results2( {"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877ee" @@ -86,16 +101,17 @@ TEST_F(SHA512HashTest, MultiValue) "aae2e742074847889a029a8d3170f9e17177d48ec0b9dabe572aa68dd3001af0c512f164ba84aa75b13950948170a" "0912912d16c98d2f05cb633c0d5b6a9105e", "77f46e99a7a51ac04b4380ebca70c0782381629f711169a3b9dad3fc9aa6221a9c0cdaa9b9ea4329773e773e2987c" - "d1eebe0661386909684927d67819a2cf736"}); + "d1eebe0661386909684927d67819a2cf736", + "023f99dea2a46cb4f0672645c4123697a57e2911c1889bcb5339383f81d78e0efbcca11568621b732e7ac13bef576" + "a79f0dfb0a1db2a2ede8a14e860e3a9f1bc"}); using limits = std::numeric_limits; cudf::test::fixed_width_column_wrapper const ints_col( - {0, 100, -100, limits::min(), limits::max(), 1, 2, 3}); + {0, -1, 100, -100, limits::min(), limits::max(), 1, 2, 3}); - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0, 1, 1, 1}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0, 1, 2, 255}); + cudf::test::fixed_width_column_wrapper const bools_col({0, 1, 1, 1, 0, 1, 1, 1, 0}); + // Test string inputs against known outputs auto const string_input1 = cudf::table_view({strings_col}); auto const string_input2 = cudf::table_view({strings_col, strings_col}); auto const sha512_string_output1 = cudf::hashing::sha512(string_input1); @@ -105,47 +121,23 @@ TEST_F(SHA512HashTest, MultiValue) CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_string_output1->view(), sha512_string_results1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_string_output2->view(), sha512_string_results2); - auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1}); - auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2}); + // Test non-string inputs for self-consistency + auto const input1 = cudf::table_view({strings_col, ints_col, bools_col}); + auto const input2 = cudf::table_view({strings_col, ints_col, bools_col}); auto const sha512_output1 = cudf::hashing::sha512(input1); auto const sha512_output2 = cudf::hashing::sha512(input2); EXPECT_EQ(input1.num_rows(), sha512_output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(sha512_output1->view(), sha512_output2->view()); } -TEST_F(SHA512HashTest, MultiValueNulls) +TEST_F(SHA512HashTest, EmptyNullEquivalence) { - // Nulls with different values should be equal - cudf::test::strings_column_wrapper const strings_col1( - {"", - "Different but null!", - "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in " - "the hash function being tested. This string needed to be longer.", - "All work and no play makes Jack a dull boy", - "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"}, - {1, 0, 0, 1, 0}); - cudf::test::strings_column_wrapper const strings_col2( - {"", - "Another string that is null.", - "Very different... but null", - "All work and no play makes Jack a dull boy", - ""}, - {1, 0, 0, 1, 1}); // empty string is equivalent to null - - // Nulls with different values should be equal - using limits = std::numeric_limits; - cudf::test::fixed_width_column_wrapper const ints_col1( - {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); - cudf::test::fixed_width_column_wrapper const ints_col2( - {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1}); + // Test that empty strings hash the same as nulls + cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0}); + cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1}); - // Nulls with different values should be equal - // Different truth values should be equal - cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1}); - cudf::test::fixed_width_column_wrapper const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1}); - - auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1}); - auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2}); + auto const input1 = cudf::table_view({strings_col1}); + auto const input2 = cudf::table_view({strings_col2}); auto const output1 = cudf::hashing::sha512(input1); auto const output2 = cudf::hashing::sha512(input2); @@ -154,6 +146,28 @@ TEST_F(SHA512HashTest, MultiValueNulls) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } +TEST_F(SHA512HashTest, ListsUnsupported) +{ + cudf::test::lists_column_wrapper strings_list_col( + {{""}, + {"", "Some inputs"}, + {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"}, + {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}}); + + auto const input = cudf::table_view({strings_list_col}); + + EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error); +} + +TEST_F(SHA512HashTest, StructsUnsupported) +{ + auto child_col = cudf::test::fixed_width_column_wrapper{0, 1, 2, 3}; + auto struct_col = cudf::test::structs_column_wrapper{{child_col}}; + auto const input = cudf::table_view({struct_col}); + + EXPECT_THROW(cudf::hashing::sha512(input), cudf::logic_error); +} + template class SHA512HashTestTyped : public cudf::test::BaseFixture {}; From 65358ff6f297193e90ccd8f849da5cd8d482e84a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 19 Jan 2024 17:37:43 -0600 Subject: [PATCH 14/16] Refactor remaining tests relying on unsanitized nulls. --- cpp/tests/hashing/sha1_test.cpp | 21 ++++++++------------- cpp/tests/hashing/sha224_test.cpp | 21 ++++++++------------- cpp/tests/hashing/sha256_test.cpp | 21 ++++++++------------- cpp/tests/hashing/sha384_test.cpp | 21 ++++++++------------- cpp/tests/hashing/sha512_test.cpp | 21 ++++++++------------- 5 files changed, 40 insertions(+), 65 deletions(-) diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp index 128794cd826..240a0279644 100644 --- a/cpp/tests/hashing/sha1_test.cpp +++ b/cpp/tests/hashing/sha1_test.cpp @@ -155,7 +155,7 @@ class SHA1HashTestTyped : public cudf::test::BaseFixture {}; TYPED_TEST_CASE(SHA1HashTestTyped, cudf::test::NumericTypes); -TYPED_TEST(SHA1HashTestTyped, Equality) +TYPED_TEST(SHA1HashTestTyped, NoNulls) { cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); auto const input = cudf::table_view({col}); @@ -168,21 +168,16 @@ TYPED_TEST(SHA1HashTestTyped, Equality) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } -TYPED_TEST(SHA1HashTestTyped, EqualityNulls) +TYPED_TEST(SHA1HashTestTyped, WithNulls) { - using T = TypeParam; - - // Nulls with different values should be equal - cudf::test::fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - cudf::test::fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - - auto const input1 = cudf::table_view({col1}); - auto const input2 = cudf::table_view({col2}); + cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + auto const input = cudf::table_view({col}); - auto const output1 = cudf::hashing::sha1(input1); - auto const output2 = cudf::hashing::sha1(input2); + // Hash of same input should be equal + auto const output1 = cudf::hashing::sha1(input); + auto const output2 = cudf::hashing::sha1(input); - EXPECT_EQ(input1.num_rows(), output1->size()); + EXPECT_EQ(input.num_rows(), output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp index 5122d067a05..230cbc7dc26 100644 --- a/cpp/tests/hashing/sha224_test.cpp +++ b/cpp/tests/hashing/sha224_test.cpp @@ -155,7 +155,7 @@ class SHA224HashTestTyped : public cudf::test::BaseFixture {}; TYPED_TEST_CASE(SHA224HashTestTyped, cudf::test::NumericTypes); -TYPED_TEST(SHA224HashTestTyped, Equality) +TYPED_TEST(SHA224HashTestTyped, NoNulls) { cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); auto const input = cudf::table_view({col}); @@ -168,21 +168,16 @@ TYPED_TEST(SHA224HashTestTyped, Equality) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } -TYPED_TEST(SHA224HashTestTyped, EqualityNulls) +TYPED_TEST(SHA224HashTestTyped, WithNulls) { - using T = TypeParam; - - // Nulls with different values should be equal - cudf::test::fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - cudf::test::fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - - auto const input1 = cudf::table_view({col1}); - auto const input2 = cudf::table_view({col2}); + cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + auto const input = cudf::table_view({col}); - auto const output1 = cudf::hashing::sha224(input1); - auto const output2 = cudf::hashing::sha224(input2); + // Hash of same input should be equal + auto const output1 = cudf::hashing::sha224(input); + auto const output2 = cudf::hashing::sha224(input); - EXPECT_EQ(input1.num_rows(), output1->size()); + EXPECT_EQ(input.num_rows(), output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp index 99f7f0ad82b..4921f75495f 100644 --- a/cpp/tests/hashing/sha256_test.cpp +++ b/cpp/tests/hashing/sha256_test.cpp @@ -156,7 +156,7 @@ class SHA256HashTestTyped : public cudf::test::BaseFixture {}; TYPED_TEST_CASE(SHA256HashTestTyped, cudf::test::NumericTypes); -TYPED_TEST(SHA256HashTestTyped, Equality) +TYPED_TEST(SHA256HashTestTyped, NoNulls) { cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); auto const input = cudf::table_view({col}); @@ -169,21 +169,16 @@ TYPED_TEST(SHA256HashTestTyped, Equality) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } -TYPED_TEST(SHA256HashTestTyped, EqualityNulls) +TYPED_TEST(SHA256HashTestTyped, WithNulls) { - using T = TypeParam; - - // Nulls with different values should be equal - cudf::test::fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - cudf::test::fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - - auto const input1 = cudf::table_view({col1}); - auto const input2 = cudf::table_view({col2}); + cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + auto const input = cudf::table_view({col}); - auto const output1 = cudf::hashing::sha256(input1); - auto const output2 = cudf::hashing::sha256(input2); + // Hash of same input should be equal + auto const output1 = cudf::hashing::sha256(input); + auto const output2 = cudf::hashing::sha256(input); - EXPECT_EQ(input1.num_rows(), output1->size()); + EXPECT_EQ(input.num_rows(), output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp index 47d0aee4020..c8fd528ad2f 100644 --- a/cpp/tests/hashing/sha384_test.cpp +++ b/cpp/tests/hashing/sha384_test.cpp @@ -173,7 +173,7 @@ class SHA384HashTestTyped : public cudf::test::BaseFixture {}; TYPED_TEST_CASE(SHA384HashTestTyped, cudf::test::NumericTypes); -TYPED_TEST(SHA384HashTestTyped, Equality) +TYPED_TEST(SHA384HashTestTyped, NoNulls) { cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); auto const input = cudf::table_view({col}); @@ -186,21 +186,16 @@ TYPED_TEST(SHA384HashTestTyped, Equality) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } -TYPED_TEST(SHA384HashTestTyped, EqualityNulls) +TYPED_TEST(SHA384HashTestTyped, WithNulls) { - using T = TypeParam; - - // Nulls with different values should be equal - cudf::test::fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - cudf::test::fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - - auto const input1 = cudf::table_view({col1}); - auto const input2 = cudf::table_view({col2}); + cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + auto const input = cudf::table_view({col}); - auto const output1 = cudf::hashing::sha384(input1); - auto const output2 = cudf::hashing::sha384(input2); + // Hash of same input should be equal + auto const output1 = cudf::hashing::sha384(input); + auto const output2 = cudf::hashing::sha384(input); - EXPECT_EQ(input1.num_rows(), output1->size()); + EXPECT_EQ(input.num_rows(), output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp index 3e74be07844..f7b43328331 100644 --- a/cpp/tests/hashing/sha512_test.cpp +++ b/cpp/tests/hashing/sha512_test.cpp @@ -173,7 +173,7 @@ class SHA512HashTestTyped : public cudf::test::BaseFixture {}; TYPED_TEST_CASE(SHA512HashTestTyped, cudf::test::NumericTypes); -TYPED_TEST(SHA512HashTestTyped, Equality) +TYPED_TEST(SHA512HashTestTyped, NoNulls) { cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}); auto const input = cudf::table_view({col}); @@ -186,21 +186,16 @@ TYPED_TEST(SHA512HashTestTyped, Equality) CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } -TYPED_TEST(SHA512HashTestTyped, EqualityNulls) +TYPED_TEST(SHA512HashTestTyped, WithNulls) { - using T = TypeParam; - - // Nulls with different values should be equal - cudf::test::fixed_width_column_wrapper const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - cudf::test::fixed_width_column_wrapper const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); - - auto const input1 = cudf::table_view({col1}); - auto const input2 = cudf::table_view({col2}); + cudf::test::fixed_width_column_wrapper const col({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1}); + auto const input = cudf::table_view({col}); - auto const output1 = cudf::hashing::sha512(input1); - auto const output2 = cudf::hashing::sha512(input2); + // Hash of same input should be equal + auto const output1 = cudf::hashing::sha512(input); + auto const output2 = cudf::hashing::sha512(input); - EXPECT_EQ(input1.num_rows(), output1->size()); + EXPECT_EQ(input.num_rows(), output1->size()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); } From 7183be06d4d7b508ed0580c1eac0605aad240ad8 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 19 Jan 2024 17:39:42 -0600 Subject: [PATCH 15/16] Remove extra header. --- cpp/tests/hashing/md5_test.cpp | 1 - cpp/tests/hashing/sha1_test.cpp | 1 - cpp/tests/hashing/sha224_test.cpp | 1 - cpp/tests/hashing/sha256_test.cpp | 1 - cpp/tests/hashing/sha384_test.cpp | 1 - cpp/tests/hashing/sha512_test.cpp | 1 - 6 files changed, 6 deletions(-) diff --git a/cpp/tests/hashing/md5_test.cpp b/cpp/tests/hashing/md5_test.cpp index f35e11699c0..9361c4e748c 100644 --- a/cpp/tests/hashing/md5_test.cpp +++ b/cpp/tests/hashing/md5_test.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp index 240a0279644..31145e4c3c4 100644 --- a/cpp/tests/hashing/sha1_test.cpp +++ b/cpp/tests/hashing/sha1_test.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp index 230cbc7dc26..9aa1ee0fac2 100644 --- a/cpp/tests/hashing/sha224_test.cpp +++ b/cpp/tests/hashing/sha224_test.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp index 4921f75495f..4fed8c55fc2 100644 --- a/cpp/tests/hashing/sha256_test.cpp +++ b/cpp/tests/hashing/sha256_test.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp index c8fd528ad2f..49b9b5ef3a5 100644 --- a/cpp/tests/hashing/sha384_test.cpp +++ b/cpp/tests/hashing/sha384_test.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp index f7b43328331..df0315099fb 100644 --- a/cpp/tests/hashing/sha512_test.cpp +++ b/cpp/tests/hashing/sha512_test.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include From 25b4db8e5481ca29d70b299b76024c89f5c4d6ec Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 19 Jan 2024 17:52:08 -0600 Subject: [PATCH 16/16] Use SFINAE instead of if constexpr. --- cpp/src/hash/sha_hash.cuh | 57 ++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh index ebefd8108f0..0a22ee34918 100644 --- a/cpp/src/hash/sha_hash.cuh +++ b/cpp/src/hash/sha_hash.cuh @@ -237,33 +237,46 @@ struct HasherDispatcher { { } - template + template () and not is_floating_point() and + not is_chrono())> __device__ inline void operator()(size_type row_index) { - if constexpr (is_fixed_width() && !is_chrono()) { - Element const& key = input_col.element(row_index); - if constexpr (is_floating_point()) { - if (isnan(key)) { - Element nan = std::numeric_limits::quiet_NaN(); - hasher->process_fixed_width(nan); - } else if (key == Element{0.0}) { - hasher->process_fixed_width(Element{0.0}); - } else { - hasher->process_fixed_width(key); - } - } else { - hasher->process_fixed_width(key); - } - } else if constexpr (std::is_same_v) { - string_view key = input_col.element(row_index); - uint8_t const* data = reinterpret_cast(key.data()); - uint32_t const len = static_cast(key.size_bytes()); - hasher->process(data, len); + Element const& key = input_col.element(row_index); + hasher->process_fixed_width(key); + } + + template ())> + __device__ inline void operator()(size_type row_index) + { + Element const& key = input_col.element(row_index); + if (isnan(key)) { + Element nan = std::numeric_limits::quiet_NaN(); + hasher->process_fixed_width(nan); + } else if (key == Element{0.0}) { + hasher->process_fixed_width(Element{0.0}); } else { - (void)row_index; - cudf_assert(false && "Unsupported type for hash function."); + hasher->process_fixed_width(key); } } + + template )> + __device__ inline void operator()(size_type row_index) + { + string_view key = input_col.element(row_index); + uint8_t const* data = reinterpret_cast(key.data()); + uint32_t const len = static_cast(key.size_bytes()); + hasher->process(data, len); + } + + template () or is_chrono()) and + not std::is_same_v)> + __device__ inline void operator()(size_type row_index) + { + (void)row_index; + cudf_assert(false && "Unsupported type for hash function."); + } }; /**