diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh index 806b6b790f3..3a1bb91b56c 100644 --- a/cpp/include/cudf/detail/utilities/device_operators.cuh +++ b/cpp/include/cudf/detail/utilities/device_operators.cuh @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef DEVICE_OPERATORS_CUH -#define DEVICE_OPERATORS_CUH +#pragma once /** * @brief definition of the device operators @@ -24,6 +23,7 @@ #include #include +#include #include #include #include @@ -84,16 +84,6 @@ struct DeviceCount { } }; -/** - * @brief string value for sentinel which is used in min, max reduction - * operators - * This sentinel string value is the highest possible valid UTF-8 encoded - * character. This serves as identity value for maximum operator on string - * values. Also, this char pointer serves as valid device pointer of identity - * value for minimum operator on string values. - */ -static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; - /* @brief binary `min` operator */ struct DeviceMin { template @@ -123,13 +113,7 @@ struct DeviceMin { typename std::enable_if_t::value>* = nullptr> CUDA_HOST_DEVICE_CALLABLE static constexpr T identity() { - const char* psentinel{nullptr}; -#if defined(__CUDA_ARCH__) - psentinel = &max_string_sentinel[0]; -#else - CUDA_TRY(cudaGetSymbolAddress((void**)&psentinel, max_string_sentinel)); -#endif - return T(psentinel, 4); + return string_view::max(); } template ()>* = nullptr> @@ -167,13 +151,7 @@ struct DeviceMax { typename std::enable_if_t::value>* = nullptr> CUDA_HOST_DEVICE_CALLABLE static constexpr T identity() { - const char* psentinel{nullptr}; -#if defined(__CUDA_ARCH__) - psentinel = &max_string_sentinel[0]; -#else - CUDA_TRY(cudaGetSymbolAddress((void**)&psentinel, max_string_sentinel)); -#endif - return T(psentinel, 0); + return string_view::min(); } template ()>* = nullptr> @@ -242,5 +220,3 @@ struct DeviceLeadLag { }; } // namespace cudf - -#endif diff --git a/cpp/include/cudf/detail/utilities/int_fastdiv.h b/cpp/include/cudf/detail/utilities/int_fastdiv.h index 845a72c9b99..292b502cc78 100644 --- a/cpp/include/cudf/detail/utilities/int_fastdiv.h +++ b/cpp/include/cudf/detail/utilities/int_fastdiv.h @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef _INT_FASTDIV_KJGIUHFG -#define _INT_FASTDIV_KJGIUHFG +#pragma once class int_fastdiv { public: @@ -172,5 +171,3 @@ __host__ __device__ __forceinline__ int operator%(const unsigned char n, const i { return ((int)n) % divisor; } - -#endif diff --git a/cpp/include/cudf/strings/string.cuh b/cpp/include/cudf/strings/string.cuh index ba85c3759a7..82da5ad8f10 100644 --- a/cpp/include/cudf/strings/string.cuh +++ b/cpp/include/cudf/strings/string.cuh @@ -41,7 +41,7 @@ namespace string { * @param d_str String to check. * @return true if string has valid integer characters */ -__device__ bool is_integer(string_view const& d_str) +inline __device__ bool is_integer(string_view const& d_str) { if (d_str.empty()) return false; auto begin = d_str.begin(); @@ -71,7 +71,7 @@ __device__ bool is_integer(string_view const& d_str) * @param d_str String to check. * @return true if string has valid float characters */ -__device__ bool is_float(string_view const& d_str) +inline __device__ bool is_float(string_view const& d_str) { if (d_str.empty()) return false; // strings allowed by the converter @@ -105,6 +105,7 @@ __device__ bool is_float(string_view const& d_str) } return result; } + /** @} */ // end of group } // namespace string } // namespace strings diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 55d77db42d0..4bcb46e4655 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -43,9 +44,50 @@ __device__ inline size_type characters_in_string(const char* str, size_type byte return thrust::count_if( thrust::seq, ptr, ptr + bytes, [](uint8_t chr) { return is_begin_utf8_char(chr); }); } + +/** + * @brief string value for sentinel which is used in min, max reduction + * operators + * + * This sentinel string value is the highest possible valid UTF-8 encoded + * character. This serves as identity value for maximum operator on string + * values. Also, this char pointer serves as valid device pointer of identity + * value for minimum operator on string values. + */ +static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"}; } // namespace detail } // namespace strings +/** + * @brief Return minimum value associated with the string type + * + * This function is needed to be host callable because it is called by a host + * callable function DeviceMax::identity() + * + * @return An empty string + */ +CUDA_HOST_DEVICE_CALLABLE string_view string_view::min() { return string_view(); } + +/** + * @brief Return maximum value associated with the string type + * + * This function is needed to be host callable because it is called by a host + * callable function DeviceMin::identity() + * + * @return A string value which represents the highest possible valid UTF-8 encoded + * character. + */ +CUDA_HOST_DEVICE_CALLABLE string_view string_view::max() +{ + const char* psentinel{nullptr}; +#if defined(__CUDA_ARCH__) + psentinel = &cudf::strings::detail::max_string_sentinel[0]; +#else + CUDA_TRY(cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel)); +#endif + return string_view(psentinel, 4); +} + __device__ inline size_type string_view::length() const { if (_length == UNKNOWN_STRING_LENGTH) diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index 9c42c216791..667a25c7641 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -290,6 +290,27 @@ class string_view { */ CUDA_DEVICE_CALLABLE string_view substr(size_type start, size_type length) const; + /** + * @brief Return minimum value associated with the string type + * + * This function is needed to be host callable because it is called by a host + * callable function DeviceMax::identity() + * + * @return An empty string + */ + CUDA_HOST_DEVICE_CALLABLE static string_view min(); + + /** + * @brief Return maximum value associated with the string type + * + * This function is needed to be host callable because it is called by a host + * callable function DeviceMin::identity() + * + * @return A string value which represents the highest possible valid UTF-8 encoded + * character. + */ + CUDA_HOST_DEVICE_CALLABLE static string_view max(); + /** * @brief Default constructor represents an empty string. */ diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 15fef5c8b82..56a55bd0a4d 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -51,7 +51,7 @@ __global__ void __launch_bounds__(init_threads_per_block) uint32_t t = threadIdx.x; statistics_group *group = &group_g[threadIdx.y]; if (chunk_id < num_rowgroups and t == 0) { - uint32_t num_rows = cols[col_id].num_rows; + uint32_t num_rows = cols[col_id].leaf_column->size(); group->col = &cols[col_id]; group->start_row = chunk_id * row_index_stride; group->num_rows = min(num_rows - min(chunk_id * row_index_stride, num_rows), row_index_stride); diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 1e634849998..1c99c99369b 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -21,8 +21,11 @@ #include "writer_impl.hpp" +#include + #include #include +#include #include #include @@ -39,6 +42,7 @@ namespace detail { namespace orc { using namespace cudf::io::orc; using namespace cudf::io; +using cudf::io::orc::gpu::nvstrdesc_s; struct row_group_index_info { int32_t pos = -1; // Position @@ -775,7 +779,9 @@ std::vector writer::impl::gather_stripes( } std::vector> writer::impl::gather_statistic_blobs( - host_span columns, host_span stripe_bounds) + const table_device_view &table, + host_span columns, + host_span stripe_bounds) { auto const num_rowgroups = stripes_size(stripe_bounds); size_t num_stat_blobs = (1 + stripe_bounds.size()) * columns.size(); @@ -833,6 +839,10 @@ std::vector> writer::impl::gather_statistic_blobs( } stat_desc.host_to_device(stream); stat_merge.host_to_device(stream); + + rmm::device_uvector leaf_column_views = + create_leaf_column_device_views(stat_desc, table, stream); + gpu::orc_init_statistics_groups(stat_groups.data(), stat_desc.device_ptr(), columns.size(), @@ -1106,10 +1116,11 @@ void writer::impl::write(table_view const &table) auto stripes = gather_stripes(num_rows, num_index_streams, stripe_bounds, &enc_data.streams, &strm_descs); + auto device_columns = table_device_view::create(table); // Gather column statistics std::vector> column_stats; if (enable_statistics_ && num_columns > 0 && num_rows > 0) { - column_stats = gather_statistic_blobs(orc_columns, stripe_bounds); + column_stats = gather_statistic_blobs(*device_columns, orc_columns, stripe_bounds); } // Allocate intermediate output stream buffer diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 6f651579042..f0ec3a70cec 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -271,13 +272,16 @@ class writer::impl { * @brief Returns per-stripe and per-file column statistics encoded * in ORC protobuf format. * + * @param table Table information to be written * @param columns List of columns * @param stripe_bounds List of stripe boundaries * * @return The statistic blobs */ std::vector> gather_statistic_blobs( - host_span columns, host_span stripe_bounds); + const table_device_view& table, + host_span columns, + host_span stripe_bounds); /** * @brief Writes the specified column's row index stream. diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index d7d47c07354..8b99248e2fd 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -1961,41 +1961,6 @@ void InitPageFragments(PageFragment *frag, frag, col_desc, num_fragments, num_columns, fragment_size, num_rows); } -/** - * @copydoc void init_column_device_views(EncColumnDesc *col_desc, - * column_device_view *leaf_column_views, - * const table_device_view &parent_table_device_view, - * rmm::cuda_stream_view stream) - */ -void init_column_device_views(EncColumnDesc *col_desc, - column_device_view *leaf_column_views, - const table_device_view &parent_column_table_device_view, - rmm::cuda_stream_view stream) -{ - cudf::detail::device_single_thread( - [col_desc, - parent_col_view = parent_column_table_device_view, - leaf_column_views] __device__() mutable { - for (size_type i = 0; i < parent_col_view.num_columns(); ++i) { - column_device_view col = parent_col_view.column(i); - if (col.type().id() == type_id::LIST) { - col_desc[i].parent_column = parent_col_view.begin() + i; - } else { - col_desc[i].parent_column = nullptr; - } - // traverse till leaf column - while (col.type().id() == type_id::LIST) { - col = col.child(lists_column_view::child_column_index); - } - // Store leaf_column to device storage - column_device_view *leaf_col_ptr = leaf_column_views + i; - *leaf_col_ptr = col; - col_desc[i].leaf_column = leaf_col_ptr; - } - }, - stream); -} - /** * @brief Launches kernel for initializing fragment statistics groups * diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 4271e3f4bce..43d144ec980 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -229,9 +229,6 @@ struct EncColumnDesc : stats_column_desc { size_type const *level_offsets; //!< Offset array for per-row pre-calculated rep/def level values uint8_t const *rep_values; //!< Pre-calculated repetition level values uint8_t const *def_values; //!< Pre-calculated definition level values - - column_device_view *leaf_column; //!< Pointer to leaf column - column_device_view *parent_column; //!< Pointer to parent column. Is nullptr if not list type. }; constexpr int max_page_fragment_size = 5000; //!< Max number of rows in a page fragment @@ -448,19 +445,6 @@ void InitPageFragments(PageFragment *frag, uint32_t num_rows, rmm::cuda_stream_view stream); -/** - * @brief Set column_device_view pointers in column description array - * - * @param[out] col_desc Column description array [column_id] - * @param[out] leaf_column_views Device array to store leaf columns - * @param[in] parent_table_device_view Table device view containing parent columns - * @param[in] stream CUDA stream to use, default 0 - */ -void init_column_device_views(EncColumnDesc *col_desc, - column_device_view *leaf_column_views, - const table_device_view &parent_table_device_view, - rmm::cuda_stream_view stream); - /** * @brief Launches kernel for initializing fragment statistics groups * diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index de1707ea9af..a645ca0fd91 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -22,6 +22,7 @@ #include "writer_impl.hpp" #include +#include #include #include @@ -495,17 +496,6 @@ class parquet_column_view { uint8_t _decimal_precision = 0; }; -rmm::device_uvector writer::impl::create_leaf_column_device_views( - hostdevice_vector &col_desc, - const table_device_view &parent_table_device_view) -{ - rmm::device_uvector leaf_column_views(parent_table_device_view.num_columns(), - stream); - gpu::init_column_device_views( - col_desc.device_ptr(), leaf_column_views.data(), parent_table_device_view, stream); - return leaf_column_views; -} - void writer::impl::init_page_fragments(hostdevice_vector &frag, hostdevice_vector &col_desc, uint32_t num_columns, @@ -919,7 +909,8 @@ void writer::impl::write(table_view const &table) if (fragments.size() != 0) { // Move column info to device col_desc.host_to_device(stream); - leaf_column_views = create_leaf_column_device_views(col_desc, *parent_column_table_device_view); + leaf_column_views = create_leaf_column_device_views( + col_desc, *parent_column_table_device_view, stream); init_page_fragments(fragments, col_desc, num_columns, num_fragments, num_rows, fragment_size); } diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index b1c0577821f..f5e0f7408c5 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -136,18 +136,6 @@ class writer::impl { uint32_t num_rows, uint32_t fragment_size); - /** - * @brief Create column_device_view pointers from leaf columns - * - * @param col_desc Column description array - * @param parent_table_device_view Table device view containing parent columns - * - * @return Device array containing leaf column device views - */ - rmm::device_uvector create_leaf_column_device_views( - hostdevice_vector& col_desc, - const table_device_view& parent_table_device_view); - /** * @brief Gather per-fragment statistics * diff --git a/cpp/src/io/statistics/column_stats.cu b/cpp/src/io/statistics/column_stats.cu index 5d9d41412a4..128bd905259 100644 --- a/cpp/src/io/statistics/column_stats.cu +++ b/cpp/src/io/statistics/column_stats.cu @@ -16,7 +16,7 @@ #include "column_stats.h" -#include +#include #include @@ -62,84 +62,6 @@ struct IgnoreNaNSum { } }; -/** - * Warp-wide Min reduction for string types - */ -inline __device__ string_stats WarpReduceMinString(const char *smin, uint32_t lmin) -{ - uint32_t len = shuffle_xor(lmin, 1); - const char *ptr = - reinterpret_cast(shuffle_xor(reinterpret_cast(smin), 1)); - if (!smin || (ptr && nvstr_is_lesser(ptr, len, smin, lmin))) { - smin = ptr; - lmin = len; - } - len = shuffle_xor(lmin, 2); - ptr = reinterpret_cast(shuffle_xor(reinterpret_cast(smin), 2)); - if (!smin || (ptr && nvstr_is_lesser(ptr, len, smin, lmin))) { - smin = ptr; - lmin = len; - } - len = shuffle_xor(lmin, 4); - ptr = reinterpret_cast(shuffle_xor(reinterpret_cast(smin), 4)); - if (!smin || (ptr && nvstr_is_lesser(ptr, len, smin, lmin))) { - smin = ptr; - lmin = len; - } - len = shuffle_xor(lmin, 8); - ptr = reinterpret_cast(shuffle_xor(reinterpret_cast(smin), 8)); - if (!smin || (ptr && nvstr_is_lesser(ptr, len, smin, lmin))) { - smin = ptr; - lmin = len; - } - len = shuffle_xor(lmin, 16); - ptr = reinterpret_cast(shuffle_xor(reinterpret_cast(smin), 16)); - if (!smin || (ptr && nvstr_is_lesser(ptr, len, smin, lmin))) { - smin = ptr; - lmin = len; - } - return {smin, lmin}; -} - -/** - * Warp-wide Max reduction for string types - */ -inline __device__ string_stats WarpReduceMaxString(const char *smax, uint32_t lmax) -{ - uint32_t len = shuffle_xor(lmax, 1); - const char *ptr = - reinterpret_cast(shuffle_xor(reinterpret_cast(smax), 1)); - if (!smax || (ptr && nvstr_is_greater(ptr, len, smax, lmax))) { - smax = ptr; - lmax = len; - } - len = shuffle_xor(lmax, 2); - ptr = reinterpret_cast(shuffle_xor(reinterpret_cast(smax), 2)); - if (!smax || (ptr && nvstr_is_greater(ptr, len, smax, lmax))) { - smax = ptr; - lmax = len; - } - len = shuffle_xor(lmax, 4); - ptr = reinterpret_cast(shuffle_xor(reinterpret_cast(smax), 4)); - if (!smax || (ptr && nvstr_is_greater(ptr, len, smax, lmax))) { - smax = ptr; - lmax = len; - } - len = shuffle_xor(lmax, 8); - ptr = reinterpret_cast(shuffle_xor(reinterpret_cast(smax), 8)); - if (!smax || (ptr && nvstr_is_greater(ptr, len, smax, lmax))) { - smax = ptr; - lmax = len; - } - len = shuffle_xor(lmax, 16); - ptr = reinterpret_cast(shuffle_xor(reinterpret_cast(smax), 16)); - if (!smax || (ptr && nvstr_is_greater(ptr, len, smax, lmax))) { - smax = ptr; - lmax = len; - } - return {smax, lmax}; -} - /** * @brief Gather statistics for integer-like columns * @@ -160,31 +82,25 @@ gatherIntColumnStats(stats_state_s *s, statistics_dtype dtype, uint32_t t, Stora uint32_t nn_cnt = 0; __shared__ volatile bool has_minmax; for (uint32_t i = 0; i < s->group.num_rows; i += block_size) { - uint32_t r = i + t; - uint32_t row = r + s->group.start_row; - const uint32_t *valid_map = s->col.valid_map_base; - uint32_t is_valid = (r < s->group.num_rows && row < s->col.num_values) - ? (valid_map) ? (valid_map[(row + s->col.column_offset) / 32] >> - ((row + s->col.column_offset) % 32)) & - 1 - : 1 - : 0; + uint32_t r = i + t; + uint32_t row = r + s->group.start_row; + uint32_t is_valid = (r < s->group.num_rows) ? s->col.leaf_column->is_valid(row) : 0; if (is_valid) { switch (dtype) { case dtype_int32: - case dtype_date32: v = static_cast(s->col.column_data_base)[row]; break; + case dtype_date32: v = s->col.leaf_column->element(row); break; case dtype_int64: - case dtype_decimal64: v = static_cast(s->col.column_data_base)[row]; break; - case dtype_int16: v = static_cast(s->col.column_data_base)[row]; break; + case dtype_decimal64: v = s->col.leaf_column->element(row); break; + case dtype_int16: v = s->col.leaf_column->element(row); break; case dtype_timestamp64: - v = static_cast(s->col.column_data_base)[row]; + v = s->col.leaf_column->element(row); if (s->col.ts_scale < -1) { v /= -s->col.ts_scale; } else if (s->col.ts_scale > 1) { v *= s->col.ts_scale; } break; - default: v = static_cast(s->col.column_data_base)[row]; break; + default: v = s->col.leaf_column->element(row); break; } vmin = min(vmin, v); vmax = max(vmax, v); @@ -235,23 +151,17 @@ gatherFloatColumnStats(stats_state_s *s, statistics_dtype dtype, uint32_t t, Sto uint32_t nn_cnt = 0; __shared__ volatile bool has_minmax; for (uint32_t i = 0; i < s->group.num_rows; i += block_size) { - uint32_t r = i + t; - uint32_t row = r + s->group.start_row; - const uint32_t *valid_map = s->col.valid_map_base; - uint32_t is_valid = (r < s->group.num_rows && row < s->col.num_values) - ? (valid_map) ? (valid_map[(row + s->col.column_offset) >> 5] >> - ((row + s->col.column_offset) & 0x1f)) & - 1 - : 1 - : 0; + uint32_t r = i + t; + uint32_t row = r + s->group.start_row; + uint32_t is_valid = (r < s->group.num_rows) ? s->col.leaf_column->is_valid(row) : 0; if (is_valid) { if (dtype == dtype_float64) { - v = static_cast(s->col.column_data_base)[row]; + v = s->col.leaf_column->element(row); } else { - v = static_cast(s->col.column_data_base)[row]; + v = s->col.leaf_column->element(row); } - if (v < vmin) { vmin = v; } - if (v > vmax) { vmax = v; } + vmin = min(vmin, v); + vmax = max(vmax, v); if (!isnan(v)) { vsum += v; } } nn_cnt += __syncthreads_count(is_valid); @@ -293,39 +203,25 @@ struct nvstrdesc_s { template void __device__ gatherStringColumnStats(stats_state_s *s, uint32_t t, Storage &storage) { - using block_reduce = cub::BlockReduce; - uint32_t len_sum = 0; - const char *smin = nullptr; - const char *smax = nullptr; - uint32_t lmin = 0; - uint32_t lmax = 0; - uint32_t nn_cnt = 0; - bool has_minmax; - string_stats minval, maxval; + using block_reduce = cub::BlockReduce; + using string_reduce = cub::BlockReduce; + uint32_t len_sum = 0; + uint32_t nn_cnt = 0; + bool has_minmax = false; + + string_view minimum_value = string_view::max(); + string_view maximum_value = string_view::min(); for (uint32_t i = 0; i < s->group.num_rows; i += block_size) { - uint32_t r = i + t; - uint32_t row = r + s->group.start_row; - const uint32_t *valid_map = s->col.valid_map_base; - uint32_t is_valid = (r < s->group.num_rows && row < s->col.num_values) - ? (valid_map) ? (valid_map[(row + s->col.column_offset) >> 5] >> - ((row + s->col.column_offset) & 0x1f)) & - 1 - : 1 - : 0; + uint32_t r = i + t; + uint32_t row = r + s->group.start_row; + uint32_t is_valid = (r < s->group.num_rows) ? s->col.leaf_column->is_valid(row) : 0; if (is_valid) { - const nvstrdesc_s *str_col = static_cast(s->col.column_data_base); - uint32_t len = (uint32_t)str_col[row].count; - const char *ptr = str_col[row].ptr; - len_sum += len; - if (!smin || nvstr_is_lesser(ptr, len, smin, lmin)) { - lmin = len; - smin = ptr; - } - if (!smax || nvstr_is_greater(ptr, len, smax, lmax)) { - lmax = len; - smax = ptr; - } + has_minmax = true; + auto str = s->col.leaf_column->element(row); + len_sum += str.size_bytes(); + minimum_value = thrust::min(minimum_value, str); + maximum_value = thrust::max(maximum_value, str); } nn_cnt += __syncthreads_count(is_valid); } @@ -333,35 +229,20 @@ void __device__ gatherStringColumnStats(stats_state_s *s, uint32_t t, Storage &s s->ck.non_nulls = nn_cnt; s->ck.null_count = s->group.num_rows - nn_cnt; } - minval = WarpReduceMinString(smin, lmin); - maxval = WarpReduceMaxString(smax, lmax); - __syncwarp(); - if (!(t & 0x1f)) { - s->warp_min[t >> 5].str_val.ptr = minval.ptr; - s->warp_min[t >> 5].str_val.length = minval.length; - s->warp_max[t >> 5].str_val.ptr = maxval.ptr; - s->warp_max[t >> 5].str_val.length = maxval.length; - } - has_minmax = __syncthreads_or(smin != nullptr); + minimum_value = string_reduce(storage.string_val_stats).Reduce(minimum_value, cub::Min()); + __syncthreads(); + maximum_value = string_reduce(storage.string_val_stats).Reduce(maximum_value, cub::Max()); + has_minmax = __syncthreads_or(has_minmax); if (has_minmax) { len_sum = block_reduce(storage.string_stats).Sum(len_sum); } - if (t < 32 * 1) { - minval = WarpReduceMinString(s->warp_min[t].str_val.ptr, s->warp_min[t].str_val.length); - if (!(t & 0x1f)) { - if (has_minmax) { - s->ck.min_value.str_val.ptr = minval.ptr; - s->ck.min_value.str_val.length = minval.length; - s->ck.sum.i_val = len_sum; - } - s->ck.has_minmax = has_minmax; - s->ck.has_sum = has_minmax; - } - } else if (t < 32 * 2 and has_minmax) { - maxval = - WarpReduceMaxString(s->warp_max[t & 0x1f].str_val.ptr, s->warp_max[t & 0x1f].str_val.length); - if (!(t & 0x1f)) { - s->ck.max_value.str_val.ptr = maxval.ptr; - s->ck.max_value.str_val.length = maxval.length; + + if (!t) { + if (has_minmax) { + s->ck.min_value.str_val = minimum_value; + s->ck.max_value.str_val = maximum_value; + s->ck.sum.i_val = len_sum; } + s->ck.has_minmax = has_minmax; + s->ck.has_sum = has_minmax; } } @@ -383,6 +264,7 @@ __global__ void __launch_bounds__(block_size, 1) typename cub::BlockReduce::TempStorage integer_stats; typename cub::BlockReduce::TempStorage float_stats; typename cub::BlockReduce::TempStorage string_stats; + typename cub::BlockReduce::TempStorage string_val_stats; } temp_storage; stats_state_s *const s = &state_g; @@ -501,10 +383,8 @@ void __device__ mergeFloatColumnStats(merge_state_s *s, for (uint32_t i = t; i < num_chunks; i += block_size) { const statistics_chunk *ck = &ck_in[i]; if (ck->has_minmax) { - double v0 = ck->min_value.fp_val; - double v1 = ck->max_value.fp_val; - if (v0 < vmin) { vmin = v0; } - if (v1 > vmax) { vmax = v1; } + vmin = min(vmin, ck->min_value.fp_val); + vmax = max(vmax, ck->max_value.fp_val); } if (ck->has_sum) { vsum += ck->sum.fp_val; } non_nulls += ck->non_nulls; @@ -553,71 +433,48 @@ void __device__ mergeStringColumnStats(merge_state_s *s, uint32_t t, Storage &storage) { + using block_reduce = cub::BlockReduce; + using string_reduce = cub::BlockReduce; uint32_t len_sum = 0; - const char *smin = nullptr; - const char *smax = nullptr; - uint32_t lmin = 0; - uint32_t lmax = 0; uint32_t non_nulls = 0; uint32_t null_count = 0; - bool has_minmax; - string_stats minval, maxval; + bool has_minmax = false; + + string_view minimum_value = string_view::max(); + string_view maximum_value = string_view::min(); for (uint32_t i = t; i < num_chunks; i += block_size) { const statistics_chunk *ck = &ck_in[i]; if (ck->has_minmax) { - uint32_t len0 = ck->min_value.str_val.length; - const char *ptr0 = ck->min_value.str_val.ptr; - uint32_t len1 = ck->max_value.str_val.length; - const char *ptr1 = ck->max_value.str_val.ptr; - if (!smin || (ptr0 && nvstr_is_lesser(ptr0, len0, smin, lmin))) { - lmin = len0; - smin = ptr0; - } - if (!smax || (ptr1 && nvstr_is_greater(ptr1, len1, smax, lmax))) { - lmax = len1; - smax = ptr1; - } + has_minmax = true; + minimum_value = thrust::min(minimum_value, ck->min_value.str_val); + maximum_value = thrust::max(maximum_value, ck->max_value.str_val); } if (ck->has_sum) { len_sum += (uint32_t)ck->sum.i_val; } non_nulls += ck->non_nulls; null_count += ck->null_count; } - minval = WarpReduceMinString(smin, lmin); - maxval = WarpReduceMaxString(smax, lmax); - if (!(t & 0x1f)) { - s->warp_min[t >> 5].str_val.ptr = minval.ptr; - s->warp_min[t >> 5].str_val.length = minval.length; - s->warp_max[t >> 5].str_val.ptr = maxval.ptr; - s->warp_max[t >> 5].str_val.length = maxval.length; - } - has_minmax = __syncthreads_or(smin != nullptr); + minimum_value = string_reduce(storage.str).Reduce(minimum_value, cub::Min()); + __syncthreads(); + maximum_value = string_reduce(storage.str).Reduce(maximum_value, cub::Max()); + has_minmax = __syncthreads_or(has_minmax); - non_nulls = cub::BlockReduce(storage.u32).Sum(non_nulls); + non_nulls = block_reduce(storage.u32).Sum(non_nulls); __syncthreads(); - null_count = cub::BlockReduce(storage.u32).Sum(null_count); + null_count = block_reduce(storage.u32).Sum(null_count); __syncthreads(); - if (has_minmax) { len_sum = cub::BlockReduce(storage.u32).Sum(len_sum); } - if (t < 32 * 1) { - minval = WarpReduceMinString(s->warp_min[t].str_val.ptr, s->warp_min[t].str_val.length); - if (!(t & 0x1f)) { - if (has_minmax) { - s->ck.min_value.str_val.ptr = minval.ptr; - s->ck.min_value.str_val.length = minval.length; - s->ck.sum.i_val = len_sum; - } - s->ck.has_minmax = has_minmax; - s->ck.has_sum = has_minmax; - s->ck.non_nulls = non_nulls; - s->ck.null_count = null_count; - } - } else if (t < 32 * 2) { - maxval = - WarpReduceMaxString(s->warp_max[t & 0x1f].str_val.ptr, s->warp_max[t & 0x1f].str_val.length); - if (!((t & 0x1f) and has_minmax)) { - s->ck.max_value.str_val.ptr = maxval.ptr; - s->ck.max_value.str_val.length = maxval.length; + if (has_minmax) { len_sum = block_reduce(storage.u32).Sum(len_sum); } + + if (!t) { + if (has_minmax) { + s->ck.min_value.str_val = minimum_value; + s->ck.max_value.str_val = maximum_value; + s->ck.sum.i_val = len_sum; } + s->ck.has_minmax = has_minmax; + s->ck.has_sum = has_minmax; + s->ck.non_nulls = non_nulls; + s->ck.null_count = null_count; } } @@ -641,6 +498,7 @@ __global__ void __launch_bounds__(block_size, 1) typename cub::BlockReduce::TempStorage u32; typename cub::BlockReduce::TempStorage i64; typename cub::BlockReduce::TempStorage f64; + typename cub::BlockReduce::TempStorage str; } storage; merge_state_s *const s = &state_g; diff --git a/cpp/src/io/statistics/column_stats.h b/cpp/src/io/statistics/column_stats.h index 6812678f01d..d1d414aa7b4 100644 --- a/cpp/src/io/statistics/column_stats.h +++ b/cpp/src/io/statistics/column_stats.h @@ -16,6 +16,8 @@ #pragma once #include +#include +#include #include #include @@ -47,11 +49,29 @@ struct stats_column_desc { size_type column_offset; //! < index of the first element relative to the base memory const void *column_data_base; //!< base ptr to column data int32_t ts_scale; //!< timestamp scale (>0: multiply by scale, <0: divide by -scale) + + column_device_view *leaf_column; //!< Pointer to leaf column + column_device_view *parent_column; //!< Pointer to parent column. Is nullptr if not list type. }; struct string_stats { const char *ptr; //!< ptr to character data uint32_t length; //!< length of string + __host__ __device__ __forceinline__ volatile string_stats &operator=( + const string_view &val) volatile + { + ptr = val.data(); + length = val.size_bytes(); + return *this; + } + __host__ __device__ __forceinline__ operator string_view() volatile + { + return string_view(ptr, static_cast(length)); + } + __host__ __device__ __forceinline__ operator string_view() const + { + return string_view(ptr, static_cast(length)); + } }; union statistics_val { diff --git a/cpp/src/io/utilities/column_utils.cuh b/cpp/src/io/utilities/column_utils.cuh new file mode 100644 index 00000000000..4f41e846631 --- /dev/null +++ b/cpp/src/io/utilities/column_utils.cuh @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace io { + +/** + * @brief Create column_device_view pointers from leaf columns + * + * A device_uvector is created to store the leaves of parent columns. The + * column descriptor array is updated to point to these leaf columns. + * + * @tparam ColumnDescriptor Struct describing properties of columns with + * pointers to leaf and parent columns + * + * @param col_desc Column description array + * @param parent_table_device_view Table device view containing parent columns + * @param stream CUDA stream to use + * + * @return Device array containing leaf column device views + */ +template +rmm::device_uvector create_leaf_column_device_views( + typename cudf::device_span col_desc, + const table_device_view &parent_table_device_view, + rmm::cuda_stream_view stream) +{ + rmm::device_uvector leaf_column_views(parent_table_device_view.num_columns(), + stream); + auto leaf_columns = cudf::device_span{leaf_column_views}; + + auto iter = thrust::make_counting_iterator(0); + thrust::for_each(rmm::exec_policy(stream), + iter, + iter + parent_table_device_view.num_columns(), + [col_desc, parent_col_view = parent_table_device_view, leaf_columns] __device__( + size_type index) mutable { + column_device_view col = parent_col_view.column(index); + + if (col.type().id() == type_id::LIST) { + col_desc[index].parent_column = parent_col_view.begin() + index; + } else { + col_desc[index].parent_column = nullptr; + } + // traverse till leaf column + while (col.type().id() == type_id::LIST) { + col = col.child(lists_column_view::child_column_index); + } + // Store leaf_column to device storage + column_device_view *leaf_col_ptr = leaf_columns.begin() + index; + *leaf_col_ptr = col; + col_desc[index].leaf_column = leaf_col_ptr; + }); + + return leaf_column_views; +} + +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index 3dc399d669f..62b87f727c4 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -92,8 +92,8 @@ class hostdevice_vector { return reinterpret_cast(d_data.data()) + offset; } - operator cudf::device_span() { return {d_data.data(), max_elements}; } - operator cudf::device_span() const { return {d_data.data(), max_elements}; } + operator cudf::device_span() { return {device_ptr(), max_elements}; } + operator cudf::device_span() const { return {device_ptr(), max_elements}; } operator cudf::host_span() { return {h_data, max_elements}; } operator cudf::host_span() const { return {h_data, max_elements}; }