Simplify type dispatch with device_storage_dispatch (#7419)

Resolves #7390 Compile times: ``` // Before real 33m29.842s user 300m0.478s sys 10m46.871s // After real 33m20.127s user 299m24.825s sys 10m35.779s ``` Binary sizes: ``` Before: -rwxr-xr-x 1 rapids rapids 328M Feb 22 15:10 libcudf_base.so After: -rwxr-xr-x 1 rapids rapids 327M Feb 23 07:49 libcudf_base.so ``` Authors: - Conor Hoekstra (@codereport) Approvers: - David (@davidwendt) - Jake Hemstad (@jrhemstad) URL: #7419
rapidsai · Feb 23, 2021 · 2234554 · 2234554
1 parent c3b3b96
commit 2234554
Show file tree

Hide file tree

Showing 19 changed files with 229 additions and 254 deletions.
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -176,11 +176,9 @@ struct column_gatherer_impl {
     auto destination_column =
       cudf::detail::allocate_like(source_column, num_rows, policy, stream, mr);
 
-    using Type = device_storage_type_t<Element>;
-
-    gather_helper(source_column.data<Type>(),
+    gather_helper(source_column.data<Element>(),
                   source_column.size(),
-                  destination_column->mutable_view().template begin<Type>(),
+                  destination_column->mutable_view().template begin<Element>(),
                   gather_map_begin,
                   gather_map_end,
                   nullify_out_of_bounds,
@@ -633,14 +631,14 @@ std::unique_ptr<table> gather(
   for (auto const& source_column : source_table) {
     // The data gather for n columns will be put on the first n streams
     destination_columns.push_back(
-      cudf::type_dispatcher(source_column.type(),
-                            column_gatherer{},
-                            source_column,
-                            gather_map_begin,
-                            gather_map_end,
-                            bounds_policy == out_of_bounds_policy::NULLIFY,
-                            stream,
-                            mr));
+      cudf::type_dispatcher<dispatch_storage_type>(source_column.type(),
+                                                   column_gatherer{},
+                                                   source_column,
+                                                   gather_map_begin,
+                                                   gather_map_end,
+                                                   bounds_policy == out_of_bounds_policy::NULLIFY,
+                                                   stream,
+                                                   mr));
   }
 
   gather_bitmask_op const op = bounds_policy == out_of_bounds_policy::NULLIFY

diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
@@ -91,15 +91,13 @@ struct column_scatterer_impl {
     auto result      = std::make_unique<column>(target, stream, mr);
     auto result_view = result->mutable_view();
 
-    using Type = device_storage_type_t<Element>;
-
     // NOTE use source.begin + scatter rows rather than source.end in case the
     // scatter map is smaller than the number of source rows
     thrust::scatter(rmm::exec_policy(stream),
-                    source.begin<Type>(),
-                    source.begin<Type>() + cudf::distance(scatter_map_begin, scatter_map_end),
+                    source.begin<Element>(),
+                    source.begin<Element>() + cudf::distance(scatter_map_begin, scatter_map_end),
                     scatter_map_begin,
-                    result_view.begin<Type>());
+                    result_view.begin<Element>());
 
     return result;
   }
@@ -285,14 +283,14 @@ std::unique_ptr<table> scatter(
                  target.begin(),
                  result.begin(),
                  [=](auto const& source_col, auto const& target_col) {
-                   return type_dispatcher(source_col.type(),
-                                          scatter_functor,
-                                          source_col,
-                                          updated_scatter_map_begin,
-                                          updated_scatter_map_end,
-                                          target_col,
-                                          stream,
-                                          mr);
+                   return type_dispatcher<dispatch_storage_type>(source_col.type(),
+                                                                 scatter_functor,
+                                                                 source_col,
+                                                                 updated_scatter_map_begin,
+                                                                 updated_scatter_map_end,
+                                                                 target_col,
+                                                                 stream,
+                                                                 mr);
                  });
 
   auto gather_map = scatter_to_gather(

diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -190,6 +190,19 @@ CUDF_TYPE_MAPPING(numeric::decimal32, type_id::DECIMAL32);
 CUDF_TYPE_MAPPING(numeric::decimal64, type_id::DECIMAL64);
 CUDF_TYPE_MAPPING(cudf::struct_view, type_id::STRUCT);
 
+/**
+ * @brief Use this specialization on `type_dispatcher` whenever you only need to operate on the
+ * underlying stored type.
+ *
+ * For example, `cudf::sort` in sort.cu uses `cudf::type_dispatcher<dispatch_storage_type>(...)`.
+ * `cudf::gather` in gather.cuh also uses `cudf::type_dispatcher<dispatch_storage_type>(...)`.
+ * However, reductions needs both `data_type` and underlying type, so cannot use this.
+ */
+template <cudf::type_id Id>
+struct dispatch_storage_type {
+  using type = device_storage_type_t<typename id_to_type_impl<Id>::type>;
+};
+
 template <typename T>
 struct type_to_scalar_type_impl {
   using ScalarType = cudf::scalar;

diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -300,13 +300,11 @@ struct concatenate_dispatch {
     bool const has_nulls =
       std::any_of(views.cbegin(), views.cend(), [](auto const& col) { return col.has_nulls(); });
 
-    using Type = device_storage_type_t<T>;
-
     // Use a heuristic to guess when the fused kernel will be faster
     if (use_fused_kernel_heuristic(has_nulls, views.size())) {
-      return fused_concatenate<Type>(views, has_nulls, stream, mr);
+      return fused_concatenate<T>(views, has_nulls, stream, mr);
     } else {
-      return for_each_concatenate<Type>(views, has_nulls, stream, mr);
+      return for_each_concatenate<T>(views, has_nulls, stream, mr);
     }
   }
 };
@@ -409,8 +407,8 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns_to_c
     return empty_like(columns_to_concat.front());
   }
 
-  return type_dispatcher(columns_to_concat.front().type(),
-                         concatenate_dispatch{columns_to_concat, stream, mr});
+  return type_dispatcher<dispatch_storage_type>(
+    columns_to_concat.front().type(), concatenate_dispatch{columns_to_concat, stream, mr});
 }
 
 std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_concat,

diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,28 +41,26 @@ struct copy_if_else_functor_impl {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    using Type = device_storage_type_t<T>;
-
     if (left_nullable) {
       if (right_nullable) {
-        auto lhs_iter = cudf::detail::make_pair_iterator<Type, true>(lhs);
-        auto rhs_iter = cudf::detail::make_pair_iterator<Type, true>(rhs);
+        auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
+        auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
         return detail::copy_if_else(
           true, lhs_iter, lhs_iter + size, rhs_iter, filter, lhs.type(), stream, mr);
       }
-      auto lhs_iter = cudf::detail::make_pair_iterator<Type, true>(lhs);
-      auto rhs_iter = cudf::detail::make_pair_iterator<Type, false>(rhs);
+      auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
+      auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
       return detail::copy_if_else(
         true, lhs_iter, lhs_iter + size, rhs_iter, filter, lhs.type(), stream, mr);
     }
     if (right_nullable) {
-      auto lhs_iter = cudf::detail::make_pair_iterator<Type, false>(lhs);
-      auto rhs_iter = cudf::detail::make_pair_iterator<Type, true>(rhs);
+      auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
+      auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
       return detail::copy_if_else(
         true, lhs_iter, lhs_iter + size, rhs_iter, filter, lhs.type(), stream, mr);
     }
-    auto lhs_iter = cudf::detail::make_pair_iterator<Type, false>(lhs);
-    auto rhs_iter = cudf::detail::make_pair_iterator<Type, false>(rhs);
+    auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
+    auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
     return detail::copy_if_else(
       false, lhs_iter, lhs_iter + size, rhs_iter, filter, lhs.type(), stream, mr);
   }
@@ -182,30 +180,30 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
     auto filter = [bool_mask_device] __device__(cudf::size_type i) {
       return bool_mask_device.is_valid_nocheck(i) and bool_mask_device.element<bool>(i);
     };
-    return cudf::type_dispatcher(lhs.type(),
-                                 copy_if_else_functor{},
-                                 lhs,
-                                 rhs,
-                                 boolean_mask.size(),
-                                 left_nullable,
-                                 right_nullable,
-                                 filter,
-                                 stream,
-                                 mr);
+    return cudf::type_dispatcher<dispatch_storage_type>(lhs.type(),
+                                                        copy_if_else_functor{},
+                                                        lhs,
+                                                        rhs,
+                                                        boolean_mask.size(),
+                                                        left_nullable,
+                                                        right_nullable,
+                                                        filter,
+                                                        stream,
+                                                        mr);
   } else {
     auto filter = [bool_mask_device] __device__(cudf::size_type i) {
       return bool_mask_device.element<bool>(i);
     };
-    return cudf::type_dispatcher(lhs.type(),
-                                 copy_if_else_functor{},
-                                 lhs,
-                                 rhs,
-                                 boolean_mask.size(),
-                                 left_nullable,
-                                 right_nullable,
-                                 filter,
-                                 stream,
-                                 mr);
+    return cudf::type_dispatcher<dispatch_storage_type>(lhs.type(),
+                                                        copy_if_else_functor{},
+                                                        lhs,
+                                                        rhs,
+                                                        boolean_mask.size(),
+                                                        left_nullable,
+                                                        right_nullable,
+                                                        filter,
+                                                        stream,
+                                                        mr);
   }
 }
 

diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -108,9 +108,8 @@ struct out_of_place_copy_range_dispatch {
     }
 
     if (source_end != source_begin) {  // otherwise no-op
-      using Type    = cudf::device_storage_type_t<T>;
       auto ret_view = p_ret->mutable_view();
-      in_place_copy_range<Type>(source, ret_view, source_begin, source_end, target_begin, stream);
+      in_place_copy_range<T>(source, ret_view, source_begin, source_end, target_begin, stream);
     }
 
     return p_ret;
@@ -261,13 +260,14 @@ std::unique_ptr<column> copy_range(column_view const& source,
                "Range is out of bounds.");
   CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.");
 
-  return cudf::type_dispatcher(target.type(),
-                               out_of_place_copy_range_dispatch{source, target},
-                               source_begin,
-                               source_end,
-                               target_begin,
-                               stream,
-                               mr);
+  return cudf::type_dispatcher<dispatch_storage_type>(
+    target.type(),
+    out_of_place_copy_range_dispatch{source, target},
+    source_begin,
+    source_end,
+    target_begin,
+    stream,
+    mr);
 }
 
 }  // namespace detail

diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,18 +106,16 @@ struct column_scalar_scatterer_impl {
     auto result      = std::make_unique<column>(target, stream, mr);
     auto result_view = result->mutable_view();
 
-    using Type = device_storage_type_t<Element>;
-
     // Use permutation iterator with constant index to dereference scalar data
-    auto scalar_impl = static_cast<const scalar_type_t<Type>*>(&source.get());
+    auto scalar_impl = static_cast<const scalar_type_t<Element>*>(&source.get());
     auto scalar_iter =
       thrust::make_permutation_iterator(scalar_impl->data(), thrust::make_constant_iterator(0));
 
     thrust::scatter(rmm::exec_policy(stream),
                     scalar_iter,
                     scalar_iter + scatter_rows,
                     scatter_iter,
-                    result_view.begin<Type>());
+                    result_view.begin<Element>());
 
     return result;
   }
@@ -300,14 +298,14 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>
                  target.begin(),
                  result.begin(),
                  [=](auto const& source_scalar, auto const& target_col) {
-                   return type_dispatcher(target_col.type(),
-                                          scatter_functor,
-                                          source_scalar,
-                                          scatter_iter,
-                                          scatter_rows,
-                                          target_col,
-                                          stream,
-                                          mr);
+                   return type_dispatcher<dispatch_storage_type>(target_col.type(),
+                                                                 scatter_functor,
+                                                                 source_scalar,
+                                                                 scatter_iter,
+                                                                 scatter_rows,
+                                                                 target_col,
+                                                                 stream,
+                                                                 mr);
                  });
 
   scatter_scalar_bitmask(source, scatter_iter, scatter_rows, result, stream, mr);

diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,8 +60,7 @@ struct shift_functor {
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr)
   {
-    using Type       = device_storage_type_t<T>;
-    using ScalarType = cudf::scalar_type_t<Type>;
+    using ScalarType = cudf::scalar_type_t<T>;
     auto& scalar     = static_cast<ScalarType const&>(fill_value);
 
     auto device_input = column_device_view::create(input);
@@ -88,7 +87,7 @@ struct shift_functor {
       output->set_null_count(std::get<1>(mask_pair));
     }
 
-    auto data = device_output->data<Type>();
+    auto data = device_output->data<T>();
 
     // avoid assigning elements we know to be invalid.
     if (not scalar.is_valid()) {
@@ -103,7 +102,7 @@ struct shift_functor {
     auto func_value =
       [size, offset, fill = scalar.data(), input = *device_input] __device__(size_type idx) {
         auto src_idx = idx - offset;
-        return out_of_bounds(size, src_idx) ? *fill : input.element<Type>(src_idx);
+        return out_of_bounds(size, src_idx) ? *fill : input.element<T>(src_idx);
       };
 
     thrust::transform(rmm::exec_policy(stream), index_begin, index_end, data, func_value);
@@ -128,7 +127,8 @@ std::unique_ptr<column> shift(column_view const& input,
 
   if (input.is_empty()) { return empty_like(input); }
 
-  return type_dispatcher(input.type(), shift_functor{}, input, offset, fill_value, stream, mr);
+  return type_dispatcher<dispatch_storage_type>(
+    input.type(), shift_functor{}, input, offset, fill_value, stream, mr);
 }
 
 }  // namespace detail