Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Force inlining to improve AST performance #9530

Merged
merged 6 commits into from
Nov 12, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions cpp/include/cudf/ast/detail/expression_evaluator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -431,10 +431,9 @@ struct expression_evaluator {
* @param row_index Row index of all input and output data column(s).
*/
template <typename ResultSubclass, typename T, bool result_has_nulls>
CUDA_DEVICE_CALLABLE void evaluate(
expression_result<ResultSubclass, T, result_has_nulls>& output_object,
cudf::size_type const row_index,
IntermediateDataType<has_nulls>* thread_intermediate_storage)
CUDF_DFI void evaluate(expression_result<ResultSubclass, T, result_has_nulls>& output_object,
cudf::size_type const row_index,
IntermediateDataType<has_nulls>* thread_intermediate_storage)
{
evaluate(output_object, row_index, row_index, row_index, thread_intermediate_storage);
}
Expand All @@ -452,12 +451,11 @@ struct expression_evaluator {
* @param output_row_index The row in the output to insert the result.
*/
template <typename ResultSubclass, typename T, bool result_has_nulls>
CUDA_DEVICE_CALLABLE void evaluate(
expression_result<ResultSubclass, T, result_has_nulls>& output_object,
cudf::size_type const left_row_index,
cudf::size_type const right_row_index,
cudf::size_type const output_row_index,
IntermediateDataType<has_nulls>* thread_intermediate_storage)
CUDF_DFI void evaluate(expression_result<ResultSubclass, T, result_has_nulls>& output_object,
cudf::size_type const left_row_index,
cudf::size_type const right_row_index,
cudf::size_type const output_row_index,
IntermediateDataType<has_nulls>* thread_intermediate_storage)
{
cudf::size_type operator_source_index{0};
for (cudf::size_type operator_index = 0; operator_index < plan.operators.size();
Expand Down
8 changes: 8 additions & 0 deletions cpp/include/cudf/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,17 @@
#ifdef __CUDACC__
#define CUDA_HOST_DEVICE_CALLABLE __host__ __device__ inline
#define CUDA_DEVICE_CALLABLE __device__ inline

// This version of the macro maximizes the chances of inlining when applied to
// a callable that is called on the GPU.
#define CUDF_HDFI __host__ __device__ __forceinline__
#define CUDF_DFI __device__ __forceinline__
#else
#define CUDA_HOST_DEVICE_CALLABLE inline
#define CUDA_DEVICE_CALLABLE inline

#define CUDF_HDFI inline
#define CUDF_DFI inline
#endif

#include <cassert>
Expand Down
4 changes: 1 addition & 3 deletions cpp/include/cudf/utilities/type_dispatcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,9 +411,7 @@ using scalar_device_type_t = typename type_to_scalar_type_impl<T>::ScalarDeviceT
template <template <cudf::type_id> typename IdTypeMap = id_to_type_impl,
typename Functor,
typename... Ts>
CUDA_HOST_DEVICE_CALLABLE constexpr decltype(auto) type_dispatcher(cudf::data_type dtype,
Functor f,
Ts&&... args)
CUDA_HDFI constexpr decltype(auto) type_dispatcher(cudf::data_type dtype, Functor f, Ts&&... args)
{
switch (dtype.id()) {
case type_id::BOOL8:
Expand Down