diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a57652cb364..841a02f72e6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -221,7 +221,8 @@ $ ./build.sh dask_cudf - To run Python tests (Optional): ```bash $ cd $CUDF_HOME/python -$ py.test -v # run python tests on cudf and dask-cudf python bindings +$ py.test -v cudf # run cudf test suite +$ py.test -v dask_cudf # run dask_cudf test suite ``` - Other `build.sh` options: diff --git a/ci/checks/style.sh b/ci/checks/style.sh index 981e886d31c..8235f9de0e5 100755 --- a/ci/checks/style.sh +++ b/ci/checks/style.sh @@ -11,7 +11,8 @@ LC_ALL=C.UTF-8 LANG=C.UTF-8 # Activate common conda env -source activate gdf +. /opt/conda/etc/profile.d/conda.sh +conda activate rapids # Run isort and get results/return code ISORT=`isort --check-only python/**/*.py` diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 1d0154aedc7..355b18f4543 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -201,8 +201,8 @@ fi ################################################################################ # If examples grows too large to build, should move to cpu side -gpuci_logger "Building libcudf examples" -$WORKSPACE/cpp/examples/build.sh +# gpuci_logger "Building libcudf examples" +# $WORKSPACE/cpp/examples/build.sh # set environment variable for numpy 1.16 # will be enabled for later versions by default @@ -217,7 +217,7 @@ fi cd "$WORKSPACE/python/cudf" gpuci_logger "Python py.test for cuDF" -py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term +py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term cd "$WORKSPACE/python/dask_cudf" gpuci_logger "Python py.test for dask-cudf" diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh new file mode 100755 index 00000000000..8c4b597d12d --- /dev/null +++ b/ci/gpu/java.sh @@ -0,0 +1,145 @@ +#!/bin/bash +# Copyright (c) 2018-2020, NVIDIA CORPORATION. +############################################## +# cuDF GPU build and test script for CI # +############################################## +set -e +NUMARGS=$# +ARGS=$* + +# Arg parsing function +function hasArg { + (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") +} + +# Set path and build parallel level +export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH +export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} + +# Set home to the job's workspace +export HOME="$WORKSPACE" + +# Switch to project root; also root of repo checkout +cd "$WORKSPACE" + +# Determine CUDA release version +export CUDA_REL=${CUDA_VERSION%.*} +export CONDA_ARTIFACT_PATH="$WORKSPACE/ci/artifacts/cudf/cpu/.conda-bld/" + +# Parse git describe +export GIT_DESCRIBE_TAG=`git describe --tags` +export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` + +################################################################################ +# TRAP - Setup trap for removing jitify cache +################################################################################ + +# Set `LIBCUDF_KERNEL_CACHE_PATH` environment variable to $HOME/.jitify-cache +# because it's local to the container's virtual file system, and not shared with +# other CI jobs like `/tmp` is +export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" + +function remove_libcudf_kernel_cache_dir { + EXITCODE=$? + gpuci_logger "TRAP: Removing kernel cache dir: $LIBCUDF_KERNEL_CACHE_PATH" + rm -rf "$LIBCUDF_KERNEL_CACHE_PATH" \ + || gpuci_logger "[ERROR] TRAP: Could not rm -rf $LIBCUDF_KERNEL_CACHE_PATH" + exit $EXITCODE +} + +# Set trap to run on exit +gpuci_logger "TRAP: Set trap to remove jitify cache on exit" +trap remove_libcudf_kernel_cache_dir EXIT + +mkdir -p "$LIBCUDF_KERNEL_CACHE_PATH" \ + || gpuci_logger "[ERROR] TRAP: Could not mkdir -p $LIBCUDF_KERNEL_CACHE_PATH" + +################################################################################ +# SETUP - Check environment +################################################################################ + +gpuci_logger "Check environment variables" +env + +gpuci_logger "Check GPU usage" +nvidia-smi + +gpuci_logger "Activate conda env" +. /opt/conda/etc/profile.d/conda.sh +conda activate rapids + +gpuci_logger "Check conda environment" +conda info +conda config --show-sources +conda list --show-channel-urls + +gpuci_logger "Install dependencies" +gpuci_conda_retry install -y \ + "cudatoolkit=$CUDA_REL" \ + "rapids-build-env=$MINOR_VERSION.*" \ + "rapids-notebook-env=$MINOR_VERSION.*" \ + "dask-cuda=${MINOR_VERSION}" \ + "rmm=$MINOR_VERSION.*" \ + "ucx-py=0.21.*" \ + "openjdk=8.*" \ + "maven" + +# https://docs.rapids.ai/maintainers/depmgmt/ +# gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env +# gpuci_conda_retry install -y "your-pkg=1.0.0" + + +gpuci_logger "Check compiler versions" +python --version +$CC --version +$CXX --version + +gpuci_logger "Check conda environment" +conda info +conda config --show-sources +conda list --show-channel-urls + +function install_dask { + # Install the main version of dask, distributed, and streamz + gpuci_logger "Install the main version of dask, distributed, and streamz" + set -x + pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps + pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps + # Need to uninstall streamz that is already in the env. + pip uninstall -y streamz + pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps + set +x +} + +################################################################################ +# INSTALL - Install libcudf artifacts +################################################################################ + +export LIB_BUILD_DIR="$WORKSPACE/ci/artifacts/cudf/cpu/libcudf_work/cpp/build" +export CUDF_ROOT=${LIB_BUILD_DIR} +export LD_LIBRARY_PATH="$LIB_BUILD_DIR:$CONDA_PREFIX/lib:$LD_LIBRARY_PATH" + +CUDF_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf-*.tar.bz2"` +CUDF_CONDA_FILE=`basename "$CUDF_CONDA_FILE" .tar.bz2` #get filename without extension +CUDF_CONDA_FILE=${CUDF_CONDA_FILE//-/=} #convert to conda install +KAFKA_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf_kafka-*.tar.bz2"` +KAFKA_CONDA_FILE=`basename "$KAFKA_CONDA_FILE" .tar.bz2` #get filename without extension +KAFKA_CONDA_FILE=${KAFKA_CONDA_FILE//-/=} #convert to conda install + +gpuci_logger "Installing $CUDF_CONDA_FILE & $KAFKA_CONDA_FILE" +conda install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE" + +install_dask + +################################################################################ +# TEST - Run java tests +################################################################################ + +gpuci_logger "Check GPU usage" +nvidia-smi + +gpuci_logger "Running Java Tests" +cd ${WORKSPACE}/java +mvn test -B -DCUDF_JNI_ARROW_STATIC=OFF + +return ${EXITCODE} diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 1568327f88c..30586c91351 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -7,8 +7,8 @@ channels: - rapidsai-nightly - conda-forge dependencies: - - clang=8.0.1 - - clang-tools=8.0.1 + - clang=11.0.0 + - clang-tools=11.0.0 - cupy>7.1.0,<10.0.0a0 - rmm=21.08.* - cmake>=3.20.1 @@ -17,7 +17,7 @@ dependencies: - numba>=0.53.1 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=1.0.1 + - pyarrow=4.0.1=*cuda - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -31,7 +31,6 @@ dependencies: - nbsphinx - numpydoc - ipython - - recommonmark - pandoc=<2.0.0 - cudatoolkit=11.0 - pip @@ -44,12 +43,11 @@ dependencies: - dask>=2021.6.0 - distributed>=2021.6.0 - streamz + - arrow-cpp=4.0.1 - dlpack>=0.5,<0.6.0a0 - - arrow-cpp=1.0.1 - arrow-cpp-proc * cuda - double-conversion - rapidjson - - flatbuffers - hypothesis - sphinx-markdown-tables - sphinx-copybutton diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml index 9d520ada253..f2bc5a21079 100644 --- a/conda/environments/cudf_dev_cuda11.2.yml +++ b/conda/environments/cudf_dev_cuda11.2.yml @@ -7,8 +7,8 @@ channels: - rapidsai-nightly - conda-forge dependencies: - - clang=8.0.1 - - clang-tools=8.0.1 + - clang=11.0.0 + - clang-tools=11.0.0 - cupy>7.1.0,<10.0.0a0 - rmm=21.08.* - cmake>=3.20.1 @@ -17,7 +17,7 @@ dependencies: - numba>=0.53.1 - numpy - pandas>=1.0,<1.3.0dev0 - - pyarrow=1.0.1 + - pyarrow=4.0.1=*cuda - fastavro>=0.22.9 - notebook>=0.5.0 - cython>=0.29,<0.30 @@ -31,7 +31,6 @@ dependencies: - nbsphinx - numpydoc - ipython - - recommonmark - pandoc=<2.0.0 - cudatoolkit=11.2 - pip @@ -44,12 +43,11 @@ dependencies: - dask>=2021.6.0 - distributed>=2021.6.0 - streamz + - arrow-cpp=4.0.1 - dlpack>=0.5,<0.6.0a0 - - arrow-cpp=1.0.1 - arrow-cpp-proc * cuda - double-conversion - rapidjson - - flatbuffers - hypothesis - sphinx-markdown-tables - sphinx-copybutton diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index d1aaf924555..9023e89c2f5 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -30,7 +30,7 @@ requirements: - setuptools - numba >=0.53.1 - dlpack>=0.5,<0.6.0a0 - - pyarrow 1.0.1 + - pyarrow 4.0.1 *cuda - libcudf {{ version }} - rmm {{ minor_version }} - cudatoolkit {{ cuda_version }} @@ -42,7 +42,7 @@ requirements: - cupy >7.1.0,<10.0.0a0 - numba >=0.53.1 - numpy - - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} + - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} *cuda - fastavro >=0.22.0 - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec>=0.6.0 diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 14b94dd2249..6c4175a2539 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -37,11 +37,12 @@ requirements: host: - librmm {{ minor_version }}.* - cudatoolkit {{ cuda_version }}.* - - arrow-cpp 1.0.1 + - arrow-cpp 4.0.1 *cuda - arrow-cpp-proc * cuda - dlpack>=0.5,<0.6.0a0 run: - {{ pin_compatible('cudatoolkit', max_pin='x.x') }} + - arrow-cpp * *cuda - arrow-cpp-proc * cuda - {{ pin_compatible('dlpack', max_pin='x.x') }} @@ -220,6 +221,7 @@ test: - test -f $PREFIX/include/cudf/utilities/error.hpp - test -f $PREFIX/include/cudf/utilities/traits.hpp - test -f $PREFIX/include/cudf/utilities/type_dispatcher.hpp + - test -f $PREFIX/include/cudf/utilities/type_checks.hpp - test -f $PREFIX/include/cudf/utilities/default_stream.hpp - test -f $PREFIX/include/cudf/wrappers/dictionary.hpp - test -f $PREFIX/include/cudf/wrappers/durations.hpp diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index f1ec813a17f..6b15890e7c7 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -25,8 +25,8 @@ requirements: build: - cmake >=3.20.1 host: - - libcudf {{ version }} - - librdkafka >=1.5.0,<1.5.3 + - libcudf {{version}} + - librdkafka >=1.6.0,<1.7.0a0 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not diff --git a/cpp/.clang-format b/cpp/.clang-format index 11404b0226e..0c05436e922 100644 --- a/cpp/.clang-format +++ b/cpp/.clang-format @@ -6,16 +6,22 @@ Language: Cpp AccessModifierOffset: -1 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: true +AlignConsecutiveBitFields: true AlignConsecutiveDeclarations: false +AlignConsecutiveMacros: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: true AllowShortCaseLabelsOnASingleLine: true +AllowShortEnumsOnASingleLine: true AllowShortFunctionsOnASingleLine: All AllowShortIfStatementsOnASingleLine: true -AllowShortLoopsOnASingleLine: true +AllowShortLambdasOnASingleLine: true +AllowShortLoopsOnASingleLine: false # This is deprecated AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakAfterReturnType: None @@ -40,14 +46,14 @@ BraceWrapping: SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false +BreakAfterJavaFieldAnnotations: false BreakBeforeBinaryOperators: None BreakBeforeBraces: WebKit BreakBeforeInheritanceComma: false -BreakInheritanceList: BeforeColon BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false BreakConstructorInitializers: BeforeColon -BreakAfterJavaFieldAnnotations: false +BreakInheritanceList: BeforeColon BreakStringLiterals: true ColumnLimit: 100 CommentPragmas: '^ IWYU pragma:' @@ -57,7 +63,7 @@ ConstructorInitializerAllOnOneLineOrOnePerLine: true ConstructorInitializerIndentWidth: 2 ContinuationIndentWidth: 2 Cpp11BracedListStyle: true -DerivePointerAlignment: true +DerivePointerAlignment: false DisableFormat: false ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true @@ -139,14 +145,17 @@ SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: ControlStatements SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 2 SpacesInAngles: false +SpacesInConditionalStatement: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false -Standard: Cpp11 +Standard: c++17 StatementMacros: - Q_UNUSED - QT_REQUIRE_VERSION diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 678f202d106..ab7d8389c88 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -40,10 +40,12 @@ set(THREADS_PREFER_PTHREAD_FLAG ON) option(USE_NVTX "Build with NVTX support" ON) option(BUILD_TESTS "Configure CMake to build tests" ON) -option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF) +option(BUILD_BENCHMARKS "Configure CMake to build (google & nvbench) benchmarks" OFF) option(BUILD_SHARED_LIBS "Build cuDF shared libraries" ON) option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF) +option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF) +option(CUDF_ENABLE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF) option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" ON) option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF) option(DISABLE_DEPRECATION_WARNING "Disable warnings generated from deprecated declarations." OFF) @@ -54,7 +56,7 @@ option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") -message(VERBOSE "CUDF: Configure CMake to build (google) benchmarks: ${BUILD_BENCHMARKS}") +message(VERBOSE "CUDF: Configure CMake to build (google & nvbench) benchmarks: ${BUILD_BENCHMARKS}") message(VERBOSE "CUDF: Build cuDF shared libraries: ${BUILD_SHARED_LIBS}") message(VERBOSE "CUDF: Use a file cache for JIT compiled kernels: ${JITIFY_USE_CACHE}") message(VERBOSE "CUDF: Build and statically link Arrow libraries: ${CUDF_USE_ARROW_STATIC}") @@ -153,6 +155,34 @@ add_library(cudf src/ast/transform.cu src/binaryop/binaryop.cpp src/binaryop/compiled/binary_ops.cu + src/binaryop/compiled/Add.cu + src/binaryop/compiled/ATan2.cu + src/binaryop/compiled/BitwiseAnd.cu + src/binaryop/compiled/BitwiseOr.cu + src/binaryop/compiled/BitwiseXor.cu + src/binaryop/compiled/Less.cu + src/binaryop/compiled/Greater.cu + src/binaryop/compiled/LessEqual.cu + src/binaryop/compiled/GreaterEqual.cu + src/binaryop/compiled/Div.cu + src/binaryop/compiled/equality_ops.cu + src/binaryop/compiled/FloorDiv.cu + src/binaryop/compiled/LogBase.cu + src/binaryop/compiled/LogicalAnd.cu + src/binaryop/compiled/LogicalOr.cu + src/binaryop/compiled/Mod.cu + src/binaryop/compiled/Mul.cu + src/binaryop/compiled/NullMax.cu + src/binaryop/compiled/NullMin.cu + src/binaryop/compiled/PMod.cu + src/binaryop/compiled/Pow.cu + src/binaryop/compiled/PyMod.cu + src/binaryop/compiled/ShiftLeft.cu + src/binaryop/compiled/ShiftRight.cu + src/binaryop/compiled/ShiftRightUnsigned.cu + src/binaryop/compiled/Sub.cu + src/binaryop/compiled/TrueDiv.cu + src/binaryop/compiled/util.cpp src/labeling/label_bins.cu src/bitmask/null_mask.cu src/bitmask/is_element_valid.cpp @@ -194,14 +224,16 @@ add_library(cudf src/filling/sequence.cu src/groupby/groupby.cu src/groupby/hash/groupby.cu + src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu src/groupby/sort/group_argmin.cu - src/groupby/sort/aggregate.cpp src/groupby/sort/group_collect.cu - src/groupby/sort/group_merge_lists.cu src/groupby/sort/group_count.cu + src/groupby/sort/group_m2.cu src/groupby/sort/group_max.cu src/groupby/sort/group_min.cu + src/groupby/sort/group_merge_lists.cu + src/groupby/sort/group_merge_m2.cu src/groupby/sort/group_nth_element.cu src/groupby/sort/group_nunique.cu src/groupby/sort/group_product.cu @@ -272,7 +304,7 @@ add_library(cudf src/join/join.cu src/join/semi_join.cu src/lists/contains.cu - src/lists/combine/concatenate_list_elements.cu + src/lists/combine/concatenate_list_elements.cu src/lists/combine/concatenate_rows.cu src/lists/copying/concatenate.cu src/lists/copying/copying.cu @@ -354,6 +386,7 @@ add_library(cudf src/strings/convert/convert_urls.cu src/strings/copying/concatenate.cu src/strings/copying/copying.cu + src/strings/copying/shift.cu src/strings/extract.cu src/strings/filling/fill.cu src/strings/filter_chars.cu @@ -411,6 +444,7 @@ add_library(cudf src/unary/nan_ops.cu src/unary/null_ops.cu src/utilities/default_stream.cpp + src/utilities/type_checks.cpp ) set_target_properties(cudf @@ -575,6 +609,8 @@ if(CUDF_BUILD_BENCHMARKS) GIT_SHALLOW TRUE OPTIONS "BENCHMARK_ENABLE_TESTING OFF" "BENCHMARK_ENABLE_INSTALL OFF") + # Find or install NVBench + include(cmake/thirdparty/CUDF_GetNVBench.cmake) add_subdirectory(benchmarks) endif() diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index e8ccb24f44c..e5bee4771df 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -50,11 +50,19 @@ target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen) function(ConfigureBench CMAKE_BENCH_NAME) add_executable(${CMAKE_BENCH_NAME} ${ARGN}) set_target_properties(${CMAKE_BENCH_NAME} - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$") + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$") target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main) endfunction() +function(ConfigureNVBench CMAKE_BENCH_NAME) + add_executable(${CMAKE_BENCH_NAME} ${ARGN}) + set_target_properties(${CMAKE_BENCH_NAME} + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$") + target_link_libraries(${CMAKE_BENCH_NAME} + PRIVATE cudf_benchmark_common cudf_datagen nvbench::main) +endfunction() + ################################################################################################### # - column benchmarks ----------------------------------------------------------------------------- ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate_benchmark.cpp) @@ -93,7 +101,7 @@ ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchma ################################################################################################### # - join benchmark -------------------------------------------------------------------------------- -ConfigureBench(JOIN_BENCH join/join_benchmark.cu) +ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu) ################################################################################################### # - iterator benchmark ---------------------------------------------------------------------------- @@ -195,6 +203,7 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp) # - binaryop benchmark ---------------------------------------------------------------------------- ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cpp + binaryop/compiled_binaryop_benchmark.cpp binaryop/jit_binaryop_benchmark.cpp) ################################################################################################### diff --git a/cpp/benchmarks/ast/transform_benchmark.cpp b/cpp/benchmarks/ast/transform_benchmark.cpp index d39faec3ac4..6f131cf0d6a 100644 --- a/cpp/benchmarks/ast/transform_benchmark.cpp +++ b/cpp/benchmarks/ast/transform_benchmark.cpp @@ -30,9 +30,9 @@ #include #include -#include #include #include +#include #include enum class TreeType { @@ -40,11 +40,11 @@ enum class TreeType { // child column reference }; -template +template class AST : public cudf::benchmark { }; -template +template static void BM_ast_transform(benchmark::State& state) { const cudf::size_type table_size{(cudf::size_type)state.range(0)}; @@ -56,10 +56,24 @@ static void BM_ast_transform(benchmark::State& state) auto columns = std::vector(n_cols); auto data_iterator = thrust::make_counting_iterator(0); - std::generate_n(column_wrappers.begin(), n_cols, [=]() { - return cudf::test::fixed_width_column_wrapper(data_iterator, - data_iterator + table_size); - }); + + if constexpr (Nullable) { + auto validities = std::vector(table_size); + std::random_device rd; + std::mt19937 gen(rd()); + + std::generate( + validities.begin(), validities.end(), [&]() { return gen() > (0.5 * gen.max()); }); + std::generate_n(column_wrappers.begin(), n_cols, [=]() { + return cudf::test::fixed_width_column_wrapper( + data_iterator, data_iterator + table_size, validities.begin()); + }); + } else { + std::generate_n(column_wrappers.begin(), n_cols, [=]() { + return cudf::test::fixed_width_column_wrapper(data_iterator, + data_iterator + table_size); + }); + } std::transform( column_wrappers.begin(), column_wrappers.end(), columns.begin(), [](auto const& col) { return static_cast(col); @@ -113,29 +127,32 @@ static void BM_ast_transform(benchmark::State& state) (tree_levels + 1) * sizeof(key_type)); } -#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \ - BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns) \ - (::benchmark::State & st) { BM_ast_transform(st); } - -AST_TRANSFORM_BENCHMARK_DEFINE(ast_int32_imbalanced_unique, - int32_t, - TreeType::IMBALANCED_LEFT, - false); -AST_TRANSFORM_BENCHMARK_DEFINE(ast_int32_imbalanced_reuse, - int32_t, - TreeType::IMBALANCED_LEFT, - true); -AST_TRANSFORM_BENCHMARK_DEFINE(ast_double_imbalanced_unique, - double, - TreeType::IMBALANCED_LEFT, - false); +#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ + BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \ + (::benchmark::State & st) { BM_ast_transform(st); } + +AST_TRANSFORM_BENCHMARK_DEFINE( + ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false); +AST_TRANSFORM_BENCHMARK_DEFINE( + ast_int32_imbalanced_reuse, int32_t, TreeType::IMBALANCED_LEFT, true, false); +AST_TRANSFORM_BENCHMARK_DEFINE( + ast_double_imbalanced_unique, double, TreeType::IMBALANCED_LEFT, false, false); + +AST_TRANSFORM_BENCHMARK_DEFINE( + ast_int32_imbalanced_unique_nulls, int32_t, TreeType::IMBALANCED_LEFT, false, true); +AST_TRANSFORM_BENCHMARK_DEFINE( + ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true); +AST_TRANSFORM_BENCHMARK_DEFINE( + ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true); static void CustomRanges(benchmark::internal::Benchmark* b) { auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; auto operation_counts = std::vector{1, 5, 10}; for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { b->Args({row_count, operation_count}); } + for (auto const& operation_count : operation_counts) { + b->Args({row_count, operation_count}); + } } } @@ -153,3 +170,18 @@ BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique) ->Apply(CustomRanges) ->Unit(benchmark::kMillisecond) ->UseManualTime(); + +BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique_nulls) + ->Apply(CustomRanges) + ->Unit(benchmark::kMillisecond) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse_nulls) + ->Apply(CustomRanges) + ->Unit(benchmark::kMillisecond) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique_nulls) + ->Apply(CustomRanges) + ->Unit(benchmark::kMillisecond) + ->UseManualTime(); diff --git a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp index 753dcc83b54..314d657679b 100644 --- a/cpp/benchmarks/binaryop/binaryop_benchmark.cpp +++ b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp @@ -113,7 +113,9 @@ static void CustomRanges(benchmark::internal::Benchmark* b) auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; auto operation_counts = std::vector{1, 2, 5, 10}; for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { b->Args({row_count, operation_count}); } + for (auto const& operation_count : operation_counts) { + b->Args({row_count, operation_count}); + } } } diff --git a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp new file mode 100644 index 00000000000..aa86f3bedf8 --- /dev/null +++ b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +#include + +template +class COMPILED_BINARYOP : public cudf::benchmark { +}; + +template +void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) +{ + const cudf::size_type column_size{(cudf::size_type)state.range(0)}; + + auto data_it = thrust::make_counting_iterator(0); + cudf::test::fixed_width_column_wrapper input1(data_it, data_it + column_size); + cudf::test::fixed_width_column_wrapper input2(data_it, data_it + column_size); + + auto lhs = cudf::column_view(input1); + auto rhs = cudf::column_view(input2); + auto output_dtype = cudf::data_type(cudf::type_to_id()); + + // Call once for hot cache. + cudf::experimental::binary_operation(lhs, rhs, binop, output_dtype); + + for (auto _ : state) { + cuda_event_timer timer(state, true); + cudf::experimental::binary_operation(lhs, rhs, binop, output_dtype); + } +} + +// TODO tparam boolean for null. +#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \ + BENCHMARK_TEMPLATE_DEFINE_F( \ + COMPILED_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \ + (::benchmark::State & st) \ + { \ + BM_compiled_binaryop(st, cudf::binary_operator::binop); \ + } \ + BENCHMARK_REGISTER_F(COMPILED_BINARYOP, binop) \ + ->Unit(benchmark::kMicrosecond) \ + ->UseManualTime() \ + ->Arg(10000) /* 10k */ \ + ->Arg(100000) /* 100k */ \ + ->Arg(1000000) /* 1M */ \ + ->Arg(10000000) /* 10M */ \ + ->Arg(100000000); /* 100M */ + +using namespace cudf; +using namespace numeric; + +// clang-format off +BINARYOP_BENCHMARK_DEFINE(float, int64_t, ADD, int32_t); +BINARYOP_BENCHMARK_DEFINE(duration_s, duration_D, SUB, duration_ms); +BINARYOP_BENCHMARK_DEFINE(float, float, MUL, int64_t); +BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, DIV, int64_t); +BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, TRUE_DIV, int64_t); +BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, FLOOR_DIV, int64_t); +BINARYOP_BENCHMARK_DEFINE(double, double, MOD, double); +BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, PMOD, double); +BINARYOP_BENCHMARK_DEFINE(int32_t, uint8_t, PYMOD, int64_t); +BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, POW, double); +BINARYOP_BENCHMARK_DEFINE(float, double, LOG_BASE, double); +BINARYOP_BENCHMARK_DEFINE(float, double, ATAN2, double); +BINARYOP_BENCHMARK_DEFINE(int, int, SHIFT_LEFT, int); +BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, SHIFT_RIGHT, int); +BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, SHIFT_RIGHT_UNSIGNED, int64_t); +BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, BITWISE_AND, int16_t); +BINARYOP_BENCHMARK_DEFINE(int16_t, int32_t, BITWISE_OR, int64_t); +BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, BITWISE_XOR, int32_t); +BINARYOP_BENCHMARK_DEFINE(double, int8_t, LOGICAL_AND, bool); +BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, LOGICAL_OR, bool); +BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, EQUAL, bool); +BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NOT_EQUAL, bool); +BINARYOP_BENCHMARK_DEFINE(timestamp_s, timestamp_s, LESS, bool); +BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s, GREATER, bool); +BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, NULL_EQUALS, bool); +BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NULL_MAX, decimal32); +BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s, NULL_MIN, timestamp_s); diff --git a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp index 29ca02a843d..3c02f47eeb7 100644 --- a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp +++ b/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp @@ -23,7 +23,7 @@ #include -template +template class JIT_BINARYOP : public cudf::benchmark { }; @@ -50,22 +50,24 @@ void BM_binaryop(benchmark::State& state, cudf::binary_operator binop) } // TODO tparam boolean for null. -#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \ - BENCHMARK_TEMPLATE_DEFINE_F(JIT_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut) \ - (::benchmark::State & st) \ - { \ - BM_binaryop(st, cudf::binary_operator::binop); \ - } \ - BENCHMARK_REGISTER_F(JIT_BINARYOP, binop) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ \ +#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \ + BENCHMARK_TEMPLATE_DEFINE_F( \ + JIT_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop) \ + (::benchmark::State & st) \ + { \ + BM_binaryop(st, cudf::binary_operator::binop); \ + } \ + BENCHMARK_REGISTER_F(JIT_BINARYOP, binop) \ + ->Unit(benchmark::kMicrosecond) \ + ->UseManualTime() \ + ->Arg(10000) /* 10k */ \ + ->Arg(100000) /* 100k */ \ + ->Arg(1000000) /* 1M */ \ + ->Arg(10000000) /* 10M */ \ ->Arg(100000000); /* 100M */ using namespace cudf; +using namespace numeric; // clang-format off BINARYOP_BENCHMARK_DEFINE(float, int64_t, ADD, int32_t); @@ -75,16 +77,23 @@ BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, DIV, int6 BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, TRUE_DIV, int64_t); BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, FLOOR_DIV, int64_t); BINARYOP_BENCHMARK_DEFINE(double, double, MOD, double); +BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, PMOD, double); +BINARYOP_BENCHMARK_DEFINE(int32_t, uint8_t, PYMOD, int64_t); BINARYOP_BENCHMARK_DEFINE(int64_t, int64_t, POW, double); +BINARYOP_BENCHMARK_DEFINE(float, double, LOG_BASE, double); +BINARYOP_BENCHMARK_DEFINE(float, double, ATAN2, double); +BINARYOP_BENCHMARK_DEFINE(int, int, SHIFT_LEFT, int); +BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, SHIFT_RIGHT, int); +BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, SHIFT_RIGHT_UNSIGNED, int64_t); BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, BITWISE_AND, int16_t); BINARYOP_BENCHMARK_DEFINE(int16_t, int32_t, BITWISE_OR, int64_t); BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, BITWISE_XOR, int32_t); -BINARYOP_BENCHMARK_DEFINE(double, int8_t, LOGICAL_AND, int16_t); +BINARYOP_BENCHMARK_DEFINE(double, int8_t, LOGICAL_AND, bool); BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, LOGICAL_OR, bool); +BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, EQUAL, bool); +BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NOT_EQUAL, bool); BINARYOP_BENCHMARK_DEFINE(timestamp_s, timestamp_s, LESS, bool); BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s, GREATER, bool); -BINARYOP_BENCHMARK_DEFINE(int, int, SHIFT_LEFT, int); -BINARYOP_BENCHMARK_DEFINE(int16_t, int64_t, SHIFT_RIGHT, int); -BINARYOP_BENCHMARK_DEFINE(int64_t, int32_t, SHIFT_RIGHT_UNSIGNED, int64_t); -BINARYOP_BENCHMARK_DEFINE(int32_t, int64_t, PMOD, double); -BINARYOP_BENCHMARK_DEFINE(float, double, ATAN2, double); +BINARYOP_BENCHMARK_DEFINE(duration_ms, duration_ns, NULL_EQUALS, bool); +BINARYOP_BENCHMARK_DEFINE(decimal32, decimal32, NULL_MAX, decimal32); +BINARYOP_BENCHMARK_DEFINE(timestamp_D, timestamp_s, NULL_MIN, timestamp_s); diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp index 591e42ceddf..ea54d4daf05 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.cpp +++ b/cpp/benchmarks/common/generate_benchmark_input.cpp @@ -53,7 +53,7 @@ T get_distribution_mean(distribution_params const& dist) auto const range_size = dist.lower_bound < dist.upper_bound ? dist.upper_bound - dist.lower_bound : dist.lower_bound - dist.upper_bound; - auto const p = geometric_dist_p(range_size); + auto const p = geometric_dist_p(range_size); if (dist.lower_bound < dist.upper_bound) return dist.lower_bound + (1. / p); else @@ -108,7 +108,8 @@ size_t avg_element_bytes(data_profile const& profile, cudf::type_id tid) /** * @brief Functor that computes a random column element with the given data profile. * - * The implementation is SFINAEd for diffent type groups. Currently only used for fixed-width types. + * The implementation is SFINAEd for different type groups. Currently only used for fixed-width + * types. */ template struct random_value_fn; diff --git a/cpp/benchmarks/common/generate_benchmark_input.hpp b/cpp/benchmarks/common/generate_benchmark_input.hpp index acb8adc98e9..6c2a43a34e2 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.hpp +++ b/cpp/benchmarks/common/generate_benchmark_input.hpp @@ -137,7 +137,7 @@ struct distribution_params< }; /** - * @brief Boolens are parameterized with the probability of getting `true` value. + * @brief Booleans are parameterized with the probability of getting `true` value. */ template struct distribution_params::value>> { @@ -195,7 +195,7 @@ std::vector get_type_or_group(int32_t id); * * If an element of the input vector is a `cudf::type_id` enumerator, function return value simply * includes this type. If an element of the input vector is a `type_group_id` enumerator, function - * return value includes all types coresponding to the group enumerator. + * return value includes all types corresponding to the group enumerator. * * @param ids Vector of integers equal to either a `cudf::type_id` enumerator or a `type_group_id` * enumerator. diff --git a/cpp/benchmarks/hashing/partition_benchmark.cpp b/cpp/benchmarks/hashing/partition_benchmark.cpp index d10b63dc4e1..185f19f28e5 100644 --- a/cpp/benchmarks/hashing/partition_benchmark.cpp +++ b/cpp/benchmarks/hashing/partition_benchmark.cpp @@ -65,7 +65,9 @@ static void CustomRanges(benchmark::internal::Benchmark* b) { for (int columns = 1; columns <= 256; columns *= 16) { for (int partitions = 64; partitions <= 1024; partitions *= 2) { - for (int rows = 1 << 17; rows <= 1 << 21; rows *= 2) { b->Args({rows, columns, partitions}); } + for (int rows = 1 << 17; rows <= 1 << 21; rows *= 2) { + b->Args({rows, columns, partitions}); + } } } } diff --git a/cpp/benchmarks/io/cuio_benchmark_common.cpp b/cpp/benchmarks/io/cuio_benchmark_common.cpp index f2aa216d413..627ac9ccc04 100644 --- a/cpp/benchmarks/io/cuio_benchmark_common.cpp +++ b/cpp/benchmarks/io/cuio_benchmark_common.cpp @@ -94,7 +94,8 @@ std::vector select_column_indexes(int num_cols, column_selection col_sel) (col_sel == column_selection::SECOND_HALF) ? num_cols / 2 : 0); break; case column_selection::ALTERNATE: - for (size_t i = 0; i < col_idxs.size(); ++i) col_idxs[i] = 2 * i; + for (size_t i = 0; i < col_idxs.size(); ++i) + col_idxs[i] = 2 * i; break; } return col_idxs; diff --git a/cpp/benchmarks/iterator/iterator_benchmark.cu b/cpp/benchmarks/iterator/iterator_benchmark.cu index 04307f5db25..b4bb99abdde 100644 --- a/cpp/benchmarks/iterator/iterator_benchmark.cu +++ b/cpp/benchmarks/iterator/iterator_benchmark.cu @@ -61,7 +61,7 @@ inline auto reduce_by_cub(OutputIterator result, InputIterator d_in, int num_ite // ----------------------------------------------------------------------------- template -void raw_stream_bench_cub(cudf::column_view &col, rmm::device_uvector &result) +void raw_stream_bench_cub(cudf::column_view& col, rmm::device_uvector& result) { // std::cout << "raw stream cub: " << "\t"; @@ -73,7 +73,7 @@ void raw_stream_bench_cub(cudf::column_view &col, rmm::device_uvector &result }; template -void iterator_bench_cub(cudf::column_view &col, rmm::device_uvector &result) +void iterator_bench_cub(cudf::column_view& col, rmm::device_uvector& result) { // std::cout << "iterator cub " << ( (has_null) ? ": " : ": " ) << "\t"; @@ -91,7 +91,7 @@ void iterator_bench_cub(cudf::column_view &col, rmm::device_uvector &result) // ----------------------------------------------------------------------------- template -void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_uvector &result) +void raw_stream_bench_thrust(cudf::column_view& col, rmm::device_uvector& result) { // std::cout << "raw stream thust: " << "\t\t"; @@ -102,7 +102,7 @@ void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_uvector &res } template -void iterator_bench_thrust(cudf::column_view &col, rmm::device_uvector &result) +void iterator_bench_thrust(cudf::column_view& col, rmm::device_uvector& result) { // std::cout << "iterator thust " << ( (has_null) ? ": " : ": " ) << "\t"; @@ -124,7 +124,7 @@ class Iterator : public cudf::benchmark { }; template -void BM_iterator(benchmark::State &state) +void BM_iterator(benchmark::State& state) { const cudf::size_type column_size{(cudf::size_type)state.range(0)}; using T = TypeParam; @@ -165,8 +165,8 @@ __device__ thrust::pair operator+(thrust::pair lhs, thrust::pa } // ----------------------------------------------------------------------------- template -void pair_iterator_bench_cub(cudf::column_view &col, - rmm::device_uvector> &result) +void pair_iterator_bench_cub(cudf::column_view& col, + rmm::device_uvector>& result) { thrust::pair init{0, false}; auto d_col = cudf::column_device_view::create(col); @@ -176,8 +176,8 @@ void pair_iterator_bench_cub(cudf::column_view &col, } template -void pair_iterator_bench_thrust(cudf::column_view &col, - rmm::device_uvector> &result) +void pair_iterator_bench_thrust(cudf::column_view& col, + rmm::device_uvector>& result) { thrust::pair init{0, false}; auto d_col = cudf::column_device_view::create(col); @@ -187,7 +187,7 @@ void pair_iterator_bench_thrust(cudf::column_view &col, } template -void BM_pair_iterator(benchmark::State &state) +void BM_pair_iterator(benchmark::State& state) { const cudf::size_type column_size{(cudf::size_type)state.range(0)}; using T = TypeParam; diff --git a/cpp/benchmarks/join/conditional_join_benchmark.cu b/cpp/benchmarks/join/conditional_join_benchmark.cu new file mode 100644 index 00000000000..4a655e29f74 --- /dev/null +++ b/cpp/benchmarks/join/conditional_join_benchmark.cu @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "generate_input_tables.cuh" + +template +class ConditionalJoin : public cudf::benchmark { +}; + +template +static void BM_join(benchmark::State& state, Join JoinFunc) +{ + const cudf::size_type build_table_size{(cudf::size_type)state.range(0)}; + const cudf::size_type probe_table_size{(cudf::size_type)state.range(1)}; + const cudf::size_type rand_max_val{build_table_size * 2}; + const double selectivity = 0.3; + const bool is_build_table_key_unique = true; + + // Generate build and probe tables + cudf::test::UniformRandomGenerator rand_gen(0, build_table_size); + auto build_random_null_mask = [&rand_gen](int size) { + if (Nullable) { + // roughly 25% nulls + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; }); + return cudf::test::detail::make_null_mask(validity, validity + size); + } else { + return cudf::create_null_mask(size, cudf::mask_state::UNINITIALIZED); + } + }; + + std::unique_ptr build_key_column = [&]() { + return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), + build_table_size, + build_random_null_mask(build_table_size)) + : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), + build_table_size); + }(); + std::unique_ptr probe_key_column = [&]() { + return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), + probe_table_size, + build_random_null_mask(probe_table_size)) + : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), + probe_table_size); + }(); + + generate_input_tables( + build_key_column->mutable_view().data(), + build_table_size, + probe_key_column->mutable_view().data(), + probe_table_size, + selectivity, + rand_max_val, + is_build_table_key_unique); + + auto payload_data_it = thrust::make_counting_iterator(0); + cudf::test::fixed_width_column_wrapper build_payload_column( + payload_data_it, payload_data_it + build_table_size); + + cudf::test::fixed_width_column_wrapper probe_payload_column( + payload_data_it, payload_data_it + probe_table_size); + + CHECK_CUDA(0); + + cudf::table_view build_table({build_key_column->view(), build_payload_column}); + cudf::table_view probe_table({probe_key_column->view(), probe_payload_column}); + + // Benchmark the inner join operation + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + + // Common column references. + const auto col_ref_left_0 = cudf::ast::column_reference(0); + const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT); + auto left_zero_eq_right_zero = + cudf::ast::expression(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); + + auto result = + JoinFunc(probe_table, build_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL); + } +} + +#define CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \ + BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type) \ + (::benchmark::State & st) \ + { \ + auto join = [](cudf::table_view const& left, \ + cudf::table_view const& right, \ + cudf::ast::expression binary_pred, \ + cudf::null_equality compare_nulls) { \ + return cudf::conditional_inner_join(left, right, binary_pred, compare_nulls); \ + }; \ + BM_join(st, join); \ + } + +CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit, int32_t, int32_t, false); +CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit, int64_t, int64_t, false); +CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit_nulls, int32_t, int32_t, true); +CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int64_t, int64_t, true); + +#define CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \ + BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type) \ + (::benchmark::State & st) \ + { \ + auto join = [](cudf::table_view const& left, \ + cudf::table_view const& right, \ + cudf::ast::expression binary_pred, \ + cudf::null_equality compare_nulls) { \ + return cudf::conditional_left_join(left, right, binary_pred, compare_nulls); \ + }; \ + BM_join(st, join); \ + } + +CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit, int32_t, int32_t, false); +CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit, int64_t, int64_t, false); +CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit_nulls, int32_t, int32_t, true); +CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_t, int64_t, true); + +#define CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \ + BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type) \ + (::benchmark::State & st) \ + { \ + auto join = [](cudf::table_view const& left, \ + cudf::table_view const& right, \ + cudf::ast::expression binary_pred, \ + cudf::null_equality compare_nulls) { \ + return cudf::conditional_inner_join(left, right, binary_pred, compare_nulls); \ + }; \ + BM_join(st, join); \ + } + +CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit, int32_t, int32_t, false); +CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit, int64_t, int64_t, false); +CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit_nulls, int32_t, int32_t, true); +CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_t, int64_t, true); + +#define CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \ + BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type) \ + (::benchmark::State & st) \ + { \ + auto join = [](cudf::table_view const& left, \ + cudf::table_view const& right, \ + cudf::ast::expression binary_pred, \ + cudf::null_equality compare_nulls) { \ + return cudf::conditional_left_anti_join(left, right, binary_pred, compare_nulls); \ + }; \ + BM_join(st, join); \ + } + +CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit, + int32_t, + int32_t, + false); +CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit, + int64_t, + int64_t, + false); +CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit_nulls, + int32_t, + int32_t, + true); +CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nulls, + int64_t, + int64_t, + true); + +#define CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \ + BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type) \ + (::benchmark::State & st) \ + { \ + auto join = [](cudf::table_view const& left, \ + cudf::table_view const& right, \ + cudf::ast::expression binary_pred, \ + cudf::null_equality compare_nulls) { \ + return cudf::conditional_left_semi_join(left, right, binary_pred, compare_nulls); \ + }; \ + BM_join(st, join); \ + } + +CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit, + int32_t, + int32_t, + false); +CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit, + int64_t, + int64_t, + false); +CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit_nulls, + int32_t, + int32_t, + true); +CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit_nulls, + int64_t, + int64_t, + true); + +// inner join ----------------------------------------------------------------------- +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + // TODO: The below benchmark is slow, but can be useful to validate that the + // code works for large data sets. This benchmark was used to compare to the + // otherwise equivalent nullable benchmark below, which has memory errors for + // sufficiently large data sets. + //->Args({1'000'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit_nulls) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit_nulls) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +// left join ----------------------------------------------------------------------- +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_32bit) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_64bit) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_32bit_nulls) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_64bit_nulls) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +// full join ----------------------------------------------------------------------- +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_32bit) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_64bit) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_32bit_nulls) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_64bit_nulls) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +// left anti-join ------------------------------------------------------------- +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_32bit) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_64bit) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_32bit_nulls) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_64bit_nulls) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +// left semi-join ------------------------------------------------------------- +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_32bit) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_64bit) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_32bit_nulls) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); + +BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_64bit_nulls) + ->Unit(benchmark::kMillisecond) + ->Args({100'000, 100'000}) + ->Args({100'000, 400'000}) + ->Args({100'000, 1'000'000}) + ->UseManualTime(); diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index 285a9241a26..d7f64716e58 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -141,7 +141,7 @@ __global__ void init_probe_tbl(key_type* const probe_tbl, * (e.g. device memory, zero copy memory or unified memory). Each value in the build table * will be from [0,rand_max] and if uniq_build_tbl_keys is true it is ensured that each value * will be uniq in the build table. Each value in the probe table will be also in the build - * table with a propability of selectivity and a random number from + * table with a probability of selectivity and a random number from * [0,rand_max] \setminus \{build_tbl\} otherwise. * * @param[out] build_tbl The build table to generate. Usually the smaller table used to @@ -150,7 +150,7 @@ __global__ void init_probe_tbl(key_type* const probe_tbl, * @param[out] probe_tbl The probe table to generate. Usually the larger table used to * probe into the hash table created from the build table. * @param[in] build_tbl_size number of keys in the build table - * @param[in] selectivity propability with which an element of the probe table is + * @param[in] selectivity probability with which an element of the probe table is * present in the build table. * @param[in] rand_max maximum random number to generate. I.e. random numbers are * integers from [0,rand_max]. @@ -169,7 +169,7 @@ void generate_input_tables(key_type* const build_tbl, // expense of not being that accurate with applying the selectivity an especially more memory // efficient implementations would be to partition the random numbers into two intervals and then // let one table choose random numbers from only one interval and the other only select with - // selectivity propability from the same interval and from the other in the other cases. + // selective probability from the same interval and from the other in the other cases. static_assert(std::is_signed::value, "key_type needs to be signed for lottery to work"); diff --git a/cpp/benchmarks/reduction/anyall_benchmark.cpp b/cpp/benchmarks/reduction/anyall_benchmark.cpp index 97d66585f8c..3dcb433ec52 100644 --- a/cpp/benchmarks/reduction/anyall_benchmark.cpp +++ b/cpp/benchmarks/reduction/anyall_benchmark.cpp @@ -48,7 +48,7 @@ void BM_reduction_anyall(benchmark::State& state, std::unique_ptr co } #define concat(a, b, c) a##b##c -#define get_agg(op) concat(cudf::make_, op, _aggregation()) +#define get_agg(op) concat(cudf::make_, op, _aggregation()) // TYPE, OP #define RBM_BENCHMARK_DEFINE(name, type, aggregation) \ diff --git a/cpp/benchmarks/search/search_benchmark.cpp b/cpp/benchmarks/search/search_benchmark.cpp index 7fb196fb500..c3529c7e79c 100644 --- a/cpp/benchmarks/search/search_benchmark.cpp +++ b/cpp/benchmarks/search/search_benchmark.cpp @@ -131,7 +131,8 @@ BENCHMARK_DEFINE_F(Search, Table)(::benchmark::State& state) { BM_table(state); static void CustomArguments(benchmark::internal::Benchmark* b) { for (int num_cols = 1; num_cols <= 10; num_cols *= 2) - for (int col_size = 1000; col_size <= 100000000; col_size *= 10) b->Args({num_cols, col_size}); + for (int col_size = 1000; col_size <= 100000000; col_size *= 10) + b->Args({num_cols, col_size}); } BENCHMARK_REGISTER_F(Search, Table) diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask_benchmark.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask_benchmark.cpp index 5cd2278ca14..7246d113ade 100644 --- a/cpp/benchmarks/stream_compaction/apply_boolean_mask_benchmark.cpp +++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask_benchmark.cpp @@ -35,13 +35,15 @@ constexpr cudf::size_type fifty_percent = 50; void percent_range(benchmark::internal::Benchmark* b) { b->Unit(benchmark::kMillisecond); - for (int percent = 0; percent <= 100; percent += 10) b->Args({hundredM, percent}); + for (int percent = 0; percent <= 100; percent += 10) + b->Args({hundredM, percent}); } void size_range(benchmark::internal::Benchmark* b) { b->Unit(benchmark::kMillisecond); - for (int size = tenK; size <= hundredM; size *= 10) b->Args({size, fifty_percent}); + for (int size = tenK; size <= hundredM; size *= 10) + b->Args({size, fifty_percent}); } template @@ -64,9 +66,9 @@ void calculate_bandwidth(benchmark::State& state, cudf::size_type num_columns) cudf::size_type const column_size_out = fraction * column_size; int64_t const mask_size = sizeof(bool) * column_size + cudf::bitmask_allocation_size_bytes(column_size); - int64_t const validity_bytes_in = (fraction >= 1.0f / 32) - ? cudf::bitmask_allocation_size_bytes(column_size) - : 4 * column_size_out; + int64_t const validity_bytes_in = (fraction >= 1.0f / 32) + ? cudf::bitmask_allocation_size_bytes(column_size) + : 4 * column_size_out; int64_t const validity_bytes_out = cudf::bitmask_allocation_size_bytes(column_size_out); int64_t const column_bytes_out = sizeof(T) * column_size_out; int64_t const column_bytes_in = column_bytes_out; // we only read unmasked inputs diff --git a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp index 16bae725621..8039d7d065f 100644 --- a/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp +++ b/cpp/benchmarks/stream_compaction/drop_duplicates_benchmark.cpp @@ -50,7 +50,7 @@ void BM_compaction(benchmark::State& state, cudf::duplicate_keep_option keep) } #define concat(a, b, c) a##b##c -#define get_keep(op) cudf::duplicate_keep_option::KEEP_##op +#define get_keep(op) cudf::duplicate_keep_option::KEEP_##op // TYPE, OP #define RBM_BENCHMARK_DEFINE(name, type, keep) \ diff --git a/cpp/benchmarks/string/extract_benchmark.cpp b/cpp/benchmarks/string/extract_benchmark.cpp index aa1e59a22bf..161e30c6f25 100644 --- a/cpp/benchmarks/string/extract_benchmark.cpp +++ b/cpp/benchmarks/string/extract_benchmark.cpp @@ -48,7 +48,9 @@ static void BM_extract(benchmark::State& state, int groups) }); std::string pattern; - while (static_cast(pattern.size()) < groups) { pattern += "(\\d+) "; } + while (static_cast(pattern.size()) < groups) { + pattern += "(\\d+) "; + } std::uniform_int_distribution distribution(0, samples.size() - 1); auto elements = cudf::detail::make_counting_transform_iterator( diff --git a/cpp/benchmarks/text/replace_benchmark.cpp b/cpp/benchmarks/text/replace_benchmark.cpp index 8f6704ab1af..0a0e6a1667c 100644 --- a/cpp/benchmarks/text/replace_benchmark.cpp +++ b/cpp/benchmarks/text/replace_benchmark.cpp @@ -41,7 +41,8 @@ static void BM_replace(benchmark::State& state) std::default_random_engine generator; std::uniform_int_distribution tokens_dist(0, words.size() - 1); std::string row; // build a row of random tokens - while (static_cast(row.size()) < n_length) row += words[tokens_dist(generator)]; + while (static_cast(row.size()) < n_length) + row += words[tokens_dist(generator)]; std::uniform_int_distribution position_dist(0, 16); diff --git a/cpp/benchmarks/text/subword_benchmark.cpp b/cpp/benchmarks/text/subword_benchmark.cpp index 3670fa7c9a7..2406ddd39ae 100644 --- a/cpp/benchmarks/text/subword_benchmark.cpp +++ b/cpp/benchmarks/text/subword_benchmark.cpp @@ -37,7 +37,8 @@ static std::string create_hash_vocab_file() std::vector> coefficients(23, {65559, 0}); std::ofstream outfile(hash_file, std::ofstream::out); outfile << "1\n0\n" << coefficients.size() << "\n"; - for (auto c : coefficients) outfile << c.first << " " << c.second << "\n"; + for (auto c : coefficients) + outfile << c.first << " " << c.second << "\n"; std::vector hash_table(23, 0); outfile << hash_table.size() << "\n"; hash_table[0] = 3015668L; @@ -45,7 +46,8 @@ static std::string create_hash_vocab_file() hash_table[5] = 6358029; hash_table[16] = 451412625363L; hash_table[20] = 6206321707968235495L; - for (auto h : hash_table) outfile << h << "\n"; + for (auto h : hash_table) + outfile << h << "\n"; outfile << "100\n101\n102\n\n"; return hash_file; } diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu index b09a7911595..8e51bcca63d 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu @@ -64,7 +64,9 @@ __global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_ using F = Functor; cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x; while (index < n_rows) { - for (int c = 0; c < n_cols; c++) { A[c][index] = F::f(A[c][index]); } + for (int c = 0; c < n_cols; c++) { + A[c][index] = F::f(A[c][index]); + } index += blockDim.x * gridDim.x; } } diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake index 0eee5abd2f3..8cef3e8b9d0 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake @@ -14,11 +14,10 @@ # limitations under the License. #============================================================================= -function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3) +function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON ENABLE_PARQUET) set(ARROW_BUILD_SHARED ON) set(ARROW_BUILD_STATIC OFF) - set(ARROW_BUILD_S3 OFF) set(CPMAddOrFindPackage CPMFindPackage) if(NOT ARROW_ARMV8_ARCH) @@ -36,10 +35,23 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3) set(CPMAddOrFindPackage CPMAddPackage) endif() - if(ENABLE_S3) - set(ARROW_BUILD_S3 ON) + set(ARROW_PYTHON_OPTIONS "") + if(ENABLE_PYTHON) + list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON") + # Arrow's logic to build Boost from source is busted, so we have to get it from the system. + list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM") + # Arrow's logic to find Thrift is busted, so we have to build it from + # source. Why can't we use `THRIFT_SOURCE BUNDLED` you might ask? + # Because that's _also_ busted. The only thing that seems to is to set + # _all_ dependencies to bundled, then optionall un-set BOOST_SOURCE to + # SYSTEM. + list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE BUNDLED") endif() + # Set this so Arrow correctly finds the CUDA toolkit when the build machine + # does not have the CUDA driver installed. This must be an env var. + set(ENV{CUDA_LIB_PATH} "${CUDAToolkit_LIBRARY_DIR}/stubs") + cmake_language(CALL ${CPMAddOrFindPackage} NAME Arrow VERSION ${VERSION} @@ -55,7 +67,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3) "ARROW_WITH_BACKTRACE ON" "ARROW_CXXFLAGS -w" "ARROW_JEMALLOC OFF" - "ARROW_S3 ${ARROW_BUILD_S3}" + "ARROW_S3 ${ENABLE_S3}" + # e.g. needed by blazingsql-io + "ARROW_PARQUET ${ENABLE_PARQUET}" + ${ARROW_PYTHON_OPTIONS} # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off "ARROW_USE_CCACHE OFF" "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}" @@ -98,13 +113,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3) DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util") file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/gpu/cuda_version.h" DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/gpu") + if(ENABLE_PARQUET) + file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" + DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet") + endif() ### # This shouldn't be necessary! # # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` # and `arrow_shared` targets in FindArrow and FindArrowCUDA respectively, # so for static source-builds, we have to do it after-the-fact. - # + # # This only works because we know exactly which components we're using. # Don't forget to update this list if we add more! ### @@ -127,6 +146,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3) endfunction() -set(CUDF_VERSION_Arrow 1.0.1) +set(CUDF_VERSION_Arrow 4.0.1) -find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3}) +find_and_configure_arrow( + ${CUDF_VERSION_Arrow} + ${CUDF_USE_ARROW_STATIC} + ${CUDF_ENABLE_ARROW_S3} + ${CUDF_ENABLE_ARROW_PYTHON} + ${CUDF_ENABLE_ARROW_PARQUET} +) diff --git a/cpp/cmake/thirdparty/CUDF_GetNVBench.cmake b/cpp/cmake/thirdparty/CUDF_GetNVBench.cmake new file mode 100644 index 00000000000..09ceffb284f --- /dev/null +++ b/cpp/cmake/thirdparty/CUDF_GetNVBench.cmake @@ -0,0 +1,34 @@ +#============================================================================= +# Copyright (c) 2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +# NVBench doesn't have a public release yet + +function(find_and_configure_nvbench) + + if(TARGET nvbench::main) + return() + endif() + + CPMFindPackage(NAME nvbench + GIT_REPOSITORY https://github.com/NVIDIA/nvbench.git + GIT_TAG main + GIT_SHALLOW TRUE + OPTIONS "NVBench_ENABLE_EXAMPLES OFF" + "NVBench_ENABLE_TESTING OFF") + +endfunction() + +find_and_configure_nvbench() diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md index 8ec111acdb2..9ec64060847 100644 --- a/cpp/docs/DEVELOPER_GUIDE.md +++ b/cpp/docs/DEVELOPER_GUIDE.md @@ -470,7 +470,7 @@ libcudf, and you should not use it in new code in libcudf without careful consid use `rmm::device_uvector` along with the utility factories in `device_factories.hpp`. These utilities enable creation of `uvector`s from host-side vectors, or creating zero-initialized `uvector`s, so that they are as convenient to use as `device_vector`. Avoiding `device_vector` has -a number of benefits, as described in the folling section on `rmm::device_uvector`. +a number of benefits, as described in the following section on `rmm::device_uvector`. #### `rmm::device_uvector` diff --git a/cpp/docs/TESTING.md b/cpp/docs/TESTING.md index 2c7b62b8b6d..3c741b5d4e7 100644 --- a/cpp/docs/TESTING.md +++ b/cpp/docs/TESTING.md @@ -67,7 +67,7 @@ not necessary for your test fixtures to inherit from it. Example: ```c++ -class MyTestFiture : public cudf::test::BaseFixture {...}; +class MyTestFixture : public cudf::test::BaseFixture {...}; ``` ## Typed Tests diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 5fab284d506..a2f59de54db 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -67,8 +67,9 @@ class aggregation { ALL, ///< all reduction SUM_OF_SQUARES, ///< sum of squares reduction MEAN, ///< arithmetic mean reduction - VARIANCE, ///< groupwise variance - STD, ///< groupwise standard deviation + M2, ///< sum of squares of differences from the mean + VARIANCE, ///< variance + STD, ///< standard deviation MEDIAN, ///< median reduction QUANTILE, ///< compute specified quantile(s) ARGMAX, ///< Index of max element @@ -78,12 +79,13 @@ class aggregation { ROW_NUMBER, ///< get row-number of current index (relative to rolling window) COLLECT_LIST, ///< collect values into a list COLLECT_SET, ///< collect values into a list without duplicate entries - MERGE_LISTS, ///< merge multiple lists values into one list - MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries LEAD, ///< window function, accesses row at specified offset following current row LAG, ///< window function, accesses row at specified offset preceding current row PTX, ///< PTX UDF based reduction - CUDA ///< CUDA UDF based reduction + CUDA, ///< CUDA UDF based reduction + MERGE_LISTS, ///< merge multiple lists values into one list + MERGE_SETS, ///< merge multiple lists values into one list then drop duplicate entries + MERGE_M2 ///< merge partial values of M2 aggregation }; aggregation() = delete; @@ -159,6 +161,20 @@ std::unique_ptr make_sum_of_squares_aggregation(); template std::unique_ptr make_mean_aggregation(); +/** + * @brief Factory to create a M2 aggregation + * + * A M2 aggregation is sum of squares of differences from the mean. That is: + * `M2 = SUM((x - MEAN) * (x - MEAN))`. + * + * This aggregation produces the intermediate values that are used to compute variance and standard + * deviation across multiple discrete sets. See + * `https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm` for more + * detail. + */ +template +std::unique_ptr make_m2_aggregation(); + /** * @brief Factory to create a VARIANCE aggregation * @@ -271,11 +287,33 @@ std::unique_ptr make_collect_set_aggregation(null_policy null_handling = n null_equality nulls_equal = null_equality::EQUAL, nan_equality nans_equal = nan_equality::UNEQUAL); +/// Factory to create a LAG aggregation +template +std::unique_ptr make_lag_aggregation(size_type offset); + +/// Factory to create a LEAD aggregation +template +std::unique_ptr make_lead_aggregation(size_type offset); + +/** + * @brief Factory to create an aggregation base on UDF for PTX or CUDA + * + * @param[in] type: either udf_type::PTX or udf_type::CUDA + * @param[in] user_defined_aggregator A string containing the aggregator code + * @param[in] output_type expected output type + * + * @return aggregation unique pointer housing user_defined_aggregator string. + */ +template +std::unique_ptr make_udf_aggregation(udf_type type, + std::string const& user_defined_aggregator, + data_type output_type); + /** * @brief Factory to create a MERGE_LISTS aggregation. * * Given a lists column, this aggregation merges all the lists corresponding to the same key value - * into one list. It is designed specificly to merge the partial results of multiple (distributed) + * into one list. It is designed specifically to merge the partial results of multiple (distributed) * groupby `COLLECT_LIST` aggregations into a final `COLLECT_LIST` result. As such, it requires the * input lists column to be non-nullable (the child column containing list entries is not subjected * to this requirement). @@ -290,7 +328,7 @@ std::unique_ptr make_merge_lists_aggregation(); * value into one list, then it drops all the duplicate entries in each lists, producing a lists * column containing non-repeated entries. * - * This aggregation is designed specificly to merge the partial results of multiple (distributed) + * This aggregation is designed specifically to merge the partial results of multiple (distributed) * groupby `COLLECT_LIST` or `COLLECT_SET` aggregations into a final `COLLECT_SET` result. As such, * it requires the input lists column to be non-nullable (the child column containing list entries * is not subjected to this requirement). @@ -308,27 +346,20 @@ template std::unique_ptr make_merge_sets_aggregation(null_equality nulls_equal = null_equality::EQUAL, nan_equality nans_equal = nan_equality::UNEQUAL); -/// Factory to create a LAG aggregation -template -std::unique_ptr make_lag_aggregation(size_type offset); - -/// Factory to create a LEAD aggregation -template -std::unique_ptr make_lead_aggregation(size_type offset); - /** - * @brief Factory to create an aggregation base on UDF for PTX or CUDA + * @brief Factory to create a MERGE_M2 aggregation * - * @param[in] type: either udf_type::PTX or udf_type::CUDA - * @param[in] user_defined_aggregator A string containing the aggregator code - * @param[in] output_type expected output type + * Merges the results of `M2` aggregations on independent sets into a new `M2` value equivalent to + * if a single `M2` aggregation was done across all of the sets at once. This aggregation is only + * valid on structs whose members are the result of the `COUNT_VALID`, `MEAN`, and `M2` aggregations + * on the same sets. The output of this aggregation is a struct containing the merged `COUNT_VALID`, + * `MEAN`, and `M2` aggregations. * - * @return aggregation unique pointer housing user_defined_aggregator string. + * The input `M2` aggregation values are expected to be all non-negative numbers, since they + * were output from `M2` aggregation. */ template -std::unique_ptr make_udf_aggregation(udf_type type, - std::string const& user_defined_aggregator, - data_type output_type); +std::unique_ptr make_merge_m2_aggregation(); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/ast/detail/linearizer.hpp b/cpp/include/cudf/ast/detail/linearizer.hpp index 166a0408703..67474e08877 100644 --- a/cpp/include/cudf/ast/detail/linearizer.hpp +++ b/cpp/include/cudf/ast/detail/linearizer.hpp @@ -103,10 +103,24 @@ class linearizer { /** * @brief Construct a new linearizer object * + * @param expr The expression to create an evaluable linearizer for. + * @param left The left table used for evaluating the abstract syntax tree. + * @param right The right table used for evaluating the abstract syntax tree. + */ + linearizer(detail::node const& expr, cudf::table_view left, cudf::table_view right) + : _left(left), _right(right), _node_count(0), _intermediate_counter() + { + expr.accept(*this); + } + + /** + * @brief Construct a new linearizer object + * + * @param expr The expression to create an evaluable linearizer for. * @param table The table used for evaluating the abstract syntax tree. */ linearizer(detail::node const& expr, cudf::table_view table) - : _table(table), _node_count(0), _intermediate_counter() + : _left(table), _right(table), _node_count(0), _intermediate_counter() { expr.accept(*this); } @@ -217,7 +231,8 @@ class linearizer { cudf::size_type add_data_reference(detail::device_data_reference data_ref); // State information about the "linearized" GPU execution plan - cudf::table_view _table; + cudf::table_view const& _left; + cudf::table_view const& _right; cudf::size_type _node_count; intermediate_counter _intermediate_counter; std::vector _data_references; diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index f69927a3601..e56b4fb2281 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -31,6 +31,8 @@ #include +#include + #include #include @@ -40,132 +42,375 @@ namespace ast { namespace detail { -// Forward declaration -struct row_evaluator; +// Type trait for wrapping nullable types in a thrust::optional. Non-nullable +// types are returned as is. +template +struct possibly_null_value; -struct row_output { - public: - __device__ row_output(row_evaluator const& evaluator) : evaluator(evaluator) {} +template +struct possibly_null_value { + using type = thrust::optional; +}; + +template +struct possibly_null_value { + using type = T; +}; + +template +using possibly_null_value_t = typename possibly_null_value::type; + +// Type used for intermediate storage in expression evaluation. +template +using IntermediateDataType = possibly_null_value_t; +/** + * @brief A container for capturing the output of an evaluated expression. + * + * This class is designed to be passed by reference as the first argument to + * expression_evaluator::evaluate. The API is designed such that template + * specializations for specific output types will be able to customize setting + * behavior if necessary. The class leverages CRTP to define a suitable interface + * for the `expression_evaluator` at compile-time and enforce this API on its + * subclasses to get around the lack of device-side polymorphism. + * + * @tparam Subclass The subclass to dispatch methods to. + * @tparam T The underlying data type. + * @tparam has_nulls Whether or not the result data is nullable. + */ +template +struct expression_result { /** - * @brief Resolves an output data reference and assigns result value. - * - * Only output columns (COLUMN) and intermediates (INTERMEDIATE) are supported as output reference - * types. Intermediates must be of fixed width less than or equal to sizeof(std::int64_t). This - * requirement on intermediates is enforced by the linearizer. - * - * @tparam Element Type of result element. - * @param device_data_reference Data reference to resolve. - * @param row_index Row index of data column. - * @param result Value to assign to output. + * Helper function to get the subclass type to dispatch methods to. */ - template ())> - __device__ void resolve_output(detail::device_data_reference device_data_reference, - cudf::size_type row_index, - Element result) const; - // Definition below after row_evaluator is a complete type - - template ())> - __device__ void resolve_output(detail::device_data_reference device_data_reference, - cudf::size_type row_index, - Element result) const + Subclass& subclass() { return static_cast(*this); } + Subclass const& subclass() const { return static_cast(*this); } + + // TODO: The index is ignored by the value subclass, but is included in this + // signature because it is required by the implementation in the template + // specialization for column views. It would be nice to clean this up, see + // the related TODO below. Note that storing the index in the class on + // construction (which would result in a cleaner delineation of the API for + // the derived types) results in a significant performance penalty because + // the index is pushed down the memory hierarchy by the time it needs to be + // used, whereas passing it as a parameter keeps it in registers for fast + // access at the point where indexing occurs. + template + __device__ void set_value(cudf::size_type index, possibly_null_value_t result) { - cudf_assert(false && "Invalid type in resolve_output."); + subclass()->set_value(); } - private: - row_evaluator const& evaluator; + __device__ bool is_valid() const { subclass()->is_valid(); } + + __device__ T value() const { subclass()->value(); } }; -template -struct unary_row_output : public row_output { - __device__ unary_row_output(row_evaluator const& evaluator) : row_output(evaluator) {} +/** + * @brief A container for capturing the output of an evaluated expression in a scalar. + * + * This subclass of `expression_result` functions as an owning container of a + * (possibly nullable) scalar type that can be written to by the + * expression_evaluator. The data (and its validity) can then be accessed. + * + * @tparam T The underlying data type. + * @tparam has_nulls Whether or not the result data is nullable. + */ +template +struct value_expression_result + : public expression_result, T, has_nulls> { + __device__ value_expression_result() {} - template < - ast_operator op, - std::enable_if_t, Input>>* = nullptr> - __device__ void operator()(cudf::size_type row_index, - Input input, - detail::device_data_reference output) const + template + __device__ void set_value(cudf::size_type index, possibly_null_value_t result) { - using OperatorFunctor = detail::operator_functor; - using Out = cuda::std::invoke_result_t; - resolve_output(output, row_index, OperatorFunctor{}(input)); + if constexpr (std::is_same_v) { + _obj = result; + } else { + cudf_assert(false && "Output type does not match container type."); + } } - template < - ast_operator op, - std::enable_if_t, Input>>* = nullptr> - __device__ void operator()(cudf::size_type row_index, - Input input, - detail::device_data_reference output) const + /** + * @brief Returns true if the underlying data is valid and false otherwise. + */ + __device__ bool is_valid() const { - cudf_assert(false && "Invalid unary dispatch operator for the provided input."); + if constexpr (has_nulls) { return _obj.has_value(); } + return true; } + + /** + * @brief Returns the underlying data. + * + * @throws thrust::bad_optional_access if the underlying data is not valid. + */ + __device__ T value() const + { + // Using two separate constexprs silences compiler warnings, whereas an + // if/else does not. An unconditional return is not ignored by the compiler + // when has_nulls is true and therefore raises a compiler error. + if constexpr (has_nulls) { return _obj.value(); } + if constexpr (!has_nulls) { return _obj; } + } + + possibly_null_value_t + _obj; ///< The underlying data value, or a nullable version of it. }; -template -struct binary_row_output : public row_output { - __device__ binary_row_output(row_evaluator const& evaluator) : row_output(evaluator) {} - - template < - ast_operator op, - std::enable_if_t, LHS, RHS>>* = nullptr> - __device__ void operator()(cudf::size_type row_index, - LHS lhs, - RHS rhs, - detail::device_data_reference output) const +// TODO: The below implementation significantly differs from the default +// implementation above due to the non-owning nature of the container and the +// usage of the index. It would be ideal to unify these further if possible. + +/** + * @brief A container for capturing the output of an evaluated expression in a column. + * + * This subclass of `expression_result` functions as a non-owning container + * that transparently passes calls through to an underlying mutable view to a + * column. Not all methods are implemented + * + * @tparam has_nulls Whether or not the result data is nullable. + */ +template +struct mutable_column_expression_result + : public expression_result, + mutable_column_device_view, + has_nulls> { + __device__ mutable_column_expression_result(mutable_column_device_view& obj) : _obj(obj) {} + + template + __device__ void set_value(cudf::size_type index, possibly_null_value_t result) + { + if constexpr (has_nulls) { + if (result.has_value()) { + _obj.template element(index) = *result; + _obj.set_valid(index); + } else { + _obj.set_null(index); + } + } else { + _obj.template element(index) = result; + } + } + + /** + * @brief Not implemented for this specialization. + */ + __device__ bool is_valid() const { - using OperatorFunctor = detail::operator_functor; - using Out = cuda::std::invoke_result_t; - resolve_output(output, row_index, OperatorFunctor{}(lhs, rhs)); + // Not implemented since it would require modifying the API in the parent class to accept an + // index. + cudf_assert(false && "This method is not implemented."); } - template , LHS, RHS>>* = - nullptr> - __device__ void operator()(cudf::size_type row_index, - LHS lhs, - RHS rhs, - detail::device_data_reference output) const + /** + * @brief Not implemented for this specialization. + */ + __device__ mutable_column_device_view value() const { - cudf_assert(false && "Invalid binary dispatch operator for the provided input."); + // Not implemented since it would require modifying the API in the parent class to accept an + // index. + cudf_assert(false && "This method is not implemented."); } + + mutable_column_device_view& _obj; ///< The column to which the data is written. }; /** - * @brief An expression evaluator owned by a single thread operating on rows of a table. + * @brief A container of all device data required to evaluate an expression on tables. + * + * This struct should never be instantiated directly. It is created by the + * `ast_plan` on construction, and the resulting member is publicly accessible + * for passing to kernels for constructing an `expression_evaluator`. * - * This class is designed for n-ary transform evaluation. Currently this class assumes that there's - * only one relevant "row index" in its methods, which corresponds to a row in a single input table - * and the same row index in an output column. */ -struct row_evaluator { - friend struct row_output; - template - friend struct unary_row_output; - template - friend struct binary_row_output; +struct device_ast_plan { + device_span data_references; + device_span literals; + device_span operators; + device_span operator_source_indices; + cudf::size_type num_intermediates; + int shmem_per_thread; +}; + +/** + * @brief Preprocessor for an expression acting on tables to generate data suitable for AST + * expression evaluation on the GPU. + * + * On construction, an AST plan creates a single "packed" host buffer of all + * data arrays that will be necessary to evaluate an expression on a pair of + * tables. This data is copied to a single contiguous device buffer, and + * pointers are generated to the individual components. Because the plan tends + * to be small, this is the most efficient approach for low latency. All the + * data required on the GPU can be accessed via the convenient `dev_plan` + * member struct, which can be used to construct an `expression_evaluator` on + * the device. + * + * Note that the resulting device data cannot be used once this class goes out of scope. + */ +struct ast_plan { + /** + * @brief Construct an AST plan for an expression operating on two tables. + * + * @param expr The expression for which to construct a plan. + * @param left The left table on which the expression acts. + * @param right The right table on which the expression acts. + * @param has_nulls Boolean indicator of whether or not the data contains nulls. + * @param stream Stream view on which to allocate resources and queue execution. + * @param mr Device memory resource used to allocate the returned column's device. + */ + ast_plan(detail::node const& expr, + cudf::table_view left, + cudf::table_view right, + bool has_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : _linearizer(expr, left, right) + { + std::vector sizes; + std::vector data_pointers; + + extract_size_and_pointer(_linearizer.data_references(), sizes, data_pointers); + extract_size_and_pointer(_linearizer.literals(), sizes, data_pointers); + extract_size_and_pointer(_linearizer.operators(), sizes, data_pointers); + extract_size_and_pointer(_linearizer.operator_source_indices(), sizes, data_pointers); + + // Create device buffer + auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0); + auto buffer_offsets = std::vector(sizes.size()); + thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0); + + auto h_data_buffer = std::make_unique(buffer_size); + for (unsigned int i = 0; i < data_pointers.size(); ++i) { + std::memcpy(h_data_buffer.get() + buffer_offsets[i], data_pointers[i], sizes[i]); + } + + _device_data_buffer = rmm::device_buffer(h_data_buffer.get(), buffer_size, stream, mr); + stream.synchronize(); + + // Create device pointers to components of plan + auto device_data_buffer_ptr = static_cast(_device_data_buffer.data()); + dev_plan.data_references = device_span( + reinterpret_cast(device_data_buffer_ptr + + buffer_offsets[0]), + _linearizer.data_references().size()); + dev_plan.literals = device_span( + reinterpret_cast( + device_data_buffer_ptr + buffer_offsets[1]), + _linearizer.literals().size()); + dev_plan.operators = device_span( + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]), + _linearizer.operators().size()); + dev_plan.operator_source_indices = device_span( + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]), + _linearizer.operator_source_indices().size()); + dev_plan.num_intermediates = _linearizer.intermediate_count(); + dev_plan.shmem_per_thread = static_cast( + (has_nulls ? sizeof(IntermediateDataType) : sizeof(IntermediateDataType)) * + dev_plan.num_intermediates); + } + + /** + * @brief Construct an AST plan for an expression operating on one table. + * + * @param expr The expression for which to construct a plan. + * @param table The table on which the expression acts. + * @param has_nulls Boolean indicator of whether or not the data contains nulls. + * @param stream Stream view on which to allocate resources and queue execution. + * @param mr Device memory resource used to allocate the returned column's device. + */ + ast_plan(detail::node const& expr, + cudf::table_view table, + bool has_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : ast_plan(expr, table, table, has_nulls, stream, mr) + { + } + + cudf::data_type output_type() const { return _linearizer.root_data_type(); } + + device_ast_plan + dev_plan; ///< The collection of data required to evaluate the expression on the device. + + private: + /** + * @brief Helper function for adding components (operators, literals, etc) to AST plan + * + * @tparam T The underlying type of the input `std::vector` + * @param[in] v The `std::vector` containing components (operators, literals, etc). + * @param[in,out] sizes The `std::vector` containing the size of each data buffer. + * @param[in,out] data_pointers The `std::vector` containing pointers to each data buffer. + */ + template + void extract_size_and_pointer(std::vector const& v, + std::vector& sizes, + std::vector& data_pointers) + { + auto const data_size = sizeof(T) * v.size(); + sizes.push_back(data_size); + data_pointers.push_back(v.data()); + } + + rmm::device_buffer + _device_data_buffer; ///< The device-side data buffer containing the plan information, which is + ///< owned by this class and persists until it is destroyed. + linearizer const _linearizer; ///< The linearizer created from the provided expression that is + ///< used to construct device-side operators and references. +}; + +/** + * @brief The principal object for evaluating AST expressions on device. + * + * This class is designed for n-ary transform evaluation. It operates on two + * tables. + */ +template +struct expression_evaluator { public: /** - * @brief Construct a row evaluator. + * @brief Construct an expression evaluator acting on two tables. + * + * @param left View of the left table view used for evaluation. + * @param right View of the right table view used for evaluation. + * @param plan The collection of device references representing the expression to evaluate. + * @param thread_intermediate_storage Pointer to this thread's portion of shared memory for + * storing intermediates. + * @param compare_nulls Whether the equality operator returns true or false for two nulls. + + */ + __device__ expression_evaluator(table_device_view const& left, + table_device_view const& right, + device_ast_plan const& plan, + IntermediateDataType* thread_intermediate_storage, + null_equality compare_nulls = null_equality::EQUAL) + : left(left), + right(right), + plan(plan), + thread_intermediate_storage(thread_intermediate_storage), + compare_nulls(compare_nulls) + { + } + + /** + * @brief Construct an expression evaluator acting on one table. * - * @param table The table device view used for evaluation. - * @param literals Array of literal values used for evaluation. + * @param table View of the table view used for evaluation. + * @param plan The collection of device references representing the expression to evaluate. * @param thread_intermediate_storage Pointer to this thread's portion of shared memory for * storing intermediates. - * @param output_column The output column where results are stored. + * @param compare_nulls Whether the equality operator returns true or false for two nulls. */ - __device__ row_evaluator( - table_device_view const& table, - device_span literals, - std::int64_t* thread_intermediate_storage, - mutable_column_device_view* output_column) - : table(table), - literals(literals), + __device__ expression_evaluator(table_device_view const& table, + device_ast_plan const& plan, + IntermediateDataType* thread_intermediate_storage, + null_equality compare_nulls = null_equality::EQUAL) + : left(table), + right(table), + plan(plan), thread_intermediate_storage(thread_intermediate_storage), - output_column(output_column) + compare_nulls(compare_nulls) { } @@ -177,241 +422,437 @@ struct row_evaluator { * sizeof(std::int64_t). This requirement on intermediates is enforced by the linearizer. * * @tparam Element Type of element to return. + * @tparam has_nulls Whether or not the result data is nullable. * @param device_data_reference Data reference to resolve. * @param row_index Row index of data column. - * @return Element + * @return Element The type- and null-resolved data. */ template ())> - __device__ Element resolve_input(detail::device_data_reference device_data_reference, - cudf::size_type row_index) const + __device__ possibly_null_value_t resolve_input( + detail::device_data_reference device_data_reference, cudf::size_type row_index) const { auto const data_index = device_data_reference.data_index; auto const ref_type = device_data_reference.reference_type; + // TODO: Everywhere in the code assumes that the table reference is either + // left or right. Should we error-check somewhere to prevent + // table_reference::OUTPUT from being specified? + auto const& table = device_data_reference.table_source == table_reference::LEFT ? left : right; + using ReturnType = possibly_null_value_t; if (ref_type == detail::device_data_reference_type::COLUMN) { - return table.column(data_index).element(row_index); + // If we have nullable data, return an empty nullable type with no value if the data is null. + if constexpr (has_nulls) { + return table.column(data_index).is_valid(row_index) + ? ReturnType(table.column(data_index).element(row_index)) + : ReturnType(); + + } else { + return ReturnType(table.column(data_index).element(row_index)); + } } else if (ref_type == detail::device_data_reference_type::LITERAL) { - return literals[data_index].value(); + return ReturnType(plan.literals[data_index].value()); } else { // Assumes ref_type == detail::device_data_reference_type::INTERMEDIATE // Using memcpy instead of reinterpret_cast for safe type aliasing // Using a temporary variable ensures that the compiler knows the result is aligned - std::int64_t intermediate = thread_intermediate_storage[data_index]; - Element tmp; - memcpy(&tmp, &intermediate, sizeof(Element)); + IntermediateDataType intermediate = thread_intermediate_storage[data_index]; + ReturnType tmp; + memcpy(&tmp, &intermediate, sizeof(ReturnType)); return tmp; } + // Unreachable return used to silence compiler warnings. + return {}; } template ())> - __device__ Element resolve_input(detail::device_data_reference device_data_reference, - cudf::size_type row_index) const + __device__ possibly_null_value_t resolve_input( + detail::device_data_reference device_data_reference, cudf::size_type row_index) const { cudf_assert(false && "Unsupported type in resolve_input."); + // Unreachable return used to silence compiler warnings. return {}; } /** * @brief Callable to perform a unary operation. * - * @tparam OperatorFunctor Functor that performs desired operation when `operator()` is called. * @tparam Input Type of input value. - * @param row_index Row index of data column(s). + * @tparam OutputType The container type that data will be inserted into. + * + * @param output_object The container that data will be inserted into. + * @param input_row_index The row to pull the data from the input table. * @param input Input data reference. * @param output Output data reference. + * @param output_row_index The row in the output to insert the result. + * @param op The operator to act with. */ - template - __device__ void operator()(cudf::size_type row_index, - detail::device_data_reference input, - detail::device_data_reference output, - ast_operator op) const + template + __device__ void operator()(OutputType& output_object, + const cudf::size_type input_row_index, + const detail::device_data_reference input, + const detail::device_data_reference output, + const cudf::size_type output_row_index, + const ast_operator op) const { - auto const typed_input = resolve_input(input, row_index); - ast_operator_dispatcher(op, unary_row_output(*this), row_index, typed_input, output); + auto const typed_input = resolve_input(input, input_row_index); + ast_operator_dispatcher(op, + unary_expression_output_handler(*this), + output_object, + output_row_index, + typed_input, + output); } /** - * @brief Callable to perform a binary operation. + * @brief Callable to perform a unary operation. + * + * @tparam LHS Type of the left input value. + * @tparam RHS Type of the right input value. + * @tparam OutputType The container type that data will be inserted into. * - * @tparam OperatorFunctor Functor that performs desired operation when `operator()` is called. - * @tparam LHS Type of left input value. - * @tparam RHS Type of right input value. - * @param row_index Row index of data column(s). + * @param output_object The container that data will be inserted into. + * @param left_row_index The row to pull the data from the left table. + * @param right_row_index The row to pull the data from the right table. * @param lhs Left input data reference. * @param rhs Right input data reference. * @param output Output data reference. + * @param output_row_index The row in the output to insert the result. + * @param op The operator to act with. */ - template - __device__ void operator()(cudf::size_type row_index, - detail::device_data_reference lhs, - detail::device_data_reference rhs, - detail::device_data_reference output, - ast_operator op) const + template + __device__ void operator()(OutputType& output_object, + const cudf::size_type left_row_index, + const cudf::size_type right_row_index, + const detail::device_data_reference lhs, + const detail::device_data_reference rhs, + const detail::device_data_reference output, + const cudf::size_type output_row_index, + const ast_operator op) const { - auto const typed_lhs = resolve_input(lhs, row_index); - auto const typed_rhs = resolve_input(rhs, row_index); - ast_operator_dispatcher( - op, binary_row_output(*this), row_index, typed_lhs, typed_rhs, output); + auto const typed_lhs = resolve_input(lhs, left_row_index); + auto const typed_rhs = resolve_input(rhs, right_row_index); + ast_operator_dispatcher(op, + binary_expression_output_handler(*this), + output_object, + output_row_index, + typed_lhs, + typed_rhs, + output); } template >* = nullptr> - __device__ void operator()(cudf::size_type row_index, - detail::device_data_reference lhs, - detail::device_data_reference rhs, - detail::device_data_reference output) const + __device__ void operator()(OutputType& output_object, + cudf::size_type left_row_index, + cudf::size_type right_row_index, + const detail::device_data_reference lhs, + const detail::device_data_reference rhs, + const detail::device_data_reference output, + cudf::size_type output_row_index, + const ast_operator op) const { cudf_assert(false && "Invalid binary dispatch operator for the provided input."); } - private: - table_device_view const& table; - device_span literals; - std::int64_t* thread_intermediate_storage; - mutable_column_device_view* output_column; -}; - -template ()>*> -__device__ void row_output::resolve_output(detail::device_data_reference device_data_reference, - cudf::size_type row_index, - Element result) const -{ - auto const ref_type = device_data_reference.reference_type; - if (ref_type == detail::device_data_reference_type::COLUMN) { - evaluator.output_column->element(row_index) = result; - } else { // Assumes ref_type == detail::device_data_reference_type::INTERMEDIATE - // Using memcpy instead of reinterpret_cast for safe type aliasing. - // Using a temporary variable ensures that the compiler knows the result is aligned. - std::int64_t tmp; - memcpy(&tmp, &result, sizeof(Element)); - evaluator.thread_intermediate_storage[device_data_reference.data_index] = tmp; + /** + * @brief Evaluate an expression applied to a row. + * + * This function performs an n-ary transform for one row on one thread. + * + * @tparam OutputType The container type that data will be inserted into. + * + * @param output_object The container that data will be inserted into. + * @param row_index Row index of all input and output data column(s). + */ + template + __device__ void evaluate(OutputType& output_object, cudf::size_type const row_index) + { + evaluate(output_object, row_index, row_index, row_index); } -} -/** - * @brief Evaluate an expression applied to a row. - * - * This function performs an n-ary transform for one row on one thread. - * - * @param evaluator The row evaluator used for evaluation. - * @param data_references Array of data references. - * @param operators Array of operators to perform. - * @param operator_source_indices Array of source indices for the operators. - * @param num_operators Number of operators. - * @param row_index Row index of data column(s). - */ -__device__ void evaluate_row_expression( - detail::row_evaluator const& evaluator, - device_span data_references, - device_span operators, - device_span operator_source_indices, - cudf::size_type row_index) -{ - auto operator_source_index = static_cast(0); - for (cudf::size_type operator_index = 0; operator_index < operators.size(); operator_index++) { - // Execute operator - auto const op = operators[operator_index]; - auto const arity = ast_operator_arity(op); - if (arity == 1) { - // Unary operator - auto const input = data_references[operator_source_indices[operator_source_index]]; - auto const output = data_references[operator_source_indices[operator_source_index + 1]]; - operator_source_index += arity + 1; - type_dispatcher(input.data_type, evaluator, row_index, input, output, op); - } else if (arity == 2) { - // Binary operator - auto const lhs = data_references[operator_source_indices[operator_source_index]]; - auto const rhs = data_references[operator_source_indices[operator_source_index + 1]]; - auto const output = data_references[operator_source_indices[operator_source_index + 2]]; - operator_source_index += arity + 1; - type_dispatcher(lhs.data_type, - detail::single_dispatch_binary_operator{}, - evaluator, - row_index, - lhs, - rhs, - output, - op); - } else { - cudf_assert(false && "Invalid operator arity."); + /** + * @brief Evaluate an expression applied to a row. + * + * This function performs an n-ary transform for one row on one thread. + * + * @tparam OutputType The container type that data will be inserted into. + * + * @param output_object The container that data will be inserted into. + * @param left_row_index The row to pull the data from the left table. + * @param right_row_index The row to pull the data from the right table. + * @param output_row_index The row in the output to insert the result. + */ + template + __device__ void evaluate(OutputType& output_object, + cudf::size_type const left_row_index, + cudf::size_type const right_row_index, + cudf::size_type const output_row_index) + { + auto operator_source_index = static_cast(0); + for (cudf::size_type operator_index = 0; operator_index < plan.operators.size(); + operator_index++) { + // Execute operator + auto const op = plan.operators[operator_index]; + auto const arity = ast_operator_arity(op); + if (arity == 1) { + // Unary operator + auto const input = + plan.data_references[plan.operator_source_indices[operator_source_index]]; + auto const output = + plan.data_references[plan.operator_source_indices[operator_source_index + 1]]; + operator_source_index += arity + 1; + auto input_row_index = + input.table_source == table_reference::LEFT ? left_row_index : right_row_index; + type_dispatcher(input.data_type, + *this, + output_object, + input_row_index, + input, + output, + output_row_index, + op); + } else if (arity == 2) { + // Binary operator + auto const lhs = plan.data_references[plan.operator_source_indices[operator_source_index]]; + auto const rhs = + plan.data_references[plan.operator_source_indices[operator_source_index + 1]]; + auto const output = + plan.data_references[plan.operator_source_indices[operator_source_index + 2]]; + operator_source_index += arity + 1; + type_dispatcher(lhs.data_type, + detail::single_dispatch_binary_operator{}, + *this, + output_object, + left_row_index, + right_row_index, + lhs, + rhs, + output, + output_row_index, + op); + } else { + cudf_assert(false && "Invalid operator arity."); + } } } -} -/** - * @brief The AST plan creates a device buffer of data needed to execute an AST. - * - * On construction, an AST plan creates a single "packed" host buffer of all necessary data arrays, - * and copies that to the device with a single host-device memory copy. Because the plan tends to be - * small, this is the most efficient approach for low latency. - * - */ -struct ast_plan { - ast_plan(linearizer const& expr_linearizer, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : _sizes{}, _data_pointers{} - { - add_to_plan(expr_linearizer.data_references()); - add_to_plan(expr_linearizer.literals()); - add_to_plan(expr_linearizer.operators()); - add_to_plan(expr_linearizer.operator_source_indices()); + private: + /** + * @brief Helper struct for type dispatch on the result of an expression. + * + * Evaluating an expression requires multiple levels of type dispatch to + * determine the input types, the operation type, and the output type. This + * helper class is a functor that handles the operator dispatch, invokes the + * operator, and dispatches output writing based on the resulting data type. + */ + struct expression_output_handler { + public: + __device__ expression_output_handler(expression_evaluator const& evaluator) + : evaluator(evaluator) + { + } - // Create device buffer - auto const buffer_size = std::accumulate(_sizes.cbegin(), _sizes.cend(), 0); - auto buffer_offsets = std::vector(_sizes.size()); - thrust::exclusive_scan(_sizes.cbegin(), _sizes.cend(), buffer_offsets.begin(), 0); + /** + * @brief Resolves an output data reference and assigns result value. + * + * Only output columns (COLUMN) and intermediates (INTERMEDIATE) are supported as output + * reference types. Intermediates must be of fixed width less than or equal to + * sizeof(std::int64_t). This requirement on intermediates is enforced by the linearizer. + * + * @tparam Element Type of result element. + * @tparam OutputType The container type that data will be inserted into. + * + * @param output_object The container that data will be inserted into. + * @param device_data_reference Data reference to resolve. + * @param row_index Row index of data column. + * @param result Value to assign to output. + */ + template ())> + __device__ void resolve_output(OutputType& output_object, + const detail::device_data_reference device_data_reference, + const cudf::size_type row_index, + const possibly_null_value_t result) const + { + auto const ref_type = device_data_reference.reference_type; + if (ref_type == detail::device_data_reference_type::COLUMN) { + output_object.template set_value(row_index, result); + } else { // Assumes ref_type == detail::device_data_reference_type::INTERMEDIATE + // Using memcpy instead of reinterpret_cast for safe type aliasing. + // Using a temporary variable ensures that the compiler knows the result is aligned. + IntermediateDataType tmp; + memcpy(&tmp, &result, sizeof(possibly_null_value_t)); + evaluator.thread_intermediate_storage[device_data_reference.data_index] = tmp; + } + } - auto h_data_buffer = std::make_unique(buffer_size); - for (unsigned int i = 0; i < _data_pointers.size(); ++i) { - std::memcpy(h_data_buffer.get() + buffer_offsets[i], _data_pointers[i], _sizes[i]); + template ())> + __device__ void resolve_output(OutputType& output_object, + const detail::device_data_reference device_data_reference, + const cudf::size_type row_index, + const possibly_null_value_t result) const + { + cudf_assert(false && "Invalid type in resolve_output."); } - _device_data_buffer = rmm::device_buffer(h_data_buffer.get(), buffer_size, stream, mr); + protected: + expression_evaluator const& evaluator; + }; - stream.synchronize(); + /** + * @brief Subclass of the expression output handler for unary operations. + * + * This functor's call operator is specialized to handle unary operations, + * which only require a single operand. + */ + template + struct unary_expression_output_handler : public expression_output_handler { + __device__ unary_expression_output_handler(expression_evaluator const& evaluator) + : expression_output_handler(evaluator) + { + } - // Create device pointers to components of plan - auto device_data_buffer_ptr = static_cast(_device_data_buffer.data()); - _device_data_references = device_span( - reinterpret_cast(device_data_buffer_ptr + - buffer_offsets[0]), - expr_linearizer.data_references().size()); - _device_literals = device_span( - reinterpret_cast( - device_data_buffer_ptr + buffer_offsets[1]), - expr_linearizer.literals().size()); - _device_operators = device_span( - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]), - expr_linearizer.operators().size()); - _device_operator_source_indices = device_span( - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]), - expr_linearizer.operator_source_indices().size()); - } + /** + * @brief Callable to perform a unary operation. + * + * @tparam op The operation to perform. + * @tparam OutputType The container type that data will be inserted into. + * + * @param output_object The container that data will be inserted into. + * @param outputrow_index The row in the output object to insert the data. + * @param input Input to the operation. + * @param output Output data reference. + */ + template < + ast_operator op, + typename OutputType, + std::enable_if_t, Input>>* = nullptr> + __device__ void operator()(OutputType& output_object, + const cudf::size_type output_row_index, + const possibly_null_value_t input, + const detail::device_data_reference output) const + { + using OperatorFunctor = detail::operator_functor; + using Out = cuda::std::invoke_result_t; + if constexpr (has_nulls) { + auto const result = input.has_value() + ? possibly_null_value_t(OperatorFunctor{}(*input)) + : possibly_null_value_t(); + this->template resolve_output(output_object, output, output_row_index, result); + } else { + this->template resolve_output( + output_object, output, output_row_index, OperatorFunctor{}(input)); + } + } + + template < + ast_operator op, + typename OutputType, + std::enable_if_t, Input>>* = nullptr> + __device__ void operator()(OutputType& output_object, + const cudf::size_type output_row_index, + const possibly_null_value_t input, + const detail::device_data_reference output) const + { + cudf_assert(false && "Invalid unary dispatch operator for the provided input."); + } + }; /** - * @brief Helper function for adding components (operators, literals, etc) to AST plan + * @brief Subclass of the expression output handler for binary operations. * - * @tparam T The underlying type of the input `std::vector` - * @param v The `std::vector` containing components (operators, literals, etc) + * This functor's call operator is specialized to handle binary operations, + * which require two operands. */ - template - void add_to_plan(std::vector const& v) - { - auto const data_size = sizeof(T) * v.size(); - _sizes.push_back(data_size); - _data_pointers.push_back(v.data()); - } + template + struct binary_expression_output_handler : public expression_output_handler { + __device__ binary_expression_output_handler(expression_evaluator const& evaluator) + : expression_output_handler(evaluator) + { + } - std::vector _sizes; - std::vector _data_pointers; + /** + * @brief Callable to perform a binary operation. + * + * @tparam op The operation to perform. + * @tparam OutputType The container type that data will be inserted into. + * + * @param output_object The container that data will be inserted into. + * @param output_row_index The row in the output to insert the result. + * @param lhs Left input to the operation. + * @param rhs Right input to the operation. + * @param output Output data reference. + */ + template , LHS, RHS>>* = nullptr> + __device__ void operator()(OutputType& output_object, + const cudf::size_type output_row_index, + const possibly_null_value_t lhs, + const possibly_null_value_t rhs, + const detail::device_data_reference output) const + { + using OperatorFunctor = detail::operator_functor; + using Out = cuda::std::invoke_result_t; + if constexpr (has_nulls) { + if constexpr (op == ast_operator::EQUAL) { + // Special handling of the equality operator based on what kind + // of null handling was requested. + possibly_null_value_t result; + if (!lhs.has_value() && !rhs.has_value()) { + // Case 1: Both null, so the output is based on compare_nulls. + result = possibly_null_value_t(this->evaluator.compare_nulls == + null_equality::EQUAL); + } else if (lhs.has_value() && rhs.has_value()) { + // Case 2: Neither is null, so the output is given by the operation. + result = possibly_null_value_t(OperatorFunctor{}(*lhs, *rhs)); + } else { + // Case 3: One value is null, while the other is not, so we simply propagate nulls. + result = possibly_null_value_t(); + } + this->template resolve_output(output_object, output, output_row_index, result); + } else { + // Default behavior for all other operators is to propagate nulls. + auto result = (lhs.has_value() && rhs.has_value()) + ? possibly_null_value_t(OperatorFunctor{}(*lhs, *rhs)) + : possibly_null_value_t(); + this->template resolve_output(output_object, output, output_row_index, result); + } + } else { + this->template resolve_output( + output_object, output, output_row_index, OperatorFunctor{}(lhs, rhs)); + } + } - rmm::device_buffer _device_data_buffer; - device_span _device_data_references; - device_span _device_literals; - device_span _device_operators; - device_span _device_operator_source_indices; + template , LHS, RHS>>* = nullptr> + __device__ void operator()(OutputType& output_object, + const cudf::size_type output_row_index, + const possibly_null_value_t lhs, + const possibly_null_value_t rhs, + const detail::device_data_reference output) const + { + cudf_assert(false && "Invalid binary dispatch operator for the provided input."); + } + }; + + table_device_view const& left; ///< The left table to operate on. + table_device_view const& right; ///< The right table to operate on. + device_ast_plan const& + plan; ///< The container of device data representing the expression to evaluate. + IntermediateDataType* + thread_intermediate_storage; ///< The shared memory store of intermediates produced during + ///< evaluation. + null_equality + compare_nulls; ///< Whether the equality operator returns true or false for two nulls. }; /** diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp index 7099c29b9df..e6ff6b0eadc 100644 --- a/cpp/include/cudf/binaryop.hpp +++ b/cpp/include/cudf/binaryop.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,37 +42,36 @@ enum class binary_operator : int32_t { FLOOR_DIV, ///< operator / after promoting to 64 bit floating point and then ///< flooring the result MOD, ///< operator % + PMOD, ///< positive modulo operator + ///< If remainder is negative, this returns (remainder + divisor) % divisor + ///< else, it returns (dividend % divisor) PYMOD, ///< operator % but following python's sign rules for negatives POW, ///< lhs ^ rhs + LOG_BASE, ///< logarithm to the base + ATAN2, ///< 2-argument arctangent + SHIFT_LEFT, ///< operator << + SHIFT_RIGHT, ///< operator >> + SHIFT_RIGHT_UNSIGNED, ///< operator >>> (from Java) + ///< Logical right shift. Casts to an unsigned value before shifting. + BITWISE_AND, ///< operator & + BITWISE_OR, ///< operator | + BITWISE_XOR, ///< operator ^ + LOGICAL_AND, ///< operator && + LOGICAL_OR, ///< operator || EQUAL, ///< operator == NOT_EQUAL, ///< operator != LESS, ///< operator < GREATER, ///< operator > LESS_EQUAL, ///< operator <= GREATER_EQUAL, ///< operator >= - BITWISE_AND, ///< operator & - BITWISE_OR, ///< operator | - BITWISE_XOR, ///< operator ^ - LOGICAL_AND, ///< operator && - LOGICAL_OR, ///< operator || - COALESCE, ///< operator x,y x is null ? y : x - GENERIC_BINARY, ///< generic binary operator to be generated with input - ///< ptx code - SHIFT_LEFT, ///< operator << - SHIFT_RIGHT, ///< operator >> - SHIFT_RIGHT_UNSIGNED, ///< operator >>> (from Java) - ///< Logical right shift. Casts to an unsigned value before shifting. - LOG_BASE, ///< logarithm to the base - ATAN2, ///< 2-argument arctangent - PMOD, ///< positive modulo operator - ///< If remainder is negative, this returns (remainder + divisor) % divisor - ///< else, it returns (dividend % divisor) NULL_EQUALS, ///< Returns true when both operands are null; false when one is null; the ///< result of equality when both are non-null NULL_MAX, ///< Returns max of operands when both are non-null; returns the non-null ///< operand when one is null; or invalid when both are null NULL_MIN, ///< Returns min of operands when both are non-null; returns the non-null ///< operand when one is null; or invalid when both are null + GENERIC_BINARY, ///< generic binary operator to be generated with input + ///< ptx code INVALID_BINARY ///< invalid operation }; /** @@ -87,6 +86,7 @@ enum class binary_operator : int32_t { * * @param lhs The left operand scalar * @param rhs The right operand column + * @param op The binary operator * @param output_type The desired data type of the output column * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of @@ -112,6 +112,7 @@ std::unique_ptr binary_operation( * * @param lhs The left operand column * @param rhs The right operand scalar + * @param op The binary operator * @param output_type The desired data type of the output column * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of @@ -135,6 +136,7 @@ std::unique_ptr binary_operation( * * @param lhs The left operand column * @param rhs The right operand column + * @param op The binary operator * @param output_type The desired data type of the output column * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of @@ -202,5 +204,89 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op, cudf::data_type const& lhs, cudf::data_type const& rhs); +namespace experimental { +/** + * @brief Performs a binary operation between a scalar and a column. + * + * The output contains the result of `op(lhs, rhs[i])` for all `0 <= i < rhs.size()` + * The scalar is the left operand and the column elements are the right operand. + * This distinction is significant in case of non-commutative binary operations + * + * Regardless of the operator, the validity of the output value is the logical + * AND of the validity of the two operands except NullMin and NullMax (logical OR). + * + * @param lhs The left operand scalar + * @param rhs The right operand column + * @param op The binary operator + * @param output_type The desired data type of the output column + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Output column of `output_type` type containing the result of + * the binary operation + * @throw cudf::logic_error if @p output_type dtype isn't fixed-width + * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical + * operations. + */ +std::unique_ptr binary_operation( + scalar const& lhs, + column_view const& rhs, + binary_operator op, + data_type output_type, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Performs a binary operation between a column and a scalar. + * + * The output contains the result of `op(lhs[i], rhs)` for all `0 <= i < lhs.size()` + * The column elements are the left operand and the scalar is the right operand. + * This distinction is significant in case of non-commutative binary operations + * + * Regardless of the operator, the validity of the output value is the logical + * AND of the validity of the two operands except NullMin and NullMax (logical OR). + * + * @param lhs The left operand column + * @param rhs The right operand scalar + * @param op The binary operator + * @param output_type The desired data type of the output column + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Output column of `output_type` type containing the result of + * the binary operation + * @throw cudf::logic_error if @p output_type dtype isn't fixed-width + * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical + * operations. + */ +std::unique_ptr binary_operation( + column_view const& lhs, + scalar const& rhs, + binary_operator op, + data_type output_type, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Performs a binary operation between two columns. + * + * The output contains the result of `op(lhs[i], rhs[i])` for all `0 <= i < lhs.size()` + * + * Regardless of the operator, the validity of the output value is the logical + * AND of the validity of the two operands except NullMin and NullMax (logical OR). + * + * @param lhs The left operand column + * @param rhs The right operand column + * @param op The binary operator + * @param output_type The desired data type of the output column + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Output column of `output_type` type containing the result of + * the binary operation + * @throw cudf::logic_error if @p lhs and @p rhs are different sizes + * @throw cudf::logic_error if @p output_type dtype isn't boolean for comparison and logical + * operations. + * @throw cudf::logic_error if @p output_type dtype isn't fixed-width + */ +std::unique_ptr binary_operation( + column_view const& lhs, + column_view const& rhs, + binary_operator op, + data_type output_type, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +} // namespace experimental /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp index ee367840644..8decce7f260 100644 --- a/cpp/include/cudf/column/column.hpp +++ b/cpp/include/cudf/column/column.hpp @@ -293,7 +293,7 @@ class column { /** * @brief Implicit conversion operator to a `mutable_column_view`. * - * This allows pasing a `column` object into a function that accepts a + * This allows passing a `column` object into a function that accepts a *`mutable_column_view`. The conversion is automatic. * @note Creating a mutable view of a `column` invalidates the `column`'s diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 8cb05ca0bad..02e3eee6b43 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -37,7 +37,7 @@ /** * @file column_device_view.cuh - * @brief Column device view class definitons + * @brief Column device view class definitions */ namespace cudf { @@ -541,7 +541,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * * optional_begin with mode `DYNAMIC` defers the assumption of nullability to * runtime, with the user stating on construction of the iterator if column has nulls. - * `DYNAMIC` mode is nice when an algorithm is going to execute on mutliple + * `DYNAMIC` mode is nice when an algorithm is going to execute on multiple * iterators and you don't want to compile all the combinations of iterator types * * Example: diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index e5424f0fc44..bdb7fd48e60 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -399,7 +399,7 @@ std::unique_ptr make_strings_column( * one more than the total number of strings so the `offsets.back()` is the total number of bytes * in the strings array. `offsets.front()` must always be 0 to point to the beginning of `strings`. * @param[in] null_mask Device span containing the null element indicator bitmask. Arrow format for - * nulls is used for interpeting this bitmask. + * nulls is used for interpreting this bitmask. * @param[in] null_count The number of null string entries. If equal to `UNKNOWN_NULL_COUNT`, the * null count will be computed dynamically on the first invocation of `column::null_count()` * @param[in] stream CUDA stream used for device memory operations and kernel launches. @@ -428,7 +428,7 @@ std::unique_ptr make_strings_column( * strings are identified by the offsets and the nullmask. * @param[in] null_count The number of null string entries. * @param[in] null_mask The bits specifying the null strings in device memory. Arrow format for - * nulls is used for interpeting this bitmask. + * nulls is used for interpreting this bitmask. * @param[in] stream CUDA stream used for device memory operations and kernel launches. * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children * columns' device memory. @@ -491,7 +491,7 @@ std::unique_ptr make_strings_column( * further nested. * @param[in] null_count The number of null list entries. * @param[in] null_mask The bits specifying the null lists in device memory. - * Arrow format for nulls is used for interpeting this bitmask. + * Arrow format for nulls is used for interpreting this bitmask. * @param[in] stream Optional stream for use with all memory allocation * and device kernels * @param[in] mr Optional resource to use for device memory diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 82326a21d7d..7ab8cc0f6b1 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -22,7 +22,7 @@ /** * @file column_view.hpp - * @brief column view class definitons + * @brief column view class definitions */ namespace cudf { diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index 477c53535de..6ab115196d6 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -529,6 +529,7 @@ struct packed_columns { * @ingroup copy_split */ struct metadata { + metadata() = default; metadata(std::vector&& v) : data_(std::move(v)) {} uint8_t const* data() const { return data_.data(); } size_t size() const { return data_.size(); } @@ -537,6 +538,15 @@ struct packed_columns { std::vector data_; }; + packed_columns() + : metadata_(std::make_unique()), gpu_data(std::make_unique()) + { + } + packed_columns(std::unique_ptr&& md, std::unique_ptr&& gd) + : metadata_(std::move(md)), gpu_data(std::move(gd)) + { + } + std::unique_ptr metadata_; std::unique_ptr gpu_data; }; @@ -629,7 +639,7 @@ packed_columns pack(cudf::table_view const& input, * guaranteeing that that all of the columns in the table point into `contiguous_buffer`. * * @param input View of the table to pack - * @param contgiuous_buffer A contiguous buffer of device memory which contains the data referenced + * @param contiguous_buffer A contiguous buffer of device memory which contains the data referenced * by the columns in `table` * @param buffer_size The size of `contiguous_buffer`. * @return Vector of bytes representing the metadata used to `unpack` a packed_columns struct. diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index a276769c169..980c824fdf2 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -189,6 +189,23 @@ std::unique_ptr add_calendrical_months( cudf::column_view const& timestamps, cudf::column_view const& months, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Check if the year of the given date is a leap year + * + * `output[i] == true` if year of `column[i]` is a leap year + * `output[i] == false` if year of `column[i]` is not a leap year + * `output[i] is null` if `column[i]` is null + * + * @param[in] cudf::column_view of the input datetime values + * + * @returns cudf::column of datatype BOOL8 truth value of the corresponding date + * @throw cudf::logic_error if input column datatype is not a TIMESTAMP + */ +std::unique_ptr is_leap_year( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace datetime } // namespace cudf diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index 09763d66403..53c1f47c201 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -643,7 +643,7 @@ struct identity_initializer { * The `i`th column will be initialized with the identity value of the `i`th * aggregation operation in `aggs`. * - * @throw cudf::logic_error if column type and corresponging agg are incompatible + * @throw cudf::logic_error if column type and corresponding agg are incompatible * @throw cudf::logic_error if column type is not fixed-width * * @param table The table of columns to initialize. diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 373d695a5b5..10d9d8c1b92 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -53,6 +53,8 @@ class simple_aggregations_collector { // Declares the interface for the simple data_type col_type, class sum_of_squares_aggregation const& agg); virtual std::vector> visit(data_type col_type, class mean_aggregation const& agg); + virtual std::vector> visit(data_type col_type, + class m2_aggregation const& agg); virtual std::vector> visit(data_type col_type, class var_aggregation const& agg); virtual std::vector> visit(data_type col_type, @@ -75,14 +77,16 @@ class simple_aggregations_collector { // Declares the interface for the simple data_type col_type, class collect_list_aggregation const& agg); virtual std::vector> visit(data_type col_type, class collect_set_aggregation const& agg); + virtual std::vector> visit(data_type col_type, + class lead_lag_aggregation const& agg); + virtual std::vector> visit(data_type col_type, + class udf_aggregation const& agg); virtual std::vector> visit(data_type col_type, class merge_lists_aggregation const& agg); virtual std::vector> visit(data_type col_type, class merge_sets_aggregation const& agg); virtual std::vector> visit(data_type col_type, - class lead_lag_aggregation const& agg); - virtual std::vector> visit(data_type col_type, - class udf_aggregation const& agg); + class merge_m2_aggregation const& agg); }; class aggregation_finalizer { // Declares the interface for the finalizer @@ -98,6 +102,7 @@ class aggregation_finalizer { // Declares the interface for the finalizer virtual void visit(class all_aggregation const& agg); virtual void visit(class sum_of_squares_aggregation const& agg); virtual void visit(class mean_aggregation const& agg); + virtual void visit(class m2_aggregation const& agg); virtual void visit(class var_aggregation const& agg); virtual void visit(class std_aggregation const& agg); virtual void visit(class median_aggregation const& agg); @@ -109,10 +114,11 @@ class aggregation_finalizer { // Declares the interface for the finalizer virtual void visit(class row_number_aggregation const& agg); virtual void visit(class collect_list_aggregation const& agg); virtual void visit(class collect_set_aggregation const& agg); - virtual void visit(class merge_lists_aggregation const& agg); - virtual void visit(class merge_sets_aggregation const& agg); virtual void visit(class lead_lag_aggregation const& agg); virtual void visit(class udf_aggregation const& agg); + virtual void visit(class merge_lists_aggregation const& agg); + virtual void visit(class merge_sets_aggregation const& agg); + virtual void visit(class merge_m2_aggregation const& agg); }; /** @@ -286,6 +292,25 @@ class mean_aggregation final : public rolling_aggregation { void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } }; +/** + * @brief Derived class for specifying a m2 aggregation + */ +class m2_aggregation : public aggregation { + public: + m2_aggregation() : aggregation{M2} {} + + std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + /** * @brief Derived class for specifying a standard deviation/variance aggregation */ @@ -633,66 +658,6 @@ class collect_set_aggregation final : public rolling_aggregation { } }; -/** - * @brief Derived aggregation class for specifying MERGE_LISTs aggregation - */ -class merge_lists_aggregation final : public aggregation { - public: - explicit merge_lists_aggregation() : aggregation{MERGE_LISTS} {} - - std::unique_ptr clone() const override - { - return std::make_unique(*this); - } - std::vector> get_simple_aggregations( - data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override - { - return collector.visit(col_type, *this); - } - void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } -}; - -/** - * @brief Derived aggregation class for specifying MERGE_SETs aggregation - */ -class merge_sets_aggregation final : public aggregation { - public: - explicit merge_sets_aggregation(null_equality nulls_equal, nan_equality nans_equal) - : aggregation{MERGE_SETS}, _nulls_equal(nulls_equal), _nans_equal(nans_equal) - { - } - - null_equality _nulls_equal; ///< whether to consider nulls as equal value - nan_equality _nans_equal; ///< whether to consider NaNs as equal value (applicable only to - ///< floating point types) - - bool is_equal(aggregation const& _other) const override - { - if (!this->aggregation::is_equal(_other)) { return false; } - auto const& other = dynamic_cast(_other); - return (_nulls_equal == other._nulls_equal && _nans_equal == other._nans_equal); - } - - size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } - - std::unique_ptr clone() const override - { - return std::make_unique(*this); - } - std::vector> get_simple_aggregations( - data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override - { - return collector.visit(col_type, *this); - } - void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } - - protected: - size_t hash_impl() const - { - return std::hash{}(static_cast(_nulls_equal) ^ static_cast(_nans_equal)); - } -}; - /** * @brief Derived aggregation class for specifying LEAD/LAG window aggregations */ @@ -783,6 +748,85 @@ class udf_aggregation final : public rolling_aggregation { } }; +/** + * @brief Derived aggregation class for specifying MERGE_LISTS aggregation + */ +class merge_lists_aggregation final : public aggregation { + public: + explicit merge_lists_aggregation() : aggregation{MERGE_LISTS} {} + + std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + +/** + * @brief Derived aggregation class for specifying MERGE_SETS aggregation + */ +class merge_sets_aggregation final : public aggregation { + public: + explicit merge_sets_aggregation(null_equality nulls_equal, nan_equality nans_equal) + : aggregation{MERGE_SETS}, _nulls_equal(nulls_equal), _nans_equal(nans_equal) + { + } + + null_equality _nulls_equal; ///< whether to consider nulls as equal value + nan_equality _nans_equal; ///< whether to consider NaNs as equal value (applicable only to + ///< floating point types) + + bool is_equal(aggregation const& _other) const override + { + if (!this->aggregation::is_equal(_other)) { return false; } + auto const& other = dynamic_cast(_other); + return (_nulls_equal == other._nulls_equal && _nans_equal == other._nans_equal); + } + + size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); } + + std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } + + protected: + size_t hash_impl() const + { + return std::hash{}(static_cast(_nulls_equal) ^ static_cast(_nans_equal)); + } +}; + +/** + * @brief Derived aggregation class for specifying MERGE_M2 aggregation + */ +class merge_m2_aggregation final : public aggregation { + public: + explicit merge_m2_aggregation() : aggregation{MERGE_M2} {} + + std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + /** * @brief Sentinel value used for `ARGMAX` aggregation. * @@ -904,6 +948,12 @@ struct target_type_impl() && is_su using type = Source; }; +// Always use `double` for M2 +template +struct target_type_impl { + using type = double; +}; + // Always use `double` for VARIANCE template struct target_type_impl { @@ -970,6 +1020,18 @@ struct target_type_impl { using type = cudf::list_view; }; +// Always use Source for LEAD +template +struct target_type_impl { + using type = Source; +}; + +// Always use Source for LAG +template +struct target_type_impl { + using type = Source; +}; + // Always use list for MERGE_LISTS template struct target_type_impl { @@ -982,16 +1044,10 @@ struct target_type_impl { using type = cudf::list_view; }; -// Always use Source for LEAD -template -struct target_type_impl { - using type = Source; -}; - -// Always use Source for LAG -template -struct target_type_impl { - using type = Source; +// Always use struct for MERGE_M2 +template +struct target_type_impl { + using type = cudf::struct_view; }; /** @@ -1061,6 +1117,7 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin return f.template operator()(std::forward(args)...); case aggregation::MEAN: return f.template operator()(std::forward(args)...); + case aggregation::M2: return f.template operator()(std::forward(args)...); case aggregation::VARIANCE: return f.template operator()(std::forward(args)...); case aggregation::STD: @@ -1083,14 +1140,16 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin return f.template operator()(std::forward(args)...); case aggregation::COLLECT_SET: return f.template operator()(std::forward(args)...); - case aggregation::MERGE_LISTS: - return f.template operator()(std::forward(args)...); - case aggregation::MERGE_SETS: - return f.template operator()(std::forward(args)...); case aggregation::LEAD: return f.template operator()(std::forward(args)...); case aggregation::LAG: return f.template operator()(std::forward(args)...); + case aggregation::MERGE_LISTS: + return f.template operator()(std::forward(args)...); + case aggregation::MERGE_SETS: + return f.template operator()(std::forward(args)...); + case aggregation::MERGE_M2: + return f.template operator()(std::forward(args)...); default: { #ifndef __CUDA_ARCH__ CUDF_FAIL("Unsupported aggregation."); diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp index aebf0c23469..79da4a997da 100644 --- a/cpp/include/cudf/detail/copy.hpp +++ b/cpp/include/cudf/detail/copy.hpp @@ -232,9 +232,10 @@ std::unique_ptr sample( * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr get_element(column_view const& input, - size_type index, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr get_element( + column_view const& input, + size_type index, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 1acdcadaacf..74a94f34ad8 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -46,7 +46,7 @@ __launch_bounds__(block_size) __global__ RightIter rhs, Filter filter, mutable_column_device_view out, - size_type *__restrict__ const valid_count) + size_type* __restrict__ const valid_count) { const size_type tid = threadIdx.x + blockIdx.x * block_size; const int warp_id = tid / warp_size; @@ -166,7 +166,7 @@ std::unique_ptr copy_if_else( FilterFn filter, cudf::data_type output_type, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { using Element = typename thrust::tuple_element<0, typename thrust::iterator_traits::value_type>::type; diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 017fe0d96ff..9cc319b5011 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -124,6 +124,17 @@ std::unique_ptr add_calendrical_months( cudf::column_view const& months, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::mr::device_memory_resource *) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr is_leap_year( + cudf::column_view const& column, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace detail } // namespace datetime } // namespace cudf diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh index 8bbd0d1aada..4a2b40e8be7 100644 --- a/cpp/include/cudf/detail/indexalator.cuh +++ b/cpp/include/cudf/detail/indexalator.cuh @@ -29,7 +29,7 @@ namespace detail { /** * @brief The base class for the input or output index normalizing iterator. * - * This implementation uses CTRP to define the `input_indexalator` and the + * This implementation uses CRTP to define the `input_indexalator` and the * `output_indexalator` classes. This is so this class can manipulate the * uniquely typed subclass member variable `p_` directly without requiring * virtual functions since iterator instances will be copied to device memory. @@ -241,7 +241,7 @@ struct base_indexalator { */ struct input_indexalator : base_indexalator { friend struct indexalator_factory; - friend struct base_indexalator; // for CTRP + friend struct base_indexalator; // for CRTP using reference = size_type const; // this keeps STL and thrust happy @@ -326,7 +326,7 @@ struct input_indexalator : base_indexalator { */ struct output_indexalator : base_indexalator { friend struct indexalator_factory; - friend struct base_indexalator; // for CTRP + friend struct base_indexalator; // for CRTP using reference = output_indexalator const&; // required for output iterators diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh index 4cb0c6e1877..deb161fd9c2 100644 --- a/cpp/include/cudf/detail/iterator.cuh +++ b/cpp/include/cudf/detail/iterator.cuh @@ -177,7 +177,7 @@ auto make_null_replacement_iterator(column_device_view const& column, * * make_optional_iterator with mode `DYNAMIC` defers the assumption of nullability to * runtime, with the user stating on construction of the iterator if column has nulls. - * `DYNAMIC` mode is nice when an algorithm is going to execute on mutliple + * `DYNAMIC` mode is nice when an algorithm is going to execute on multiple * iterators and you don't want to compile all the combinations of iterator types * * Example: @@ -819,7 +819,7 @@ auto inline make_pair_iterator(scalar const& scalar_value) * * Else, if the scalar is null, then the value of `p.first` is undefined and `p.second == false`. * - * The behaviour is undefined if the scalar is destroyed before iterator dereferencing. + * The behavior is undefined if the scalar is destroyed before iterator dereferencing. * * @throws cudf::logic_error if scalar datatype and Element type mismatch. * @throws cudf::logic_error if the returned iterator is dereferenced in host diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh index a938a3a053a..a779c3defbb 100644 --- a/cpp/include/cudf/detail/merge.cuh +++ b/cpp/include/cudf/detail/merge.cuh @@ -77,8 +77,8 @@ struct tagged_element_relational_comparator { { } - __device__ weak_ordering compare(index_type lhs_tagged_index, index_type rhs_tagged_index) const - noexcept + __device__ weak_ordering compare(index_type lhs_tagged_index, + index_type rhs_tagged_index) const noexcept { side const l_side = thrust::get<0>(lhs_tagged_index); side const r_side = thrust::get<0>(rhs_tagged_index); @@ -117,8 +117,8 @@ struct row_lexicographic_tagged_comparator { CUDF_EXPECTS(_lhs.num_columns() == _rhs.num_columns(), "Mismatched number of columns."); } - __device__ bool operator()(index_type lhs_tagged_index, index_type rhs_tagged_index) const - noexcept + __device__ bool operator()(index_type lhs_tagged_index, + index_type rhs_tagged_index) const noexcept { for (size_type i = 0; i < _lhs.num_columns(); ++i) { bool ascending = (_column_order == nullptr) or (_column_order[i] == order::ASCENDING); diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 08dae998944..e507bacb919 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -38,7 +38,7 @@ namespace detail { template __global__ void offset_bitmask_binop(Binop op, device_span destination, - device_span source, + device_span source, device_span source_begin_bits, size_type source_size_bits) { @@ -73,16 +73,16 @@ __global__ void offset_bitmask_binop(Binop op, template rmm::device_buffer bitmask_binop( Binop op, - host_span masks, + host_span masks, host_span masks_begin_bits, size_type mask_size_bits, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr}; inplace_bitmask_binop(op, - device_span(static_cast(dest_mask.data()), + device_span(static_cast(dest_mask.data()), num_bitmask_words(mask_size_bits)), masks, masks_begin_bits, @@ -110,11 +110,11 @@ template void inplace_bitmask_binop( Binop op, device_span dest_mask, - host_span masks, + host_span masks, host_span masks_begin_bits, size_type mask_size_bits, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS( std::all_of(masks_begin_bits.begin(), masks_begin_bits.end(), [](auto b) { return b >= 0; }), @@ -123,7 +123,7 @@ void inplace_bitmask_binop( CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }), "Mask pointer cannot be null"); - rmm::device_uvector d_masks(masks.size(), stream, mr); + rmm::device_uvector d_masks(masks.size(), stream, mr); rmm::device_uvector d_begin_bits(masks_begin_bits.size(), stream, mr); CUDA_TRY(cudaMemcpyAsync( diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index 77cb321a12c..f757929d839 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -34,25 +34,45 @@ rmm::device_buffer create_null_mask( size_type size, mask_state state, rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::set_null_mask(bitmask_type*, size_type, size_type, bool) * * @param stream CUDA stream used for device memory operations and kernel launches. */ -void set_null_mask(bitmask_type *bitmask, +void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid, rmm::cuda_stream_view stream = rmm::cuda_stream_default); +/** + * @copydoc cudf::count_set_bits + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +cudf::size_type count_set_bits(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream); + +/** + * @copydoc cudf::count_unset_bits + * + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + */ +cudf::size_type count_unset_bits(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream); + /** * @copydoc cudf::segmented_count_set_bits * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -std::vector segmented_count_set_bits(bitmask_type const *bitmask, +std::vector segmented_count_set_bits(bitmask_type const* bitmask, host_span indices, rmm::cuda_stream_view stream); @@ -61,7 +81,7 @@ std::vector segmented_count_set_bits(bitmask_type const *bitmask, * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -std::vector segmented_count_unset_bits(bitmask_type const *bitmask, +std::vector segmented_count_unset_bits(bitmask_type const* bitmask, host_span indices, rmm::cuda_stream_view stream); @@ -72,11 +92,11 @@ std::vector segmented_count_unset_bits(bitmask_type const *bitmask, * @param stream CUDA stream used for device memory operations and kernel launches. */ rmm::device_buffer copy_bitmask( - bitmask_type const *mask, + bitmask_type const* mask, size_type begin_bit, size_type end_bit, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::copy_bitmask(column_view const& view, rmm::mr::device_memory_resource*) @@ -84,9 +104,9 @@ rmm::device_buffer copy_bitmask( * @param stream CUDA stream used for device memory operations and kernel launches. */ rmm::device_buffer copy_bitmask( - column_view const &view, + column_view const& view, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc bitmask_and(host_span const, host_span const, @@ -95,11 +115,11 @@ rmm::device_buffer copy_bitmask( * @param stream CUDA stream used for device memory operations and kernel launches */ rmm::device_buffer bitmask_and( - host_span masks, + host_span masks, host_span masks_begin_bits, size_type mask_size_bits, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::bitmask_and @@ -107,9 +127,9 @@ rmm::device_buffer bitmask_and( * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ rmm::device_buffer bitmask_and( - table_view const &view, + table_view const& view, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @copydoc cudf::bitmask_or @@ -117,9 +137,9 @@ rmm::device_buffer bitmask_and( * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ rmm::device_buffer bitmask_or( - table_view const &view, + table_view const& view, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Performs a bitwise AND of the specified bitmasks, @@ -135,11 +155,11 @@ rmm::device_buffer bitmask_or( */ void inplace_bitmask_and( device_span dest_mask, - host_span masks, + host_span masks, host_span masks_begin_bits, size_type mask_size_bits, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/nvtx/nvtx3.hpp b/cpp/include/cudf/detail/nvtx/nvtx3.hpp index add5699e34a..0e1a82a0657 100644 --- a/cpp/include/cudf/detail/nvtx/nvtx3.hpp +++ b/cpp/include/cudf/detail/nvtx/nvtx3.hpp @@ -54,7 +54,7 @@ * \code{.cpp} * #include "nvtx3.hpp" * void some_function(){ - * // Begins a NVTX range with the messsage "some_function" + * // Begins a NVTX range with the message "some_function" * // The range ends when some_function() returns and `r` is destroyed * nvtx3::thread_range r{"some_function"}; * @@ -322,7 +322,7 @@ * Example: * \code{.cpp} * // Create an `event_attributes` with the custom message "my message" - * nvtx3::event_attributes attr{nvtx3::Mesage{"my message"}}; + * nvtx3::event_attributes attr{nvtx3::message{"my message"}}; * * // strings and string literals implicitly assumed to be a `nvtx3::message` * nvtx3::event_attributes attr{"my message"}; @@ -1267,7 +1267,7 @@ class registered_message { * nvtx3::thread_range range1{attr1}; * * // `range2` contains message "message 2" - * nvtx3::thread_range range2{nvtx3::Mesage{"message 2"}}; + * nvtx3::thread_range range2{nvtx3::message{"message 2"}}; * * // `std::string` and string literals are implicitly assumed to be * // the contents of an `nvtx3::message` @@ -1525,7 +1525,7 @@ class payload { * * // For convenience, the arguments that can be passed to the * `event_attributes` - * // constructor may be passed to the `domain_thread_range` contructor where + * // constructor may be passed to the `domain_thread_range` constructor where * // they will be forwarded to the `EventAttribute`s constructor * nvtx3::thread_range r{nvtx3::payload{42}, nvtx3::category{1}, "message"}; * \endcode diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp index 0615e502c60..e672cf01488 100644 --- a/cpp/include/cudf/detail/unary.hpp +++ b/cpp/include/cudf/detail/unary.hpp @@ -31,7 +31,7 @@ namespace detail { * doesn't. * * @tparam InputIterator Iterator type for `begin` and `end` - * @tparam Predicate A predicator type which will be evaludated + * @tparam Predicate A predicator type which will be evaluated * @param begin Beginning of the sequence of elements * @param end End of the sequence of elements * @param p Predicate to be applied to each element in `[begin,end)` diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index 16b7da0a083..6380e76fdfa 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -95,12 +95,12 @@ struct genericAtomicOperationImpl { do { assumed = old; - T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); + T const target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16); uint16_t updating_value = type_reinterpret(op(target_value, update_value)); - T_int new_value = (is_32_align) ? (old & 0xffff0000) | updating_value - : (old & 0xffff) | (T_int(updating_value) << 16); - old = atomicCAS(address_uint32, assumed, new_value); + T_int const new_value = (is_32_align) ? (old & 0xffff0000) | updating_value + : (old & 0xffff) | (T_int(updating_value) << 16); + old = atomicCAS(address_uint32, assumed, new_value); } while (assumed != old); return (is_32_align) ? T(old & 0xffff) : T(old >> 16); @@ -161,7 +161,7 @@ struct genericAtomicOperationImpl { // ----------------------------------------------------------------------- // specialized functions for operators -// `atomicAdd` supports int32, float, double (signed int64 is not supproted.) +// `atomicAdd` supports int32, float, double (signed int64 is not supported.) // `atomicMin`, `atomicMax` support int32_t, int64_t // `atomicAnd`, `atomicOr`, `atomicXor` support int32_t, int64_t template <> diff --git a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh index 8c0abbad49f..05a788abd45 100644 --- a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh +++ b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh @@ -50,7 +50,7 @@ struct null_replacing_transformer { } template - CUDA_HOST_DEVICE_CALLABLE type operator()(thrust::pair const &pair_value) + CUDA_HOST_DEVICE_CALLABLE type operator()(thrust::pair const& pair_value) { if (pair_value.second) return f(pair_value.first); @@ -83,7 +83,7 @@ struct meanvar { using this_t = cudf::meanvar; CUDA_HOST_DEVICE_CALLABLE - this_t operator+(this_t const &rhs) const + this_t operator+(this_t const& rhs) const { return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared), @@ -91,7 +91,7 @@ struct meanvar { }; CUDA_HOST_DEVICE_CALLABLE - bool operator==(this_t const &rhs) const + bool operator==(this_t const& rhs) const { return ((this->value == rhs.value) && (this->value_squared == rhs.value_squared) && (this->count == rhs.count)); @@ -114,7 +114,7 @@ struct meanvar { template struct transformer_squared { CUDA_HOST_DEVICE_CALLABLE - ElementType operator()(ElementType const &value) { return (value * value); }; + ElementType operator()(ElementType const& value) { return (value * value); }; }; /** @@ -131,7 +131,7 @@ struct transformer_meanvar { using ResultType = meanvar; CUDA_HOST_DEVICE_CALLABLE - ResultType operator()(thrust::pair const &pair) + ResultType operator()(thrust::pair const& pair) { ElementType v = pair.first; return meanvar(v, v * v, (pair.second) ? 1 : 0); diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index 11ce9199c2d..4a7e9b89c80 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -117,7 +117,7 @@ std::pair valid_if( * input ranges. * Given a set of bitmasks, `masks`, the state of bit `j` in mask `i` is - * determined by `p( *(begin1 + i), *(begin2 + j))`. If the predivate evaluates + * determined by `p( *(begin1 + i), *(begin2 + j))`. If the predicate evaluates * to true, the the bit is set to `1`. If false, set to `0`. * * Example Arguments: diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp index 85c469f58f8..5656b38a0ef 100644 --- a/cpp/include/cudf/groupby.hpp +++ b/cpp/include/cudf/groupby.hpp @@ -116,7 +116,7 @@ class groupby { /** * @brief Performs grouped aggregations on the specified values. * - * The values to aggregate and the aggregations to perform are specifed in an + * The values to aggregate and the aggregations to perform are specified in an * `aggregation_request`. Each request contains a `column_view` of values to * aggregate and a set of `aggregation`s to perform on those elements. * @@ -173,7 +173,7 @@ class groupby { /** * @brief Performs grouped scans on the specified values. * - * The values to aggregate and the aggregations to perform are specifed in an + * The values to aggregate and the aggregations to perform are specified in an * `aggregation_request`. Each request contains a `column_view` of values to * aggregate and a set of `aggregation`s to perform on those elements. * diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp index 18398ff4ceb..34410209c72 100644 --- a/cpp/include/cudf/io/avro.hpp +++ b/cpp/include/cudf/io/avro.hpp @@ -180,7 +180,7 @@ class avro_reader_options_builder { /** * @brief move avro_reader_options member once it's built. */ - operator avro_reader_options &&() { return std::move(options); } + operator avro_reader_options&&() { return std::move(options); } /** * @brief move avro_reader_options member once it's built. diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index 8efe871ad3a..1dff99735ec 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -574,9 +574,9 @@ class csv_reader_options { * * @param types Vector of dtypes in which the column needs to be read. */ - [ - [deprecated("The string-based interface will be deprecated." - "Use dtypes(std::vector) instead.")]] void + [[deprecated( + "The string-based interface will be deprecated." + "Use dtypes(std::vector) instead.")]] void set_dtypes(std::vector types) { _dtypes = std::move(types); @@ -997,9 +997,9 @@ class csv_reader_options_builder { * @param types Vector of dtypes in which the column needs to be read. * @return this for chaining. */ - [ - [deprecated("The string-based interface will be deprecated." - "Use dtypes(std::vector) instead.")]] csv_reader_options_builder& + [[deprecated( + "The string-based interface will be deprecated." + "Use dtypes(std::vector) instead.")]] csv_reader_options_builder& dtypes(std::vector types) { options._dtypes = std::move(types); @@ -1093,7 +1093,7 @@ class csv_reader_options_builder { /** * @brief move csv_reader_options member once it's built. */ - operator csv_reader_options &&() { return std::move(options); } + operator csv_reader_options&&() { return std::move(options); } /** * @brief move csv_reader_options member once it's built. @@ -1422,7 +1422,7 @@ class csv_writer_options_builder { /** * @brief move `csv_writer_options` member once it's built. */ - operator csv_writer_options &&() { return std::move(options); } + operator csv_writer_options&&() { return std::move(options); } /** * @brief move `csv_writer_options` member once it's built. diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index 6c885a874ee..c1aff818121 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -322,9 +322,9 @@ class arrow_io_source : public datasource { filesystem = result.ValueOrDie(); // Parse the path from the URI - size_t start = arrow_uri.find(uri_start_delimiter) == std::string::npos - ? 0 - : arrow_uri.find(uri_start_delimiter) + uri_start_delimiter.size(); + size_t start = arrow_uri.find(uri_start_delimiter) == std::string::npos + ? 0 + : arrow_uri.find(uri_start_delimiter) + uri_start_delimiter.size(); size_t end = arrow_uri.find(uri_end_delimiter) - start; std::string_view path = arrow_uri.substr(start, end); diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp index 4310d0e7c4b..98483d1c03e 100644 --- a/cpp/include/cudf/io/detail/avro.hpp +++ b/cpp/include/cudf/io/detail/avro.hpp @@ -46,10 +46,10 @@ class reader { * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ - explicit reader(std::vector const &filepaths, - avro_reader_options const &options, + explicit reader(std::vector const& filepaths, + avro_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr); + rmm::mr::device_memory_resource* mr); /** * @brief Constructor from an array of datasources @@ -59,10 +59,10 @@ class reader { * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ - explicit reader(std::vector> &&sources, - avro_reader_options const &options, + explicit reader(std::vector>&& sources, + avro_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr); + rmm::mr::device_memory_resource* mr); /** * @brief Destructor explicitly-declared to avoid inlined in header @@ -77,7 +77,7 @@ class reader { * * @return The set of columns along with table metadata */ - table_with_metadata read(avro_reader_options const &options, + table_with_metadata read(avro_reader_options const& options, rmm::cuda_stream_view stream = rmm::cuda_stream_default); }; } // namespace avro diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp index 8ec2818c2ca..89e589d306a 100644 --- a/cpp/include/cudf/io/detail/csv.hpp +++ b/cpp/include/cudf/io/detail/csv.hpp @@ -41,10 +41,10 @@ class reader { * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ - explicit reader(std::vector const &filepaths, - csv_reader_options const &options, + explicit reader(std::vector const& filepaths, + csv_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr); + rmm::mr::device_memory_resource* mr); /** * @brief Constructor from an array of datasources @@ -54,10 +54,10 @@ class reader { * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ - explicit reader(std::vector> &&sources, - csv_reader_options const &options, + explicit reader(std::vector>&& sources, + csv_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr); + rmm::mr::device_memory_resource* mr); /** * @brief Destructor explicitly-declared to avoid inlined in header @@ -91,9 +91,9 @@ class writer { * @param mr Device memory resource to use for device memory allocation */ writer(std::unique_ptr sinkp, - csv_writer_options const &options, + csv_writer_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr); // cannot provide definition here (because + rmm::mr::device_memory_resource* mr); // cannot provide definition here (because // _impl is incomplete hence unique_ptr has // not enough sizeof() info) @@ -109,8 +109,8 @@ class writer { * @param metadata Table metadata and column names * @param stream CUDA stream used for device memory operations and kernel launches. */ - void write(table_view const &table, - const table_metadata *metadata = nullptr, + void write(table_view const& table, + const table_metadata* metadata = nullptr, rmm::cuda_stream_view stream = rmm::cuda_stream_default); }; } // namespace csv diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 6ed93dc5c25..e6d8f2de483 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -54,10 +54,10 @@ class reader { * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ - explicit reader(std::vector const &filepaths, - json_reader_options const &options, + explicit reader(std::vector const& filepaths, + json_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr); + rmm::mr::device_memory_resource* mr); /** * @brief Constructor from an array of datasources @@ -67,10 +67,10 @@ class reader { * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ - explicit reader(std::vector> &&sources, - json_reader_options const &options, + explicit reader(std::vector>&& sources, + json_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr); + rmm::mr::device_memory_resource* mr); /** * @brief Destructor explicitly-declared to avoid inlined in header @@ -83,7 +83,7 @@ class reader { * @param[in] options Settings for controlling reading behavior * @return cudf::table object that contains the array of cudf::column. */ - table_with_metadata read(json_reader_options const &options, + table_with_metadata read(json_reader_options const& options, rmm::cuda_stream_view stream = rmm::cuda_stream_default); }; diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 7d56c1c0fc6..2f4d0936d8b 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -277,7 +277,7 @@ class json_reader_options_builder { /** * @brief move json_reader_options member once it's built. */ - operator json_reader_options &&() { return std::move(options); } + operator json_reader_options&&() { return std::move(options); } /** * @brief move json_reader_options member once it's built. diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index bd1e4e96d7d..997f35ed922 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -331,7 +331,7 @@ class orc_reader_options_builder { /** * @brief move orc_reader_options member once it's built. */ - operator orc_reader_options &&() { return std::move(options); } + operator orc_reader_options&&() { return std::move(options); } /** * @brief move orc_reader_options member once it's built. @@ -550,7 +550,7 @@ class orc_writer_options_builder { /** * @brief move orc_writer_options member once it's built. */ - operator orc_writer_options &&() { return std::move(options); } + operator orc_writer_options&&() { return std::move(options); } /** * @brief move orc_writer_options member once it's built. @@ -724,7 +724,7 @@ class chunked_orc_writer_options_builder { /** * @brief move chunked_orc_writer_options member once it's built. */ - operator chunked_orc_writer_options &&() { return std::move(options); } + operator chunked_orc_writer_options&&() { return std::move(options); } /** * @brief move chunked_orc_writer_options member once it's built. diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 178e46a0c5c..ecd9607a87e 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -339,7 +339,7 @@ class parquet_reader_options_builder { /** * @brief move parquet_reader_options member once it's built. */ - operator parquet_reader_options &&() { return std::move(options); } + operator parquet_reader_options&&() { return std::move(options); } /** * @brief move parquet_reader_options member once it's built. @@ -769,7 +769,7 @@ class parquet_writer_options_builder { /** * @brief move parquet_writer_options member once it's built. */ - operator parquet_writer_options &&() { return std::move(options); } + operator parquet_writer_options&&() { return std::move(options); } /** * @brief move parquet_writer_options member once it's built. @@ -973,7 +973,7 @@ class chunked_parquet_writer_options_builder { * @brief Set to true if timestamps should be written as * int96 types instead of int64 types. Even though int96 is deprecated and is * not an internal type for cudf, it needs to be written for backwards - * compatability reasons. + * compatibility reasons. * * @param enabled Boolean value to enable/disable int96 timestamps. * @return this for chaining. @@ -987,7 +987,7 @@ class chunked_parquet_writer_options_builder { /** * @brief move chunked_parquet_writer_options member once it's built. */ - operator chunked_parquet_writer_options &&() { return std::move(options); } + operator chunked_parquet_writer_options&&() { return std::move(options); } /** * @brief move chunked_parquet_writer_options member once it's is built. diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 1f9ed71ce8c..725c0fc3699 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -23,6 +24,7 @@ #include #include +#include #include namespace cudf { @@ -647,5 +649,206 @@ class hash_join { const std::unique_ptr impl; }; +/** + * @brief Returns a pair of row index vectors corresponding to all pairs + * of rows between the specified tables where the predicate evaluates to true. + * + * The first returned vector contains the row indices from the left + * table that have a match in the right table (in unspecified order). + * The corresponding values in the second returned vector are + * the matched row indices from the right table. + * + * @code{.pseudo} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}} + * Expression: Left.Column_0 == Right.Column_0 + * Result: {{1, 2}, {0, 1}} + * + * Left: {{0, 1, 2}, {3, 4, 5}} + * Right: {{1, 2, 3}, {4, 6, 7}} + * Expression: (Left.Column_0 == Right.Column_0) AND (Left.Column_1 == Right.Column_1) + * Result: {{1}, {0}} + * @endcode + * + * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` + * mismatch. + * + * @param left The left table + * @param right The right table + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether the equality operator returns true or false for two nulls. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a conditional inner join between two tables `left` and `right` . + */ +std::pair>, + std::unique_ptr>> +conditional_inner_join( + table_view left, + table_view right, + ast::expression binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a pair of row index vectors corresponding to all pairs + * of rows between the specified tables where the predicate evaluates to true, + * or null matches for rows in left that have no match in right. + * + * The first returned vector contains all the row indices from the left + * table (in unspecified order). The corresponding value in the + * second returned vector is either (1) the row index of the matched row + * from the right table, if there is a match or (2) an unspecified + * out-of-bounds value. + * + * @code{.pseudo} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}} + * Expression: Left.Column_0 == Right.Column_0 + * Result: {{0, 1, 2}, {None, 0, 1}} + * + * Left: {{0, 1, 2}, {3, 4, 5}} + * Right: {{1, 2, 3}, {4, 6, 7}} + * Expression: (Left.Column_0 == Right.Column_0) AND (Left.Column_1 == Right.Column_1) + * Result: {{0, 1, 2}, {None, 0, None}} + * @endcode + * + * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` + * mismatch. + * + * @param left The left table + * @param right The right table + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether the equality operator returns true or false for two nulls. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a conditional left join between two tables `left` and `right` . + */ +std::pair>, + std::unique_ptr>> +conditional_left_join(table_view left, + table_view right, + ast::expression binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a pair of row index vectors corresponding to all pairs + * of rows between the specified tables where the predicate evaluates to true, + * or null matches for rows in either table that have no match in the other. + * + * Taken pairwise, the values from the returned vectors are one of: + * (1) row indices corresponding to matching rows from the left and + * right tables, (2) a row index and an unspecified out-of-bounds value, + * representing a row from one table without a match in the other. + * + * @code{.pseudo} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}} + * Expression: Left.Column_0 == Right.Column_0 + * Result: {{0, 1, 2, None}, {None, 0, 1, 2}} + * + * Left: {{0, 1, 2}, {3, 4, 5}} + * Right: {{1, 2, 3}, {4, 6, 7}} + * Expression: (Left.Column_0 == Right.Column_0) AND (Left.Column_1 == Right.Column_1) + * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}} + * @endcode + * + * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` + * mismatch. + * + * @param left The left table + * @param right The right table + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether the equality operator returns true or false for two nulls. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a conditional full join between two tables `left` and `right` . + */ +std::pair>, + std::unique_ptr>> +conditional_full_join(table_view left, + table_view right, + ast::expression binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns an index vector corresponding to all rows in the left table + * for which there exists some row in the right table where the predicate + * evaluates to true. + * + * @code{.pseudo} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}} + * Expression: Left.Column_0 == Right.Column_0 + * Result: {1, 2} + * + * Left: {{0, 1, 2}, {3, 4, 5}} + * Right: {{1, 2, 3}, {4, 6, 7}} + * Expression: (Left.Column_0 == Right.Column_0) AND (Left.Column_1 == Right.Column_1) + * Result: {1} + * @endcode + * + * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` + * mismatch. + * + * @param left The left table + * @param right The right table + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether the equality operator returns true or false for two nulls. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A vector `left_indices` that can be used to construct the result of + * performing a conditional left semi join between two tables `left` and + * `right` . + */ +std::unique_ptr> conditional_left_semi_join( + table_view left, + table_view right, + ast::expression binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns an index vector corresponding to all rows in the left table + * for which there does not exist any row in the right table where the + * predicate evaluates to true. + * + * @code{.pseudo} + * Left: {{0, 1, 2}} + * Right: {{1, 2, 3}} + * Expression: Left.Column_0 == Right.Column_0 + * Result: {0} + * + * Left: {{0, 1, 2}, {3, 4, 5}} + * Right: {{1, 2, 3}, {4, 6, 7}} + * Expression: (Left.Column_0 == Right.Column_0) AND (Left.Column_1 == Right.Column_1) + * Result: {0, 2} + * @endcode + * + * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys` + * mismatch. + * + * @param left The left table + * @param right The right table + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether the equality operator returns true or false for two nulls. + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A vector `left_indices` that can be used to construct the result of + * performing a conditional left anti join between two tables `left` and + * `right` . + */ +std::unique_ptr> conditional_left_anti_join( + table_view left, + table_view right, + ast::expression binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index a440e456e25..94b0e830b15 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -89,7 +90,7 @@ std::unique_ptr scatter_impl( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { - assert_same_data_type(source, target); + CUDF_EXPECTS(column_types_equal(source, target), "Mismatched column types."); auto const child_column_type = lists_column_view(target).child().type(); diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh index 76121bc35e9..7d0586ed6a6 100644 --- a/cpp/include/cudf/lists/detail/scatter_helper.cuh +++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh @@ -129,11 +129,6 @@ struct unbound_list_view { size_type _size{}; // Number of elements in *this* list row. }; -/** - * @brief Checks that the specified columns have matching schemas, all the way down. - */ -void assert_same_data_type(column_view const& lhs, column_view const& rhs); - std::unique_ptr build_lists_child_column_recursive( data_type child_column_type, rmm::device_uvector const& list_vector, diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp index 9fd913517fc..d094118293b 100644 --- a/cpp/include/cudf/reduction.hpp +++ b/cpp/include/cudf/reduction.hpp @@ -64,10 +64,10 @@ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE }; * @returns Output scalar with reduce result. */ std::unique_ptr reduce( - column_view const &col, - std::unique_ptr const &agg, + column_view const& col, + std::unique_ptr const& agg, data_type output_dtype, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Computes the scan of a column. @@ -88,11 +88,11 @@ std::unique_ptr reduce( * @returns unique pointer to new output column */ std::unique_ptr scan( - const column_view &input, - std::unique_ptr const &agg, + const column_view& input, + std::unique_ptr const& agg, scan_type inclusive, null_policy null_handling = null_policy::EXCLUDE, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Determines the minimum and maximum values of a column. @@ -104,8 +104,8 @@ std::unique_ptr scan( * and the second scalar being the maximum value of the input column. */ std::pair, std::unique_ptr> minmax( - column_view const &col, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + column_view const& col, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index 2e57e56255d..0e14b0c6bf5 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -78,7 +78,7 @@ class scalar { /** * @brief Indicates whether the scalar contains a valid value. * - * @note Using the value when `is_valid() == false` is undefined behaviour. In addition, this + * @note Using the value when `is_valid() == false` is undefined behavior. In addition, this * function does a stream synchronization. * * @param stream CUDA stream used for device memory operations. @@ -154,7 +154,7 @@ class fixed_width_scalar : public scalar { void set_value(T value, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** - * @brief Implicit conversion operator to get the value of the scalar on the host. + * @brief Explicit conversion operator to get the value of the scalar on the host. */ explicit operator value_type() const; @@ -365,6 +365,11 @@ class fixed_point_scalar : public scalar { */ T fixed_point_value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; + /** + * @brief Explicit conversion operator to get the value of the scalar on the host. + */ + explicit operator value_type() const; + /** * @brief Returns a raw pointer to the value in device memory. */ @@ -465,7 +470,7 @@ class string_scalar : public scalar { rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Implicit conversion operator to get the value of the scalar in a host std::string. + * @brief Explicit conversion operator to get the value of the scalar in a host std::string. */ explicit operator std::string() const; diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh index d56d5d5eb0d..884b412d3e2 100644 --- a/cpp/include/cudf/scalar/scalar_device_view.cuh +++ b/cpp/include/cudf/scalar/scalar_device_view.cuh @@ -21,7 +21,7 @@ /** * @file scalar_device_view.cuh - * @brief Scalar device view class definitons + * @brief Scalar device view class definitions */ namespace cudf { diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp index 2454cfe7c7b..36a8131a78e 100644 --- a/cpp/include/cudf/sorting.hpp +++ b/cpp/include/cudf/sorting.hpp @@ -187,7 +187,7 @@ std::unique_ptr rank( /** * @brief Returns sorted order after sorting each segment in the table. * - * If segment_offsets contains values larger than number of rows, behaviour is undefined. + * If segment_offsets contains values larger than number of rows, behavior is undefined. * @throws cudf::logic_error if `segment_offsets` is not `size_type` column. * * @param keys The table that determines the ordering of elements in each segment @@ -214,7 +214,7 @@ std::unique_ptr segmented_sorted_order( /** * @brief Performs a lexicographic segmented sort of a table * - * If segment_offsets contains values larger than number of rows, behaviour is undefined. + * If segment_offsets contains values larger than number of rows, behavior is undefined. * @throws cudf::logic_error if `values.num_rows() != keys.num_rows()`. * @throws cudf::logic_error if `segment_offsets` is not `size_type` column. * diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp index 372d9faf13f..604756b5d09 100644 --- a/cpp/include/cudf/strings/capitalize.hpp +++ b/cpp/include/cudf/strings/capitalize.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -30,21 +31,33 @@ namespace strings { /** * @brief Returns a column of capitalized strings. * - * Any null string entries return corresponding null output column entries. + * If the `delimiters` is an empty string, then only the first character of each + * row is capitalized. Otherwise, a non-delimiter character is capitalized after + * any delimiter character is found. * * @code{.pseudo} * Example: - * input = ["tesT1", "a Test", "Another Test"]; + * input = ["tesT1", "a Test", "Another Test", "a\tb"]; * output = capitalize(input) - * output is ["Test1", "A test", "Another test"] + * output is ["Test1", "A test", "Another test", "A\tb"] + * output = capitalize(input, " ") + * output is ["Test1", "A Test", "Another Test", "A\tb"] + * output = capitalize(input, " \t") + * output is ["Test1", "A Test", "Another Test", "A\tB"] * @endcode * - * @param[in] input String column. - * @param[in] mr Device memory resource used to allocate the returned column's device memory + * Any null string entries return corresponding null output column entries. + * + * @throw cudf::logic_error if `delimiter.is_valid()` is `false`. + * + * @param input String column. + * @param delimiters Characters for identifying words to capitalize. + * @param mr Device memory resource used to allocate the returned column's device memory * @return Column of strings capitalized from the input column. */ std::unique_ptr capitalize( strings_column_view const& input, + string_scalar const& delimiters = string_scalar(""), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp index 3e069de2f0f..32f8d482a34 100644 --- a/cpp/include/cudf/strings/combine.hpp +++ b/cpp/include/cudf/strings/combine.hpp @@ -272,7 +272,7 @@ std::unique_ptr join_list_elements( * delimited by the @p separator provided. * * A null list row will always result in a null string in the output row. Any non-null list row - * having a null elenent will result in the corresponding output row to be null unless a + * having a null element will result in the corresponding output row to be null unless a * @p narep string is specified to be used in its place. * * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the diff --git a/cpp/include/cudf/strings/detail/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp index 19dfa193207..6083ebc4a62 100644 --- a/cpp/include/cudf/strings/detail/copying.hpp +++ b/cpp/include/cudf/strings/detail/copying.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -54,6 +55,33 @@ std::unique_ptr copy_slice( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a new strings column created by shifting the rows by a specified offset. + * + * @code{.pseudo} + * Example: + * s = ["a", "b", "c", "d", "e", "f"] + * r1 = shift(s, 2, "_") + * r1 is now ["_", "_", "a", "b", "c", "d"] + * r2 = shift(s, -2, "_") + * r2 is now ["c", "d", "e", "f", "_", "_"] + * @endcode + * + * The caller should set the validity mask in the output column. + * + * @param input Strings instance for this operation. + * @param offset The offset by which to shift the input. + * @param fill_value Fill value for indeterminable outputs. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New strings column. + */ +std::unique_ptr shift(strings_column_view const& input, + size_type offset, + scalar const& fill_value, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace detail } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp index 4023dbc6c84..2b39662456b 100644 --- a/cpp/include/cudf/strings/repeat_strings.hpp +++ b/cpp/include/cudf/strings/repeat_strings.hpp @@ -31,7 +31,7 @@ namespace strings { * @brief Repeat the given string scalar by a given number of times. * * For a given string scalar, an output string scalar is generated by repeating the input string by - * a number of times given by the @p `repeat_times` parameter. If `repeat_times` is not a positve + * a number of times given by the @p `repeat_times` parameter. If `repeat_times` is not a positive * value, an empty (valid) string scalar will be returned. An invalid input scalar will always * result in an invalid output scalar regardless of the value of `repeat_times` parameter. * @@ -42,7 +42,7 @@ namespace strings { * out is '123XYZ-123XYZ-123XYZ-' * @endcode * - * @throw cudf::logic_error if the size of the ouput string scalar exceeds the maximum value that + * @throw cudf::logic_error if the size of the output string scalar exceeds the maximum value that * can be stored by the index type * (i.e., `input.size() * repeat_times > numeric_limits::max()`). * @@ -61,7 +61,7 @@ std::unique_ptr repeat_strings( * * For a given strings column, an output strings column is generated by repeating each string from * the input by a number of times given by the @p `repeat_times` parameter. If `repeat_times` is not - * a positve value, all the rows of the output strings column will be an empty string. Any null row + * a positive value, all the rows of the output strings column will be an empty string. Any null row * will result in a null row regardless of the value of `repeat_times` parameter. * * Note that this function cannot handle the cases when the size of the output column exceeds the diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp index e9091b88b08..40eb796eba7 100644 --- a/cpp/include/cudf/strings/replace.hpp +++ b/cpp/include/cudf/strings/replace.hpp @@ -36,7 +36,7 @@ namespace strings { * input string. If not found, the output entry is just a copy of the * corresponding input string. * - * Specifing an empty string for repl will essentially remove the target + * Specifying an empty string for repl will essentially remove the target * string if found in each string. * * Null string entries will return null output string entries. diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp index 82b191a8e1b..4978bad3bb3 100644 --- a/cpp/include/cudf/strings/split/split.hpp +++ b/cpp/include/cudf/strings/split/split.hpp @@ -139,7 +139,7 @@ std::unique_ptr
rsplit( * * @throw cudf:logic_error if `delimiter` is invalid. * - * @param strings A column of string elements to be splitted. + * @param strings A column of string elements to be split. * @param delimiter The string to identify split points in each string. * Default of empty string indicates split on whitespace. * @param maxsplit Maximum number of splits to perform. @@ -216,7 +216,7 @@ std::unique_ptr split_record( * * @throw cudf:logic_error if `delimiter` is invalid. * - * @param strings A column of string elements to be splitted. + * @param strings A column of string elements to be split. * @param delimiter The string to identify split points in each string. * Default of empty string indicates split on whitespace. * @param maxsplit Maximum number of splits to perform. diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index f5ab2046441..238d55d580e 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -17,11 +17,16 @@ #pragma once #include + +#ifndef __CUDA_ARCH__ #include +#endif +// This is defined when including this header in a https://github.com/NVIDIA/jitify +// or jitify2 source file. The jitify cannot include thrust headers at this time. +#ifndef CUDF_JIT_UDF #include -#include -#include +#endif // This file should only include device code logic. // Host-only or host/device code should be defined in the string_view.hpp header file. @@ -41,8 +46,17 @@ __device__ inline size_type characters_in_string(const char* str, size_type byte { if ((str == 0) || (bytes == 0)) return 0; auto ptr = reinterpret_cast(str); +#ifndef CUDF_JIT_UDF return thrust::count_if( thrust::seq, ptr, ptr + bytes, [](uint8_t chr) { return is_begin_utf8_char(chr); }); +#else + size_type chars = 0; + auto const end = ptr + bytes; + while (ptr < end) { + chars += is_begin_utf8_char(*ptr++); + } + return chars; +#endif } /** @@ -121,7 +135,8 @@ __device__ inline string_view::const_iterator string_view::const_iterator::opera { const_iterator tmp(*this); size_type adjust = abs(offset); - while (adjust-- > 0) offset > 0 ? ++tmp : --tmp; + while (adjust-- > 0) + offset > 0 ? ++tmp : --tmp; return tmp; } @@ -129,7 +144,8 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper string_view::const_iterator::difference_type offset) { size_type adjust = abs(offset); - while (adjust-- > 0) offset > 0 ? operator++() : operator--(); + while (adjust-- > 0) + offset > 0 ? operator++() : operator--(); return *this; } @@ -153,7 +169,8 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper string_view::const_iterator::difference_type offset) { size_type adjust = abs(offset); - while (adjust-- > 0) offset > 0 ? operator--() : operator++(); + while (adjust-- > 0) + offset > 0 ? operator--() : operator++(); return *this; } @@ -162,7 +179,8 @@ __device__ inline string_view::const_iterator string_view::const_iterator::opera { const_iterator tmp(*this); size_type adjust = abs(offset); - while (adjust-- > 0) offset > 0 ? --tmp : ++tmp; + while (adjust-- > 0) + offset > 0 ? --tmp : ++tmp; return tmp; } @@ -256,7 +274,8 @@ __device__ inline int string_view::compare(const char* data, size_type bytes) co size_type const len1 = size_bytes(); const unsigned char* ptr1 = reinterpret_cast(this->data()); const unsigned char* ptr2 = reinterpret_cast(data); - size_type idx = 0; + if ((ptr1 == ptr2) && (bytes == len1)) return 0; + size_type idx = 0; for (; (idx < len1) && (idx < bytes); ++idx) { if (*ptr1 != *ptr2) return static_cast(*ptr1) - static_cast(*ptr2); ++ptr1; @@ -327,7 +346,8 @@ __device__ inline size_type string_view::find(const char* str, const char* ptr2 = str; for (size_type idx = 0; idx < len1; ++idx) { bool match = true; - for (size_type jdx = 0; match && (jdx < len2); ++jdx) match = (ptr1[jdx] == ptr2[jdx]); + for (size_type jdx = 0; match && (jdx < len2); ++jdx) + match = (ptr1[jdx] == ptr2[jdx]); if (match) return character_offset(idx + spos); ptr1++; } @@ -368,7 +388,8 @@ __device__ inline size_type string_view::rfind(const char* str, const char* ptr2 = str; for (int idx = 0; idx < len1; ++idx) { bool match = true; - for (size_type jdx = 0; match && (jdx < len2); ++jdx) match = (ptr1[jdx] == ptr2[jdx]); + for (size_type jdx = 0; match && (jdx < len2); ++jdx) + match = (ptr1[jdx] == ptr2[jdx]); if (match) return character_offset(epos - len2 - idx); ptr1--; // go backwards } diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index 4b1a901d72f..be182cb0e9d 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -15,9 +15,8 @@ */ #pragma once -#include -#include #include + #include /** @@ -36,12 +35,6 @@ using char_utf8 = uint32_t; ///< UTF-8 characters are 1-4 bytes */ constexpr cudf::size_type UNKNOWN_STRING_LENGTH{-1}; -/** - * @brief This value is assigned to the _char_width member if the string - * contains characters of different widths. - */ -constexpr int8_t VARIABLE_CHAR_WIDTH{0}; - /** * @brief A non-owning, immutable view of device data that is a variable length * char array representing a UTF-8 string. @@ -417,7 +410,7 @@ CUDA_HOST_DEVICE_CALLABLE size_type to_char_utf8(const char* str, char_utf8& cha * @brief Place a char_utf8 value into a char array. * * @param character Single character - * @param[out] str Allocated char array with enough space to hold the encoded characer. + * @param[out] str Allocated char array with enough space to hold the encoded character. * @return The number of bytes in the character */ CUDA_HOST_DEVICE_CALLABLE size_type from_char_utf8(char_utf8 character, char* str) diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh index bec5299ab77..d174222b2ff 100644 --- a/cpp/include/cudf/table/row_operators.cuh +++ b/cpp/include/cudf/table/row_operators.cuh @@ -191,8 +191,8 @@ class element_equality_comparator { */ template ()>* = nullptr> - __device__ bool operator()(size_type lhs_element_index, size_type rhs_element_index) const - noexcept + __device__ bool operator()(size_type lhs_element_index, + size_type rhs_element_index) const noexcept { if (has_nulls) { bool const lhs_is_null{lhs.is_null(lhs_element_index)}; diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh index 7c80c958f92..71e48370ccf 100644 --- a/cpp/include/cudf/table/table_device_view.cuh +++ b/cpp/include/cudf/table/table_device_view.cuh @@ -27,7 +27,7 @@ /** * @file table_device_view.cuh - * @brief Table device view class definitons + * @brief Table device view class definitions */ namespace cudf { diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp index e99e0db21c5..460c62e3598 100644 --- a/cpp/include/cudf/transform.hpp +++ b/cpp/include/cudf/transform.hpp @@ -40,7 +40,7 @@ namespace cudf { * * @param input An immutable view of the input column to transform * @param unary_udf The PTX/CUDA string of the unary function to apply - * @param outout_type The output type that is compatible with the output type in the UDF + * @param output_type The output type that is compatible with the output type in the UDF * @param is_ptx true: the UDF is treated as PTX code; false: the UDF is treated as CUDA code * @param mr Device memory resource used to allocate the returned column's device memory * @return The column resulting from applying the unary function to @@ -133,7 +133,7 @@ std::pair, std::unique_ptr> encode( * @param bitmask A device pointer to the bitmask which needs to be converted * @param begin_bit position of the bit from which the conversion should start * @param end_bit position of the bit before which the conversion should stop - * @param mr Device memory resource used to allocate the returned columns's device memory + * @param mr Device memory resource used to allocate the returned columns' device memory * @return A boolean column representing the given mask from [begin_bit, end_bit). */ std::unique_ptr mask_to_bools( @@ -164,7 +164,7 @@ std::unique_ptr mask_to_bools( * row_bit_count(column(x)) >= row_bit_count(gather(column(x))) * * @param t The table view to perform the computation on. - * @param mr Device memory resource used to allocate the returned columns's device memory + * @param mr Device memory resource used to allocate the returned columns' device memory * @return A 32-bit integer column containing the per-row bit counts. */ std::unique_ptr row_bit_count( diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 8116097e38e..e1037efb5c8 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -18,10 +18,10 @@ #ifdef __CUDACC__ #define CUDA_HOST_DEVICE_CALLABLE __host__ __device__ inline -#define CUDA_DEVICE_CALLABLE __device__ inline +#define CUDA_DEVICE_CALLABLE __device__ inline #else #define CUDA_HOST_DEVICE_CALLABLE inline -#define CUDA_DEVICE_CALLABLE inline +#define CUDA_DEVICE_CALLABLE inline #endif #include diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp index 15613c8caa7..2036723a6ed 100644 --- a/cpp/include/cudf/utilities/error.hpp +++ b/cpp/include/cudf/utilities/error.hpp @@ -37,7 +37,7 @@ struct cuda_error : public std::runtime_error { } // namespace cudf #define STRINGIFY_DETAIL(x) #x -#define CUDF_STRINGIFY(x) STRINGIFY_DETAIL(x) +#define CUDF_STRINGIFY(x) STRINGIFY_DETAIL(x) /** * @addtogroup utility_error diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index e2f5f6db624..2cdc455e05c 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -47,16 +47,20 @@ using void_t = void; */ #define CUDF_ENABLE_IF(...) std::enable_if_t<(__VA_ARGS__)>* = nullptr -template -struct is_relationally_comparable_impl : std::false_type { -}; - template using less_comparable = decltype(std::declval() < std::declval()); template using greater_comparable = decltype(std::declval() > std::declval()); +template +using equality_comparable = decltype(std::declval() == std::declval()); + +namespace detail { +template +struct is_relationally_comparable_impl : std::false_type { +}; + template struct is_relationally_comparable_impl struct is_equality_comparable_impl : std::false_type { }; -template -using equality_comparable = decltype(std::declval() == std::declval()); - template struct is_equality_comparable_impl>> : std::true_type { }; +// has common type +template +struct has_common_type_impl : std::false_type { +}; + +template +struct has_common_type_impl>, Ts...> : std::true_type { +}; +} // namespace detail + +template +using has_common_type = typename detail::has_common_type_impl::type; + +template +constexpr inline bool has_common_type_v = detail::has_common_type_impl::value; + template using is_timestamp_t = cuda::std::disjunction, std::is_same, @@ -104,7 +121,7 @@ using is_duration_t = cuda::std::disjunction, template constexpr inline bool is_relationally_comparable() { - return is_relationally_comparable_impl::value; + return detail::is_relationally_comparable_impl::value; } /** @@ -122,7 +139,7 @@ constexpr inline bool is_relationally_comparable() template constexpr inline bool is_equality_comparable() { - return is_equality_comparable_impl::value; + return detail::is_equality_comparable_impl::value; } /** diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp new file mode 100644 index 00000000000..8d57ab3aaa5 --- /dev/null +++ b/cpp/include/cudf/utilities/type_checks.hpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace cudf { + +/** + * @brief Compares the type of two `column_view`s + * + * This function returns true if the type of `lhs` equals that of `rhs`. + * - For fixed point types, the scale is compared. + * - For dictionary types, the type of the keys are compared if both are + * non-empty columns. + * - For lists types, the type of child columns are compared recursively. + * - For struct types, the type of each field are compared in order. + * - For all other types, the `id` of `data_type` is compared. + * + * @param lhs The first `column_view` to compare + * @param rhs The second `column_view` to compare + * @return true if column types match + */ +bool column_types_equal(column_view const& lhs, column_view const& rhs); + +} // namespace cudf diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp index 9fa67dccb52..cd088d81531 100644 --- a/cpp/include/cudf_test/base_fixture.hpp +++ b/cpp/include/cudf_test/base_fixture.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,38 +42,38 @@ namespace test { * ``` */ class BaseFixture : public ::testing::Test { - rmm::mr::device_memory_resource *_mr{rmm::mr::get_current_device_resource()}; + rmm::mr::device_memory_resource* _mr{rmm::mr::get_current_device_resource()}; public: /** * @brief Returns pointer to `device_memory_resource` that should be used for * all tests inheriting from this fixture */ - rmm::mr::device_memory_resource *mr() { return _mr; } + rmm::mr::device_memory_resource* mr() { return _mr; } }; template struct uniform_distribution_impl { }; template -struct uniform_distribution_impl< - T, - std::enable_if_t::value && not cudf::is_boolean()>> { +struct uniform_distribution_impl::value>> { using type = std::uniform_int_distribution; }; -template -struct uniform_distribution_impl::value>> { - using type = std::uniform_real_distribution; +template <> +struct uniform_distribution_impl { + using type = std::bernoulli_distribution; }; template -struct uniform_distribution_impl()>> { - using type = std::bernoulli_distribution; +struct uniform_distribution_impl::value>> { + using type = std::uniform_real_distribution; }; template -struct uniform_distribution_impl()>> { +struct uniform_distribution_impl< + T, + std::enable_if_t() or cudf::is_fixed_point()>> { using type = std::uniform_int_distribution; }; @@ -131,7 +131,8 @@ class UniformRandomGenerator { * @param lower Lower bound of the range * @param upper Upper bound of the desired range */ - template ()> * = nullptr> + template () && !cudf::is_boolean()>* = nullptr> UniformRandomGenerator(T lower, T upper, uint64_t seed = detail::random_generator_incrementing_seed()) @@ -139,6 +140,14 @@ class UniformRandomGenerator { { } + template ()>* = nullptr> + UniformRandomGenerator(T lower, + T upper, + uint64_t seed = detail::random_generator_incrementing_seed()) + : dist{0.5}, rng{std::mt19937_64{seed}()} + { + } + /** * @brief Construct a new Uniform Random Generator to generate uniformly * random numbers in the range `[upper,lower]` @@ -146,7 +155,8 @@ class UniformRandomGenerator { * @param lower Lower bound of the range * @param upper Upper bound of the desired range */ - template ()> * = nullptr> + template () or cudf::is_fixed_point()>* = nullptr> UniformRandomGenerator(typename TL::rep lower, typename TL::rep upper, uint64_t seed = detail::random_generator_incrementing_seed()) @@ -157,13 +167,13 @@ class UniformRandomGenerator { /** * @brief Returns the next random number. */ - template ()> * = nullptr> + template ()>* = nullptr> T generate() { return T{dist(rng)}; } - template ()> * = nullptr> + template ()>* = nullptr> T generate() { return T{typename T::duration{dist(rng)}}; @@ -237,7 +247,7 @@ inline auto make_binning() * @return Memory resource instance */ inline std::shared_ptr create_memory_resource( - std::string const &allocation_mode) + std::string const& allocation_mode) { if (allocation_mode == "binning") return make_binning(); if (allocation_mode == "cuda") return make_cuda(); @@ -252,12 +262,12 @@ inline std::shared_ptr create_memory_resource( /** * @brief Parses the cuDF test command line options. * - * Currently only supports 'rmm_mode' string paramater, which set the rmm + * Currently only supports 'rmm_mode' string parameter, which set the rmm * allocation mode. The default value of the parameter is 'pool'. * * @return Parsing results in the form of unordered map */ -inline auto parse_cudf_test_opts(int argc, char **argv) +inline auto parse_cudf_test_opts(int argc, char** argv) { try { cxxopts::Options options(argv[0], " - cuDF tests command line options"); @@ -265,7 +275,7 @@ inline auto parse_cudf_test_opts(int argc, char **argv) "rmm_mode", "RMM allocation mode", cxxopts::value()->default_value("pool")); return options.parse(argc, argv); - } catch (const cxxopts::OptionException &e) { + } catch (const cxxopts::OptionException& e) { CUDF_FAIL("Error parsing command line options"); } } @@ -281,7 +291,7 @@ inline auto parse_cudf_test_opts(int argc, char **argv) * allocation mode used for creating the default memory resource. */ #define CUDF_TEST_PROGRAM_MAIN() \ - int main(int argc, char **argv) \ + int main(int argc, char** argv) \ { \ ::testing::InitGoogleTest(&argc, argv); \ auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index 74d22085b26..a4857552831 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -1239,7 +1239,7 @@ class lists_column_wrapper : public detail::column_wrapper { /** * @brief Construct a lists column containing a single list of fixed-width - * type from an interator range. + * type from an iterator range. * * Example: * @code{.cpp} @@ -1621,7 +1621,7 @@ class lists_column_wrapper : public detail::column_wrapper { std::back_inserter(cols), [&](lists_column_wrapper const& l) -> column_view { // depth mismatch. attempt to normalize the short column. - // this function will also catch if this is a legitmately broken + // this function will also catch if this is a legitimately broken // set of input if (l.depth < expected_depth) { if (l.root) { diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp index b60c94394d1..1e2e44c79d1 100644 --- a/cpp/include/cudf_test/cudf_gtest.hpp +++ b/cpp/include/cudf_test/cudf_gtest.hpp @@ -34,10 +34,10 @@ * redefines them properly. */ -#define Types Types_NOT_USED -#define Types0 Types0_NOT_USED -#define TypeList TypeList_NOT_USED -#define Templates Templates_NOT_USED +#define Types Types_NOT_USED +#define Types0 Types0_NOT_USED +#define TypeList TypeList_NOT_USED +#define Templates Templates_NOT_USED #define Templates0 Templates0_NOT_USED #include #undef Types @@ -104,7 +104,7 @@ struct TypeList> { { \ try { \ x; \ - } catch (const exception &e) { \ + } catch (const exception& e) { \ ASSERT_NE(nullptr, e.what()); \ EXPECT_THAT(e.what(), testing::StartsWith((startswith))); \ EXPECT_THAT(e.what(), testing::EndsWith((endswith))); \ diff --git a/cpp/include/cudf_test/cxxopts.hpp b/cpp/include/cudf_test/cxxopts.hpp index 49c551ab2f1..5135fd02e21 100644 --- a/cpp/include/cudf_test/cxxopts.hpp +++ b/cpp/include/cudf_test/cxxopts.hpp @@ -89,7 +89,9 @@ inline String& stringAppend(String& s, String a) { return s.append(std::move(a)) inline String& stringAppend(String& s, int n, UChar32 c) { - for (int i = 0; i != n; ++i) { s.append(c); } + for (int i = 0; i != n; ++i) { + s.append(c); + } return s; } @@ -1449,7 +1451,9 @@ inline void Options::generate_all_groups_help(String& result) const std::vector all_groups; all_groups.reserve(m_help.size()); - for (auto& group : m_help) { all_groups.push_back(group.first); } + for (auto& group : m_help) { + all_groups.push_back(group.first); + } generate_group_help(result, all_groups); } diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp index 13394445922..90bf0cd99dc 100644 --- a/cpp/include/cudf_test/file_utilities.hpp +++ b/cpp/include/cudf_test/file_utilities.hpp @@ -28,17 +28,17 @@ class temp_directory { std::string _path; public: - temp_directory(const std::string &base_name) + temp_directory(const std::string& base_name) { std::string dir_template("/tmp"); - if (const char *env_p = std::getenv("WORKSPACE")) dir_template = env_p; + if (const char* env_p = std::getenv("WORKSPACE")) dir_template = env_p; dir_template += "/" + base_name + ".XXXXXX"; - auto const tmpdirptr = mkdtemp(const_cast(dir_template.data())); + auto const tmpdirptr = mkdtemp(const_cast(dir_template.data())); if (tmpdirptr == nullptr) CUDF_FAIL("Temporary directory creation failure: " + dir_template); _path = dir_template + "/"; } - static int rm_files(const char *pathname, const struct stat *sbuf, int type, struct FTW *ftwb) + static int rm_files(const char* pathname, const struct stat* sbuf, int type, struct FTW* ftwb) { return std::remove(pathname); } @@ -49,5 +49,5 @@ class temp_directory { nftw(_path.c_str(), rm_files, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS); } - const std::string &path() const { return _path; } + const std::string& path() const { return _path; } }; diff --git a/cpp/include/cudf_test/type_list_utilities.hpp b/cpp/include/cudf_test/type_list_utilities.hpp index a3f771c2f72..1588e3c9be9 100644 --- a/cpp/include/cudf_test/type_list_utilities.hpp +++ b/cpp/include/cudf_test/type_list_utilities.hpp @@ -32,7 +32,7 @@ * template * class TestFixture : ::testing::Test { }; * - * TYPED_TEST_CASE(TestFixure, TestTypes); + * TYPED_TEST_CASE(TestFixture, TestTypes); * * TYPED_TEST(TestFixture, mytest){ * using Type0 = GetType; // the first type element @@ -169,7 +169,7 @@ struct ConcatImpl<> { }; /** - * @brief Concantenates compile-time lists of types into a single type list. + * @brief Concatenates compile-time lists of types into a single type list. * * Example: * ``` diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp index aeddafae253..5c1b0c6c458 100644 --- a/cpp/include/cudf_test/type_lists.hpp +++ b/cpp/include/cudf_test/type_lists.hpp @@ -59,7 +59,7 @@ constexpr std::array types_to_ids_impl( * array == {type_id::INT32, type_id::FLOAT}; * ``` * - * @tparam TYPES List of types to conver to `type_id`s + * @tparam TYPES List of types to convert to `type_id`s * @return `std::array` of `type_id`s corresponding to each type in `TYPES` */ template diff --git a/cpp/scripts/run-clang-format.py b/cpp/scripts/run-clang-format.py index 2a7b66d4f77..c32e984278f 100755 --- a/cpp/scripts/run-clang-format.py +++ b/cpp/scripts/run-clang-format.py @@ -22,7 +22,7 @@ import sys import tempfile -EXPECTED_VERSION = "8.0.1" +EXPECTED_VERSION = "11.0.0" VERSION_REGEX = re.compile(r"clang-format version ([0-9.]+)") # NOTE: populate this list with more top-level dirs as we add more of them to # the cudf repo diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index f0fd865f685..53a55351f8e 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -88,6 +88,12 @@ std::vector> simple_aggregations_collector::visit( return visit(col_type, static_cast(agg)); } +std::vector> simple_aggregations_collector::visit( + data_type col_type, m2_aggregation const& agg) +{ + return visit(col_type, static_cast(agg)); +} + std::vector> simple_aggregations_collector::visit( data_type col_type, var_aggregation const& agg) { @@ -155,25 +161,31 @@ std::vector> simple_aggregations_collector::visit( } std::vector> simple_aggregations_collector::visit( - data_type col_type, merge_lists_aggregation const& agg) + data_type col_type, lead_lag_aggregation const& agg) { return visit(col_type, static_cast(agg)); } std::vector> simple_aggregations_collector::visit( - data_type col_type, merge_sets_aggregation const& agg) + data_type col_type, udf_aggregation const& agg) { return visit(col_type, static_cast(agg)); } std::vector> simple_aggregations_collector::visit( - data_type col_type, lead_lag_aggregation const& agg) + data_type col_type, merge_lists_aggregation const& agg) { return visit(col_type, static_cast(agg)); } std::vector> simple_aggregations_collector::visit( - data_type col_type, udf_aggregation const& agg) + data_type col_type, merge_sets_aggregation const& agg) +{ + return visit(col_type, static_cast(agg)); +} + +std::vector> simple_aggregations_collector::visit( + data_type col_type, merge_m2_aggregation const& agg) { return visit(col_type, static_cast(agg)); } @@ -227,6 +239,11 @@ void aggregation_finalizer::visit(mean_aggregation const& agg) visit(static_cast(agg)); } +void aggregation_finalizer::visit(m2_aggregation const& agg) +{ + visit(static_cast(agg)); +} + void aggregation_finalizer::visit(var_aggregation const& agg) { visit(static_cast(agg)); @@ -282,22 +299,27 @@ void aggregation_finalizer::visit(collect_set_aggregation const& agg) visit(static_cast(agg)); } -void aggregation_finalizer::visit(merge_lists_aggregation const& agg) +void aggregation_finalizer::visit(lead_lag_aggregation const& agg) { visit(static_cast(agg)); } -void aggregation_finalizer::visit(merge_sets_aggregation const& agg) +void aggregation_finalizer::visit(udf_aggregation const& agg) { visit(static_cast(agg)); } -void aggregation_finalizer::visit(lead_lag_aggregation const& agg) +void aggregation_finalizer::visit(merge_lists_aggregation const& agg) { visit(static_cast(agg)); } -void aggregation_finalizer::visit(udf_aggregation const& agg) +void aggregation_finalizer::visit(merge_sets_aggregation const& agg) +{ + visit(static_cast(agg)); +} + +void aggregation_finalizer::visit(merge_m2_aggregation const& agg) { visit(static_cast(agg)); } @@ -311,7 +333,7 @@ std::vector> aggregation::get_simple_aggregations( } /// Factory to create a SUM aggregation -template +template std::unique_ptr make_sum_aggregation() { return std::make_unique(); @@ -320,7 +342,7 @@ template std::unique_ptr make_sum_aggregation(); template std::unique_ptr make_sum_aggregation(); /// Factory to create a PRODUCT aggregation -template +template std::unique_ptr make_product_aggregation() { return std::make_unique(); @@ -328,7 +350,7 @@ std::unique_ptr make_product_aggregation() template std::unique_ptr make_product_aggregation(); /// Factory to create a MIN aggregation -template +template std::unique_ptr make_min_aggregation() { return std::make_unique(); @@ -337,7 +359,7 @@ template std::unique_ptr make_min_aggregation(); template std::unique_ptr make_min_aggregation(); /// Factory to create a MAX aggregation -template +template std::unique_ptr make_max_aggregation() { return std::make_unique(); @@ -346,7 +368,7 @@ template std::unique_ptr make_max_aggregation(); template std::unique_ptr make_max_aggregation(); /// Factory to create a COUNT aggregation -template +template std::unique_ptr make_count_aggregation(null_policy null_handling) { auto kind = @@ -359,7 +381,7 @@ template std::unique_ptr make_count_aggregation +template std::unique_ptr make_any_aggregation() { return std::make_unique(); @@ -367,7 +389,7 @@ std::unique_ptr make_any_aggregation() template std::unique_ptr make_any_aggregation(); /// Factory to create a ALL aggregation -template +template std::unique_ptr make_all_aggregation() { return std::make_unique(); @@ -375,7 +397,7 @@ std::unique_ptr make_all_aggregation() template std::unique_ptr make_all_aggregation(); /// Factory to create a SUM_OF_SQUARES aggregation -template +template std::unique_ptr make_sum_of_squares_aggregation() { return std::make_unique(); @@ -383,7 +405,7 @@ std::unique_ptr make_sum_of_squares_aggregation() template std::unique_ptr make_sum_of_squares_aggregation(); /// Factory to create a MEAN aggregation -template +template std::unique_ptr make_mean_aggregation() { return std::make_unique(); @@ -391,8 +413,16 @@ std::unique_ptr make_mean_aggregation() template std::unique_ptr make_mean_aggregation(); template std::unique_ptr make_mean_aggregation(); +/// Factory to create a M2 aggregation +template +std::unique_ptr make_m2_aggregation() +{ + return std::make_unique(); +} +template std::unique_ptr make_m2_aggregation(); + /// Factory to create a VARIANCE aggregation -template +template std::unique_ptr make_variance_aggregation(size_type ddof) { return std::make_unique(ddof); @@ -400,7 +430,7 @@ std::unique_ptr make_variance_aggregation(size_type ddof) template std::unique_ptr make_variance_aggregation(size_type ddof); /// Factory to create a STD aggregation -template +template std::unique_ptr make_std_aggregation(size_type ddof) { return std::make_unique(ddof); @@ -408,7 +438,7 @@ std::unique_ptr make_std_aggregation(size_type ddof) template std::unique_ptr make_std_aggregation(size_type ddof); /// Factory to create a MEDIAN aggregation -template +template std::unique_ptr make_median_aggregation() { return std::make_unique(); @@ -416,7 +446,7 @@ std::unique_ptr make_median_aggregation() template std::unique_ptr make_median_aggregation(); /// Factory to create a QUANTILE aggregation -template +template std::unique_ptr make_quantile_aggregation(std::vector const& q, interpolation i) { return std::make_unique(q, i); @@ -425,7 +455,7 @@ template std::unique_ptr make_quantile_aggregation( std::vector const& q, interpolation i); /// Factory to create an ARGMAX aggregation -template +template std::unique_ptr make_argmax_aggregation() { return std::make_unique(); @@ -434,7 +464,7 @@ template std::unique_ptr make_argmax_aggregation(); template std::unique_ptr make_argmax_aggregation(); /// Factory to create an ARGMIN aggregation -template +template std::unique_ptr make_argmin_aggregation() { return std::make_unique(); @@ -443,7 +473,7 @@ template std::unique_ptr make_argmin_aggregation(); template std::unique_ptr make_argmin_aggregation(); /// Factory to create an NUNIQUE aggregation -template +template std::unique_ptr make_nunique_aggregation(null_policy null_handling) { return std::make_unique(null_handling); @@ -452,7 +482,7 @@ template std::unique_ptr make_nunique_aggregation( null_policy null_handling); /// Factory to create an NTH_ELEMENT aggregation -template +template std::unique_ptr make_nth_element_aggregation(size_type n, null_policy null_handling) { return std::make_unique(n, null_handling); @@ -461,7 +491,7 @@ template std::unique_ptr make_nth_element_aggregation( size_type n, null_policy null_handling); /// Factory to create a ROW_NUMBER aggregation -template +template std::unique_ptr make_row_number_aggregation() { return std::make_unique(); @@ -470,7 +500,7 @@ template std::unique_ptr make_row_number_aggregation() template std::unique_ptr make_row_number_aggregation(); /// Factory to create a COLLECT_LIST aggregation -template +template std::unique_ptr make_collect_list_aggregation(null_policy null_handling) { return std::make_unique(null_handling); @@ -481,7 +511,7 @@ template std::unique_ptr make_collect_list_aggregation +template std::unique_ptr make_collect_set_aggregation(null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal) @@ -493,26 +523,8 @@ template std::unique_ptr make_collect_set_aggregation( template std::unique_ptr make_collect_set_aggregation( null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal); -/// Factory to create a MERGE_LISTS aggregation -template -std::unique_ptr make_merge_lists_aggregation() -{ - return std::make_unique(); -} -template std::unique_ptr make_merge_lists_aggregation(); - -/// Factory to create a MERGE_SETS aggregation -template -std::unique_ptr make_merge_sets_aggregation(null_equality nulls_equal, - nan_equality nans_equal) -{ - return std::make_unique(nulls_equal, nans_equal); -} -template std::unique_ptr make_merge_sets_aggregation(null_equality, - nan_equality); - /// Factory to create a LAG aggregation -template +template std::unique_ptr make_lag_aggregation(size_type offset) { return std::make_unique(aggregation::LAG, offset); @@ -522,7 +534,7 @@ template std::unique_ptr make_lag_aggregation +template std::unique_ptr make_lead_aggregation(size_type offset) { return std::make_unique(aggregation::LEAD, offset); @@ -532,7 +544,7 @@ template std::unique_ptr make_lead_aggregation +template std::unique_ptr make_udf_aggregation(udf_type type, std::string const& user_defined_aggregator, data_type output_type) @@ -548,6 +560,32 @@ template std::unique_ptr make_udf_aggregation( template std::unique_ptr make_udf_aggregation( udf_type type, std::string const& user_defined_aggregator, data_type output_type); +/// Factory to create a MERGE_LISTS aggregation +template +std::unique_ptr make_merge_lists_aggregation() +{ + return std::make_unique(); +} +template std::unique_ptr make_merge_lists_aggregation(); + +/// Factory to create a MERGE_SETS aggregation +template +std::unique_ptr make_merge_sets_aggregation(null_equality nulls_equal, + nan_equality nans_equal) +{ + return std::make_unique(nulls_equal, nans_equal); +} +template std::unique_ptr make_merge_sets_aggregation(null_equality, + nan_equality); + +/// Factory to create a MERGE_M2 aggregation +template +std::unique_ptr make_merge_m2_aggregation() +{ + return std::make_unique(); +} +template std::unique_ptr make_merge_m2_aggregation(); + namespace detail { namespace { struct target_type_functor { diff --git a/cpp/src/ast/linearizer.cpp b/cpp/src/ast/linearizer.cpp index 66a32ead35e..3e442305552 100644 --- a/cpp/src/ast/linearizer.cpp +++ b/cpp/src/ast/linearizer.cpp @@ -111,7 +111,9 @@ cudf::size_type linearizer::visit(column_reference const& expr) // Increment the node index _node_count++; // Resolve node type - auto const data_type = expr.get_data_type(_table); + auto const data_type = expr.get_table_source() == table_reference::LEFT + ? expr.get_data_type(_left) + : expr.get_data_type(_right); // Push data reference auto const source = detail::device_data_reference(detail::device_data_reference_type::COLUMN, data_type, diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu index 43d3bde97c2..7aa89635c54 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/ast/transform.cu @@ -49,37 +49,37 @@ namespace detail { * This evaluates an expression over a table to produce a new column. Also called an n-ary * transform. * - * @tparam block_size + * @tparam max_block_size The size of the thread block, used to set launch + * bounds and minimize register usage. + * @tparam has_nulls whether or not the output column may contain nulls. + * * @param table The table device view used for evaluation. - * @param literals Array of literal values used for evaluation. - * @param output_column The output column where results are stored. - * @param data_references Array of data references. - * @param operators Array of operators to perform. - * @param operator_source_indices Array of source indices for the operators. - * @param num_operators Number of operators. - * @param num_intermediates Number of intermediates, used to allocate a portion of shared memory to - * each thread. + * @param plan Container of device data required to evaluate the desired expression. + * @param output_column The destination for the results of evaluating the expression. */ -template -__launch_bounds__(max_block_size) __global__ void compute_column_kernel( - table_device_view const table, - device_span literals, - mutable_column_device_view output_column, - device_span data_references, - device_span operators, - device_span operator_source_indices, - cudf::size_type num_intermediates) +template +__launch_bounds__(max_block_size) __global__ + void compute_column_kernel(table_device_view const table, + device_ast_plan plan, + mutable_column_device_view output_column) { - extern __shared__ std::int64_t intermediate_storage[]; - auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * num_intermediates]; + // The (required) extern storage of the shared memory array leads to + // conflicting declarations between different templates. The easiest + // workaround is to declare an arbitrary (here char) array type then cast it + // after the fact to the appropriate type. + extern __shared__ char raw_intermediate_storage[]; + IntermediateDataType* intermediate_storage = + reinterpret_cast*>(raw_intermediate_storage); + + auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates]; auto const start_idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); auto const stride = static_cast(blockDim.x * gridDim.x); - auto const evaluator = - cudf::ast::detail::row_evaluator(table, literals, thread_intermediate_storage, &output_column); + auto evaluator = + cudf::ast::detail::expression_evaluator(table, plan, thread_intermediate_storage); for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) { - evaluate_row_expression( - evaluator, data_references, operators, operator_source_indices, row_index); + auto output_dest = mutable_column_expression_result(output_column); + evaluator.evaluate(output_dest, row_index); } } @@ -88,22 +88,30 @@ std::unique_ptr compute_column(table_view const table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const expr_linearizer = linearizer(expr, table); // Linearize the AST - auto const plan = ast_plan{expr_linearizer, stream, mr}; // Create ast_plan + // Prepare output column. Whether or not the output column is nullable is + // determined by whether any of the columns in the input table are nullable. + // If none of the input columns actually contain nulls, we can still use the + // non-nullable version of the expression evaluation code path for + // performance, so we capture that information as well. + auto const nullable = + std::any_of(table.begin(), table.end(), [](column_view c) { return c.nullable(); }); + auto const has_nulls = nullable && std::any_of(table.begin(), table.end(), [](column_view c) { + return c.nullable() && c.has_nulls(); + }); - // Create table device view - auto table_device = table_device_view::create(table, stream); - auto const table_num_rows = table.num_rows(); + auto const plan = ast_plan{expr, table, has_nulls, stream, mr}; + + auto const output_column_mask_state = + nullable ? (has_nulls ? mask_state::UNINITIALIZED : mask_state::ALL_VALID) + : mask_state::UNALLOCATED; - // Prepare output column auto output_column = cudf::make_fixed_width_column( - expr_linearizer.root_data_type(), table_num_rows, mask_state::UNALLOCATED, stream, mr); + plan.output_type(), table.num_rows(), output_column_mask_state, stream, mr); auto mutable_output_device = cudf::mutable_column_device_view::create(output_column->mutable_view(), stream); // Configure kernel parameters - auto const num_intermediates = expr_linearizer.intermediate_count(); - auto const shmem_size_per_thread = static_cast(sizeof(std::int64_t) * num_intermediates); + auto const& dev_plan = plan.dev_plan; int device_id; CUDA_TRY(cudaGetDevice(&device_id)); int shmem_limit_per_block; @@ -111,22 +119,23 @@ std::unique_ptr compute_column(table_view const table, cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id)); auto constexpr MAX_BLOCK_SIZE = 128; auto const block_size = - shmem_size_per_thread != 0 - ? std::min(MAX_BLOCK_SIZE, shmem_limit_per_block / shmem_size_per_thread) + dev_plan.shmem_per_thread != 0 + ? std::min(MAX_BLOCK_SIZE, shmem_limit_per_block / dev_plan.shmem_per_thread) : MAX_BLOCK_SIZE; - auto const config = cudf::detail::grid_1d{table_num_rows, block_size}; - auto const shmem_size_per_block = shmem_size_per_thread * config.num_threads_per_block; + auto const config = cudf::detail::grid_1d{table.num_rows(), block_size}; + auto const shmem_per_block = dev_plan.shmem_per_thread * config.num_threads_per_block; // Execute the kernel - cudf::ast::detail::compute_column_kernel - <<>>( - *table_device, - plan._device_literals, - *mutable_output_device, - plan._device_data_references, - plan._device_operators, - plan._device_operator_source_indices, - num_intermediates); + auto table_device = table_device_view::create(table, stream); + if (has_nulls) { + cudf::ast::detail::compute_column_kernel + <<>>( + *table_device, dev_plan, *mutable_output_device); + } else { + cudf::ast::detail::compute_column_kernel + <<>>( + *table_device, dev_plan, *mutable_output_device); + } CHECK_CUDA(stream.value()); return output_column; } diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 11a3383ee87..aaf193ff5cf 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -588,7 +588,7 @@ std::unique_ptr binary_operation(scalar const& lhs, rmm::mr::device_memory_resource* mr) { if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING) - return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr); + return experimental::binary_operation(lhs, rhs, op, output_type, mr); if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr); @@ -615,7 +615,7 @@ std::unique_ptr binary_operation(column_view const& lhs, rmm::mr::device_memory_resource* mr) { if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING) - return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr); + return experimental::binary_operation(lhs, rhs, op, output_type, mr); if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr); @@ -644,7 +644,7 @@ std::unique_ptr binary_operation(column_view const& lhs, CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match"); if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING) - return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr); + return experimental::binary_operation(lhs, rhs, op, output_type, mr); if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr); @@ -757,4 +757,78 @@ std::unique_ptr binary_operation(column_view const& lhs, return detail::binary_operation(lhs, rhs, ptx, output_type, rmm::cuda_stream_default, mr); } +// Experimental Compiled Binary operation +namespace experimental { +namespace detail { +/** + * @copydoc cudf::experimental::binary_operation(column_view const&, column_view const&, + * binary_operator, data_type, rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +template +std::unique_ptr binary_operation(LhsType const& lhs, + RhsType const& rhs, + binary_operator op, + data_type output_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if constexpr (std::is_same_v and std::is_same_v) + CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match"); + + if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING and + output_type.id() == type_id::STRING and + (op == binary_operator::NULL_MAX or op == binary_operator::NULL_MIN)) + return binops::compiled::string_null_min_max(lhs, rhs, op, output_type, stream, mr); + + if (not binops::compiled::is_supported_operation(output_type, lhs.type(), rhs.type(), op)) + CUDF_FAIL("Unsupported operator for these types"); + + // TODO check if scale conversion required? + // if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type())) + // CUDF_FAIL("Not yet supported fixed_point"); + // return fixed_point_binary_operation(lhs, rhs, op, output_type, stream, mr); + + auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr); + + if constexpr (std::is_same_v) + if (lhs.is_empty()) return out; + if constexpr (std::is_same_v) + if (rhs.is_empty()) return out; + + auto out_view = out->mutable_view(); + cudf::binops::compiled::binary_operation(out_view, lhs, rhs, op, stream); + return out; +} +} // namespace detail + +std::unique_ptr binary_operation(scalar const& lhs, + column_view const& rhs, + binary_operator op, + data_type output_type, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr); +} +std::unique_ptr binary_operation(column_view const& lhs, + scalar const& rhs, + binary_operator op, + data_type output_type, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr); +} +std::unique_ptr binary_operation(column_view const& lhs, + column_view const& rhs, + binary_operator op, + data_type output_type, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr); +} +} // namespace experimental } // namespace cudf diff --git a/cpp/src/binaryop/compiled/ATan2.cu b/cpp/src/binaryop/compiled/ATan2.cu new file mode 100644 index 00000000000..8e5cbf57f55 --- /dev/null +++ b/cpp/src/binaryop/compiled/ATan2.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/Add.cu b/cpp/src/binaryop/compiled/Add.cu new file mode 100644 index 00000000000..4cd2ced66f4 --- /dev/null +++ b/cpp/src/binaryop/compiled/Add.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/BitwiseAnd.cu b/cpp/src/binaryop/compiled/BitwiseAnd.cu new file mode 100644 index 00000000000..6abac2bd197 --- /dev/null +++ b/cpp/src/binaryop/compiled/BitwiseAnd.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/BitwiseOr.cu b/cpp/src/binaryop/compiled/BitwiseOr.cu new file mode 100644 index 00000000000..6d523cbf1d1 --- /dev/null +++ b/cpp/src/binaryop/compiled/BitwiseOr.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/BitwiseXor.cu b/cpp/src/binaryop/compiled/BitwiseXor.cu new file mode 100644 index 00000000000..45175681574 --- /dev/null +++ b/cpp/src/binaryop/compiled/BitwiseXor.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/Div.cu b/cpp/src/binaryop/compiled/Div.cu new file mode 100644 index 00000000000..7cc895ecd06 --- /dev/null +++ b/cpp/src/binaryop/compiled/Div.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/FloorDiv.cu b/cpp/src/binaryop/compiled/FloorDiv.cu new file mode 100644 index 00000000000..99ea2706b86 --- /dev/null +++ b/cpp/src/binaryop/compiled/FloorDiv.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/Greater.cu b/cpp/src/binaryop/compiled/Greater.cu new file mode 100644 index 00000000000..679e029b5fc --- /dev/null +++ b/cpp/src/binaryop/compiled/Greater.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/GreaterEqual.cu b/cpp/src/binaryop/compiled/GreaterEqual.cu new file mode 100644 index 00000000000..23b0c6aaa0d --- /dev/null +++ b/cpp/src/binaryop/compiled/GreaterEqual.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/Less.cu b/cpp/src/binaryop/compiled/Less.cu new file mode 100644 index 00000000000..7ab5dfe3478 --- /dev/null +++ b/cpp/src/binaryop/compiled/Less.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/LessEqual.cu b/cpp/src/binaryop/compiled/LessEqual.cu new file mode 100644 index 00000000000..983c50c9575 --- /dev/null +++ b/cpp/src/binaryop/compiled/LessEqual.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/LogBase.cu b/cpp/src/binaryop/compiled/LogBase.cu new file mode 100644 index 00000000000..bdc709b86bf --- /dev/null +++ b/cpp/src/binaryop/compiled/LogBase.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/LogicalAnd.cu b/cpp/src/binaryop/compiled/LogicalAnd.cu new file mode 100644 index 00000000000..08112fadfff --- /dev/null +++ b/cpp/src/binaryop/compiled/LogicalAnd.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/LogicalOr.cu b/cpp/src/binaryop/compiled/LogicalOr.cu new file mode 100644 index 00000000000..bc400afd4cd --- /dev/null +++ b/cpp/src/binaryop/compiled/LogicalOr.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/Mod.cu b/cpp/src/binaryop/compiled/Mod.cu new file mode 100644 index 00000000000..0b82c09c8a6 --- /dev/null +++ b/cpp/src/binaryop/compiled/Mod.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/Mul.cu b/cpp/src/binaryop/compiled/Mul.cu new file mode 100644 index 00000000000..15394245259 --- /dev/null +++ b/cpp/src/binaryop/compiled/Mul.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/NullMax.cu b/cpp/src/binaryop/compiled/NullMax.cu new file mode 100644 index 00000000000..78a44041cba --- /dev/null +++ b/cpp/src/binaryop/compiled/NullMax.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} // namespace cudf::binops::compiled diff --git a/cpp/src/binaryop/compiled/NullMin.cu b/cpp/src/binaryop/compiled/NullMin.cu new file mode 100644 index 00000000000..629ab600fd7 --- /dev/null +++ b/cpp/src/binaryop/compiled/NullMin.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} // namespace cudf::binops::compiled diff --git a/cpp/src/binaryop/compiled/PMod.cu b/cpp/src/binaryop/compiled/PMod.cu new file mode 100644 index 00000000000..36902c0ed10 --- /dev/null +++ b/cpp/src/binaryop/compiled/PMod.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/Pow.cu b/cpp/src/binaryop/compiled/Pow.cu new file mode 100644 index 00000000000..c6f897ee18d --- /dev/null +++ b/cpp/src/binaryop/compiled/Pow.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/PyMod.cu b/cpp/src/binaryop/compiled/PyMod.cu new file mode 100644 index 00000000000..b05dcd8e7bc --- /dev/null +++ b/cpp/src/binaryop/compiled/PyMod.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/ShiftLeft.cu b/cpp/src/binaryop/compiled/ShiftLeft.cu new file mode 100644 index 00000000000..6cc950b2d50 --- /dev/null +++ b/cpp/src/binaryop/compiled/ShiftLeft.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/ShiftRight.cu b/cpp/src/binaryop/compiled/ShiftRight.cu new file mode 100644 index 00000000000..1ddd7100a73 --- /dev/null +++ b/cpp/src/binaryop/compiled/ShiftRight.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu b/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu new file mode 100644 index 00000000000..a87b4b9f9ac --- /dev/null +++ b/cpp/src/binaryop/compiled/ShiftRightUnsigned.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/Sub.cu b/cpp/src/binaryop/compiled/Sub.cu new file mode 100644 index 00000000000..e0cf47c1310 --- /dev/null +++ b/cpp/src/binaryop/compiled/Sub.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/TrueDiv.cu b/cpp/src/binaryop/compiled/TrueDiv.cu new file mode 100644 index 00000000000..d8f1d956340 --- /dev/null +++ b/cpp/src/binaryop/compiled/TrueDiv.cu @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +template void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view); +} diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu index 2b24e0cfa3d..1dd00c4b981 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cu +++ b/cpp/src/binaryop/compiled/binary_ops.cu @@ -15,13 +15,12 @@ */ #include "binary_ops.hpp" +#include "operation.cuh" +#include #include -#include -#include #include -#include -#include +#include #include #include @@ -32,204 +31,76 @@ namespace binops { namespace compiled { namespace { - -template -struct apply_binop { - binary_operator op; - apply_binop(binary_operator op) : op(op) {} - CUDA_DEVICE_CALLABLE Out operator()(Lhs const& x, Rhs const& y) const - { - switch (op) { - case binary_operator::EQUAL: return this->equal(x, y); - case binary_operator::NOT_EQUAL: return this->not_equal(x, y); - case binary_operator::LESS: return this->less(x, y); - case binary_operator::GREATER: return this->greater(x, y); - case binary_operator::LESS_EQUAL: return this->less_equal(x, y); - case binary_operator::GREATER_EQUAL: return this->greater_equal(x, y); - default: return Out{}; - } - } - CUDA_DEVICE_CALLABLE Out equal(Lhs const& x, Rhs const& y) const - { - return static_cast(x == y); - } - CUDA_DEVICE_CALLABLE Out not_equal(Lhs const& x, Rhs const& y) const - { - return static_cast(x != y); - } - CUDA_DEVICE_CALLABLE Out less(Lhs const& x, Rhs const& y) const - { - return static_cast(x < y); - } - CUDA_DEVICE_CALLABLE Out greater(Lhs const& x, Rhs const& y) const - { - return static_cast(x > y); - } - CUDA_DEVICE_CALLABLE Out less_equal(Lhs const& x, Rhs const& y) const - { - return static_cast(x <= y); - } - CUDA_DEVICE_CALLABLE Out greater_equal(Lhs const& x, Rhs const& y) const - { - return static_cast(x >= y); - } -}; - -template -struct apply_binop_scalar_lhs_rhs : apply_binop { - cudf::scalar_device_type_t scalar; - apply_binop_scalar_lhs_rhs(binary_operator op, cudf::scalar_device_type_t scalar) - : apply_binop(op), scalar(scalar) - { - } - CUDA_DEVICE_CALLABLE Out operator()(Lhs const& x) const - { - return apply_binop::operator()(x, scalar.value()); - } -}; - -template -struct apply_binop_scalar_rhs_lhs : apply_binop { - cudf::scalar_device_type_t scalar; - apply_binop_scalar_rhs_lhs(binary_operator op, cudf::scalar_device_type_t scalar) - : apply_binop(op), scalar(scalar) +/** + * @brief Converts scalar to column_device_view with single element. + * + * @return pair with column_device_view and column containing any auxilary data to create + * column_view from scalar + */ +struct scalar_as_column_device_view { + using return_type = typename std::pair>; + template ())>* = nullptr> + return_type operator()(scalar const& s, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { + auto h_scalar_type_view = static_cast&>(const_cast(s)); + auto col_v = + column_view(s.type(), 1, h_scalar_type_view.data(), (bitmask_type const*)s.validity_data()); + return std::pair{column_device_view::create(col_v, stream), std::unique_ptr(nullptr)}; } - CUDA_DEVICE_CALLABLE Out operator()(Lhs const& x) const + template ())>* = nullptr> + return_type operator()(scalar const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) { - return apply_binop::operator()(scalar.value(), x); + CUDF_FAIL("Unsupported type"); } }; +// specialization for cudf::string_view +template <> +scalar_as_column_device_view::return_type +scalar_as_column_device_view::operator()(scalar const& s, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + using T = cudf::string_view; + auto h_scalar_type_view = static_cast&>(const_cast(s)); + + // build offsets column from the string size + auto offsets_transformer_itr = + thrust::make_constant_iterator(h_scalar_type_view.size()); + auto offsets_column = strings::detail::make_offsets_child_column( + offsets_transformer_itr, offsets_transformer_itr + 1, stream, mr); + + auto chars_column_v = + column_view(data_type{type_id::INT8}, h_scalar_type_view.size(), h_scalar_type_view.data()); + // Construct string column_view + auto col_v = column_view(s.type(), + 1, + nullptr, + (bitmask_type const*)s.validity_data(), + cudf::UNKNOWN_NULL_COUNT, + 0, + {offsets_column->view(), chars_column_v}); + return std::pair{column_device_view::create(col_v, stream), std::move(offsets_column)}; +} -template -struct binary_op { - std::unique_ptr operator()(column_view const& lhs, - scalar const& rhs, - binary_operator op, - data_type out_type, - bool const reversed, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - { - auto new_mask = binops::detail::scalar_col_valid_mask_and(lhs, rhs, stream, mr); - auto out = make_fixed_width_column(out_type, - lhs.size(), - std::move(new_mask), - rhs.is_valid(stream) ? cudf::UNKNOWN_NULL_COUNT : lhs.size(), - stream, - mr); - - if (lhs.size() > 0 && rhs.is_valid(stream)) { - auto out_view = out->mutable_view(); - auto out_itr = out_view.begin(); - auto lhs_device_view = column_device_view::create(lhs, stream); - using rhs_type = cudf::scalar_type_t; - auto rhs_scalar = rhs_type(static_cast(rhs), stream); - auto rhs_scalar_view = get_scalar_device_view(rhs_scalar); - if (lhs.has_nulls()) { - auto lhs_itr = cudf::detail::make_null_replacement_iterator(*lhs_device_view, Lhs{}); - reversed - ? thrust::transform(rmm::exec_policy(stream), - lhs_itr, - lhs_itr + lhs.size(), - out_itr, - apply_binop_scalar_rhs_lhs{op, rhs_scalar_view}) - : thrust::transform(rmm::exec_policy(stream), - lhs_itr, - lhs_itr + lhs.size(), - out_itr, - apply_binop_scalar_lhs_rhs{op, rhs_scalar_view}); - } else { - auto lhs_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_type{0}), - [col = *lhs_device_view] __device__(size_type i) { return col.element(i); }); - reversed - ? thrust::transform(rmm::exec_policy(stream), - lhs_itr, - lhs_itr + lhs.size(), - out_itr, - apply_binop_scalar_rhs_lhs{op, rhs_scalar_view}) - : thrust::transform(rmm::exec_policy(stream), - lhs_itr, - lhs_itr + lhs.size(), - out_itr, - apply_binop_scalar_lhs_rhs{op, rhs_scalar_view}); - } - } - - CHECK_CUDA(stream.value()); - - return out; - } - - std::unique_ptr operator()(column_view const& lhs, - column_view const& rhs, - binary_operator op, - data_type out_type, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - { - auto new_mask = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr); - auto out = make_fixed_width_column( - out_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr); - - if (lhs.size() > 0) { - auto out_view = out->mutable_view(); - auto out_itr = out_view.begin(); - auto lhs_device_view = column_device_view::create(lhs, stream); - auto rhs_device_view = column_device_view::create(rhs, stream); - if (lhs.has_nulls() && rhs.has_nulls()) { - auto lhs_itr = cudf::detail::make_null_replacement_iterator(*lhs_device_view, Lhs{}); - auto rhs_itr = cudf::detail::make_null_replacement_iterator(*rhs_device_view, Rhs{}); - thrust::transform(rmm::exec_policy(stream), - lhs_itr, - lhs_itr + lhs.size(), - rhs_itr, - out_itr, - apply_binop{op}); - } else if (lhs.has_nulls()) { - auto lhs_itr = cudf::detail::make_null_replacement_iterator(*lhs_device_view, Lhs{}); - auto rhs_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_type{0}), - [col = *rhs_device_view] __device__(size_type i) { return col.element(i); }); - thrust::transform(rmm::exec_policy(stream), - lhs_itr, - lhs_itr + lhs.size(), - rhs_itr, - out_itr, - apply_binop{op}); - } else if (rhs.has_nulls()) { - auto lhs_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_type{0}), - [col = *lhs_device_view] __device__(size_type i) { return col.element(i); }); - auto rhs_itr = cudf::detail::make_null_replacement_iterator(*rhs_device_view, Rhs{}); - thrust::transform(rmm::exec_policy(stream), - lhs_itr, - lhs_itr + lhs.size(), - rhs_itr, - out_itr, - apply_binop{op}); - } else { - auto lhs_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_type{0}), - [col = *lhs_device_view] __device__(size_type i) { return col.element(i); }); - auto rhs_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_type{0}), - [col = *rhs_device_view] __device__(size_type i) { return col.element(i); }); - thrust::transform(rmm::exec_policy(stream), - lhs_itr, - lhs_itr + lhs.size(), - rhs_itr, - out_itr, - apply_binop{op}); - } - } - - CHECK_CUDA(stream.value()); - - return out; - } -}; +/** + * @brief Converts scalar to column_device_view with single element. + * + * @param scal scalar to convert + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * @return pair with column_device_view and column containing any auxilary data to create + * column_view from scalar + */ +auto scalar_to_column_device_view( + scalar const& scal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + return type_dispatcher(scal.type(), scalar_as_column_device_view{}, scal, stream, mr); +} // This functor does the actual comparison between string column value and a scalar string // or between two string column values using a comparator @@ -337,152 +208,181 @@ struct null_considering_binop { // Create device views for inputs auto const lhs_dev_view = get_device_view(lhs); auto const rhs_dev_view = get_device_view(rhs); - - switch (op) { - case binary_operator::NULL_EQUALS: { - // Validate input - CUDF_EXPECTS(output_type.id() == type_id::BOOL8, "Output column type has to be bool"); - - // Make a bool8 numeric output column - out = make_numeric_column( - data_type{type_id::BOOL8}, col_size, mask_state::ALL_VALID, stream, mr); - - // Create a compare function lambda - auto equal_func = [] __device__(bool lhs_valid, - bool rhs_valid, - cudf::string_view lhs_value, - cudf::string_view rhs_value) { - if (!lhs_valid && !rhs_valid) return true; - if (lhs_valid && rhs_valid) return (lhs_value == rhs_value); - return false; - }; - - // Populate output column - populate_out_col(lhs_dev_view, - rhs_dev_view, - col_size, - stream, - equal_func, - mutable_column_view{*out}.begin()); - - break; - } - - case binary_operator::NULL_MAX: - case binary_operator::NULL_MIN: { - // Validate input - CUDF_EXPECTS(output_type.id() == lhs.type().id(), - "Output column type should match input column type"); - - // Shallow copy of the resultant strings - rmm::device_uvector out_col_strings(col_size, stream); - - // Invalid output column strings - null rows - cudf::string_view const invalid_str{nullptr, 0}; - - // Create a compare function lambda - auto minmax_func = [op, invalid_str] __device__(bool lhs_valid, - bool rhs_valid, - cudf::string_view lhs_value, - cudf::string_view rhs_value) { - if (!lhs_valid && !rhs_valid) - return invalid_str; - else if (lhs_valid && rhs_valid) { - return (op == binary_operator::NULL_MAX) - ? thrust::maximum()(lhs_value, rhs_value) - : thrust::minimum()(lhs_value, rhs_value); - } else if (lhs_valid) - return lhs_value; - else - return rhs_value; - }; - - // Populate output column - populate_out_col( - lhs_dev_view, rhs_dev_view, col_size, stream, minmax_func, out_col_strings.data()); - - // Create an output column with the resultant strings - out = cudf::make_strings_column(out_col_strings, invalid_str, stream, mr); - - break; - } - - default: { - CUDF_FAIL("Null aware binop not supported"); - } - } - - return out; + // Validate input + CUDF_EXPECTS(output_type.id() == lhs.type().id(), + "Output column type should match input column type"); + + // Shallow copy of the resultant strings + rmm::device_uvector out_col_strings(col_size, stream); + + // Invalid output column strings - null rows + cudf::string_view const invalid_str{nullptr, 0}; + + // Create a compare function lambda + auto minmax_func = + [op, invalid_str] __device__( + bool lhs_valid, bool rhs_valid, cudf::string_view lhs_value, cudf::string_view rhs_value) { + if (!lhs_valid && !rhs_valid) + return invalid_str; + else if (lhs_valid && rhs_valid) { + return (op == binary_operator::NULL_MAX) + ? thrust::maximum()(lhs_value, rhs_value) + : thrust::minimum()(lhs_value, rhs_value); + } else if (lhs_valid) + return lhs_value; + else + return rhs_value; + }; + + // Populate output column + populate_out_col( + lhs_dev_view, rhs_dev_view, col_size, stream, minmax_func, out_col_strings.data()); + + // Create an output column with the resultant strings + return cudf::make_strings_column(out_col_strings, invalid_str, stream, mr); } }; } // namespace -std::unique_ptr binary_operation(scalar const& lhs, - column_view const& rhs, - binary_operator op, - data_type output_type, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr string_null_min_max(scalar const& lhs, + column_view const& rhs, + binary_operator op, + data_type output_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // hard-coded to only work with cudf::string_view so we don't explode compile times CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype"); CUDF_EXPECTS(rhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported rhs datatype"); - if (is_null_dependent(op)) { - if (rhs.is_empty()) return cudf::make_empty_column(output_type); - auto rhs_device_view = cudf::column_device_view::create(rhs, stream); - return null_considering_binop{}(lhs, *rhs_device_view, op, output_type, rhs.size(), stream, mr); - } else { - CUDF_EXPECTS(is_boolean(output_type), "Invalid/Unsupported output datatype"); - // Should pass the right type of scalar and column_view when specializing binary_op - return binary_op{}( - rhs, lhs, op, output_type, true, stream, mr); - } + CUDF_EXPECTS(op == binary_operator::NULL_MAX or op == binary_operator::NULL_MIN, + "Unsupported binary operation"); + if (rhs.is_empty()) return cudf::make_empty_column(output_type); + auto rhs_device_view = cudf::column_device_view::create(rhs, stream); + return null_considering_binop{}(lhs, *rhs_device_view, op, output_type, rhs.size(), stream, mr); } -std::unique_ptr binary_operation(column_view const& lhs, - scalar const& rhs, - binary_operator op, - data_type output_type, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr string_null_min_max(column_view const& lhs, + scalar const& rhs, + binary_operator op, + data_type output_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // hard-coded to only work with cudf::string_view so we don't explode compile times CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype"); CUDF_EXPECTS(rhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported rhs datatype"); - if (is_null_dependent(op)) { - if (lhs.is_empty()) return cudf::make_empty_column(output_type); - auto lhs_device_view = cudf::column_device_view::create(lhs, stream); - return null_considering_binop{}(*lhs_device_view, rhs, op, output_type, lhs.size(), stream, mr); - } else { - CUDF_EXPECTS(is_boolean(output_type), "Invalid/Unsupported output datatype"); - return binary_op{}( - lhs, rhs, op, output_type, false, stream, mr); - } + CUDF_EXPECTS(op == binary_operator::NULL_MAX or op == binary_operator::NULL_MIN, + "Unsupported binary operation"); + if (lhs.is_empty()) return cudf::make_empty_column(output_type); + auto lhs_device_view = cudf::column_device_view::create(lhs, stream); + return null_considering_binop{}(*lhs_device_view, rhs, op, output_type, lhs.size(), stream, mr); } -std::unique_ptr binary_operation(column_view const& lhs, - column_view const& rhs, - binary_operator op, - data_type output_type, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr string_null_min_max(column_view const& lhs, + column_view const& rhs, + binary_operator op, + data_type output_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // hard-coded to only work with cudf::string_view so we don't explode compile times CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype"); CUDF_EXPECTS(rhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported rhs datatype"); - if (is_null_dependent(op)) { - CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes do not match"); - if (lhs.is_empty()) return cudf::make_empty_column(output_type); - auto lhs_device_view = cudf::column_device_view::create(lhs, stream); - auto rhs_device_view = cudf::column_device_view::create(rhs, stream); - return null_considering_binop{}( - *lhs_device_view, *rhs_device_view, op, output_type, lhs.size(), stream, mr); - } else { - CUDF_EXPECTS(is_boolean(output_type), "Invalid/Unsupported output datatype"); - return binary_op{}( - lhs, rhs, op, output_type, stream, mr); - } + CUDF_EXPECTS(op == binary_operator::NULL_MAX or op == binary_operator::NULL_MIN, + "Unsupported binary operation"); + CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes do not match"); + if (lhs.is_empty()) return cudf::make_empty_column(output_type); + auto lhs_device_view = cudf::column_device_view::create(lhs, stream); + auto rhs_device_view = cudf::column_device_view::create(rhs, stream); + return null_considering_binop{}( + *lhs_device_view, *rhs_device_view, op, output_type, lhs.size(), stream, mr); +} + +void operator_dispatcher(mutable_column_device_view& out, + column_device_view const& lhs, + column_device_view const& rhs, + bool is_lhs_scalar, + bool is_rhs_scalar, + binary_operator op, + rmm::cuda_stream_view stream) +{ + // clang-format off +switch (op) { +case binary_operator::ADD: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::SUB: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::MUL: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::DIV: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::TRUE_DIV: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::FLOOR_DIV: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::MOD: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::PYMOD: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::POW: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::EQUAL: +case binary_operator::NOT_EQUAL: +case binary_operator::NULL_EQUALS: +if(out.type().id() != type_id::BOOL8) CUDF_FAIL("Output type of Comparison operator should be bool type"); +dispatch_equality_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, op, stream); break; +case binary_operator::LESS: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::GREATER: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::LESS_EQUAL: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::GREATER_EQUAL: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::BITWISE_AND: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::BITWISE_OR: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::BITWISE_XOR: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::LOGICAL_AND: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::LOGICAL_OR: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +/* +case binary_operator::GENERIC_BINARY: // Cannot be compiled, should be called by jit::binary_operation +*/ +case binary_operator::SHIFT_LEFT: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::SHIFT_RIGHT: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::SHIFT_RIGHT_UNSIGNED: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::LOG_BASE: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::ATAN2: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::PMOD: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::NULL_MAX: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +case binary_operator::NULL_MIN: apply_binary_op(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break; +default:; +} + // clang-format on +} + +// vector_vector +void binary_operation(mutable_column_view& out, + column_view const& lhs, + column_view const& rhs, + binary_operator op, + rmm::cuda_stream_view stream) +{ + auto lhsd = column_device_view::create(lhs, stream); + auto rhsd = column_device_view::create(rhs, stream); + auto outd = mutable_column_device_view::create(out, stream); + operator_dispatcher(*outd, *lhsd, *rhsd, false, false, op, stream); +} +// scalar_vector +void binary_operation(mutable_column_view& out, + scalar const& lhs, + column_view const& rhs, + binary_operator op, + rmm::cuda_stream_view stream) +{ + auto [lhsd, aux] = scalar_to_column_device_view(lhs, stream); + auto rhsd = column_device_view::create(rhs, stream); + auto outd = mutable_column_device_view::create(out, stream); + operator_dispatcher(*outd, *lhsd, *rhsd, true, false, op, stream); +} +// vector_scalar +void binary_operation(mutable_column_view& out, + column_view const& lhs, + scalar const& rhs, + binary_operator op, + rmm::cuda_stream_view stream) +{ + auto lhsd = column_device_view::create(lhs, stream); + auto [rhsd, aux] = scalar_to_column_device_view(rhs, stream); + auto outd = mutable_column_device_view::create(out, stream); + operator_dispatcher(*outd, *lhsd, *rhsd, false, true, op, stream); } } // namespace compiled diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh new file mode 100644 index 00000000000..b17f3eddc5d --- /dev/null +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "binary_ops.hpp" +#include "operation.cuh" + +#include +#include + +#include +#include + +namespace cudf { +namespace binops { +namespace compiled { + +template +constexpr bool is_bool_result() +{ + using ReturnType = std::invoke_result_t; + return std::is_same_v; +} + +/** + * @brief Type casts each element of the column to `CastType` + * + */ +template +struct type_casted_accessor { + template + CUDA_DEVICE_CALLABLE CastType operator()(cudf::size_type i, + column_device_view const& col, + bool is_scalar) const + { + if constexpr (column_device_view::has_element_accessor() and + std::is_convertible_v) + return static_cast(col.element(is_scalar ? 0 : i)); + return {}; + } +}; + +/** + * @brief Type casts value to column type and stores in `i`th row of the column + * + */ +template +struct typed_casted_writer { + template + CUDA_DEVICE_CALLABLE void operator()(cudf::size_type i, + mutable_column_device_view const& col, + FromType val) const + { + if constexpr (mutable_column_device_view::has_element_accessor() and + std::is_constructible_v) { + col.element(i) = static_cast(val); + } else if constexpr (is_fixed_point() and std::is_constructible_v) { + if constexpr (is_fixed_point()) + col.data()[i] = val.rescaled(numeric::scale_type{col.type().scale()}).value(); + else + col.data()[i] = Element{val, numeric::scale_type{col.type().scale()}}.value(); + } + } +}; + +// Functors to launch only defined operations. + +/** + * @brief Functor to launch only defined operations with common type. + * + * @tparam BinaryOperator binary operator functor + */ +template +struct ops_wrapper { + mutable_column_device_view& out; + column_device_view const& lhs; + column_device_view const& rhs; + bool const& is_lhs_scalar; + bool const& is_rhs_scalar; + template + __device__ void operator()(size_type i) + { + if constexpr (std::is_invocable_v) { + TypeCommon x = + type_dispatcher(lhs.type(), type_casted_accessor{}, i, lhs, is_lhs_scalar); + TypeCommon y = + type_dispatcher(rhs.type(), type_casted_accessor{}, i, rhs, is_rhs_scalar); + auto result = [&]() { + if constexpr (std::is_same_v or + std::is_same_v or + std::is_same_v) { + bool output_valid = false; + auto result = BinaryOperator{}.template operator()( + x, + y, + lhs.is_valid(is_lhs_scalar ? 0 : i), + rhs.is_valid(is_rhs_scalar ? 0 : i), + output_valid); + if (out.nullable() && !output_valid) out.set_null(i); + return result; + } else { + return BinaryOperator{}.template operator()(x, y); + } + // To supress nvcc warning + return std::invoke_result_t{}; + }(); + if constexpr (is_bool_result()) + out.element(i) = result; + else + type_dispatcher(out.type(), typed_casted_writer{}, i, out, result); + } + (void)i; + } +}; + +/** + * @brief Functor to launch only defined operations without common type. + * + * @tparam BinaryOperator binary operator functor + */ +template +struct ops2_wrapper { + mutable_column_device_view& out; + column_device_view const& lhs; + column_device_view const& rhs; + bool const& is_lhs_scalar; + bool const& is_rhs_scalar; + template + __device__ void operator()(size_type i) + { + if constexpr (!has_common_type_v and + std::is_invocable_v) { + TypeLhs x = lhs.element(is_lhs_scalar ? 0 : i); + TypeRhs y = rhs.element(is_rhs_scalar ? 0 : i); + auto result = [&]() { + if constexpr (std::is_same_v or + std::is_same_v or + std::is_same_v) { + bool output_valid = false; + auto result = BinaryOperator{}.template operator()( + x, + y, + lhs.is_valid(is_lhs_scalar ? 0 : i), + rhs.is_valid(is_rhs_scalar ? 0 : i), + output_valid); + if (out.nullable() && !output_valid) out.set_null(i); + return result; + } else { + return BinaryOperator{}.template operator()(x, y); + } + // To supress nvcc warning + return std::invoke_result_t{}; + }(); + if constexpr (is_bool_result()) + out.element(i) = result; + else + type_dispatcher(out.type(), typed_casted_writer{}, i, out, result); + } + (void)i; + } +}; + +/** + * @brief Functor which does single, and double type dispatcher in device code + * + * single type dispatcher for lhs and rhs with common types. + * double type dispatcher for lhs and rhs without common types. + * + * @tparam BinaryOperator binary operator functor + */ +template +struct device_type_dispatcher { + mutable_column_device_view out; + column_device_view lhs; + column_device_view rhs; + bool is_lhs_scalar; + bool is_rhs_scalar; + std::optional common_data_type; + + __device__ void operator()(size_type i) + { + if (common_data_type) { + type_dispatcher(*common_data_type, + ops_wrapper{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar}, + i); + } else { + double_type_dispatcher( + lhs.type(), + rhs.type(), + ops2_wrapper{out, lhs, rhs, is_lhs_scalar, is_rhs_scalar}, + i); + } + } +}; + +/** + * @brief Simplified for_each kernel + * + * @param size number of elements to process. + * @param f Functor object to call for each element. + */ +template +__global__ void for_each_kernel(cudf::size_type size, Functor f) +{ + int tid = threadIdx.x; + int blkid = blockIdx.x; + int blksz = blockDim.x; + int gridsz = gridDim.x; + + int start = tid + blkid * blksz; + int step = blksz * gridsz; + +#pragma unroll + for (cudf::size_type i = start; i < size; i += step) { + f(i); + } +} + +/** + * @brief Launches Simplified for_each kernel with maximum occupancy grid dimensions. + * + * @tparam Functor + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param size number of elements to process. + * @param f Functor object to call for each element. + */ +template +void for_each(rmm::cuda_stream_view stream, cudf::size_type size, Functor f) +{ + int block_size; + int min_grid_size; + CUDA_TRY( + cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, for_each_kernel)); + // 2 elements per thread. + const int grid_size = util::div_rounding_up_safe(size, 2 * block_size); + for_each_kernel<<>>(size, std::forward(f)); +} + +template +void apply_binary_op(mutable_column_device_view& outd, + column_device_view const& lhsd, + column_device_view const& rhsd, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view stream) +{ + auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type()); + + // Create binop functor instance + auto binop_func = device_type_dispatcher{ + outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}; + // Execute it on every element + for_each(stream, outd.size(), binop_func); +} + +} // namespace compiled +} // namespace binops +} // namespace cudf diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp index a3f62f5018e..2a814c16d57 100644 --- a/cpp/src/binaryop/compiled/binary_ops.hpp +++ b/cpp/src/binaryop/compiled/binary_ops.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,13 @@ #include +#include + namespace cudf { +// Forward declarations +class column_device_view; +class mutable_column_device_view; + namespace binops { namespace detail { /** @@ -45,6 +51,30 @@ inline bool is_null_dependent(binary_operator op) namespace compiled { +std::unique_ptr string_null_min_max( + scalar const& lhs, + column_view const& rhs, + binary_operator op, + data_type output_type, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr string_null_min_max( + column_view const& lhs, + scalar const& rhs, + binary_operator op, + data_type output_type, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr string_null_min_max( + column_view const& lhs, + column_view const& rhs, + binary_operator op, + data_type output_type, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Performs a binary operation between a string scalar and a string * column. @@ -123,6 +153,89 @@ std::unique_ptr binary_operation( rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +void binary_operation(mutable_column_view& out, + scalar const& lhs, + column_view const& rhs, + binary_operator op, + rmm::cuda_stream_view stream); +void binary_operation(mutable_column_view& out, + column_view const& lhs, + scalar const& rhs, + binary_operator op, + rmm::cuda_stream_view stream); +void binary_operation(mutable_column_view& out, + column_view const& lhs, + column_view const& rhs, + binary_operator op, + rmm::cuda_stream_view stream); + +// Defined in util.cpp +/** + * @brief Get the common type among all input types. + * + * @param out type 1 + * @param lhs type 2 + * @param rhs type 3 + * @return common type among @p out, @p lhs, @p rhs. + */ +std::optional get_common_type(data_type out, data_type lhs, data_type rhs); +/** + * @brief Check if input binary operation is supported for the given input and output types. + * + * @param out output type of the binary operation + * @param lhs first operand type of the binary operation + * @param rhs second operand type of the binary operation + * @param op binary operator enum. + * @return true if given binary operator supports given input and output types. + */ +bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op); + +// Defined in individual .cu files. +/** + * @brief Deploys single type or double type dispatcher that runs binary operation on each element + * of @p lhsd and @p rhsd columns. + * + * This template is instantiated for each binary operator. + * + * @tparam BinaryOperator Binary operator functor + * @param outd mutable device view of output column + * @param lhsd device view of left operand column + * @param rhsd device view of right operand column + * @param is_lhs_scalar true if @p lhsd is a single element column representing a scalar + * @param is_rhs_scalar true if @p rhsd is a single element column representing a scalar + * @param stream CUDA stream used for device memory operations + */ +template +void apply_binary_op(mutable_column_device_view&, + column_device_view const&, + column_device_view const&, + bool is_lhs_scalar, + bool is_rhs_scalar, + rmm::cuda_stream_view stream); +/** + * @brief Deploys single type or double type dispatcher that runs equality operation on each element + * of @p lhsd and @p rhsd columns. + * + * Comparison operators are EQUAL, NOT_EQUAL, NULL_EQUALS. + * @p outd type is boolean. + * + * This template is instantiated for each binary operator. + * + * @param outd mutable device view of output column + * @param lhsd device view of left operand column + * @param rhsd device view of right operand column + * @param is_lhs_scalar true if @p lhsd is a single element column representing a scalar + * @param is_rhs_scalar true if @p rhsd is a single element column representing a scalar + * @param op comparison binary operator + * @param stream CUDA stream used for device memory operations + */ +void dispatch_equality_op(mutable_column_device_view& outd, + column_device_view const& lhsd, + column_device_view const& rhsd, + bool is_lhs_scalar, + bool is_rhs_scalar, + binary_operator op, + rmm::cuda_stream_view stream); } // namespace compiled } // namespace binops } // namespace cudf diff --git a/cpp/src/binaryop/compiled/equality_ops.cu b/cpp/src/binaryop/compiled/equality_ops.cu new file mode 100644 index 00000000000..feee310716a --- /dev/null +++ b/cpp/src/binaryop/compiled/equality_ops.cu @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "binary_ops.cuh" + +namespace cudf::binops::compiled { +void dispatch_equality_op(mutable_column_device_view& outd, + column_device_view const& lhsd, + column_device_view const& rhsd, + bool is_lhs_scalar, + bool is_rhs_scalar, + binary_operator op, + rmm::cuda_stream_view stream) +{ + auto common_dtype = get_common_type(outd.type(), lhsd.type(), rhsd.type()); + + // Execute it on every element + for_each( + stream, + outd.size(), + [op, outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype] __device__(size_type i) { + // clang-format off + // Similar enabled template types should go together (better performance) + switch (op) { + case binary_operator::EQUAL: device_type_dispatcher{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break; + case binary_operator::NOT_EQUAL: device_type_dispatcher{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break; + case binary_operator::NULL_EQUALS: device_type_dispatcher{outd, lhsd, rhsd, is_lhs_scalar, is_rhs_scalar, common_dtype}(i); break; + default:; + } + // clang-format on + }); +} +} // namespace cudf::binops::compiled diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh new file mode 100644 index 00000000000..86645e2cb8a --- /dev/null +++ b/cpp/src/binaryop/compiled/operation.cuh @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { +namespace binops { +namespace compiled { + +// All binary operations +namespace ops { + +struct Add { + template + CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs + rhs) + { + return lhs + rhs; + } +}; + +struct Sub { + template + CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs - rhs) + { + return lhs - rhs; + } +}; + +struct Mul { + template + static constexpr inline bool is_supported() + { + return has_common_type_v or + // FIXME: without the following line, compilation error + // _deps/libcudacxx-src/include/cuda/std/detail/libcxx/include/chrono(917): error: + // identifier "cuda::std::__3::ratio<(long)86400000000l, (long)1l> ::num" is undefined in + // device code + (is_duration() and std::is_integral()) or + (std::is_integral() and is_duration()) or + (is_fixed_point() and is_numeric()) or + (is_numeric() and is_fixed_point()); + } + template ()>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs * rhs) + { + return lhs * rhs; + } +}; + +struct Div { + template + static constexpr inline bool is_supported() + { + return has_common_type_v or + // FIXME: without this, compilation error on chrono:917 + (is_duration() and (std::is_integral() or is_duration())) or + (is_fixed_point() and is_numeric()) or + (is_numeric() and is_fixed_point()); + } + template ()>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs / rhs) + { + return lhs / rhs; + } +}; + +struct TrueDiv { + template + CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) + -> decltype((static_cast(lhs) / static_cast(rhs))) + { + return (static_cast(lhs) / static_cast(rhs)); + } +}; + +struct FloorDiv { + template + CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) + -> decltype(floor(static_cast(lhs) / static_cast(rhs))) + { + return floor(static_cast(lhs) / static_cast(rhs)); + } +}; + +struct Mod { + template + static constexpr inline bool is_supported() + { + return has_common_type_v or + // FIXME: without this, compilation error + //_deps/libcudacxx-src/include/cuda/std/detail/libcxx/include/chrono(1337): + // error : expression must have integral or unscoped enum type + (is_duration() and (std::is_integral() or is_duration())); + } + template ()>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs % rhs) + { + return lhs % rhs; + } + template >)>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> float + { + return fmodf(static_cast(lhs), static_cast(rhs)); + } + template >)>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> double + { + return fmod(static_cast(lhs), static_cast(rhs)); + } +}; + +struct PMod { + // Ideally, these two specializations - one for integral types and one for non integral + // types shouldn't be required, as std::fmod should promote integral types automatically + // to double and call the std::fmod overload for doubles. Sadly, doing this in jitified + // code does not work - it is having trouble deciding between float/double overloads + template >)>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) + { + using common_t = std::common_type_t; + common_t xconv = static_cast(x); + common_t yconv = static_cast(y); + auto rem = xconv % yconv; + if constexpr (std::is_signed_v) + if (rem < 0) rem = (rem + yconv) % yconv; + return rem; + } + + template < + typename TypeLhs, + typename TypeRhs, + std::enable_if_t<(std::is_floating_point_v>)>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) + { + using common_t = std::common_type_t; + common_t xconv = static_cast(x); + common_t yconv = static_cast(y); + auto rem = std::fmod(xconv, yconv); + if (rem < 0) rem = std::fmod(rem + yconv, yconv); + return rem; + } +}; + +struct PyMod { + template >)>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(((x % y) + y) % y) + { + return ((x % y) + y) % y; + } + + template < + typename TypeLhs, + typename TypeRhs, + std::enable_if_t<(std::is_floating_point_v>)>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double + { + double x1 = static_cast(x); + double y1 = static_cast(y); + return fmod(fmod(x1, y1) + y1, y1); + } + + template ())>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(((x % y) + y) % y) + { + return ((x % y) + y) % y; + } +}; + +struct Pow { + template and + std::is_convertible_v)>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double + { + return pow(static_cast(x), static_cast(y)); + } +}; + +struct LogBase { + template and + std::is_convertible_v)>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double + { + return (std::log(static_cast(x)) / std::log(static_cast(y))); + } +}; + +struct ATan2 { + template and + std::is_convertible_v)>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double + { + return std::atan2(static_cast(x), static_cast(y)); + } +}; + +struct ShiftLeft { + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x << y) + { + return (x << y); + } +}; + +struct ShiftRight { + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x >> y) + { + return (x >> y); + } +}; + +struct ShiftRightUnsigned { + template < + typename TypeLhs, + typename TypeRhs, + std::enable_if_t<(std::is_integral_v and not is_boolean())>* = nullptr> + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) + -> decltype(static_cast>(x) >> y) + { + return (static_cast>(x) >> y); + } +}; + +struct BitwiseAnd { + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x & y) + { + return (x & y); + } +}; + +struct BitwiseOr { + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x | y) + { + return (x | y); + } +}; + +struct BitwiseXor { + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x ^ y) + { + return (x ^ y); + } +}; + +struct LogicalAnd { + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x && y) + { + return (x && y); + } +}; + +struct LogicalOr { + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x || y) + { + return (x || y); + } +}; + +struct Equal { + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y) + { + return (x == y); + } +}; + +struct NotEqual { + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x != y) + { + return (x != y); + } +}; + +struct Less { + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x < y) + { + return (x < y); + } +}; + +struct Greater { + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x > y) + { + return (x > y); + } +}; + +struct LessEqual { + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x <= y) + { + return (x <= y); + } +}; + +struct GreaterEqual { + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x >= y) + { + return (x >= y); + } +}; + +struct NullEquals { + template + CUDA_DEVICE_CALLABLE auto operator()( + TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x == y) + { + output_valid = true; + if (!lhs_valid && !rhs_valid) return true; + if (lhs_valid && rhs_valid) return x == y; + return false; + } + // To allow std::is_invocable_v = true + template + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y); +}; + +struct NullMax { + template > + CUDA_DEVICE_CALLABLE auto operator()( + TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) + -> decltype(static_cast(static_cast(x) > static_cast(y) ? x : y)) + { + output_valid = true; + auto const x_conv = static_cast(x); + auto const y_conv = static_cast(y); + if (!lhs_valid && !rhs_valid) { + output_valid = false; + return common_t{}; + } else if (lhs_valid && rhs_valid) { + return (x_conv > y_conv) ? x_conv : y_conv; + } else if (lhs_valid) + return x_conv; + else + return y_conv; + } + // To allow std::is_invocable_v = true + template > + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) + -> decltype(static_cast(static_cast(x) > static_cast(y) ? x : y)); +}; + +struct NullMin { + template > + CUDA_DEVICE_CALLABLE auto operator()( + TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) + -> decltype(static_cast(static_cast(x) < static_cast(y) ? x : y)) + { + output_valid = true; + auto const x_conv = static_cast(x); + auto const y_conv = static_cast(y); + if (!lhs_valid && !rhs_valid) { + output_valid = false; + return common_t{}; + } else if (lhs_valid && rhs_valid) { + return (x_conv < y_conv) ? x_conv : y_conv; + } else if (lhs_valid) + return x_conv; + else + return y_conv; + } + // To allow std::is_invocable_v = true + template > + CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) + -> decltype(static_cast(static_cast(x) < static_cast(y) ? x : y)); +}; + +} // namespace ops +} // namespace compiled +} // namespace binops +} // namespace cudf diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp new file mode 100644 index 00000000000..89320256aec --- /dev/null +++ b/cpp/src/binaryop/compiled/util.cpp @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "operation.cuh" + +#include +#include +#include +#include + +namespace cudf::binops::compiled { + +namespace { +/** + * @brief Functor that returns optional common type of 2 or 3 given types. + * + */ +struct common_type_functor { + template + struct nested_common_type_functor { + template + std::optional operator()() + { + // If common_type exists + if constexpr (cudf::has_common_type_v) { + using TypeCommon = typename std::common_type::type; + return data_type{type_to_id()}; + } else if constexpr (cudf::has_common_type_v) { + using TypeCommon = typename std::common_type::type; + // Eg. d=t-t + return data_type{type_to_id()}; + } + return {}; + } + }; + template + std::optional operator()(data_type out) + { + return type_dispatcher(out, nested_common_type_functor{}); + } +}; + +/** + * @brief Functor that return true if BinaryOperator supports given input and output types. + * + * @tparam BinaryOperator binary operator functor + */ +template +struct is_binary_operation_supported { + // For types where Out type is fixed. (eg. comparison types) + template + inline constexpr bool operator()(void) + { + if constexpr (column_device_view::has_element_accessor() and + column_device_view::has_element_accessor()) { + if constexpr (has_common_type_v) { + using common_t = std::common_type_t; + return std::is_invocable_v; + } else + return std::is_invocable_v; + } else { + return false; + } + } + + template + inline constexpr bool operator()(void) + { + if constexpr (column_device_view::has_element_accessor() and + column_device_view::has_element_accessor() and + (mutable_column_device_view::has_element_accessor() or + is_fixed_point())) { + if constexpr (has_common_type_v) { + using common_t = std::common_type_t; + if constexpr (std::is_invocable_v) { + using ReturnType = std::invoke_result_t; + return std::is_constructible_v; + } + } else { + if constexpr (std::is_invocable_v) { + using ReturnType = std::invoke_result_t; + return std::is_constructible_v; + } + } + } + return false; + } +}; + +struct is_supported_operation_functor { + template + struct nested_support_functor { + template + inline constexpr bool call() + { + return is_binary_operation_supported{} + .template operator()(); + } + template + inline constexpr bool operator()(binary_operator op) + { + switch (op) { + // clang-format off + case binary_operator::ADD: return call(); + case binary_operator::SUB: return call(); + case binary_operator::MUL: return call(); + case binary_operator::DIV: return call(); + case binary_operator::TRUE_DIV: return call(); + case binary_operator::FLOOR_DIV: return call(); + case binary_operator::MOD: return call(); + case binary_operator::PYMOD: return call(); + case binary_operator::POW: return call(); + case binary_operator::BITWISE_AND: return call(); + case binary_operator::BITWISE_OR: return call(); + case binary_operator::BITWISE_XOR: return call(); + case binary_operator::SHIFT_LEFT: return call(); + case binary_operator::SHIFT_RIGHT: return call(); + case binary_operator::SHIFT_RIGHT_UNSIGNED: return call(); + case binary_operator::LOG_BASE: return call(); + case binary_operator::ATAN2: return call(); + case binary_operator::PMOD: return call(); + case binary_operator::NULL_MAX: return call(); + case binary_operator::NULL_MIN: return call(); + /* + case binary_operator::GENERIC_BINARY: // defined in jit only. + */ + default: return false; + // clang-format on + } + } + }; + + template + inline constexpr bool bool_op(data_type out) + { + return out.id() == type_id::BOOL8 and + is_binary_operation_supported{}.template operator()(); + } + template + inline constexpr bool operator()(data_type out, binary_operator op) + { + switch (op) { + // output type should be bool type. + case binary_operator::LOGICAL_AND: return bool_op(out); + case binary_operator::LOGICAL_OR: return bool_op(out); + case binary_operator::EQUAL: return bool_op(out); + case binary_operator::NOT_EQUAL: return bool_op(out); + case binary_operator::LESS: return bool_op(out); + case binary_operator::GREATER: return bool_op(out); + case binary_operator::LESS_EQUAL: return bool_op(out); + case binary_operator::GREATER_EQUAL: return bool_op(out); + case binary_operator::NULL_EQUALS: return bool_op(out); + default: return type_dispatcher(out, nested_support_functor{}, op); + } + return false; + } +}; + +} // namespace + +std::optional get_common_type(data_type out, data_type lhs, data_type rhs) +{ + return double_type_dispatcher(lhs, rhs, common_type_functor{}, out); +} + +bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op) +{ + return double_type_dispatcher(lhs, rhs, is_supported_operation_functor{}, out, op); +} +} // namespace cudf::binops::compiled diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 28d1411c30d..c3add0ea97e 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -80,7 +80,7 @@ namespace detail { rmm::device_buffer create_null_mask(size_type size, mask_state state, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { size_type mask_size{0}; @@ -91,14 +91,14 @@ rmm::device_buffer create_null_mask(size_type size, if (state != mask_state::UNINITIALIZED) { uint8_t fill_value = (state == mask_state::ALL_VALID) ? 0xff : 0x00; CUDA_TRY(cudaMemsetAsync( - static_cast(mask.data()), fill_value, mask_size, stream.value())); + static_cast(mask.data()), fill_value, mask_size, stream.value())); } return mask; } namespace { -__global__ void set_null_mask_kernel(bitmask_type *__restrict__ destination, +__global__ void set_null_mask_kernel(bitmask_type* __restrict__ destination, size_type begin_bit, size_type end_bit, bool valid, @@ -130,7 +130,7 @@ __global__ void set_null_mask_kernel(bitmask_type *__restrict__ destination, // Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true, // or null, otherwise; -void set_null_mask(bitmask_type *bitmask, +void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid, @@ -145,7 +145,7 @@ void set_null_mask(bitmask_type *bitmask, num_bitmask_words(end_bit) - begin_bit / detail::size_in_bits(); cudf::detail::grid_1d config(number_of_mask_words, 256); set_null_mask_kernel<<>>( - static_cast(bitmask), begin_bit, end_bit, valid, number_of_mask_words); + static_cast(bitmask), begin_bit, end_bit, valid, number_of_mask_words); CHECK_CUDA(stream.value()); } } @@ -155,14 +155,14 @@ void set_null_mask(bitmask_type *bitmask, // Create a device_buffer for a null mask rmm::device_buffer create_null_mask(size_type size, mask_state state, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { return detail::create_null_mask(size, state, rmm::cuda_stream_default, mr); } // Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true, // or null, otherwise; -void set_null_mask(bitmask_type *bitmask, size_type begin_bit, size_type end_bit, bool valid) +void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid) { return detail::set_null_mask(bitmask, begin_bit, end_bit, valid); } @@ -181,10 +181,10 @@ namespace { * @param[out] global_count The number of non-zero bits in the specified range */ template -__global__ void count_set_bits_kernel(bitmask_type const *bitmask, +__global__ void count_set_bits_kernel(bitmask_type const* bitmask, size_type first_bit_index, size_type last_bit_index, - size_type *global_count) + size_type* global_count) { constexpr auto const word_size{detail::size_in_bits()}; @@ -215,7 +215,7 @@ __global__ void count_set_bits_kernel(bitmask_type const *bitmask, if (num_slack_bits > 0) { bitmask_type word = bitmask[word_index]; auto slack_mask = (first) ? set_least_significant_bits(num_slack_bits) - : set_most_significant_bits(num_slack_bits); + : set_most_significant_bits(num_slack_bits); thread_count -= __popc(word & slack_mask); } @@ -248,7 +248,7 @@ __global__ void count_set_bits_kernel(bitmask_type const *bitmask, * updated */ template -__global__ void subtract_set_bits_range_boundaries_kerenel(bitmask_type const *bitmask, +__global__ void subtract_set_bits_range_boundaries_kerenel(bitmask_type const* bitmask, size_type num_ranges, OffsetIterator first_bit_indices, OffsetIterator last_bit_indices, @@ -305,8 +305,8 @@ __global__ void subtract_set_bits_range_boundaries_kerenel(bitmask_type const *b * @param number_of_mask_words The number of `cudf::bitmask_type` words to copy */ // TODO: Also make binops test that uses offset in column_view -__global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination, - bitmask_type const *__restrict__ source, +__global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination, + bitmask_type const* __restrict__ source, size_type source_begin_bit, size_type source_end_bit, size_type number_of_mask_words) @@ -323,7 +323,7 @@ __global__ void copy_offset_bitmask(bitmask_type *__restrict__ destination, // [first_word_index,last_word_index) struct to_word_index : public thrust::unary_function { const bool _inclusive = false; - size_type const *const _d_bit_indices = nullptr; + size_type const* const _d_bit_indices = nullptr; /** * @brief Constructor of a functor that converts bit indices to bitmask word @@ -333,12 +333,12 @@ struct to_word_index : public thrust::unary_function { * or exclusive. * @param[in] d_bit_indices Pointer to an array of bit indices */ - __host__ to_word_index(bool inclusive, size_type const *d_bit_indices) + __host__ to_word_index(bool inclusive, size_type const* d_bit_indices) : _inclusive(inclusive), _d_bit_indices(d_bit_indices) { } - __device__ size_type operator()(const size_type &i) const + __device__ size_type operator()(const size_type& i) const { auto bit_index = _d_bit_indices[i]; return word_index(bit_index) + ((_inclusive || intra_word_index(bit_index) == 0) ? 0 : 1); @@ -350,11 +350,11 @@ struct to_word_index : public thrust::unary_function { namespace detail { // Create a bitmask from a specific range -rmm::device_buffer copy_bitmask(bitmask_type const *mask, +rmm::device_buffer copy_bitmask(bitmask_type const* mask, size_type begin_bit, size_type end_bit, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(begin_bit >= 0, "Invalid range."); @@ -363,26 +363,22 @@ rmm::device_buffer copy_bitmask(bitmask_type const *mask, auto num_bytes = bitmask_allocation_size_bytes(end_bit - begin_bit); if ((mask == nullptr) || (num_bytes == 0)) { return dest_mask; } if (begin_bit == 0) { - dest_mask = rmm::device_buffer{static_cast(mask), num_bytes, stream, mr}; + dest_mask = rmm::device_buffer{static_cast(mask), num_bytes, stream, mr}; } else { auto number_of_mask_words = num_bitmask_words(end_bit - begin_bit); dest_mask = rmm::device_buffer{num_bytes, stream, mr}; cudf::detail::grid_1d config(number_of_mask_words, 256); copy_offset_bitmask<<>>( - static_cast(dest_mask.data()), - mask, - begin_bit, - end_bit, - number_of_mask_words); + static_cast(dest_mask.data()), mask, begin_bit, end_bit, number_of_mask_words); CHECK_CUDA(stream.value()); } return dest_mask; } // Create a bitmask from a column view -rmm::device_buffer copy_bitmask(column_view const &view, +rmm::device_buffer copy_bitmask(column_view const& view, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); rmm::device_buffer null_mask{0, stream, mr}; @@ -395,11 +391,11 @@ rmm::device_buffer copy_bitmask(column_view const &view, // Inplace Bitwise AND of the masks void inplace_bitmask_and(device_span dest_mask, - host_span masks, + host_span masks, host_span begin_bits, size_type mask_size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { inplace_bitmask_binop( [] __device__(bitmask_type left, bitmask_type right) { return left & right; }, @@ -412,11 +408,11 @@ void inplace_bitmask_and(device_span dest_mask, } // Bitwise AND of the masks -rmm::device_buffer bitmask_and(host_span masks, +rmm::device_buffer bitmask_and(host_span masks, host_span begin_bits, size_type mask_size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { return bitmask_binop( [] __device__(bitmask_type left, bitmask_type right) { return left & right; }, @@ -427,7 +423,7 @@ rmm::device_buffer bitmask_and(host_span masks, mr); } -cudf::size_type count_set_bits(bitmask_type const *bitmask, +cudf::size_type count_set_bits(bitmask_type const* bitmask, size_type start, size_type stop, rmm::cuda_stream_view stream = rmm::cuda_stream_default) @@ -455,7 +451,7 @@ cudf::size_type count_set_bits(bitmask_type const *bitmask, return non_zero_count.value(stream); } -cudf::size_type count_unset_bits(bitmask_type const *bitmask, +cudf::size_type count_unset_bits(bitmask_type const* bitmask, size_type start, size_type stop, rmm::cuda_stream_view stream = rmm::cuda_stream_default) @@ -465,7 +461,7 @@ cudf::size_type count_unset_bits(bitmask_type const *bitmask, return (num_bits - detail::count_set_bits(bitmask, start, stop, stream)); } -std::vector segmented_count_set_bits(bitmask_type const *bitmask, +std::vector segmented_count_set_bits(bitmask_type const* bitmask, host_span indices, rmm::cuda_stream_view stream) { @@ -517,7 +513,7 @@ std::vector segmented_count_set_bits(bitmask_type const *bitmask, // first_word_indices and last_word_indices to have the same type. to_word_index(false, d_last_indices.data())); - // first allocate temporary memroy + // first allocate temporary memory size_t temp_storage_bytes{0}; CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr, @@ -570,7 +566,7 @@ std::vector segmented_count_set_bits(bitmask_type const *bitmask, return ret; } -std::vector segmented_count_unset_bits(bitmask_type const *bitmask, +std::vector segmented_count_unset_bits(bitmask_type const* bitmask, host_span indices, rmm::cuda_stream_view stream) { @@ -591,17 +587,17 @@ std::vector segmented_count_unset_bits(bitmask_type const *bitmask, } // Returns the bitwise AND of the null masks of all columns in the table view -rmm::device_buffer bitmask_and(table_view const &view, +rmm::device_buffer bitmask_and(table_view const& view, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); rmm::device_buffer null_mask{0, stream, mr}; if (view.num_rows() == 0 or view.num_columns() == 0) { return null_mask; } - std::vector masks; + std::vector masks; std::vector offsets; - for (auto &&col : view) { + for (auto&& col : view) { if (col.nullable()) { masks.push_back(col.null_mask()); offsets.push_back(col.offset()); @@ -622,17 +618,17 @@ rmm::device_buffer bitmask_and(table_view const &view, } // Returns the bitwise OR of the null masks of all columns in the table view -rmm::device_buffer bitmask_or(table_view const &view, +rmm::device_buffer bitmask_or(table_view const& view, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); rmm::device_buffer null_mask{0, stream, mr}; if (view.num_rows() == 0 or view.num_columns() == 0) { return null_mask; } - std::vector masks; + std::vector masks; std::vector offsets; - for (auto &&col : view) { + for (auto&& col : view) { if (col.nullable()) { masks.push_back(col.null_mask()); offsets.push_back(col.offset()); @@ -654,21 +650,21 @@ rmm::device_buffer bitmask_or(table_view const &view, } // namespace detail // Count non-zero bits in the specified range -cudf::size_type count_set_bits(bitmask_type const *bitmask, size_type start, size_type stop) +cudf::size_type count_set_bits(bitmask_type const* bitmask, size_type start, size_type stop) { CUDF_FUNC_RANGE(); return detail::count_set_bits(bitmask, start, stop); } // Count zero bits in the specified range -cudf::size_type count_unset_bits(bitmask_type const *bitmask, size_type start, size_type stop) +cudf::size_type count_unset_bits(bitmask_type const* bitmask, size_type start, size_type stop) { CUDF_FUNC_RANGE(); return detail::count_unset_bits(bitmask, start, stop); } // Count non-zero bits in the specified ranges -std::vector segmented_count_set_bits(bitmask_type const *bitmask, +std::vector segmented_count_set_bits(bitmask_type const* bitmask, host_span indices) { CUDF_FUNC_RANGE(); @@ -676,7 +672,7 @@ std::vector segmented_count_set_bits(bitmask_type const *bitmask, } // Count zero bits in the specified ranges -std::vector segmented_count_unset_bits(bitmask_type const *bitmask, +std::vector segmented_count_unset_bits(bitmask_type const* bitmask, host_span indices) { CUDF_FUNC_RANGE(); @@ -684,26 +680,26 @@ std::vector segmented_count_unset_bits(bitmask_type const *bitmask, } // Create a bitmask from a specific range -rmm::device_buffer copy_bitmask(bitmask_type const *mask, +rmm::device_buffer copy_bitmask(bitmask_type const* mask, size_type begin_bit, size_type end_bit, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { return detail::copy_bitmask(mask, begin_bit, end_bit, rmm::cuda_stream_default, mr); } // Create a bitmask from a column view -rmm::device_buffer copy_bitmask(column_view const &view, rmm::mr::device_memory_resource *mr) +rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_resource* mr) { return detail::copy_bitmask(view, rmm::cuda_stream_default, mr); } -rmm::device_buffer bitmask_and(table_view const &view, rmm::mr::device_memory_resource *mr) +rmm::device_buffer bitmask_and(table_view const& view, rmm::mr::device_memory_resource* mr) { return detail::bitmask_and(view, rmm::cuda_stream_default, mr); } -rmm::device_buffer bitmask_or(table_view const &view, rmm::mr::device_memory_resource *mr) +rmm::device_buffer bitmask_or(table_view const& view, rmm::mr::device_memory_resource* mr) { return detail::bitmask_or(view, rmm::cuda_stream_default, mr); } diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu index 3ee8e0a33a9..2a0496b316b 100644 --- a/cpp/src/column/column.cu +++ b/cpp/src/column/column.cu @@ -45,9 +45,9 @@ namespace cudf { // Copy ctor w/ optional stream/mr -column::column(column const &other, +column::column(column const& other, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : _type{other._type}, _size{other._size}, _data{other._data, stream, mr}, @@ -55,13 +55,13 @@ column::column(column const &other, _null_count{other._null_count} { _children.reserve(other.num_children()); - for (auto const &c : other._children) { + for (auto const& c : other._children) { _children.emplace_back(std::make_unique(*c, stream, mr)); } } // Move constructor -column::column(column &&other) noexcept +column::column(column&& other) noexcept : _type{other._type}, _size{other._size}, _data{std::move(other._data)}, @@ -91,12 +91,14 @@ column_view column::view() const // Create views of children std::vector child_views; child_views.reserve(_children.size()); - for (auto const &c : _children) { child_views.emplace_back(*c); } + for (auto const& c : _children) { + child_views.emplace_back(*c); + } return column_view{type(), size(), _data.data(), - static_cast(_null_mask.data()), + static_cast(_null_mask.data()), null_count(), 0, child_views}; @@ -110,7 +112,9 @@ mutable_column_view column::mutable_view() // create views of children std::vector child_views; child_views.reserve(_children.size()); - for (auto const &c : _children) { child_views.emplace_back(*c); } + for (auto const& c : _children) { + child_views.emplace_back(*c); + } // Store the old null count before resetting it. By accessing the value directly instead of // calling `null_count()`, we can avoid a potential invocation of `count_unset_bits()`. This does @@ -126,7 +130,7 @@ mutable_column_view column::mutable_view() return mutable_column_view{type(), size(), _data.data(), - static_cast(_null_mask.data()), + static_cast(_null_mask.data()), current_null_count, 0, child_views}; @@ -138,12 +142,12 @@ size_type column::null_count() const CUDF_FUNC_RANGE(); if (_null_count <= cudf::UNKNOWN_NULL_COUNT) { _null_count = - cudf::count_unset_bits(static_cast(_null_mask.data()), 0, size()); + cudf::count_unset_bits(static_cast(_null_mask.data()), 0, size()); } return _null_count; } -void column::set_null_mask(rmm::device_buffer &&new_null_mask, size_type new_null_count) +void column::set_null_mask(rmm::device_buffer&& new_null_mask, size_type new_null_count) { if (new_null_count > 0) { CUDF_EXPECTS(new_null_mask.size() >= cudf::bitmask_allocation_size_bytes(this->size()), @@ -154,7 +158,7 @@ void column::set_null_mask(rmm::device_buffer &&new_null_mask, size_type new_nul _null_count = new_null_count; } -void column::set_null_mask(rmm::device_buffer const &new_null_mask, +void column::set_null_mask(rmm::device_buffer const& new_null_mask, size_type new_null_count, rmm::cuda_stream_view stream) { @@ -177,10 +181,10 @@ namespace { struct create_column_from_view { cudf::column_view view; rmm::cuda_stream_view stream{}; - rmm::mr::device_memory_resource *mr; + rmm::mr::device_memory_resource* mr; template ::value> * = nullptr> + std::enable_if_t::value>* = nullptr> std::unique_ptr operator()() { cudf::strings_column_view sview(view); @@ -188,7 +192,7 @@ struct create_column_from_view { } template ::value> * = nullptr> + std::enable_if_t::value>* = nullptr> std::unique_ptr operator()() { std::vector> children; @@ -211,10 +215,10 @@ struct create_column_from_view { std::move(children)); } - template ()> * = nullptr> + template ()>* = nullptr> std::unique_ptr operator()() { - auto op = [&](auto const &child) { return std::make_unique(child, stream, mr); }; + auto op = [&](auto const& child) { return std::make_unique(child, stream, mr); }; auto begin = thrust::make_transform_iterator(view.child_begin(), op); auto children = std::vector>(begin, begin + view.num_children()); @@ -222,7 +226,7 @@ struct create_column_from_view { view.type(), view.size(), rmm::device_buffer{ - static_cast(view.head()) + (view.offset() * cudf::size_of(view.type())), + static_cast(view.head()) + (view.offset() * cudf::size_of(view.type())), view.size() * cudf::size_of(view.type()), stream, mr}, @@ -232,7 +236,7 @@ struct create_column_from_view { } template ::value> * = nullptr> + std::enable_if_t::value>* = nullptr> std::unique_ptr operator()() { auto lists_view = lists_column_view(view); @@ -240,7 +244,7 @@ struct create_column_from_view { } template ::value> * = nullptr> + std::enable_if_t::value>* = nullptr> std::unique_ptr operator()() { if (view.is_empty()) { return cudf::empty_like(view); } @@ -271,7 +275,7 @@ struct create_column_from_view { } // anonymous namespace // Copy from a view -column::column(column_view view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) +column::column(column_view view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) : // Move is needed here because the dereference operator of unique_ptr returns // an lvalue reference, which would otherwise dispatch to the copy constructor column{std::move(*type_dispatcher(view.type(), create_column_from_view{view, stream, mr}))} diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 4b11382a3f2..d4d54a3f94f 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -53,7 +53,7 @@ inline __device__ std::size_t _round_up_safe(std::size_t number_to_round, std::s * The definition of "buffer" used throughout this module is a component piece of a * cudf column. So for example, a fixed-width column with validity would have 2 associated * buffers : the data itself and the validity buffer. contiguous_split operates by breaking - * each column up into it's individual components and copying each one as a seperate kernel + * each column up into it's individual components and copying each one as a separate kernel * block. */ struct src_buf_info { @@ -188,7 +188,7 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, } // if we're performing a value shift (offsets), or a bit shift (validity) the # of bytes and - // alignment must be a multiple of 4. value shifting and bit shifting are mututally exclusive + // alignment must be a multiple of 4. value shifting and bit shifting are mutually exclusive // and will never both be true at the same time. if (value_shift || bit_shift) { std::size_t idx = (num_bytes - remainder) / 4; @@ -249,7 +249,7 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, * * @param num_src_bufs Total number of source buffers (N) * @param src_bufs Input source buffers (N) - * @param dst_bufs Desination buffers (N*M) + * @param dst_bufs Destination buffers (N*M) * @param buf_info Information on the range of values to be copied for each destination buffer. */ template diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu index 67d96bbc7ce..9456ae06b21 100644 --- a/cpp/src/copying/get_element.cu +++ b/cpp/src/copying/get_element.cu @@ -34,17 +34,17 @@ namespace detail { namespace { struct get_element_functor { - template () && !is_fixed_point()> *p = nullptr> + template () && !is_fixed_point()>* p = nullptr> std::unique_ptr operator()( - column_view const &input, + column_view const& input, size_type index, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto s = make_fixed_width_scalar(data_type(type_to_id()), stream, mr); using ScalarType = cudf::scalar_type_t; - auto typed_s = static_cast(s.get()); + auto typed_s = static_cast(s.get()); auto device_s = get_scalar_device_view(*typed_s); auto device_col = column_device_view::create(input, stream); @@ -58,12 +58,12 @@ struct get_element_functor { return s; } - template ::value> *p = nullptr> + template ::value>* p = nullptr> std::unique_ptr operator()( - column_view const &input, + column_view const& input, size_type index, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto device_col = column_device_view::create(input, stream); @@ -83,12 +83,12 @@ struct get_element_functor { return std::make_unique(temp_data, temp_valid.value(stream), stream, mr); } - template ::value> *p = nullptr> + template ::value>* p = nullptr> std::unique_ptr operator()( - column_view const &input, + column_view const& input, size_type index, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto dict_view = dictionary_column_view(input); auto indices_iter = detail::indexalator_factory::make_input_iterator(dict_view.indices()); @@ -119,12 +119,12 @@ struct get_element_functor { mr); } - template ::value> *p = nullptr> + template ::value>* p = nullptr> std::unique_ptr operator()( - column_view const &input, + column_view const& input, size_type index, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { bool valid = is_element_valid_sync(input, index, stream); auto const child_col_idx = lists_column_view::child_column_index; @@ -144,12 +144,12 @@ struct get_element_functor { } } - template ()> *p = nullptr> + template ()>* p = nullptr> std::unique_ptr operator()( - column_view const &input, + column_view const& input, size_type index, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { using Type = typename T::rep; @@ -175,12 +175,12 @@ struct get_element_functor { mr); } - template ::value> *p = nullptr> + template ::value>* p = nullptr> std::unique_ptr operator()( - column_view const &input, + column_view const& input, size_type index, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { bool valid = is_element_valid_sync(input, index, stream); auto row_contents = @@ -192,10 +192,10 @@ struct get_element_functor { } // namespace -std::unique_ptr get_element(column_view const &input, +std::unique_ptr get_element(column_view const& input, size_type index, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(index >= 0 and index < input.size(), "Index out of bounds"); return type_dispatcher(input.type(), get_element_functor{}, input, index, stream, mr); @@ -203,9 +203,9 @@ std::unique_ptr get_element(column_view const &input, } // namespace detail -std::unique_ptr get_element(column_view const &input, +std::unique_ptr get_element(column_view const& input, size_type index, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { return detail::get_element(input, index, rmm::cuda_stream_default, mr); } diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index 182e3ff0584..89e5972f448 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -145,7 +145,7 @@ packed_columns pack(cudf::table_view const& input, // do a contiguous_split with no splits to get the memory for the table // arranged as we want it auto contig_split_result = cudf::detail::contiguous_split(input, {}, stream, mr); - return std::move(contig_split_result[0].data); + return contig_split_result.empty() ? packed_columns{} : std::move(contig_split_result[0].data); } template @@ -229,7 +229,9 @@ packed_columns::metadata pack_metadata(table_view const& table, size_t buffer_size) { CUDF_FUNC_RANGE(); - return detail::pack_metadata(table.begin(), table.end(), contiguous_buffer, buffer_size); + return table.is_empty() + ? packed_columns::metadata{} + : detail::pack_metadata(table.begin(), table.end(), contiguous_buffer, buffer_size); } /** @@ -238,8 +240,10 @@ packed_columns::metadata pack_metadata(table_view const& table, table_view unpack(packed_columns const& input) { CUDF_FUNC_RANGE(); - return detail::unpack(input.metadata_->data(), - reinterpret_cast(input.gpu_data->data())); + return input.metadata_->size() == 0 + ? table_view{} + : detail::unpack(input.metadata_->data(), + reinterpret_cast(input.gpu_data->data())); } /** diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index a932957ada4..3312316f548 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -63,32 +65,32 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination, } template -void scatter_scalar_bitmask(std::vector> const& source, - MapIterator scatter_map, - size_type num_scatter_rows, - std::vector>& target, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +void scatter_scalar_bitmask_inplace(std::reference_wrapper const& source, + MapIterator scatter_map, + size_type num_scatter_rows, + column& target, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { constexpr size_type block_size = 256; size_type const grid_size = grid_1d(num_scatter_rows, block_size).num_blocks; - for (size_t i = 0; i < target.size(); ++i) { - auto const source_is_valid = source[i].get().is_valid(stream); - if (target[i]->nullable() or not source_is_valid) { - if (not target[i]->nullable()) { - // Target must have a null mask if the source is not valid - auto mask = detail::create_null_mask(target[i]->size(), mask_state::ALL_VALID, stream, mr); - target[i]->set_null_mask(std::move(mask), 0); - } - - auto target_view = mutable_column_device_view::create(target[i]->mutable_view(), stream); - - auto bitmask_kernel = source_is_valid ? marking_bitmask_kernel - : marking_bitmask_kernel; - bitmask_kernel<<>>( - *target_view, scatter_map, num_scatter_rows); + auto const source_is_valid = source.get().is_valid(stream); + if (target.nullable() or not source_is_valid) { + if (not target.nullable()) { + // Target must have a null mask if the source is not valid + auto mask = detail::create_null_mask(target.size(), mask_state::ALL_VALID, stream, mr); + target.set_null_mask(std::move(mask), 0); } + + auto target_view = mutable_column_device_view::create(target, stream); + + auto bitmask_kernel = source_is_valid ? marking_bitmask_kernel + : marking_bitmask_kernel; + bitmask_kernel<<>>( + *target_view, scatter_map, num_scatter_rows); + + target.set_null_count(count_unset_bits(target.view().null_mask(), 0, target.size(), stream)); } } @@ -103,6 +105,7 @@ struct column_scalar_scatterer_impl { { CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match"); + // make a copy of data and null mask from source auto result = std::make_unique(target, stream, mr); auto result_view = result->mutable_view(); @@ -117,6 +120,7 @@ struct column_scalar_scatterer_impl { scatter_iter, result_view.begin()); + scatter_scalar_bitmask_inplace(source, scatter_iter, scatter_rows, *result, stream, mr); return result; } }; @@ -136,7 +140,10 @@ struct column_scalar_scatterer_impl { auto const source_view = string_view(scalar_impl->data(), scalar_impl->size()); auto const begin = thrust::make_constant_iterator(source_view); auto const end = begin + scatter_rows; - return strings::detail::scatter(begin, end, scatter_iter, target, stream, mr); + auto result = strings::detail::scatter(begin, end, scatter_iter, target, stream, mr); + + scatter_scalar_bitmask_inplace(source, scatter_iter, scatter_rows, *result, stream, mr); + return result; } }; @@ -149,17 +156,11 @@ struct column_scalar_scatterer_impl { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - return lists::detail::scatter( - source, scatter_iter, scatter_iter + scatter_rows, target, stream, mr); - } -}; + auto result = + lists::detail::scatter(source, scatter_iter, scatter_iter + scatter_rows, target, stream, mr); -template -struct column_scalar_scatterer_impl { - template - std::unique_ptr operator()(Args&&...) const - { - CUDF_FAIL("scatter scalar to struct_view not implemented"); + scatter_scalar_bitmask_inplace(source, scatter_iter, scatter_rows, *result, stream, mr); + return result; } }; @@ -200,10 +201,13 @@ struct column_scalar_scatterer_impl { // use the keys from the matched column std::unique_ptr keys_column(std::move(dict_target->release().children.back())); // create the output column - return make_dictionary_column(std::move(keys_column), - std::move(indices_column), - std::move(*(contents.null_mask.release())), - null_count); + auto result = make_dictionary_column(std::move(keys_column), + std::move(indices_column), + std::move(*(contents.null_mask.release())), + null_count); + + scatter_scalar_bitmask_inplace(source, scatter_iter, scatter_rows, *result, stream, mr); + return result; } }; @@ -222,6 +226,55 @@ struct column_scalar_scatterer { } }; +template +struct column_scalar_scatterer_impl { + std::unique_ptr operator()(std::reference_wrapper const& source, + MapIterator scatter_iter, + size_type scatter_rows, + column_view const& target, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + // For each field of `source`, copy construct a scalar from the field + // and dispatch to the corresponding scalar scatterer + + auto typed_s = static_cast(&source.get()); + size_type const n_fields = typed_s->view().num_columns(); + CUDF_EXPECTS(n_fields == target.num_children(), "Mismatched number of fields."); + + auto scatter_functor = column_scalar_scatterer{}; + auto fields_iter_begin = make_counting_transform_iterator(0, [&](auto const& i) { + auto row_slr = get_element(typed_s->view().column(i), 0, stream); + return type_dispatcher(row_slr->type(), + scatter_functor, + *row_slr, + scatter_iter, + scatter_rows, + target.child(i), + stream, + mr); + }); + std::vector> fields(fields_iter_begin, fields_iter_begin + n_fields); + + // Compute null mask + rmm::device_buffer null_mask = + target.nullable() ? copy_bitmask(target, stream, mr) + : create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr); + column null_mask_stub(data_type{type_id::STRUCT}, + target.size(), + rmm::device_buffer{}, + std::move(null_mask), + target.null_count()); + scatter_scalar_bitmask_inplace(source, scatter_iter, scatter_rows, null_mask_stub, stream, mr); + size_type null_count = null_mask_stub.null_count(); + auto contents = null_mask_stub.release(); + + // Null mask pushdown inside factory method + return make_structs_column( + target.size(), std::move(fields), null_count, std::move(*contents.null_mask)); + } +}; + } // namespace std::unique_ptr
scatter(table_view const& source, @@ -305,8 +358,6 @@ std::unique_ptr
scatter(std::vector> mr); }); - scatter_scalar_bitmask(source, scatter_iter, scatter_rows, result, stream, mr); - return std::make_unique
(std::move(result)); } diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu index ebeaf0e3b20..0b88545ffa5 100644 --- a/cpp/src/copying/shift.cu +++ b/cpp/src/copying/shift.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -44,13 +45,55 @@ inline bool __device__ out_of_bounds(size_type size, size_type idx) return idx < 0 || idx >= size; } +std::pair create_null_mask(column_device_view const& input, + size_type offset, + scalar const& fill_value, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const size = input.size(); + auto func_validity = + [size, offset, fill = fill_value.validity_data(), input] __device__(size_type idx) { + auto src_idx = idx - offset; + return out_of_bounds(size, src_idx) ? *fill : input.is_valid(src_idx); + }; + return detail::valid_if(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(size), + func_validity, + stream, + mr); +} + struct shift_functor { template - std::enable_if_t(), std::unique_ptr> operator()(Args&&...) + std::enable_if_t() and not std::is_same_v, + std::unique_ptr> + operator()(Args&&...) { CUDF_FAIL("shift does not support non-fixed-width types."); } + template + std::enable_if_t, std::unique_ptr> operator()( + column_view const& input, + size_type offset, + scalar const& fill_value, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + auto output = cudf::strings::detail::shift( + cudf::strings_column_view(input), offset, fill_value, stream, mr); + + if (input.nullable() || not fill_value.is_valid(stream)) { + auto const d_input = column_device_view::create(input, stream); + auto mask_pair = create_null_mask(*d_input, offset, fill_value, stream, mr); + output->set_null_mask(std::move(std::get<0>(mask_pair))); + output->set_null_count(std::get<1>(mask_pair)); + } + + return output; + } + template std::enable_if_t(), std::unique_ptr> operator()( column_view const& input, @@ -67,29 +110,21 @@ struct shift_functor { detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr); auto device_output = mutable_column_device_view::create(*output); - auto size = input.size(); - auto index_begin = thrust::make_counting_iterator(0); - auto index_end = thrust::make_counting_iterator(size); - - if (input.nullable() || not scalar.is_valid()) { - auto func_validity = [size, - offset, - fill = scalar.validity_data(), - input = *device_input] __device__(size_type idx) { - auto src_idx = idx - offset; - return out_of_bounds(size, src_idx) ? *fill : input.is_valid(src_idx); - }; - - auto mask_pair = detail::valid_if(index_begin, index_end, func_validity, stream, mr); + auto const scalar_is_valid = scalar.is_valid(stream); + if (input.nullable() || not scalar_is_valid) { + auto mask_pair = create_null_mask(*device_input, offset, fill_value, stream, mr); output->set_null_mask(std::move(std::get<0>(mask_pair))); output->set_null_count(std::get<1>(mask_pair)); } - auto data = device_output->data(); + auto const size = input.size(); + auto index_begin = thrust::make_counting_iterator(0); + auto index_end = thrust::make_counting_iterator(size); + auto data = device_output->data(); // avoid assigning elements we know to be invalid. - if (not scalar.is_valid()) { + if (not scalar_is_valid) { if (offset > 0) { index_begin = thrust::make_counting_iterator(offset); data = data + offset; diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index 36c3605951e..41f3e7dcfee 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -19,9 +19,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -127,6 +127,17 @@ struct extract_day_num_of_year { } }; +struct is_leap_year_op { + template + CUDA_DEVICE_CALLABLE bool operator()(Timestamp const ts) const + { + using namespace cuda::std::chrono; + auto const days_since_epoch = floor(ts); + auto const date = year_month_day(days_since_epoch); + return date.year().is_leap(); + } +}; + // Apply the functor for every element/row in the input column to create the output column template struct launch_functor { @@ -357,6 +368,14 @@ std::unique_ptr day_of_year(column_view const& column, return detail::apply_datetime_op( column, stream, mr); } + +std::unique_ptr is_leap_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return apply_datetime_op(column, stream, mr); +} + } // namespace detail std::unique_ptr extract_year(column_view const& column, rmm::mr::device_memory_resource* mr) @@ -426,5 +445,12 @@ std::unique_ptr add_calendrical_months(cudf::column_view const& ti return detail::add_calendrical_months( timestamp_column, months_column, rmm::cuda_stream_default, mr); } + +std::unique_ptr is_leap_year(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_leap_year(column, rmm::cuda_stream_default, mr); +} + } // namespace datetime } // namespace cudf diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu index 1dbb844a606..37118779248 100644 --- a/cpp/src/dictionary/replace.cu +++ b/cpp/src/dictionary/replace.cu @@ -123,7 +123,7 @@ std::unique_ptr replace_nulls(dictionary_column_view const& input, } CUDF_EXPECTS(input.keys().type() == replacement.type(), "keys must match scalar type"); - // first add the replacment to the keys so only the indices need to be processed + // first add the replacement to the keys so only the indices need to be processed auto input_matched = dictionary::detail::add_keys( input, make_column_from_scalar(replacement, 1, stream)->view(), stream, mr); auto const input_view = dictionary_column_view(input_matched->view()); diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index 51ca6f5d962..2baf336bb9e 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -205,7 +205,8 @@ std::pair>, std::vector> match_d auto dict_cols = dictionary::detail::match_dictionaries(dict_views, stream, mr); // replace the updated_columns vector entries for the set of columns at col_idx auto dict_col_idx = 0; - for (auto& v : updated_columns) v[col_idx] = dict_cols[dict_col_idx++]->view(); + for (auto& v : updated_columns) + v[col_idx] = dict_cols[dict_col_idx++]->view(); // move the updated dictionary columns into the main output vector std::move(dict_cols.begin(), dict_cols.end(), std::back_inserter(dictionary_columns)); } diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index ff62a260d5c..87f83c6edd6 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -413,7 +413,9 @@ void sparse_to_dense_results(table_view const& keys, row_bitmask_ptr, stream, mr); - for (auto&& agg : agg_v) { agg->finalize(finalizer); } + for (auto&& agg : agg_v) { + agg->finalize(finalizer); + } } } diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 5e202b9ef3f..4e60d8d3f7d 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -238,6 +238,21 @@ void aggregate_result_functor::operator()(aggregation const& cache.add_result(col_idx, agg, std::move(result)); }; +template <> +void aggregate_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) return; + + auto const mean_agg = make_mean_aggregation(); + operator()(*mean_agg); + auto const mean_result = cache.get_result(col_idx, *mean_agg); + + cache.add_result( + col_idx, + agg, + detail::group_m2(get_grouped_values(), mean_result, helper.group_labels(stream), stream, mr)); +}; + template <> void aggregate_result_functor::operator()(aggregation const& agg) { @@ -474,6 +489,35 @@ void aggregate_result_functor::operator()(aggregation c mr)); }; +/** + * @brief Perform merging for the M2 values that correspond to the same key value. + * + * The partial results input to this aggregation is a structs column with children are columns + * generated by three other groupby aggregations: `COUNT_VALID`, `MEAN`, and `M2` that were + * performed on partitioned datasets. After distributedly computed, the results output from these + * aggregations are (vertically) concatenated before assembling into a structs column given as the + * values column for this aggregation. + * + * For recursive merging of `M2` values, the aggregations values of all input (`COUNT_VALID`, + * `MEAN`, and `M2`) are all merged and stored in the output of this aggregation. As such, the + * output will be a structs column containing children columns of merged `COUNT_VALID`, `MEAN`, and + * `M2` values. + * + * The values of M2 are merged following the parallel algorithm described here: + * https://www.wikiwand.com/en/Algorithms_for_calculating_variance#/Parallel_algorithm + */ +template <> +void aggregate_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(col_idx, agg)) { return; } + + cache.add_result( + col_idx, + agg, + detail::group_merge_m2( + get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr)); +}; + } // namespace detail // Sort-based groupby diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu index bed64c5147a..6ce23ffc35b 100644 --- a/cpp/src/groupby/sort/group_argmax.cu +++ b/cpp/src/groupby/sort/group_argmax.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ #include -#include +#include namespace cudf { namespace groupby { @@ -39,29 +39,24 @@ std::unique_ptr group_argmax(column_view const& values, num_groups, group_labels, stream, - rmm::mr::get_current_device_resource()); + mr); // The functor returns the index of maximum in the sorted values. // We need the index of maximum in the original unsorted values. // So use indices to gather the sort order used to sort `values`. // Gather map cannot be null so we make a view with the mask removed. // The values in data buffer of indices corresponding to null values was - // initialized to ARGMAX_SENTINEL which is an out of bounds index value (-1) - // and causes the gathered value to be null. - column_view null_removed_indices( - data_type(type_to_id()), - indices->size(), - static_cast(indices->view().template data())); - auto result_table = - cudf::detail::gather(table_view({key_sort_order}), - null_removed_indices, - indices->nullable() ? cudf::out_of_bounds_policy::NULLIFY - : cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - - return std::move(result_table->release()[0]); + // initialized to ARGMAX_SENTINEL. Using gather_if. + // This can't use gather because nulls in gathered column will not store ARGMAX_SENTINEL. + auto indices_view = indices->mutable_view(); + thrust::gather_if(rmm::exec_policy(stream), + indices_view.begin(), // map first + indices_view.end(), // map last + indices_view.begin(), // stencil + key_sort_order.begin(), // input + indices_view.begin(), // result + [] __device__(auto i) { return (i != cudf::detail::ARGMAX_SENTINEL); }); + return indices; } } // namespace detail diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu index ec97a609390..ab91c2c0d29 100644 --- a/cpp/src/groupby/sort/group_argmin.cu +++ b/cpp/src/groupby/sort/group_argmin.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ #include -#include +#include namespace cudf { namespace groupby { @@ -39,29 +39,24 @@ std::unique_ptr group_argmin(column_view const& values, num_groups, group_labels, stream, - rmm::mr::get_current_device_resource()); + mr); // The functor returns the index of minimum in the sorted values. // We need the index of minimum in the original unsorted values. // So use indices to gather the sort order used to sort `values`. - // Gather map cannot be null so we make a view with the mask removed. // The values in data buffer of indices corresponding to null values was - // initialized to ARGMIN_SENTINEL which is an out of bounds index value (-1) - // and causes the gathered value to be null. - column_view null_removed_indices( - data_type(type_to_id()), - indices->size(), - static_cast(indices->view().template data())); - auto result_table = - cudf::detail::gather(table_view({key_sort_order}), - null_removed_indices, - indices->nullable() ? cudf::out_of_bounds_policy::NULLIFY - : cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); + // initialized to ARGMIN_SENTINEL. Using gather_if. + // This can't use gather because nulls in gathered column will not store ARGMIN_SENTINEL. + auto indices_view = indices->mutable_view(); + thrust::gather_if(rmm::exec_policy(stream), + indices_view.begin(), // map first + indices_view.end(), // map last + indices_view.begin(), // stencil + key_sort_order.begin(), // input + indices_view.begin(), // result + [] __device__(auto i) { return (i != cudf::detail::ARGMIN_SENTINEL); }); - return std::move(result_table->release()[0]); + return indices; } } // namespace detail diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu index 1e6a681af94..a30d4639af8 100644 --- a/cpp/src/groupby/sort/group_collect.cu +++ b/cpp/src/groupby/sort/group_collect.cu @@ -41,11 +41,11 @@ namespace detail { * @return Pair of null-eliminated grouped values and corresponding offsets */ std::pair, std::unique_ptr> purge_null_entries( - column_view const &values, - column_view const &offsets, + column_view const& values, + column_view const& offsets, size_type num_groups, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { auto values_device_view = column_device_view::create(values, stream); @@ -81,12 +81,12 @@ std::pair, std::unique_ptr> purge_null_entries( std::move(null_purged_values), std::move(null_purged_offsets)); } -std::unique_ptr group_collect(column_view const &values, +std::unique_ptr group_collect(column_view const& values, cudf::device_span group_offsets, size_type num_groups, null_policy null_handling, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { auto [child_column, offsets_column] = [null_handling, num_groups, &values, &group_offsets, stream, mr] { diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu new file mode 100644 index 00000000000..a72f6c6f647 --- /dev/null +++ b/cpp/src/groupby/sort/group_m2.cu @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace groupby { +namespace detail { +namespace { + +template +struct m2_transform { + column_device_view const d_values; + Iterator const values_iter; + ResultType const* d_means; + size_type const* d_group_labels; + + __device__ ResultType operator()(size_type const idx) const noexcept + { + if (d_values.is_null(idx)) { return 0.0; } + + auto const x = static_cast(values_iter[idx]); + auto const group_idx = d_group_labels[idx]; + auto const mean = d_means[group_idx]; + auto const diff = x - mean; + return diff * diff; + } +}; + +template +void compute_m2_fn(column_device_view const& values, + Iterator values_iter, + cudf::device_span group_labels, + ResultType const* d_means, + ResultType* d_result, + rmm::cuda_stream_view stream) +{ + auto const var_iter = cudf::detail::make_counting_transform_iterator( + size_type{0}, + m2_transform{ + values, values_iter, d_means, group_labels.data()}); + + thrust::reduce_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + var_iter, + thrust::make_discard_iterator(), + d_result); +} + +struct m2_functor { + template + std::enable_if_t::value, std::unique_ptr> operator()( + column_view const& values, + column_view const& group_means, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + using result_type = cudf::detail::target_type_t; + auto result = make_numeric_column(data_type(type_to_id()), + group_means.size(), + mask_state::UNALLOCATED, + stream, + mr); + + auto const values_dv_ptr = column_device_view::create(values, stream); + auto const d_values = *values_dv_ptr; + auto const d_means = group_means.data(); + auto const d_result = result->mutable_view().data(); + + if (!cudf::is_dictionary(values.type())) { + auto const values_iter = d_values.begin(); + compute_m2_fn(d_values, values_iter, group_labels, d_means, d_result, stream); + } else { + auto const values_iter = + cudf::dictionary::detail::make_dictionary_iterator(*values_dv_ptr); + compute_m2_fn(d_values, values_iter, group_labels, d_means, d_result, stream); + } + + // M2 column values should have the same bitmask as means's. + if (group_means.nullable()) { + result->set_null_mask(cudf::detail::copy_bitmask(group_means, stream, mr), + group_means.null_count()); + } + + return result; + } + + template + std::enable_if_t::value, std::unique_ptr> operator()(Args&&...) + { + CUDF_FAIL("Only numeric types are supported in M2 groupby aggregation"); + } +}; + +} // namespace + +std::unique_ptr group_m2(column_view const& values, + column_view const& group_means, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto values_type = cudf::is_dictionary(values.type()) + ? dictionary_column_view(values).keys().type() + : values.type(); + + return type_dispatcher(values_type, m2_functor{}, values, group_means, group_labels, stream, mr); +} + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu new file mode 100644 index 00000000000..4e2a5b68abc --- /dev/null +++ b/cpp/src/groupby/sort/group_merge_m2.cu @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace groupby { +namespace detail { +namespace { +/** + * @brief Struct to store partial results for merging. + */ +template +struct partial_result { + size_type count; + result_type mean; + result_type M2; +}; + +/** + * @brief Functor to accumulate (merge) all partial results corresponding to the same key into a + * final result storing in a member variable. It performs merging for the partial results of + * `COUNT_VALID`, `MEAN`, and `M2` at the same time. + */ +template +struct accumulate_fn { + partial_result merge_vals; + + void __device__ operator()(partial_result const& partial_vals) noexcept + { + if (partial_vals.count == 0) { return; } + + auto const n_ab = merge_vals.count + partial_vals.count; + auto const delta = partial_vals.mean - merge_vals.mean; + merge_vals.M2 += partial_vals.M2 + (delta * delta) * + static_cast(merge_vals.count) * + static_cast(partial_vals.count) / n_ab; + merge_vals.mean = + (merge_vals.mean * merge_vals.count + partial_vals.mean * partial_vals.count) / n_ab; + merge_vals.count = n_ab; + } +}; + +/** + * @brief Functor to merge partial results of `COUNT_VALID`, `MEAN`, and `M2` aggregations + * for a given group (key) index. + */ +template +struct merge_fn { + size_type const* const d_offsets; + size_type const* const d_counts; + result_type const* const d_means; + result_type const* const d_M2s; + + auto __device__ operator()(size_type const group_idx) noexcept + { + auto const start_idx = d_offsets[group_idx], end_idx = d_offsets[group_idx + 1]; + + // This case should never happen, because all groups are non-empty as the results of + // aggregation. Here we just to make sure we cover this case. + if (start_idx == end_idx) { + return thrust::make_tuple(size_type{0}, result_type{0}, result_type{0}, int8_t{0}); + } + + // If `(n = d_counts[idx]) > 0` then `d_means[idx] != null` and `d_M2s[idx] != null`. + // Otherwise (`n == 0`), these value (mean and M2) will always be nulls. + // In such cases, reading `mean` and `M2` from memory will return garbage values. + // By setting these values to zero when `n == 0`, we can safely merge the all-zero tuple without + // affecting the final result. + auto get_partial_result = [&] __device__(size_type idx) { + { + auto const n = d_counts[idx]; + return n > 0 ? partial_result{n, d_means[idx], d_M2s[idx]} + : partial_result{size_type{0}, result_type{0}, result_type{0}}; + }; + }; + + // Firstly, store tuple(count, mean, M2) of the first partial result in an accumulator. + auto accumulator = accumulate_fn{get_partial_result(start_idx)}; + + // Then, accumulate (merge) the remaining partial results into that accumulator. + for (auto idx = start_idx + 1; idx < end_idx; ++idx) { + accumulator(get_partial_result(idx)); + } + + // Get the final result after merging. + auto const& merge_vals = accumulator.merge_vals; + + // If there are all nulls in the partial results (i.e., sum of all valid counts is + // zero), then the output is a null. + auto const is_valid = int8_t{merge_vals.count > 0}; + + return thrust::make_tuple(merge_vals.count, merge_vals.mean, merge_vals.M2, is_valid); + } +}; + +} // namespace + +std::unique_ptr group_merge_m2(column_view const& values, + cudf::device_span group_offsets, + size_type num_groups, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(values.type().id() == type_id::STRUCT, + "Input to `group_merge_m2` must be a structs column."); + CUDF_EXPECTS(values.num_children() == 3, + "Input to `group_merge_m2` must be a structs column having 3 children columns."); + + using result_type = id_to_type; + static_assert( + std::is_same_v, result_type>); + CUDF_EXPECTS(values.child(0).type().id() == type_id::INT32 && + values.child(1).type().id() == type_to_id() && + values.child(2).type().id() == type_to_id(), + "Input to `group_merge_m2` must be a structs column having children columns " + "containing tuples of (M2_value, mean, valid_count)."); + + auto result_counts = make_numeric_column( + data_type(type_to_id()), num_groups, mask_state::UNALLOCATED, stream, mr); + auto result_means = make_numeric_column( + data_type(type_to_id()), num_groups, mask_state::UNALLOCATED, stream, mr); + auto result_M2s = make_numeric_column( + data_type(type_to_id()), num_groups, mask_state::UNALLOCATED, stream, mr); + auto validities = rmm::device_uvector(num_groups, stream); + + // Perform merging for all the aggregations. Their output (and their validity data) are written + // out concurrently through an output zip iterator. + using iterator_tuple = thrust::tuple; + using output_iterator = thrust::zip_iterator; + auto const out_iter = + output_iterator{thrust::make_tuple(result_counts->mutable_view().template data(), + result_means->mutable_view().template data(), + result_M2s->mutable_view().template data(), + validities.begin())}; + + auto const count_valid = values.child(0); + auto const mean_values = values.child(1); + auto const M2_values = values.child(2); + auto const iter = thrust::make_counting_iterator(0); + + auto const fn = merge_fn{group_offsets.begin(), + count_valid.template begin(), + mean_values.template begin(), + M2_values.template begin()}; + thrust::transform(rmm::exec_policy(stream), iter, iter + num_groups, out_iter, fn); + + // Generate bitmask for the output. + // Only mean and M2 values can be nullable. Count column must be non-nullable. + auto [null_mask, null_count] = cudf::detail::valid_if( + validities.begin(), validities.end(), thrust::identity{}, stream, mr); + if (null_count > 0) { + result_means->set_null_mask(null_mask, null_count); // copy null_mask + result_M2s->set_null_mask(std::move(null_mask), null_count); // take over null_mask + } + + // Output is a structs column containing the merged values of `COUNT_VALID`, `MEAN`, and `M2`. + std::vector> out_columns; + out_columns.emplace_back(std::move(result_counts)); + out_columns.emplace_back(std::move(result_means)); + out_columns.emplace_back(std::move(result_M2s)); + auto result = cudf::make_structs_column( + num_groups, std::move(out_columns), 0, rmm::device_buffer{0, stream, mr}, stream, mr); + + return result; +} + +} // namespace detail +} // namespace groupby +} // namespace cudf diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu index c3d874f3b33..e7dc57f6c93 100644 --- a/cpp/src/groupby/sort/group_nth_element.cu +++ b/cpp/src/groupby/sort/group_nth_element.cu @@ -33,15 +33,15 @@ namespace cudf { namespace groupby { namespace detail { -std::unique_ptr group_nth_element(column_view const &values, - column_view const &group_sizes, +std::unique_ptr group_nth_element(column_view const& values, + column_view const& group_sizes, cudf::device_span group_labels, cudf::device_span group_offsets, size_type num_groups, size_type n, null_policy null_handling, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(static_cast(values.size()) == group_labels.size(), "Size of values column should be same as that of group labels"); diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp index 3390af29330..2770162da2d 100644 --- a/cpp/src/groupby/sort/group_reductions.hpp +++ b/cpp/src/groupby/sort/group_reductions.hpp @@ -217,6 +217,30 @@ std::unique_ptr group_count_all(cudf::device_span group rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Internal API to calculate sum of squares of differences from means. + * + * If there are only nulls in the group, the output value of that group will be null. + * + * @code{.pseudo} + * values = [2, 1, 4, -1, -2, , 4, ] + * group_labels = [0, 0, 0, 1, 1, 2, 2, 3] + * group_means = [2.333333, -1.5, 4.0, ] + * group_m2(...) = [4.666666, 1.0, 0.0, ] + * @endcode + * + * @param values Grouped values to compute M2 values + * @param group_means Pre-computed groupwise MEAN + * @param group_labels ID of group corresponding value in @p values belongs to + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr group_m2(column_view const& values, + column_view const& group_means, + cudf::device_span group_labels, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + /** * @brief Internal API to calculate groupwise variance * @@ -392,6 +416,32 @@ std::unique_ptr group_merge_lists(column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Internal API to merge grouped M2 values corresponding to the same key. + * + * The values of M2 are merged following the parallel algorithm described here: + * `https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm` + * + * Merging M2 values require accessing to partial M2 values, means, and valid counts. Thus, the + * input to this aggregation need to be a structs column containing tuples of 3 values + * `(valid_count, mean, M2)`. + * + * This aggregation not only merges the partial results of `M2` but also merged all the partial + * results of input aggregations (`COUNT_VALID`, `MEAN`, and `M2`). As such, the output will be a + * structs column containing children columns of merged `COUNT_VALID`, `MEAN`, and `M2` values. + * + * @param values Grouped values (tuples of values `(valid_count, mean, M2)`) to merge. + * @param group_offsets Offsets of groups' starting points within @p values. + * @param num_groups Number of groups. + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr group_merge_m2(column_view const& values, + cudf::device_span group_offsets, + size_type num_groups, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + /** @endinternal * */ diff --git a/cpp/src/hash/concurrent_unordered_multimap.cuh b/cpp/src/hash/concurrent_unordered_multimap.cuh index 071214e80b0..2b92c9142ca 100644 --- a/cpp/src/hash/concurrent_unordered_multimap.cuh +++ b/cpp/src/hash/concurrent_unordered_multimap.cuh @@ -239,7 +239,7 @@ class concurrent_unordered_multimap { * @param[in] precomputed_hash A flag indicating whether or not a precomputed * hash value is passed in * @param[in] precomputed_hash_value A precomputed hash value to use for - * determing the write location of the key into the hash map instead of + * determining the write location of the key into the hash map instead of * computing the the hash value directly from the key * @tparam hash_value_type The datatype of the hash value * @@ -284,7 +284,7 @@ class concurrent_unordered_multimap { * @param[in] precomputed_hash A flag indicating whether or not a precomputed * hash value is passed in * @param[in] precomputed_hash_value A precomputed hash value to use for - * determing the write location of the key into the hash map instead of + * determining the write location of the key into the hash map instead of * computing the the hash value directly from the key * @param[in] keys_are_equal An optional functor for comparing if two keys are * equal @@ -375,7 +375,7 @@ class concurrent_unordered_multimap { * @param[in] precomputed_hash A flag indicating whether or not a precomputed * hash value is passed in * @param[in] precomputed_hash_value A precomputed hash value to use for - * determing the write location of the key into the hash map instead of + * determining the write location of the key into the hash map instead of * computing the the hash value directly from the key * @param[in] keys_are_equal An optional functor for comparing if two keys are * equal @@ -423,7 +423,7 @@ class concurrent_unordered_multimap { * @param[in] precomputed_hash A flag indicating whether or not a precomputed * hash value is passed in * @param[in] precomputed_hash_value A precomputed hash value to use for - * determing the write location of the key into the hash map instead of + * determining the write location of the key into the hash map instead of * computing the the hash value directly from the key * @param[in] keys_are_equal An optional functor for comparing if two keys are * equal diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh index c8d3178b1d8..c6cc60a6917 100644 --- a/cpp/src/hash/managed.cuh +++ b/cpp/src/hash/managed.cuh @@ -20,15 +20,15 @@ #include struct managed { - static void *operator new(size_t n) + static void* operator new(size_t n) { - void *ptr = 0; + void* ptr = 0; cudaError_t result = cudaMallocManaged(&ptr, n); if (cudaSuccess != result || 0 == ptr) throw std::bad_alloc(); return ptr; } - static void operator delete(void *ptr) noexcept + static void operator delete(void* ptr) noexcept { auto const free_result = cudaFree(ptr); assert(free_result == cudaSuccess); diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh index 645d9bc5185..d28bf6f6fe5 100644 --- a/cpp/src/hash/unordered_multiset.cuh +++ b/cpp/src/hash/unordered_multiset.cuh @@ -38,8 +38,8 @@ template create(column_view const &col, rmm::cuda_stream_view stream) + static unordered_multiset create(column_view const& col, rmm::cuda_stream_view stream) { auto d_column = column_device_view::create(col, stream); auto d_col = *d_column; @@ -86,9 +86,9 @@ class unordered_multiset { auto hash_data = rmm::device_uvector(d_col.size(), stream); Hasher hasher; - size_type *d_hash_bins_start = hash_bins_start.data(); - size_type *d_hash_bins_end = hash_bins_end.data(); - Element *d_hash_data = hash_data.data(); + size_type* d_hash_bins_start = hash_bins_start.data(); + size_type* d_hash_bins_end = hash_bins_end.data(); + Element* d_hash_data = hash_data.data(); thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -134,8 +134,8 @@ class unordered_multiset { private: unordered_multiset(size_type size, - rmm::device_uvector &&hash_bins, - rmm::device_uvector &&hash_data) + rmm::device_uvector&& hash_bins, + rmm::device_uvector&& hash_data) : size{size}, hash_bins{std::move(hash_bins)}, hash_data{std::move(hash_data)} { } diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 28fc2ae9d4f..917a5b1ac9c 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -94,7 +94,7 @@ namespace { */ struct dispatch_to_cudf_column { /** - * @brief Returns mask from an array withut any offsets. + * @brief Returns mask from an array without any offsets. */ std::unique_ptr get_mask_buffer(arrow::Array const& array, rmm::cuda_stream_view stream, diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index f8fcf03a77e..3cd515e9981 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -96,7 +96,7 @@ std::shared_ptr fetch_mask_buffer(column_view input_view, */ struct dispatch_to_arrow { /** - * @brief Creates vector Arrays from given cudf column childrens + * @brief Creates vector Arrays from given cudf column children */ std::vector> fetch_child_array( column_view input_view, diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp index 8f0599cdd5b..7227d7e4e0b 100644 --- a/cpp/src/io/avro/avro.cpp +++ b/cpp/src/io/avro/avro.cpp @@ -49,7 +49,7 @@ std::string container::get_encoded() return (len & 1) || (m_cur >= m_end) ? 0 : std::min(len >> 1, static_cast(m_end - m_cur)); }(); - auto const s = reinterpret_cast(m_cur); + auto const s = reinterpret_cast(m_cur); m_cur += len; return std::string(s, len); } @@ -63,7 +63,7 @@ std::string container::get_encoded() * * @returns true if successful, false if error */ -bool container::parse(file_metadata *md, size_t max_num_rows, size_t first_row) +bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row) { constexpr uint32_t avro_magic = (('O' << 0) | ('b' << 8) | ('j' << 16) | (0x01 << 24)); uint32_t sig4, max_block_size; @@ -195,7 +195,7 @@ enum { * * @returns true if successful, false if error */ -bool schema_parser::parse(std::vector &schema, const std::string &json_str) +bool schema_parser::parse(std::vector& schema, const std::string& json_str) { // Empty schema if (json_str == "[]") return true; @@ -361,8 +361,8 @@ bool schema_parser::parse(std::vector &schema, const std::string & std::string schema_parser::get_str() { std::string s; - const char *start = m_cur; - const char *cur = start; + const char* start = m_cur; + const char* cur = start; while (cur < m_end && *cur++ != '"') ; int32_t len = static_cast(cur - start - 1); diff --git a/cpp/src/io/avro/avro.h b/cpp/src/io/avro/avro.h index 13f5e4ecb3c..fe8f5634815 100644 --- a/cpp/src/io/avro/avro.h +++ b/cpp/src/io/avro/avro.h @@ -82,16 +82,16 @@ class schema_parser { public: schema_parser() {} - bool parse(std::vector &schema, const std::string &str); + bool parse(std::vector& schema, const std::string& str); protected: bool more_data() const { return (m_cur < m_end); } std::string get_str(); protected: - const char *m_base; - const char *m_cur; - const char *m_end; + const char* m_base; + const char* m_cur; + const char* m_end; }; /** @@ -99,7 +99,7 @@ class schema_parser { */ class container { public: - container(uint8_t const *base, size_t len) noexcept : m_base{base}, m_cur{base}, m_end{base + len} + container(uint8_t const* base, size_t len) noexcept : m_base{base}, m_cur{base}, m_end{base + len} { } @@ -119,12 +119,12 @@ class container { T get_encoded(); public: - bool parse(file_metadata *md, size_t max_num_rows = 0x7fffffff, size_t first_row = 0); + bool parse(file_metadata* md, size_t max_num_rows = 0x7fffffff, size_t first_row = 0); protected: - const uint8_t *m_base; - const uint8_t *m_cur; - const uint8_t *m_end; + const uint8_t* m_base; + const uint8_t* m_cur; + const uint8_t* m_end; }; } // namespace avro diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu index ebd7f51a08a..6fabcf00b8f 100644 --- a/cpp/src/io/avro/avro_gpu.cu +++ b/cpp/src/io/avro/avro_gpu.cu @@ -32,7 +32,7 @@ constexpr int max_shared_schema_len = 1000; * Avro varint encoding - see * https://avro.apache.org/docs/1.2.0/spec.html#binary_encoding */ -static inline int64_t __device__ avro_decode_zigzag_varint(const uint8_t *&cur, const uint8_t *end) +static inline int64_t __device__ avro_decode_zigzag_varint(const uint8_t*& cur, const uint8_t* end) { uint64_t u = 0; if (cur < end) { @@ -65,13 +65,13 @@ static inline int64_t __device__ avro_decode_zigzag_varint(const uint8_t *&cur, * * @return data pointer at the end of the row (start of next row) */ -static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema, - schemadesc_s *schema_g, +static const uint8_t* __device__ avro_decode_row(const schemadesc_s* schema, + schemadesc_s* schema_g, uint32_t schema_len, size_t row, size_t max_rows, - const uint8_t *cur, - const uint8_t *end, + const uint8_t* cur, + const uint8_t* end, device_span global_dictionary) { uint32_t array_start = 0, array_repeat_count = 0; @@ -96,11 +96,11 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema, skip = skip_after; } - void *dataptr = schema[i].dataptr; + void* dataptr = schema[i].dataptr; switch (kind) { case type_null: if (dataptr != nullptr && row < max_rows) { - atomicAnd(static_cast(dataptr) + (row >> 5), ~(1 << (row & 0x1f))); + atomicAnd(static_cast(dataptr) + (row >> 5), ~(1 << (row & 0x1f))); atomicAdd(&schema_g[i].count, 1); } break; @@ -113,13 +113,13 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema, int64_t v = avro_decode_zigzag_varint(cur, end); if (kind == type_int) { if (dataptr != nullptr && row < max_rows) { - static_cast(dataptr)[row] = static_cast(v); + static_cast(dataptr)[row] = static_cast(v); } } else if (kind == type_long) { - if (dataptr != nullptr && row < max_rows) { static_cast(dataptr)[row] = v; } + if (dataptr != nullptr && row < max_rows) { static_cast(dataptr)[row] = v; } } else { // string or enum size_t count = 0; - const char *ptr = 0; + const char* ptr = 0; if (kind == type_enum) { // dictionary size_t idx = schema[i].count + v; if (idx < global_dictionary.size()) { @@ -127,13 +127,13 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema, count = global_dictionary[idx].second; } } else if (v >= 0 && cur + v <= end) { // string - ptr = reinterpret_cast(cur); + ptr = reinterpret_cast(cur); count = (size_t)v; cur += count; } if (dataptr != nullptr && row < max_rows) { - static_cast(dataptr)[row].first = ptr; - static_cast(dataptr)[row].second = count; + static_cast(dataptr)[row].first = ptr; + static_cast(dataptr)[row].second = count; } } } break; @@ -147,7 +147,7 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema, } else { v = 0; } - static_cast(dataptr)[row] = v; + static_cast(dataptr)[row] = v; } else { cur += 4; } @@ -162,7 +162,7 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema, } else { v = 0; } - static_cast(dataptr)[row] = v; + static_cast(dataptr)[row] = v; } else { cur += 8; } @@ -170,8 +170,8 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema, case type_boolean: if (dataptr != nullptr && row < max_rows) { - uint8_t v = (cur < end) ? *cur : 0; - static_cast(dataptr)[row] = (v) ? 1 : 0; + uint8_t v = (cur < end) ? *cur : 0; + static_cast(dataptr)[row] = (v) ? 1 : 0; } cur++; break; @@ -228,10 +228,10 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema, */ // blockDim {32,num_warps,1} extern "C" __global__ void __launch_bounds__(num_warps * 32, 2) - gpuDecodeAvroColumnData(block_desc_s *blocks, - schemadesc_s *schema_g, + gpuDecodeAvroColumnData(block_desc_s* blocks, + schemadesc_s* schema_g, device_span global_dictionary, - const uint8_t *avro_data, + const uint8_t* avro_data, uint32_t num_blocks, uint32_t schema_len, uint32_t min_row_size, @@ -241,8 +241,8 @@ extern "C" __global__ void __launch_bounds__(num_warps * 32, 2) __shared__ __align__(8) schemadesc_s g_shared_schema[max_shared_schema_len]; __shared__ __align__(8) block_desc_s blk_g[num_warps]; - schemadesc_s *schema; - block_desc_s *const blk = &blk_g[threadIdx.y]; + schemadesc_s* schema; + block_desc_s* const blk = &blk_g[threadIdx.y]; uint32_t block_id = blockIdx.x * num_warps + threadIdx.y; size_t cur_row; uint32_t rows_remaining; @@ -267,7 +267,7 @@ extern "C" __global__ void __launch_bounds__(num_warps * 32, 2) end = cur + blk->size; while (rows_remaining > 0 && cur < end) { uint32_t nrows; - const uint8_t *start = cur; + const uint8_t* start = cur; if (cur_row > first_row + max_rows) break; if (cur + min_row_size * rows_remaining == end) { @@ -311,10 +311,10 @@ extern "C" __global__ void __launch_bounds__(num_warps * 32, 2) * @param[in] min_row_size Minimum size in bytes of a row * @param[in] stream CUDA stream to use, default 0 */ -void DecodeAvroColumnData(block_desc_s *blocks, - schemadesc_s *schema, +void DecodeAvroColumnData(block_desc_s* blocks, + schemadesc_s* schema, device_span global_dictionary, - const uint8_t *avro_data, + const uint8_t* avro_data, uint32_t num_blocks, uint32_t schema_len, size_t max_rows, diff --git a/cpp/src/io/avro/avro_gpu.h b/cpp/src/io/avro/avro_gpu.h index a82d3604d02..a895d1bea02 100644 --- a/cpp/src/io/avro/avro_gpu.h +++ b/cpp/src/io/avro/avro_gpu.h @@ -33,7 +33,7 @@ struct schemadesc_s { uint32_t kind; // avro type kind uint32_t count; // for records/unions: number of following child columns, for nulls: global // null_count, for enums: dictionary ofs - void *dataptr; // Ptr to column data, or null if column not selected + void* dataptr; // Ptr to column data, or null if column not selected }; /** @@ -50,10 +50,10 @@ struct schemadesc_s { * @param[in] min_row_size Minimum size in bytes of a row * @param[in] stream CUDA stream to use, default 0 */ -void DecodeAvroColumnData(block_desc_s *blocks, - schemadesc_s *schema, +void DecodeAvroColumnData(block_desc_s* blocks, + schemadesc_s* schema, cudf::device_span global_dictionary, - const uint8_t *avro_data, + const uint8_t* avro_data, uint32_t num_blocks, uint32_t schema_len, size_t max_rows = ~0, diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 21253ce8cdf..f6ffdd99d35 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -46,7 +46,7 @@ namespace { /** * @brief Function that translates Avro data kind to cuDF type enum */ -type_id to_type_id(const avro::schema_entry *col) +type_id to_type_id(const avro::schema_entry* col) { switch (col->kind) { case avro::type_boolean: return type_id::BOOL8; @@ -69,7 +69,7 @@ type_id to_type_id(const avro::schema_entry *col) */ class metadata : public file_metadata { public: - explicit metadata(datasource *const src) : source(src) {} + explicit metadata(datasource* const src) : source(src) {} /** * @brief Initializes the parser and filters down to a subset of rows @@ -77,7 +77,7 @@ class metadata : public file_metadata { * @param[in,out] row_start Starting row of the selection * @param[in,out] row_count Total number of rows selected */ - void init_and_select_rows(int &row_start, int &row_count) + void init_and_select_rows(int& row_start, int& row_count) { const auto buffer = source->host_read(0, source->size()); avro::container pod(buffer->data(), buffer->size()); @@ -100,7 +100,7 @@ class metadata : public file_metadata { const auto num_avro_columns = static_cast(columns.size()); if (!use_names.empty()) { int index = 0; - for (const auto &use_name : use_names) { + for (const auto& use_name : use_names) { for (int i = 0; i < num_avro_columns; ++i, ++index) { if (index >= num_avro_columns) { index = 0; } if (columns[index].name == use_name && @@ -135,10 +135,10 @@ class metadata : public file_metadata { } private: - datasource *const source; + datasource* const source; }; -rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_block_data, +rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer& comp_block_data, rmm::cuda_stream_view stream) { size_t uncompressed_data_size = 0; @@ -149,12 +149,14 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_ // Guess an initial maximum uncompressed block size uint32_t initial_blk_len = (_metadata->max_block_size * 2 + 0xfff) & ~0xfff; uncompressed_data_size = initial_blk_len * _metadata->block_list.size(); - for (size_t i = 0; i < inflate_in.size(); ++i) { inflate_in[i].dstSize = initial_blk_len; } + for (size_t i = 0; i < inflate_in.size(); ++i) { + inflate_in[i].dstSize = initial_blk_len; + } } else if (_metadata->codec == "snappy") { // Extract the uncompressed length from the snappy stream for (size_t i = 0; i < _metadata->block_list.size(); i++) { const auto buffer = _source->host_read(_metadata->block_list[i].offset, 4); - const uint8_t *blk = buffer->data(); + const uint8_t* blk = buffer->data(); uint32_t blk_len = blk[0]; if (blk_len > 0x7f) { blk_len = (blk_len & 0x7f) | (blk[1] << 7); @@ -176,9 +178,9 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_ for (size_t i = 0, dst_pos = 0; i < _metadata->block_list.size(); i++) { const auto src_pos = _metadata->block_list[i].offset - base_offset; - inflate_in[i].srcDevice = static_cast(comp_block_data.data()) + src_pos; + inflate_in[i].srcDevice = static_cast(comp_block_data.data()) + src_pos; inflate_in[i].srcSize = _metadata->block_list[i].size; - inflate_in[i].dstDevice = static_cast(decomp_block_data.data()) + dst_pos; + inflate_in[i].dstDevice = static_cast(decomp_block_data.data()) + dst_pos; // Update blocks offsets & sizes to refer to uncompressed data _metadata->block_list[i].offset = dst_pos; @@ -215,7 +217,7 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_ if (actual_uncompressed_size > uncompressed_data_size) { decomp_block_data.resize(actual_uncompressed_size, stream); for (size_t i = 0, dst_pos = 0; i < _metadata->block_list.size(); i++) { - auto dst_base = static_cast(decomp_block_data.data()); + auto dst_base = static_cast(decomp_block_data.data()); inflate_in[i].dstDevice = dst_base + dst_pos; _metadata->block_list[i].offset = dst_pos; @@ -233,12 +235,12 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_ return decomp_block_data; } -void reader::impl::decode_data(const rmm::device_buffer &block_data, - const std::vector> &dict, +void reader::impl::decode_data(const rmm::device_buffer& block_data, + const std::vector>& dict, device_span global_dictionary, size_t num_rows, std::vector> selection, - std::vector &out_buffers, + std::vector& out_buffers, rmm::cuda_stream_view stream) { // Build gpu schema @@ -277,7 +279,7 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data, _metadata->schema[i + 2].kind == type_null)), "Union with non-null type not currently supported"); } - std::vector valid_alias(out_buffers.size(), nullptr); + std::vector valid_alias(out_buffers.size(), nullptr); for (size_t i = 0; i < out_buffers.size(); i++) { const auto col_idx = selection[i].first; int schema_data_idx = _metadata->columns[col_idx].schema_data_idx; @@ -302,10 +304,10 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data, _metadata->block_list.data(), _metadata->block_list.size() * sizeof(block_desc_s), stream); schema_desc.host_to_device(stream); - gpu::DecodeAvroColumnData(static_cast(block_list.data()), + gpu::DecodeAvroColumnData(static_cast(block_list.data()), schema_desc.device_ptr(), global_dictionary, - static_cast(block_data.data()), + static_cast(block_data.data()), static_cast(_metadata->block_list.size()), static_cast(schema_desc.size()), _metadata->num_rows, @@ -333,15 +335,15 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data, } reader::impl::impl(std::unique_ptr source, - avro_reader_options const &options, - rmm::mr::device_memory_resource *mr) + avro_reader_options const& options, + rmm::mr::device_memory_resource* mr) : _mr(mr), _source(std::move(source)), _columns(options.get_columns()) { // Open the source Avro dataset metadata _metadata = std::make_unique(_source.get()); } -table_with_metadata reader::impl::read(avro_reader_options const &options, +table_with_metadata reader::impl::read(avro_reader_options const& options, rmm::cuda_stream_view stream) { auto skip_rows = options.get_skip_rows(); @@ -358,8 +360,8 @@ table_with_metadata reader::impl::read(avro_reader_options const &options, if (selected_columns.size() != 0) { // Get a list of column data types std::vector column_types; - for (const auto &col : selected_columns) { - auto &col_schema = _metadata->schema[_metadata->columns[col.first].schema_data_idx]; + for (const auto& col : selected_columns) { + auto& col_schema = _metadata->schema[_metadata->columns[col.first].schema_data_idx]; auto col_type = to_type_id(&col_schema); CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); @@ -372,7 +374,7 @@ table_with_metadata reader::impl::read(avro_reader_options const &options, block_data = rmm::device_buffer{_metadata->total_data_size, stream}; auto read_bytes = _source->device_read(_metadata->block_list[0].offset, _metadata->total_data_size, - static_cast(block_data.data()), + static_cast(block_data.data()), stream); block_data.resize(read_bytes, stream); } else { @@ -396,11 +398,13 @@ table_with_metadata reader::impl::read(avro_reader_options const &options, std::vector> dict(column_types.size()); for (size_t i = 0; i < column_types.size(); ++i) { auto col_idx = selected_columns[i].first; - auto &col_schema = _metadata->schema[_metadata->columns[col_idx].schema_data_idx]; + auto& col_schema = _metadata->schema[_metadata->columns[col_idx].schema_data_idx]; dict[i].first = static_cast(total_dictionary_entries); dict[i].second = static_cast(col_schema.symbols.size()); total_dictionary_entries += dict[i].second; - for (const auto &sym : col_schema.symbols) { dictionary_data_size += sym.length(); } + for (const auto& sym : col_schema.symbols) { + dictionary_data_size += sym.length(); + } } rmm::device_uvector d_global_dict(total_dictionary_entries, stream); @@ -411,10 +415,10 @@ table_with_metadata reader::impl::read(avro_reader_options const &options, size_t dict_pos = 0; for (size_t i = 0; i < column_types.size(); ++i) { auto const col_idx = selected_columns[i].first; - auto const &col_schema = _metadata->schema[_metadata->columns[col_idx].schema_data_idx]; + auto const& col_schema = _metadata->schema[_metadata->columns[col_idx].schema_data_idx]; auto const col_dict_entries = &(h_global_dict[dict[i].first]); for (size_t j = 0; j < dict[i].second; j++) { - auto const &symbols = col_schema.symbols[j]; + auto const& symbols = col_schema.symbols[j]; auto const data_dst = h_global_dict_data.data() + dict_pos; auto const len = symbols.length(); @@ -471,20 +475,20 @@ table_with_metadata reader::impl::read(avro_reader_options const &options, } // Forward to implementation -reader::reader(std::vector const &filepaths, - avro_reader_options const &options, +reader::reader(std::vector const& filepaths, + avro_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported."); _impl = std::make_unique(datasource::create(filepaths[0]), options, mr); } // Forward to implementation -reader::reader(std::vector> &&sources, - avro_reader_options const &options, +reader::reader(std::vector>&& sources, + avro_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported."); _impl = std::make_unique(std::move(sources[0]), options, mr); @@ -494,7 +498,7 @@ reader::reader(std::vector> &&sources, reader::~reader() = default; // Forward to implementation -table_with_metadata reader::read(avro_reader_options const &options, rmm::cuda_stream_view stream) +table_with_metadata reader::read(avro_reader_options const& options, rmm::cuda_stream_view stream) { return _impl->read(options, stream); } diff --git a/cpp/src/io/avro/reader_impl.hpp b/cpp/src/io/avro/reader_impl.hpp index 8e09da03563..9af32ed88a0 100644 --- a/cpp/src/io/avro/reader_impl.hpp +++ b/cpp/src/io/avro/reader_impl.hpp @@ -61,8 +61,8 @@ class reader::impl { * @param mr Device memory resource to use for device memory allocation */ explicit impl(std::unique_ptr source, - avro_reader_options const &options, - rmm::mr::device_memory_resource *mr); + avro_reader_options const& options, + rmm::mr::device_memory_resource* mr); /** * @brief Read an entire set or a subset of data and returns a set of columns @@ -72,7 +72,7 @@ class reader::impl { * * @return The set of columns along with metadata */ - table_with_metadata read(avro_reader_options const &options, rmm::cuda_stream_view stream); + table_with_metadata read(avro_reader_options const& options, rmm::cuda_stream_view stream); private: /** @@ -83,7 +83,7 @@ class reader::impl { * * @return Device buffer to decompressed block data */ - rmm::device_buffer decompress_data(const rmm::device_buffer &comp_block_data, + rmm::device_buffer decompress_data(const rmm::device_buffer& comp_block_data, rmm::cuda_stream_view stream); /** @@ -95,16 +95,16 @@ class reader::impl { * @param out_buffers Output columns' device buffers * @param stream CUDA stream used for device memory operations and kernel launches. */ - void decode_data(const rmm::device_buffer &block_data, - const std::vector> &dict, + void decode_data(const rmm::device_buffer& block_data, + const std::vector>& dict, cudf::device_span global_dictionary, size_t num_rows, std::vector> columns, - std::vector &out_buffers, + std::vector& out_buffers, rmm::cuda_stream_view stream); private: - rmm::mr::device_memory_resource *_mr = nullptr; + rmm::mr::device_memory_resource* _mr = nullptr; std::unique_ptr _source; std::unique_ptr _metadata; diff --git a/cpp/src/io/comp/brotli_dict.cpp b/cpp/src/io/comp/brotli_dict.cpp index b493ebd6bfb..3e6939bb816 100644 --- a/cpp/src/io/comp/brotli_dict.cpp +++ b/cpp/src/io/comp/brotli_dict.cpp @@ -6528,7 +6528,7 @@ static const brotli_dictionary_s g_dictionary = { 136, 224, 164, 184, 224, 164, 149, 224, 165, 141, 224, 164, 176, 224, 164, 191, 224, 164, 175, 224, 164, 164, 224, 164, 190}}; -const brotli_dictionary_s *get_brotli_dictionary(void) { return &g_dictionary; } +const brotli_dictionary_s* get_brotli_dictionary(void) { return &g_dictionary; } } // namespace io } // namespace cudf diff --git a/cpp/src/io/comp/brotli_dict.h b/cpp/src/io/comp/brotli_dict.h index c4114b7fbcf..4c1fec1492c 100644 --- a/cpp/src/io/comp/brotli_dict.h +++ b/cpp/src/io/comp/brotli_dict.h @@ -79,7 +79,7 @@ struct brotli_dictionary_s { constexpr int brotli_min_dictionary_word_length = 4; constexpr int brotli_max_dictionary_word_length = 24; -const brotli_dictionary_s *get_brotli_dictionary(void); +const brotli_dictionary_s* get_brotli_dictionary(void); } // namespace io } // namespace cudf diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp index 28d7394e485..f4cb6edd41f 100644 --- a/cpp/src/io/comp/cpu_unbz2.cpp +++ b/cpp/src/io/comp/cpu_unbz2.cpp @@ -101,13 +101,13 @@ namespace io { // Constants for the back end. #define BZ_MAX_ALPHA_SIZE 258 -#define BZ_MAX_CODE_LEN 23 +#define BZ_MAX_CODE_LEN 23 #define BZ_RUNA 0 #define BZ_RUNB 1 #define BZ_N_GROUPS 6 -#define BZ_G_SIZE 50 +#define BZ_G_SIZE 50 #define BZ_MAX_SELECTORS (2 + (900000 / BZ_G_SIZE)) @@ -121,16 +121,16 @@ typedef struct { // Decoder state typedef struct { // Input - const uint8_t *cur; - const uint8_t *end; - const uint8_t *base; + const uint8_t* cur; + const uint8_t* end; + const uint8_t* base; uint64_t bitbuf; uint32_t bitpos; // Output - uint8_t *out; - uint8_t *outend; - uint8_t *outbase; + uint8_t* out; + uint8_t* outend; + uint8_t* outbase; // misc administratium uint32_t blockSize100k; @@ -156,25 +156,25 @@ typedef struct { } unbz_state_s; // return next 32 bits -static inline uint32_t next32bits(const unbz_state_s *s) +static inline uint32_t next32bits(const unbz_state_s* s) { return (uint32_t)((s->bitbuf << s->bitpos) >> 32); } // return next n bits -static inline uint32_t showbits(const unbz_state_s *s, uint32_t n) +static inline uint32_t showbits(const unbz_state_s* s, uint32_t n) { return (uint32_t)((s->bitbuf << s->bitpos) >> (64 - n)); } // update bit position, refill bit buffer if necessary -static void skipbits(unbz_state_s *s, uint32_t n) +static void skipbits(unbz_state_s* s, uint32_t n) { uint32_t bitpos = s->bitpos + n; if (bitpos >= 32) { - const uint8_t *cur = s->cur + 4; + const uint8_t* cur = s->cur + 4; uint32_t next32 = - (cur + 4 < s->end) ? __builtin_bswap32(*reinterpret_cast(cur + 4)) : 0; + (cur + 4 < s->end) ? __builtin_bswap32(*reinterpret_cast(cur + 4)) : 0; s->cur = cur; s->bitbuf = (s->bitbuf << 32) | next32; bitpos &= 0x1f; @@ -182,7 +182,7 @@ static void skipbits(unbz_state_s *s, uint32_t n) s->bitpos = bitpos; } -static inline uint32_t getbits(unbz_state_s *s, uint32_t n) +static inline uint32_t getbits(unbz_state_s* s, uint32_t n) { uint32_t bits = showbits(s, n); skipbits(s, n); @@ -190,7 +190,7 @@ static inline uint32_t getbits(unbz_state_s *s, uint32_t n) } /*---------------------------------------------------*/ -int32_t bz2_decompress_block(unbz_state_s *s) +int32_t bz2_decompress_block(unbz_state_s* s) { int nInUse; @@ -204,7 +204,7 @@ int32_t bz2_decompress_block(unbz_state_s *s) int32_t groupNo; int32_t groupPos; uint32_t nblock, nblockMAX; - const huff_s *gSel = nullptr; + const huff_s* gSel = nullptr; uint32_t inUse16; uint32_t sig0, sig1; @@ -263,11 +263,11 @@ int32_t bz2_decompress_block(unbz_state_s *s) // Now the coding tables for (t = 0; t < nGroups; t++) { int32_t pp, vec; - uint8_t *length = &s->len[0]; + uint8_t* length = &s->len[0]; int32_t curr = getbits(s, 5); int32_t minLen = BZ_MAX_CODE_LEN - 1; int32_t maxLen = 0; - huff_s *sel = &s->ht[t]; + huff_s* sel = &s->ht[t]; for (i = 0; i < alphaSize; i++) { for (;;) { uint32_t v = showbits(s, 2); @@ -297,9 +297,11 @@ int32_t bz2_decompress_block(unbz_state_s *s) sel->base[i] = 0; sel->limit[i] = 0; } - for (i = 0; i < alphaSize; i++) sel->base[length[i] + 1]++; + for (i = 0; i < alphaSize; i++) + sel->base[length[i] + 1]++; - for (i = 1; i < BZ_MAX_CODE_LEN; i++) sel->base[i] += sel->base[i - 1]; + for (i = 1; i < BZ_MAX_CODE_LEN; i++) + sel->base[i] += sel->base[i - 1]; vec = 0; for (i = minLen; i <= maxLen; i++) { @@ -318,7 +320,8 @@ int32_t bz2_decompress_block(unbz_state_s *s) EOB = nInUse + 1; nblockMAX = 100000 * s->blockSize100k; - for (i = 0; i <= 255; i++) s->unzftab[i] = 0; + for (i = 0; i <= 255; i++) + s->unzftab[i] = 0; // MTF init { @@ -456,7 +459,7 @@ int32_t bz2_decompress_block(unbz_state_s *s) // Verify the end-of-block signature: should be followed by another block or an end-of-stream // signature { - const uint8_t *save_cur = s->cur; + const uint8_t* save_cur = s->cur; uint64_t save_bitbuf = s->bitbuf; uint32_t save_bitpos = s->bitpos; sig0 = getbits(s, 24); @@ -476,14 +479,14 @@ int32_t bz2_decompress_block(unbz_state_s *s) } } -static void bzUnRLE(unbz_state_s *s) +static void bzUnRLE(unbz_state_s* s) { - uint8_t *out = s->out; - uint8_t *outend = s->outend; + uint8_t* out = s->out; + uint8_t* outend = s->outend; int32_t rle_cnt = s->save_nblock; int cprev = -1; - std::vector &tt = s->tt; + std::vector& tt = s->tt; uint32_t pos = tt[s->origPtr] >> 8; int mask = ~0; @@ -520,7 +523,7 @@ static void bzUnRLE(unbz_state_s *s) } int32_t cpu_bz2_uncompress( - const uint8_t *source, size_t sourceLen, uint8_t *dest, size_t *destLen, uint64_t *block_start) + const uint8_t* source, size_t sourceLen, uint8_t* dest, size_t* destLen, uint64_t* block_start) { unbz_state_s s{}; uint32_t v; @@ -534,7 +537,7 @@ int32_t cpu_bz2_uncompress( s.base = source; s.end = source + sourceLen - 4; // We will not read the final combined CRC (last 4 bytes of the file) - s.bitbuf = __builtin_bswap64(*reinterpret_cast(source)); + s.bitbuf = __builtin_bswap64(*reinterpret_cast(source)); s.bitpos = 0; s.out = dest; @@ -560,7 +563,7 @@ int32_t cpu_bz2_uncompress( s.cur = source + (size_t)(bit_offs >> 3); s.bitpos = (uint32_t)(bit_offs & 7); if (s.cur + 8 > s.end) return BZ_PARAM_ERROR; - s.bitbuf = __builtin_bswap64(*reinterpret_cast(s.cur)); + s.bitbuf = __builtin_bswap64(*reinterpret_cast(s.cur)); } } diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu index 541163eb086..3f38dce3fa3 100644 --- a/cpp/src/io/comp/debrotli.cu +++ b/cpp/src/io/comp/debrotli.cu @@ -97,12 +97,12 @@ __inline__ __device__ int brotli_context_lut(int mode) { return (mode << 9); } inline __device__ uint8_t brotli_transform_type(int idx) { return kTransformsData[(idx * 3) + 1]; } -inline __device__ const uint8_t *brotli_transform_prefix(int idx) +inline __device__ const uint8_t* brotli_transform_prefix(int idx) { return &kPrefixSuffix[kPrefixSuffixMap[kTransformsData[(idx * 3)]]]; } -inline __device__ const uint8_t *brotli_transform_suffix(int idx) +inline __device__ const uint8_t* brotli_transform_suffix(int idx) { return &kPrefixSuffix[kPrefixSuffixMap[kTransformsData[(idx * 3) + 2]]]; } @@ -138,12 +138,12 @@ struct debrotli_huff_tree_group_s { uint16_t max_symbol; uint16_t num_htrees; uint16_t pad; - uint16_t *htrees[1]; + uint16_t* htrees[1]; }; // Must be able to at least hold worst-case context maps, tree groups and context modes constexpr int local_heap_size = - (256 * 64 + 256 * 4 + 3 * (sizeof(debrotli_huff_tree_group_s) + 255 * sizeof(uint16_t *)) + 256 + + (256 * 64 + 256 * 4 + 3 * (sizeof(debrotli_huff_tree_group_s) + 255 * sizeof(uint16_t*)) + 256 + 3 * brotli_huffman_max_size_258 * sizeof(uint16_t) + 3 * brotli_huffman_max_size_26 * sizeof(uint16_t)); @@ -152,15 +152,15 @@ constexpr int local_heap_size = */ struct debrotli_state_s { // Bitstream - const uint8_t *cur; - const uint8_t *end; - const uint8_t *base; + const uint8_t* cur; + const uint8_t* end; + const uint8_t* base; uint2 bitbuf; uint32_t bitpos; int32_t error; // Output - uint8_t *outbase; - uint8_t *out; + uint8_t* outbase; + uint8_t* out; size_t bytes_left; // Decoded symbols uint8_t window_bits; @@ -178,19 +178,19 @@ struct debrotli_state_s { uint32_t meta_block_len; uint16_t heap_used; uint16_t heap_limit; - uint8_t *context_map; - uint8_t *dist_context_map; - uint8_t *context_modes; - uint8_t *fb_base; + uint8_t* context_map; + uint8_t* dist_context_map; + uint8_t* context_modes; + uint8_t* fb_base; uint32_t fb_size; uint8_t block_type_rb[6]; uint8_t pad[2]; int dist_rb_idx; int dist_rb[4]; - debrotli_huff_tree_group_s *literal_hgroup; - debrotli_huff_tree_group_s *insert_copy_hgroup; - debrotli_huff_tree_group_s *distance_hgroup; - uint16_t *block_type_vlc[3]; + debrotli_huff_tree_group_s* literal_hgroup; + debrotli_huff_tree_group_s* insert_copy_hgroup; + debrotli_huff_tree_group_s* distance_hgroup; + uint16_t* block_type_vlc[3]; huff_scratch_s hs; uint32_t mtf[65]; uint64_t heap[local_heap_size / 8]; @@ -199,54 +199,54 @@ struct debrotli_state_s { inline __device__ uint32_t Log2Floor(uint32_t value) { return 32 - __clz(value); } /// @brief initializes the bit reader -__device__ void initbits(debrotli_state_s *s, const uint8_t *base, size_t len, size_t pos = 0) +__device__ void initbits(debrotli_state_s* s, const uint8_t* base, size_t len, size_t pos = 0) { - const uint8_t *p = base + pos; + const uint8_t* p = base + pos; uint32_t prefix_bytes = (uint32_t)(((size_t)p) & 3); p -= prefix_bytes; s->base = base; s->end = base + len; s->cur = p; - s->bitbuf.x = (p < s->end) ? *reinterpret_cast(p) : 0; + s->bitbuf.x = (p < s->end) ? *reinterpret_cast(p) : 0; p += 4; - s->bitbuf.y = (p < s->end) ? *reinterpret_cast(p) : 0; + s->bitbuf.y = (p < s->end) ? *reinterpret_cast(p) : 0; s->bitpos = prefix_bytes * 8; } // return next 32 bits -inline __device__ uint32_t next32bits(const debrotli_state_s *s) +inline __device__ uint32_t next32bits(const debrotli_state_s* s) { return __funnelshift_rc(s->bitbuf.x, s->bitbuf.y, s->bitpos); } /// return next n bits -inline __device__ uint32_t showbits(const debrotli_state_s *s, uint32_t n) +inline __device__ uint32_t showbits(const debrotli_state_s* s, uint32_t n) { uint32_t next32 = __funnelshift_rc(s->bitbuf.x, s->bitbuf.y, s->bitpos); return (next32 & ((1 << n) - 1)); } -inline __device__ void skipbits(debrotli_state_s *s, uint32_t n) +inline __device__ void skipbits(debrotli_state_s* s, uint32_t n) { uint32_t bitpos = s->bitpos + n; if (bitpos >= 32) { - const uint8_t *cur = s->cur + 8; + const uint8_t* cur = s->cur + 8; s->bitbuf.x = s->bitbuf.y; - s->bitbuf.y = (cur < s->end) ? *reinterpret_cast(cur) : 0; + s->bitbuf.y = (cur < s->end) ? *reinterpret_cast(cur) : 0; s->cur = cur - 4; bitpos &= 0x1f; } s->bitpos = bitpos; } -inline __device__ uint32_t getbits(debrotli_state_s *s, uint32_t n) +inline __device__ uint32_t getbits(debrotli_state_s* s, uint32_t n) { uint32_t bits = showbits(s, n); skipbits(s, n); return bits; } -inline __device__ uint32_t getbits_bytealign(debrotli_state_s *s) +inline __device__ uint32_t getbits_bytealign(debrotli_state_s* s) { uint32_t n = (uint32_t)((-(int32_t)s->bitpos) & 7); uint32_t bits = showbits(s, n); @@ -271,7 +271,7 @@ inline __device__ uint32_t getbits_bytealign(debrotli_state_s *s) * 65..128 xxxxxx1101 * 129..256 xxxxxxx1111 */ -static __device__ uint32_t getbits_u8vlc(debrotli_state_s *s) +static __device__ uint32_t getbits_u8vlc(debrotli_state_s* s) { uint32_t next32 = next32bits(s); uint32_t v, len; @@ -288,7 +288,7 @@ static __device__ uint32_t getbits_u8vlc(debrotli_state_s *s) } /// Decode a Huffman code with 8-bit initial lookup -static __device__ uint32_t getvlc(debrotli_state_s *s, const uint16_t *lut) +static __device__ uint32_t getvlc(debrotli_state_s* s, const uint16_t* lut) { uint32_t next32 = next32bits(s); uint32_t vlc, len; @@ -308,12 +308,12 @@ static __device__ uint32_t getvlc(debrotli_state_s *s, const uint16_t *lut) } /// Alloc bytes from the local (shared mem) heap -static __device__ uint8_t *local_alloc(debrotli_state_s *s, uint32_t bytes) +static __device__ uint8_t* local_alloc(debrotli_state_s* s, uint32_t bytes) { int heap_used = s->heap_used; int len = (bytes + 7) >> 3; if (heap_used + len <= s->heap_limit) { - uint8_t *ptr = reinterpret_cast(&s->heap[heap_used]); + uint8_t* ptr = reinterpret_cast(&s->heap[heap_used]); s->heap_used = (uint16_t)(heap_used + len); return ptr; } else { @@ -323,7 +323,7 @@ static __device__ uint8_t *local_alloc(debrotli_state_s *s, uint32_t bytes) /// Shrink the size of the local heap, returns ptr to end (used for stack-like intermediate /// allocations at the end of the heap) -static __device__ uint8_t *local_heap_shrink(debrotli_state_s *s, uint32_t bytes) +static __device__ uint8_t* local_heap_shrink(debrotli_state_s* s, uint32_t bytes) { int heap_used = s->heap_used; int heap_limit = s->heap_limit; @@ -331,13 +331,13 @@ static __device__ uint8_t *local_heap_shrink(debrotli_state_s *s, uint32_t bytes if (heap_limit - len >= heap_used) { heap_limit -= len; s->heap_limit = (uint16_t)heap_limit; - return reinterpret_cast(&s->heap[heap_limit]); + return reinterpret_cast(&s->heap[heap_limit]); } else { return nullptr; } } -static __device__ void local_heap_grow(debrotli_state_s *s, uint32_t bytes) +static __device__ void local_heap_grow(debrotli_state_s* s, uint32_t bytes) { int len = (bytes + 7) >> 3; int heap_limit = s->heap_limit + len; @@ -345,16 +345,16 @@ static __device__ void local_heap_grow(debrotli_state_s *s, uint32_t bytes) } /// Alloc memory from the fixed-size heap shared between all blocks (thread0-only) -static __device__ uint8_t *ext_heap_alloc(uint32_t bytes, - uint8_t *ext_heap_base, +static __device__ uint8_t* ext_heap_alloc(uint32_t bytes, + uint8_t* ext_heap_base, uint32_t ext_heap_size) { uint32_t len = (bytes + 0xf) & ~0xf; - volatile uint32_t *heap_ptr = reinterpret_cast(ext_heap_base); + volatile uint32_t* heap_ptr = reinterpret_cast(ext_heap_base); uint32_t first_free_block = ~0; for (;;) { uint32_t blk_next, blk_prev; - first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block); + first_free_block = atomicExch((unsigned int*)heap_ptr, first_free_block); if (first_free_block == ~0 || first_free_block >= ext_heap_size) { // Some other block is holding the heap or there are no free blocks: try again later continue; @@ -373,7 +373,7 @@ static __device__ uint8_t *ext_heap_alloc(uint32_t bytes, uint32_t next, blksz; if (((blk_next & 3) != 0) || (blk_next >= ext_heap_size)) { // Corrupted heap - atomicExch((unsigned int *)heap_ptr, first_free_block); + atomicExch((unsigned int*)heap_ptr, first_free_block); return nullptr; } next = heap_ptr[(blk_next >> 2) + 0]; @@ -398,14 +398,14 @@ static __device__ uint8_t *ext_heap_alloc(uint32_t bytes, } __threadfence(); // Restore the list head - atomicExch((unsigned int *)heap_ptr, first_free_block); + atomicExch((unsigned int*)heap_ptr, first_free_block); return ext_heap_base + blk_next; } else { blk_prev = blk_next; blk_next = next; } } while (blk_next != 0 && blk_next < ext_heap_size); - first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block); + first_free_block = atomicExch((unsigned int*)heap_ptr, first_free_block); // Reaching here means the heap is full // Just in case we're trying to allocate more than the entire heap if (len > ext_heap_size - 4 * sizeof(uint32_t)) { break; } @@ -414,17 +414,17 @@ static __device__ uint8_t *ext_heap_alloc(uint32_t bytes, } /// Free a memory block (thread0-only) -static __device__ void ext_heap_free(void *ptr, +static __device__ void ext_heap_free(void* ptr, uint32_t bytes, - uint8_t *ext_heap_base, + uint8_t* ext_heap_base, uint32_t ext_heap_size) { uint32_t len = (bytes + 0xf) & ~0xf; - volatile uint32_t *heap_ptr = (volatile uint32_t *)ext_heap_base; + volatile uint32_t* heap_ptr = (volatile uint32_t*)ext_heap_base; uint32_t first_free_block = ~0; - uint32_t cur_blk = static_cast(static_cast(ptr) - ext_heap_base); + uint32_t cur_blk = static_cast(static_cast(ptr) - ext_heap_base); for (;;) { - first_free_block = atomicExch((unsigned int *)heap_ptr, first_free_block); + first_free_block = atomicExch((unsigned int*)heap_ptr, first_free_block); if (first_free_block != ~0) { break; } // Some other block is holding the heap } @@ -485,12 +485,12 @@ static __device__ void ext_heap_free(void *ptr, } } __threadfence(); - atomicExch((unsigned int *)heap_ptr, first_free_block); + atomicExch((unsigned int*)heap_ptr, first_free_block); } -static __device__ uint32_t BuildSimpleHuffmanTable(uint16_t *lut, +static __device__ uint32_t BuildSimpleHuffmanTable(uint16_t* lut, int root_bits, - uint16_t *val, + uint16_t* val, uint32_t num_symbols) { uint32_t table_size = 1; @@ -562,7 +562,7 @@ static __device__ uint32_t BuildSimpleHuffmanTable(uint16_t *lut, return goal_size; } -static __device__ void BuildCodeLengthsHuffmanTable(huff_scratch_s *hs) +static __device__ void BuildCodeLengthsHuffmanTable(huff_scratch_s* hs) { uint32_t code; // current table entry int symbol; // symbol index in original or sorted table @@ -592,7 +592,9 @@ static __device__ void BuildCodeLengthsHuffmanTable(huff_scratch_s *hs) // Special case: all symbols but one have 0 code length. if (hs->offset[0] == 0) { code = huffcode(0, hs->sorted[0]); - for (key = 0; key < table_size; ++key) { hs->lenvlctab[key] = code; } + for (key = 0; key < table_size; ++key) { + hs->lenvlctab[key] = code; + } return; } @@ -606,7 +608,7 @@ static __device__ void BuildCodeLengthsHuffmanTable(huff_scratch_s *hs) for (int bits_count = hs->code_length_histo[bits]; bits_count != 0; --bits_count) { int end = table_size; code = huffcode(bits, hs->sorted[symbol++]); - uint16_t *p = &hs->lenvlctab[brev8(key)]; + uint16_t* p = &hs->lenvlctab[brev8(key)]; do { end -= step; p[end] = code; @@ -621,7 +623,7 @@ static __device__ void BuildCodeLengthsHuffmanTable(huff_scratch_s *hs) // Returns the table width of the next 2nd level table. |count| is the histogram // of bit lengths for the remaining symbols, |len| is the code length of the // next processed symbol. -static __device__ int NextTableBitSize(const uint16_t *const count, int len, int root_bits) +static __device__ int NextTableBitSize(const uint16_t* const count, int len, int root_bits) { int left = 1 << (len - root_bits); while (len < 15) { @@ -634,13 +636,13 @@ static __device__ int NextTableBitSize(const uint16_t *const count, int len, int } // Build a huffman lookup table (currently thread0-only) -static __device__ uint32_t BuildHuffmanTable(uint16_t *root_lut, +static __device__ uint32_t BuildHuffmanTable(uint16_t* root_lut, int root_bits, - const uint16_t *const symbol_lists, - uint16_t *count) + const uint16_t* const symbol_lists, + uint16_t* count) { uint32_t code; // current table entry - uint16_t *lut; // next available space in table + uint16_t* lut; // next available space in table int len; // current code length int symbol; // symbol index in original or sorted table int key; // prefix code @@ -654,7 +656,8 @@ static __device__ uint32_t BuildHuffmanTable(uint16_t *root_lut, int max_length = -1; int bits; - while (symbol_lists[max_length] == 0xFFFF) max_length--; + while (symbol_lists[max_length] == 0xFFFF) + max_length--; max_length += 16; lut = root_lut; @@ -677,7 +680,7 @@ static __device__ uint32_t BuildHuffmanTable(uint16_t *root_lut, for (int bits_count = count[bits]; bits_count != 0; --bits_count) { symbol = symbol_lists[symbol]; code = huffcode(bits, symbol); - uint16_t *p = &lut[brev8(key)]; + uint16_t* p = &lut[brev8(key)]; int end = table_size; do { end -= step; @@ -715,7 +718,7 @@ static __device__ uint32_t BuildHuffmanTable(uint16_t *root_lut, } symbol = symbol_lists[symbol]; code = huffcode(len - root_bits, symbol); - uint16_t *p = &lut[brev8(sub_key)]; + uint16_t* p = &lut[brev8(sub_key)]; int end = table_size; do { end -= step; @@ -883,10 +886,10 @@ invalid. */ // Decode Huffman tree (thread0-only) -static __device__ uint32_t DecodeHuffmanTree(debrotli_state_s *s, +static __device__ uint32_t DecodeHuffmanTree(debrotli_state_s* s, uint32_t alphabet_size, uint32_t max_symbol, - uint16_t *vlctab) + uint16_t* vlctab) { uint32_t prefix_code_type; @@ -916,8 +919,8 @@ static __device__ uint32_t DecodeHuffmanTree(debrotli_state_s *s, vlctab, huffman_lookup_table_width, s->hs.symbols_lists_array, nsym); } else { // Complex prefix code - huff_scratch_s *const hs = &s->hs; - uint16_t *symbol_lists = + huff_scratch_s* const hs = &s->hs; + uint16_t* symbol_lists = &s->hs.symbols_lists_array[16]; // Make small negative indexes addressable uint32_t space = 32, num_codes = 0, i, prev_code_len, symbol, repeat, repeat_code_len; @@ -1069,7 +1072,7 @@ formula : window size = (1 << WBITS) - 16 */ -static __device__ void DecodeStreamHeader(debrotli_state_s *s) +static __device__ void DecodeStreamHeader(debrotli_state_s* s) { uint32_t next32 = next32bits(s); uint32_t wbits = 0, len = 0; @@ -1155,7 +1158,7 @@ not set(if the ignored bits are not all zeros, the stream should be rejected as invalid) */ -static __device__ void DecodeMetaBlockHeader(debrotli_state_s *s) +static __device__ void DecodeMetaBlockHeader(debrotli_state_s* s) { uint32_t next32 = next32bits(s); uint32_t len = 1, is_empty = 0; @@ -1195,7 +1198,9 @@ static __device__ void DecodeMetaBlockHeader(debrotli_state_s *s) } skipbits(s, len); if (getbits_bytealign(s) != 0) { s->error = 1; } - for (len = mskiplen; len >= 32; len -= 32) { skipbits(s, 32); } + for (len = mskiplen; len >= 32; len -= 32) { + skipbits(s, 32); + } } } skipbits(s, len); @@ -1238,17 +1243,17 @@ Block count code + extra bits for first distance block count, appears only if NBLTYPESD >= 2 */ -static __device__ void DecodeHuffmanTables(debrotli_state_s *s) +static __device__ void DecodeHuffmanTables(debrotli_state_s* s) { for (int b = 0; b < 3; b++) { uint32_t nbltypes = 1 + getbits_u8vlc(s); s->num_block_types[b] = nbltypes; if (nbltypes >= 2) { uint32_t alphabet_size = nbltypes + 2, index, nbits, maxtblsz; - uint16_t *vlctab; + uint16_t* vlctab; maxtblsz = kMaxHuffmanTableSize[(alphabet_size + 31) >> 5]; maxtblsz = (maxtblsz > brotli_huffman_max_size_258) ? brotli_huffman_max_size_258 : maxtblsz; - vlctab = reinterpret_cast( + vlctab = reinterpret_cast( local_alloc(s, (brotli_huffman_max_size_26 + maxtblsz) * sizeof(uint16_t))); s->block_type_vlc[b] = vlctab; DecodeHuffmanTree(s, alphabet_size, alphabet_size, vlctab + brotli_huffman_max_size_26); @@ -1286,13 +1291,13 @@ static __device__ void DecodeHuffmanTables(debrotli_state_s *s) * Most of input values are 0 and 1. To reduce number of branches, we replace * inner for loop with do-while. */ -static __device__ void InverseMoveToFrontTransform(debrotli_state_s *s, uint8_t *v, uint32_t v_len) +static __device__ void InverseMoveToFrontTransform(debrotli_state_s* s, uint8_t* v, uint32_t v_len) { // Reinitialize elements that could have been changed. uint32_t i = 1; uint32_t upper_bound = s->mtf_upper_bound; - uint32_t *mtf = &s->mtf[1]; // Make mtf[-1] addressable. - uint8_t *mtf_u8 = reinterpret_cast(mtf); + uint32_t* mtf = &s->mtf[1]; // Make mtf[-1] addressable. + uint8_t* mtf_u8 = reinterpret_cast(mtf); uint32_t pattern = 0x03020100; // Little-endian // Initialize list using 4 consequent values pattern. @@ -1320,10 +1325,10 @@ static __device__ void InverseMoveToFrontTransform(debrotli_state_s *s, uint8_t s->mtf_upper_bound = upper_bound >> 2; } -static __device__ uint32_t DecodeContextMap(debrotli_state_s *s, - uint8_t *context_map, +static __device__ uint32_t DecodeContextMap(debrotli_state_s* s, + uint8_t* context_map, uint32_t context_map_size, - uint16_t *context_map_vlc) + uint16_t* context_map_vlc) { uint32_t num_htrees = getbits_u8vlc(s) + 1; uint32_t bits, context_index, max_run_length_prefix, alphabet_size; @@ -1367,7 +1372,7 @@ static __device__ uint32_t DecodeContextMap(debrotli_state_s *s, return num_htrees; } -static __device__ void DetectTrivialLiteralBlockTypes(debrotli_state_s *s) +static __device__ void DetectTrivialLiteralBlockTypes(debrotli_state_s* s) { uint32_t i; for (i = 0; i < s->num_block_types[0]; i++) { @@ -1375,7 +1380,9 @@ static __device__ void DetectTrivialLiteralBlockTypes(debrotli_state_s *s) uint32_t error = 0; uint32_t sample = s->context_map[offset]; uint32_t j; - for (j = 0; j < (1u << 6); ++j) { error |= s->context_map[offset + j] ^ sample; } + for (j = 0; j < (1u << 6); ++j) { + error |= s->context_map[offset + j] ^ sample; + } if (error == 0) { s->context_modes[i] |= 4u; } } } @@ -1405,13 +1412,13 @@ appears only if NTREESD >= 2; otherwise, the context map has only zero values */ -static __device__ debrotli_huff_tree_group_s *HuffmanTreeGroupInit(debrotli_state_s *s, +static __device__ debrotli_huff_tree_group_s* HuffmanTreeGroupInit(debrotli_state_s* s, uint32_t alphabet_size, uint32_t max_symbol, uint32_t ntrees) { - debrotli_huff_tree_group_s *group = reinterpret_cast(local_alloc( - s, sizeof(debrotli_huff_tree_group_s) + ntrees * sizeof(uint16_t *) - sizeof(uint16_t *))); + debrotli_huff_tree_group_s* group = reinterpret_cast(local_alloc( + s, sizeof(debrotli_huff_tree_group_s) + ntrees * sizeof(uint16_t*) - sizeof(uint16_t*))); group->alphabet_size = (uint16_t)alphabet_size; group->max_symbol = (uint16_t)max_symbol; group->num_htrees = (uint16_t)ntrees; @@ -1419,26 +1426,26 @@ static __device__ debrotli_huff_tree_group_s *HuffmanTreeGroupInit(debrotli_stat return group; } -static __device__ void HuffmanTreeGroupAlloc(debrotli_state_s *s, debrotli_huff_tree_group_s *group) +static __device__ void HuffmanTreeGroupAlloc(debrotli_state_s* s, debrotli_huff_tree_group_s* group) { if (!group->htrees[0]) { uint32_t alphabet_size = group->alphabet_size; uint32_t ntrees = group->num_htrees; uint32_t max_table_size = kMaxHuffmanTableSize[(alphabet_size + 31) >> 5]; uint32_t code_size = sizeof(uint16_t) * ntrees * max_table_size; - group->htrees[0] = reinterpret_cast(local_alloc(s, code_size)); + group->htrees[0] = reinterpret_cast(local_alloc(s, code_size)); if (!group->htrees[0]) { - if (s->fb_base) { group->htrees[0] = reinterpret_cast(s->fb_base + s->fb_size); } + if (s->fb_base) { group->htrees[0] = reinterpret_cast(s->fb_base + s->fb_size); } s->fb_size += (code_size + 3) & ~3; } } } // Decodes a series of Huffman table using ReadHuffmanCode function. -static __device__ void HuffmanTreeGroupDecode(debrotli_state_s *s, - debrotli_huff_tree_group_s *group) +static __device__ void HuffmanTreeGroupDecode(debrotli_state_s* s, + debrotli_huff_tree_group_s* group) { - uint16_t *next = group->htrees[0]; + uint16_t* next = group->htrees[0]; for (int htree_index = 0; htree_index < group->num_htrees; htree_index++) { uint32_t table_size = DecodeHuffmanTree(s, group->alphabet_size, group->max_symbol, next); @@ -1448,13 +1455,13 @@ static __device__ void HuffmanTreeGroupDecode(debrotli_state_s *s, } } -static __device__ void DecodeHuffmanTreeGroups(debrotli_state_s *s, - uint8_t *fb_heap_base, +static __device__ void DecodeHuffmanTreeGroups(debrotli_state_s* s, + uint8_t* fb_heap_base, uint32_t fb_heap_size) { uint32_t bits, npostfix, ndirect, nbltypesl; uint32_t context_map_size; - uint16_t *context_map_vlc; + uint16_t* context_map_vlc; uint32_t num_direct_codes, num_distance_codes, num_literal_htrees, num_dist_htrees; // Decode context maps @@ -1466,8 +1473,10 @@ static __device__ void DecodeHuffmanTreeGroups(debrotli_state_s *s, s->distance_postfix_mask = (1 << npostfix) - 1; nbltypesl = s->num_block_types[0]; s->context_modes = local_alloc(s, nbltypesl); - for (uint32_t i = 0; i < nbltypesl; i++) { s->context_modes[i] = getbits(s, 2); } - context_map_vlc = reinterpret_cast( + for (uint32_t i = 0; i < nbltypesl; i++) { + s->context_modes[i] = getbits(s, 2); + } + context_map_vlc = reinterpret_cast( local_heap_shrink(s, brotli_huffman_max_size_272 * sizeof(uint16_t))); context_map_size = nbltypesl << 6; s->context_map = local_alloc(s, context_map_size); @@ -1514,7 +1523,7 @@ static __device__ void DecodeHuffmanTreeGroups(debrotli_state_s *s, HuffmanTreeGroupDecode(s, s->distance_hgroup); } -static __device__ int PrepareLiteralDecoding(debrotli_state_s *s, const uint8_t *&context_map_slice) +static __device__ int PrepareLiteralDecoding(debrotli_state_s* s, const uint8_t*& context_map_slice) { int context_mode; uint32_t block_type = s->block_type_rb[1]; @@ -1525,13 +1534,13 @@ static __device__ int PrepareLiteralDecoding(debrotli_state_s *s, const uint8_t } /// Decodes a command or literal and updates block type ring-buffer. Reads 3..54 bits. -static __device__ uint32_t DecodeBlockTypeAndLength(debrotli_state_s *s, int tree_type) +static __device__ uint32_t DecodeBlockTypeAndLength(debrotli_state_s* s, int tree_type) { uint32_t max_block_type = s->num_block_types[tree_type]; if (max_block_type > 1) { - const uint16_t *len_tree = s->block_type_vlc[tree_type]; - const uint16_t *type_tree = len_tree + brotli_huffman_max_size_26; - uint8_t *ringbuffer = &s->block_type_rb[tree_type * 2]; + const uint16_t* len_tree = s->block_type_vlc[tree_type]; + const uint16_t* type_tree = len_tree + brotli_huffman_max_size_26; + uint8_t* ringbuffer = &s->block_type_rb[tree_type * 2]; // Read 0..15 + 3..39 bits. uint32_t block_type = getvlc(s, type_tree); uint32_t block_len = getvlc(s, len_tree); @@ -1553,7 +1562,7 @@ static __device__ uint32_t DecodeBlockTypeAndLength(debrotli_state_s *s, int tre } } -inline __device__ int ToUpperCase(uint8_t *p) +inline __device__ int ToUpperCase(uint8_t* p) { if (p[0] < 0xC0) { if (p[0] >= 'a' && p[0] <= 'z') { p[0] ^= 32; } @@ -1569,18 +1578,20 @@ inline __device__ int ToUpperCase(uint8_t *p) return 3; } -static __device__ int TransformDictionaryWord(uint8_t *dst, - const uint8_t *word, +static __device__ int TransformDictionaryWord(uint8_t* dst, + const uint8_t* word, int len, int transform_idx) { int idx = 0; - const uint8_t *prefix = brotli_transform_prefix(transform_idx); + const uint8_t* prefix = brotli_transform_prefix(transform_idx); uint8_t type = brotli_transform_type(transform_idx); - const uint8_t *suffix = brotli_transform_suffix(transform_idx); + const uint8_t* suffix = brotli_transform_suffix(transform_idx); { int prefix_len = *prefix++; - while (prefix_len--) { dst[idx++] = *prefix++; } + while (prefix_len--) { + dst[idx++] = *prefix++; + } } { const int t = type; @@ -1592,11 +1603,13 @@ static __device__ int TransformDictionaryWord(uint8_t *dst, word += skip; len -= skip; } - while (i < len) { dst[idx++] = word[i++]; } + while (i < len) { + dst[idx++] = word[i++]; + } if (t == BROTLI_TRANSFORM_UPPERCASE_FIRST) { ToUpperCase(&dst[idx - len]); } else if (t == BROTLI_TRANSFORM_UPPERCASE_ALL) { - uint8_t *uppercase = &dst[idx - len]; + uint8_t* uppercase = &dst[idx - len]; while (len > 0) { int step = ToUpperCase(uppercase); uppercase += step; @@ -1606,24 +1619,26 @@ static __device__ int TransformDictionaryWord(uint8_t *dst, } { int suffix_len = *suffix++; - while (suffix_len--) { dst[idx++] = *suffix++; } + while (suffix_len--) { + dst[idx++] = *suffix++; + } return idx; } } /// ProcessCommands, actual decoding: 1 warp, most work done by thread0 -static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_dictionary_s *words, int t) +static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_dictionary_s* words, int t) { int32_t meta_block_len = s->meta_block_len; - uint8_t *out = s->out; + uint8_t* out = s->out; int32_t pos = 0; int p1 = s->p1; int p2 = s->p2; - const uint16_t *htree_command; + const uint16_t* htree_command; const uint8_t *context_map_slice, *dist_context_map_slice; int dist_rb_idx; uint32_t blen_L, blen_I, blen_D; - uint8_t *const dict_scratch = reinterpret_cast( + uint8_t* const dict_scratch = reinterpret_cast( &s->hs); // 24+13 bytes (max length of a dictionary word including prefix & suffix) int context_mode; @@ -1678,7 +1693,7 @@ static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_diction insert_length -= len; blen_L -= len; if (brotli_need_context_lut(context_mode)) { - const debrotli_huff_tree_group_s *literal_hgroup = s->literal_hgroup; + const debrotli_huff_tree_group_s* literal_hgroup = s->literal_hgroup; do { int context = brotli_context(p1, p2, context_mode); p2 = p1; @@ -1686,7 +1701,7 @@ static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_diction out[pos++] = p1; } while (--len); } else { - const uint16_t *literal_htree = s->literal_hgroup->htrees[context_map_slice[0]]; + const uint16_t* literal_htree = s->literal_hgroup->htrees[context_map_slice[0]]; do { p2 = p1; p1 = getvlc(s, literal_htree); @@ -1704,7 +1719,7 @@ static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_diction distance_code = s->dist_rb[dist_rb_idx & 3]; distance_context = 1; } else { - const uint16_t *distance_tree; + const uint16_t* distance_tree; int distval; // Read distance code in the command, unless it was implicitly zero. if (blen_D == 0) { @@ -1847,7 +1862,7 @@ static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_diction if (distance_code > 0) { // Copy for (uint32_t i = t; i < copy_length; i += 32) { - const uint8_t *src = + const uint8_t* src = out + pos + ((i >= (uint32_t)distance_code) ? (i % (uint32_t)distance_code) : i) - distance_code; b = *src; @@ -1855,7 +1870,7 @@ static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_diction } } else { // Dictionary - const uint8_t *src = (distance_code < 0) ? &words->data[-distance_code] : dict_scratch; + const uint8_t* src = (distance_code < 0) ? &words->data[-distance_code] : dict_scratch; if (t < copy_length) { b = src[t]; out[pos + t] = b; @@ -1891,9 +1906,9 @@ static __device__ void ProcessCommands(debrotli_state_s *s, const brotli_diction * @param count Number of blocks to decompress */ extern "C" __global__ void __launch_bounds__(block_size, 2) - gpu_debrotli_kernel(gpu_inflate_input_s *inputs, - gpu_inflate_status_s *outputs, - uint8_t *scratch, + gpu_debrotli_kernel(gpu_inflate_input_s* inputs, + gpu_inflate_status_s* outputs, + uint8_t* scratch, uint32_t scratch_size, uint32_t count) { @@ -1901,16 +1916,16 @@ extern "C" __global__ void __launch_bounds__(block_size, 2) int t = threadIdx.x; int z = blockIdx.x; - debrotli_state_s *const s = &state_g; + debrotli_state_s* const s = &state_g; if (z >= count) { return; } // Thread0: initializes shared state and decode stream header if (!t) { - uint8_t const *src = static_cast(inputs[z].srcDevice); + uint8_t const* src = static_cast(inputs[z].srcDevice); size_t src_size = inputs[z].srcSize; if (src && src_size >= 8) { s->error = 0; - s->out = s->outbase = static_cast(inputs[z].dstDevice); + s->out = s->outbase = static_cast(inputs[z].dstDevice); s->bytes_left = inputs[z].dstSize; s->mtf_upper_bound = 63; s->dist_rb[0] = 16; @@ -1940,8 +1955,8 @@ extern "C" __global__ void __launch_bounds__(block_size, 2) if (!s->error && s->meta_block_len != 0) { if (s->is_uncompressed) { // Uncompressed block - const uint8_t *src = s->cur + ((s->bitpos + 7) >> 3); - uint8_t *dst = s->out; + const uint8_t* src = s->cur + ((s->bitpos + 7) >> 3); + uint8_t* dst = s->out; if (!t) { if (getbits_bytealign(s) != 0) { s->error = -1; @@ -1954,7 +1969,9 @@ extern "C" __global__ void __launch_bounds__(block_size, 2) __syncthreads(); if (!s->error) { // Simple block-wide memcpy - for (int32_t i = t; i < s->meta_block_len; i += block_size) { dst[i] = src[i]; } + for (int32_t i = t; i < s->meta_block_len; i += block_size) { + dst[i] = src[i]; + } } } else { // Compressed block @@ -1971,8 +1988,7 @@ extern "C" __global__ void __launch_bounds__(block_size, 2) if (!s->error) { // Warp0: Decode compressed block, warps 1..7 are all idle (!) if (t < 32) - ProcessCommands( - s, reinterpret_cast(scratch + scratch_size), t); + ProcessCommands(s, reinterpret_cast(scratch + scratch_size), t); __syncthreads(); } // Free any allocated memory @@ -2053,16 +2069,16 @@ size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs) #include #endif -cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s *inputs, - gpu_inflate_status_s *outputs, - void *scratch, +cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs, + gpu_inflate_status_s* outputs, + void* scratch, size_t scratch_size, int count, rmm::cuda_stream_view stream) { uint32_t count32 = (count > 0) ? count : 0; uint32_t fb_heap_size; - uint8_t *scratch_u8 = static_cast(scratch); + uint8_t* scratch_u8 = static_cast(scratch); dim3 dim_block(block_size, 1); dim3 dim_grid(count32, 1); // TODO: Check max grid dimensions vs max expected count diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index eda1d37f78c..338af72e4c9 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -102,15 +102,15 @@ constexpr int prefetch_size = (1 << log2_prefetch_size); /// @brief Prefetcher state struct prefetch_queue_s { - const uint8_t *cur_p; ///< Prefetch location + const uint8_t* cur_p; ///< Prefetch location int run; ///< prefetcher will exit when run=0 uint8_t pref_data[prefetch_size]; }; template -inline __device__ volatile uint32_t *prefetch_addr32(volatile prefetch_queue_s &q, T *ptr) +inline __device__ volatile uint32_t* prefetch_addr32(volatile prefetch_queue_s& q, T* ptr) { - return reinterpret_cast(&q.pref_data[(prefetch_size - 4) & (size_t)(ptr)]); + return reinterpret_cast(&q.pref_data[(prefetch_size - 4) & (size_t)(ptr)]); } #endif // ENABLE_PREFETCH @@ -120,12 +120,12 @@ inline __device__ volatile uint32_t *prefetch_addr32(volatile prefetch_queue_s & */ struct inflate_state_s { // output state - uint8_t *out; ///< output buffer - uint8_t *outbase; ///< start of output buffer - uint8_t *outend; ///< end of output buffer + uint8_t* out; ///< output buffer + uint8_t* outbase; ///< start of output buffer + uint8_t* outend; ///< end of output buffer // Input state - uint8_t *cur; ///< input buffer - uint8_t *end; ///< end of input buffer + uint8_t* cur; ///< input buffer + uint8_t* end; ///< end of input buffer uint2 bitbuf; ///< bit buffer (64-bit) uint32_t bitpos; ///< position in bit buffer @@ -165,24 +165,24 @@ inline __device__ unsigned int bfe(unsigned int source, return bits; }; -inline __device__ uint32_t showbits(inflate_state_s *s, uint32_t n) +inline __device__ uint32_t showbits(inflate_state_s* s, uint32_t n) { uint32_t next32 = __funnelshift_rc(s->bitbuf.x, s->bitbuf.y, s->bitpos); return (next32 & ((1 << n) - 1)); } -inline __device__ uint32_t nextbits32(inflate_state_s *s) +inline __device__ uint32_t nextbits32(inflate_state_s* s) { return __funnelshift_rc(s->bitbuf.x, s->bitbuf.y, s->bitpos); } -inline __device__ void skipbits(inflate_state_s *s, uint32_t n) +inline __device__ void skipbits(inflate_state_s* s, uint32_t n) { uint32_t bitpos = s->bitpos + n; if (bitpos >= 32) { - uint8_t *cur = s->cur + 8; + uint8_t* cur = s->cur + 8; s->bitbuf.x = s->bitbuf.y; - s->bitbuf.y = (cur < s->end) ? *reinterpret_cast(cur) : 0; + s->bitbuf.y = (cur < s->end) ? *reinterpret_cast(cur) : 0; s->cur = cur - 4; bitpos &= 0x1f; } @@ -191,7 +191,7 @@ inline __device__ void skipbits(inflate_state_s *s, uint32_t n) // TODO: If we require 4-byte alignment of input bitstream & length (padded), reading bits would // become quite a bit faster -__device__ uint32_t getbits(inflate_state_s *s, uint32_t n) +__device__ uint32_t getbits(inflate_state_s* s, uint32_t n) { uint32_t v = showbits(s, n); skipbits(s, n); @@ -222,7 +222,7 @@ __device__ uint32_t getbits(inflate_state_s *s, uint32_t n) * - Incomplete codes are handled by this decoder, since they are permitted * in the deflate format. See the format notes for fixed() and dynamic(). */ -__device__ int decode(inflate_state_s *s, const int16_t *counts, const int16_t *symbols) +__device__ int decode(inflate_state_s* s, const int16_t* counts, const int16_t* symbols) { unsigned int len; // current number of bits in code unsigned int code; // len bits being decoded @@ -279,15 +279,16 @@ __device__ int decode(inflate_state_s *s, const int16_t *counts, const int16_t * * the code bits definition. */ __device__ int construct( - inflate_state_s *s, int16_t *counts, int16_t *symbols, const int16_t *length, int n) + inflate_state_s* s, int16_t* counts, int16_t* symbols, const int16_t* length, int n) { int symbol; // current symbol when stepping through length[] int len; // current length when stepping through counts[] int left; // number of possible codes left of current length - int16_t *offs = s->u.scratch.offs; + int16_t* offs = s->u.scratch.offs; // count number of codes of each length - for (len = 0; len <= max_bits; len++) counts[len] = 0; + for (len = 0; len <= max_bits; len++) + counts[len] = 0; for (symbol = 0; symbol < n; symbol++) (counts[length[symbol]])++; // assumes lengths are within bounds if (counts[0] == n) // no codes! @@ -303,7 +304,8 @@ __device__ int construct( // generate offsets into symbol table for each length for sorting offs[1] = 0; - for (len = 1; len < max_bits; len++) offs[len + 1] = offs[len] + counts[len]; + for (len = 1; len < max_bits; len++) + offs[len + 1] = offs[len] + counts[len]; // put symbols in table sorted by length, by symbol order within each length for (symbol = 0; symbol < n; symbol++) @@ -318,12 +320,12 @@ static const __device__ __constant__ uint8_t g_code_order[19 + 1] = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15, 0xff}; /// Dynamic block (custom huffman tables) -__device__ int init_dynamic(inflate_state_s *s) +__device__ int init_dynamic(inflate_state_s* s) { int nlen, ndist, ncode; /* number of lengths in descriptor */ int index; /* index of lengths[] */ int err; /* construct() return value */ - int16_t *lengths = s->u.scratch.lengths; + int16_t* lengths = s->u.scratch.lengths; // get number of lengths in each table, check lengths nlen = getbits(s, 5) + 257; @@ -333,8 +335,10 @@ __device__ int init_dynamic(inflate_state_s *s) return -3; // bad counts } // read code length code lengths (really), missing lengths are zero - for (index = 0; index < ncode; index++) lengths[g_code_order[index]] = getbits(s, 3); - for (; index < 19; index++) lengths[g_code_order[index]] = 0; + for (index = 0; index < ncode; index++) + lengths[g_code_order[index]] = getbits(s, 3); + for (; index < 19; index++) + lengths[g_code_order[index]] = 0; // build huffman table for code lengths codes (use lencode temporarily) err = construct(s, s->lencnt, s->lensym, lengths, 19); @@ -404,20 +408,25 @@ __device__ int init_dynamic(inflate_state_s *s) * length, this can be implemented as an incomplete code. Then the invalid * codes are detected while decoding. */ -__device__ int init_fixed(inflate_state_s *s) +__device__ int init_fixed(inflate_state_s* s) { - int16_t *lengths = s->u.scratch.lengths; + int16_t* lengths = s->u.scratch.lengths; int symbol; // literal/length table - for (symbol = 0; symbol < 144; symbol++) lengths[symbol] = 8; - for (; symbol < 256; symbol++) lengths[symbol] = 9; - for (; symbol < 280; symbol++) lengths[symbol] = 7; - for (; symbol < fix_l_codes; symbol++) lengths[symbol] = 8; + for (symbol = 0; symbol < 144; symbol++) + lengths[symbol] = 8; + for (; symbol < 256; symbol++) + lengths[symbol] = 9; + for (; symbol < 280; symbol++) + lengths[symbol] = 7; + for (; symbol < fix_l_codes; symbol++) + lengths[symbol] = 8; construct(s, s->lencnt, s->lensym, lengths, fix_l_codes); // distance table - for (symbol = 0; symbol < max_d_codes; symbol++) lengths[symbol] = 5; + for (symbol = 0; symbol < max_d_codes; symbol++) + lengths[symbol] = 5; // build huffman table for distance codes construct(s, s->distcnt, s->distsym, lengths, max_d_codes); @@ -497,21 +506,21 @@ static const __device__ __constant__ uint16_t g_dext[30] = { // Extra bits for 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; /// @brief Thread 0 only: decode bitstreams and output symbols into the symbol queue -__device__ void decode_symbols(inflate_state_s *s) +__device__ void decode_symbols(inflate_state_s* s) { uint32_t bitpos = s->bitpos; uint2 bitbuf = s->bitbuf; - uint8_t *cur = s->cur; - uint8_t *end = s->end; + uint8_t* cur = s->cur; + uint8_t* end = s->end; int32_t batch = 0; int32_t sym, batch_len; do { - volatile uint32_t *b = &s->x.u.symqueue[batch * batch_size]; + volatile uint32_t* b = &s->x.u.symqueue[batch * batch_size]; // Wait for the next batch entry to be empty #if ENABLE_PREFETCH // Wait for prefetcher to fetch a worst-case of 48 bits per symbol - while ((*(volatile int32_t *)&s->pref.cur_p - (int32_t)(size_t)cur < batch_size * 6) || + while ((*(volatile int32_t*)&s->pref.cur_p - (int32_t)(size_t)cur < batch_size * 6) || (s->x.batch_len[batch] != 0)) {} #else while (s->x.batch_len[batch] != 0) {} @@ -544,7 +553,7 @@ __device__ void decode_symbols(inflate_state_s *s) } else { // Slow length path uint32_t next32r = __brev(next32); - const int16_t *symbols = &s->lensym[s->index_slow_len]; + const int16_t* symbols = &s->lensym[s->index_slow_len]; unsigned int first = s->first_slow_len; int lext; #pragma unroll 1 @@ -583,7 +592,7 @@ __device__ void decode_symbols(inflate_state_s *s) cur += 4; #else cur += 8; - bitbuf.y = (cur < end) ? *(const uint32_t *)cur : 0; + bitbuf.y = (cur < end) ? *(const uint32_t*)cur : 0; cur -= 4; #endif bitpos &= 0x1f; @@ -599,7 +608,7 @@ __device__ void decode_symbols(inflate_state_s *s) len += dext; } else { uint32_t next32r = __brev(next32); - const int16_t *symbols = &s->distsym[s->index_slow_dist]; + const int16_t* symbols = &s->distsym[s->index_slow_dist]; unsigned int first = s->first_slow_dist; #pragma unroll 1 for (len = log2_dist_lut + 1; len <= max_bits; len++) { @@ -636,7 +645,7 @@ __device__ void decode_symbols(inflate_state_s *s) #else cur += 8; if (cur < end) { - bitbuf.y = *(const uint32_t *)cur; + bitbuf.y = *(const uint32_t*)cur; cur -= 4; } else { bitbuf.y = 0; @@ -654,7 +663,7 @@ __device__ void decode_symbols(inflate_state_s *s) } while (batch_len < batch_size - 1); s->x.batch_len[batch] = batch_len; #if ENABLE_PREFETCH - ((volatile inflate_state_s *)s)->cur = cur; + ((volatile inflate_state_s*)s)->cur = cur; #endif if (batch_len != 0) batch = (batch + 1) & (batch_count - 1); } while (sym != 256); @@ -672,13 +681,13 @@ __device__ void decode_symbols(inflate_state_s *s) * @brief Build lookup tables for faster decode * LUT format is symbols*16+length */ -__device__ void init_length_lut(inflate_state_s *s, int t) +__device__ void init_length_lut(inflate_state_s* s, int t) { - int32_t *lut = s->u.lut.lenlut; + int32_t* lut = s->u.lut.lenlut; for (uint32_t bits = t; bits < (1 << log2_len_lut); bits += blockDim.x) { - const int16_t *cnt = s->lencnt; - const int16_t *symbols = s->lensym; + const int16_t* cnt = s->lencnt; + const int16_t* symbols = s->lensym; int sym = -10 << 5; unsigned int first = 0; unsigned int rbits = __brev(bits) >> (32 - log2_len_lut); @@ -704,7 +713,7 @@ __device__ void init_length_lut(inflate_state_s *s, int t) if (!t) { unsigned int first = 0; unsigned int index = 0; - const int16_t *cnt = s->lencnt; + const int16_t* cnt = s->lencnt; for (unsigned int len = 1; len <= log2_len_lut; len++) { unsigned int count = cnt[len]; index += count; @@ -720,13 +729,13 @@ __device__ void init_length_lut(inflate_state_s *s, int t) * @brief Build lookup tables for faster decode of distance symbol * LUT format is symbols*16+length */ -__device__ void init_distance_lut(inflate_state_s *s, int t) +__device__ void init_distance_lut(inflate_state_s* s, int t) { - int32_t *lut = s->u.lut.distlut; + int32_t* lut = s->u.lut.distlut; for (uint32_t bits = t; bits < (1 << log2_dist_lut); bits += blockDim.x) { - const int16_t *cnt = s->distcnt; - const int16_t *symbols = s->distsym; + const int16_t* cnt = s->distcnt; + const int16_t* symbols = s->distsym; int sym = 0; unsigned int first = 0; unsigned int rbits = __brev(bits) >> (32 - log2_dist_lut); @@ -749,7 +758,7 @@ __device__ void init_distance_lut(inflate_state_s *s, int t) if (!t) { unsigned int first = 0; unsigned int index = 0; - const int16_t *cnt = s->distcnt; + const int16_t* cnt = s->distcnt; for (unsigned int len = 1; len <= log2_dist_lut; len++) { unsigned int count = cnt[len]; index += count; @@ -762,15 +771,15 @@ __device__ void init_distance_lut(inflate_state_s *s, int t) } /// @brief WARP1: process symbols and output uncompressed stream -__device__ void process_symbols(inflate_state_s *s, int t) +__device__ void process_symbols(inflate_state_s* s, int t) { - uint8_t *out = s->out; - const uint8_t *outend = s->outend; - const uint8_t *outbase = s->outbase; + uint8_t* out = s->out; + const uint8_t* outend = s->outend; + const uint8_t* outbase = s->outbase; int batch = 0; do { - volatile uint32_t *b = &s->x.u.symqueue[batch * batch_size]; + volatile uint32_t* b = &s->x.u.symqueue[batch * batch_size]; int batch_len, pos; int32_t symt; uint32_t lit_mask; @@ -798,7 +807,7 @@ __device__ void process_symbols(inflate_state_s *s, int t) len = max((symbol & 0xffff) - 256, 0); // max should be unnecessary, but just in case dist = symbol >> 16; for (int i = t; i < len; i += 32) { - const uint8_t *src = out + ((i >= dist) ? (i % dist) : i) - dist; + const uint8_t* src = out + ((i >= dist) ? (i % dist) : i) - dist; uint8_t b = (src < outbase) ? 0 : *src; if (out + i < outend) { out[i] = b; } } @@ -838,7 +847,7 @@ __device__ void process_symbols(inflate_state_s *s, int t) * - A stored block can have zero length. This is sometimes used to byte-align * subsets of the compressed data for random access or partial recovery. */ -__device__ int init_stored(inflate_state_s *s) +__device__ int init_stored(inflate_state_s* s) { uint32_t len, nlen; // length of stored block @@ -863,13 +872,13 @@ __device__ int init_stored(inflate_state_s *s) } /// Copy bytes from stored block to destination -__device__ void copy_stored(inflate_state_s *s, int t) +__device__ void copy_stored(inflate_state_s* s, int t) { int len = s->stored_blk_len; - uint8_t *cur = s->cur + (s->bitpos >> 3); - uint8_t *out = s->out; - uint8_t *outend = s->outend; - uint8_t *cur4; + uint8_t* cur = s->cur + (s->bitpos >> 3); + uint8_t* out = s->out; + uint8_t* outend = s->outend; + uint8_t* cur4; int slow_bytes = min(len, (int)((16 - (size_t)out) & 0xf)); int fast_bytes, bitpos; @@ -893,18 +902,18 @@ __device__ void copy_stored(inflate_state_s *s, int t) // Fast copy 16 bytes at a time for (int i = t * 16; i < fast_bytes; i += blockDim.x * 16) { uint4 u; - u.x = *reinterpret_cast(cur4 + i + 0 * 4); - u.y = *reinterpret_cast(cur4 + i + 1 * 4); - u.z = *reinterpret_cast(cur4 + i + 2 * 4); - u.w = *reinterpret_cast(cur4 + i + 3 * 4); + u.x = *reinterpret_cast(cur4 + i + 0 * 4); + u.y = *reinterpret_cast(cur4 + i + 1 * 4); + u.z = *reinterpret_cast(cur4 + i + 2 * 4); + u.w = *reinterpret_cast(cur4 + i + 3 * 4); if (bitpos != 0) { - uint32_t v = (bitpos != 0) ? *reinterpret_cast(cur4 + i + 4 * 4) : 0; + uint32_t v = (bitpos != 0) ? *reinterpret_cast(cur4 + i + 4 * 4) : 0; u.x = __funnelshift_rc(u.x, u.y, bitpos); u.y = __funnelshift_rc(u.y, u.z, bitpos); u.z = __funnelshift_rc(u.z, u.w, bitpos); u.w = __funnelshift_rc(u.w, v, bitpos); } - *reinterpret_cast(out + i) = u; + *reinterpret_cast(out + i) = u; } } cur += fast_bytes; @@ -920,20 +929,20 @@ __device__ void copy_stored(inflate_state_s *s, int t) __syncthreads(); if (t == 0) { // Reset bitstream to end of block - uint8_t *p = cur + len; + uint8_t* p = cur + len; uint32_t prefix_bytes = (uint32_t)(((size_t)p) & 3); p -= prefix_bytes; s->cur = p; - s->bitbuf.x = (p < s->end) ? *reinterpret_cast(p) : 0; + s->bitbuf.x = (p < s->end) ? *reinterpret_cast(p) : 0; p += 4; - s->bitbuf.y = (p < s->end) ? *reinterpret_cast(p) : 0; + s->bitbuf.y = (p < s->end) ? *reinterpret_cast(p) : 0; s->bitpos = prefix_bytes * 8; s->out = out; } } #if ENABLE_PREFETCH -__device__ void init_prefetcher(inflate_state_s *s, int t) +__device__ void init_prefetcher(inflate_state_s* s, int t) { if (t == 0) { s->pref.cur_p = s->cur; @@ -941,17 +950,17 @@ __device__ void init_prefetcher(inflate_state_s *s, int t) } } -__device__ void prefetch_warp(volatile inflate_state_s *s, int t) +__device__ void prefetch_warp(volatile inflate_state_s* s, int t) { - const uint8_t *cur_p = s->pref.cur_p; - const uint8_t *end = s->end; + const uint8_t* cur_p = s->pref.cur_p; + const uint8_t* end = s->end; while (shuffle((t == 0) ? s->pref.run : 0)) { int32_t cur_lo = (int32_t)(size_t)cur_p; int do_pref = - shuffle((t == 0) ? (cur_lo - *(volatile int32_t *)&s->cur < prefetch_size - 32 * 4 - 4) : 0); + shuffle((t == 0) ? (cur_lo - *(volatile int32_t*)&s->cur < prefetch_size - 32 * 4 - 4) : 0); if (do_pref) { - const uint8_t *p = cur_p + 4 * t; - *prefetch_addr32(s->pref, p) = (p < end) ? *reinterpret_cast(p) : 0; + const uint8_t* p = cur_p + 4 * t; + *prefetch_addr32(s->pref, p) = (p < end) ? *reinterpret_cast(p) : 0; cur_p += 4 * 32; __threadfence_block(); __syncwarp(); @@ -968,7 +977,7 @@ __device__ void prefetch_warp(volatile inflate_state_s *s, int t) * @brief Parse GZIP header * See https://tools.ietf.org/html/rfc1952 */ -__device__ int parse_gzip_header(const uint8_t *src, size_t src_size) +__device__ int parse_gzip_header(const uint8_t* src, size_t src_size) { int hdr_len = -1; @@ -1020,16 +1029,16 @@ __device__ int parse_gzip_header(const uint8_t *src, size_t src_size) */ template __global__ void __launch_bounds__(block_size) - inflate_kernel(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs, int parse_hdr) + inflate_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, int parse_hdr) { __shared__ __align__(16) inflate_state_s state_g; int t = threadIdx.x; int z = blockIdx.x; - inflate_state_s *state = &state_g; + inflate_state_s* state = &state_g; if (!t) { - uint8_t *p = const_cast(static_cast(inputs[z].srcDevice)); + uint8_t* p = const_cast(static_cast(inputs[z].srcDevice)); size_t src_size = inputs[z].srcSize; uint32_t prefix_bytes; // Parse header if needed @@ -1045,16 +1054,16 @@ __global__ void __launch_bounds__(block_size) } } // Initialize shared state - state->out = const_cast(static_cast(inputs[z].dstDevice)); + state->out = const_cast(static_cast(inputs[z].dstDevice)); state->outbase = state->out; state->outend = state->out + inputs[z].dstSize; state->end = p + src_size; prefix_bytes = (uint32_t)(((size_t)p) & 3); p -= prefix_bytes; state->cur = p; - state->bitbuf.x = (p < state->end) ? *reinterpret_cast(p) : 0; + state->bitbuf.x = (p < state->end) ? *reinterpret_cast(p) : 0; p += 4; - state->bitbuf.y = (p < state->end) ? *reinterpret_cast(p) : 0; + state->bitbuf.y = (p < state->end) ? *reinterpret_cast(p) : 0; state->bitpos = prefix_bytes * 8; } __syncthreads(); @@ -1139,21 +1148,21 @@ __global__ void __launch_bounds__(block_size) * * @param inputs Source and destination information per block */ -__global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_input_s *inputs) +__global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_input_s* inputs) { - __shared__ const uint8_t *volatile src_g; - __shared__ uint8_t *volatile dst_g; + __shared__ const uint8_t* volatile src_g; + __shared__ uint8_t* volatile dst_g; __shared__ uint32_t volatile copy_len_g; uint32_t t = threadIdx.x; uint32_t z = blockIdx.x; - const uint8_t *src; - uint8_t *dst; + const uint8_t* src; + uint8_t* dst; uint32_t len, src_align_bytes, src_align_bits, dst_align_bytes; if (!t) { - src = static_cast(inputs[z].srcDevice); - dst = static_cast(inputs[z].dstDevice); + src = static_cast(inputs[z].srcDevice); + dst = static_cast(inputs[z].dstDevice); len = min((uint32_t)inputs[z].srcSize, (uint32_t)inputs[z].dstSize); src_g = src; dst_g = dst; @@ -1175,12 +1184,12 @@ __global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_inp src_align_bytes = (uint32_t)(3 & reinterpret_cast(src)); src_align_bits = src_align_bytes << 3; while (len >= 32) { - const uint32_t *src32 = reinterpret_cast(src - src_align_bytes); + const uint32_t* src32 = reinterpret_cast(src - src_align_bytes); uint32_t copy_cnt = min(len >> 2, 1024); if (t < copy_cnt) { uint32_t v = src32[t]; if (src_align_bits != 0) { v = __funnelshift_r(v, src32[t + 1], src_align_bits); } - reinterpret_cast(dst)[t] = v; + reinterpret_cast(dst)[t] = v; } src += copy_cnt * 4; dst += copy_cnt * 4; @@ -1189,8 +1198,8 @@ __global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_inp if (t < len) { dst[t] = src[t]; } } -cudaError_t __host__ gpuinflate(gpu_inflate_input_s *inputs, - gpu_inflate_status_s *outputs, +cudaError_t __host__ gpuinflate(gpu_inflate_input_s* inputs, + gpu_inflate_status_s* outputs, int count, int parse_hdr, rmm::cuda_stream_view stream) @@ -1203,7 +1212,7 @@ cudaError_t __host__ gpuinflate(gpu_inflate_input_s *inputs, return cudaSuccess; } -cudaError_t __host__ gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs, +cudaError_t __host__ gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs, int count, rmm::cuda_stream_view stream) { diff --git a/cpp/src/io/comp/gpuinflate.h b/cpp/src/io/comp/gpuinflate.h index 7ca6dd13e9a..a37d282997e 100644 --- a/cpp/src/io/comp/gpuinflate.h +++ b/cpp/src/io/comp/gpuinflate.h @@ -26,9 +26,9 @@ namespace io { * @brief Input parameters for the decompression interface */ struct gpu_inflate_input_s { - const void *srcDevice; + const void* srcDevice; uint64_t srcSize; - void *dstDevice; + void* dstDevice; uint64_t dstSize; }; @@ -53,8 +53,8 @@ struct gpu_inflate_status_s { * @param[in] parse_hdr Whether or not to parse GZIP header, default false * @param[in] stream CUDA stream to use, default 0 */ -cudaError_t gpuinflate(gpu_inflate_input_s *inputs, - gpu_inflate_status_s *outputs, +cudaError_t gpuinflate(gpu_inflate_input_s* inputs, + gpu_inflate_status_s* outputs, int count = 1, int parse_hdr = 0, rmm::cuda_stream_view stream = rmm::cuda_stream_default); @@ -66,7 +66,7 @@ cudaError_t gpuinflate(gpu_inflate_input_s *inputs, * @param[in] count Number of input structures, default 1 * @param[in] stream CUDA stream to use, default 0 */ -cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs, +cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs, int count = 1, rmm::cuda_stream_view stream = rmm::cuda_stream_default); @@ -81,8 +81,8 @@ cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs, * @param[in] count Number of input/output structures, default 1 * @param[in] stream CUDA stream to use, default 0 */ -cudaError_t gpu_unsnap(gpu_inflate_input_s *inputs, - gpu_inflate_status_s *outputs, +cudaError_t gpu_unsnap(gpu_inflate_input_s* inputs, + gpu_inflate_status_s* outputs, int count = 1, rmm::cuda_stream_view stream = rmm::cuda_stream_default); @@ -108,9 +108,9 @@ size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0); * @param[in] count Number of input/output structures, default 1 * @param[in] stream CUDA stream to use, default 0 */ -cudaError_t gpu_debrotli(gpu_inflate_input_s *inputs, - gpu_inflate_status_s *outputs, - void *scratch, +cudaError_t gpu_debrotli(gpu_inflate_input_s* inputs, + gpu_inflate_status_s* outputs, + void* scratch, size_t scratch_size, int count = 1, rmm::cuda_stream_view stream = rmm::cuda_stream_default); @@ -126,8 +126,8 @@ cudaError_t gpu_debrotli(gpu_inflate_input_s *inputs, * @param[in] count Number of input/output structures, default 1 * @param[in] stream CUDA stream to use, default 0 */ -cudaError_t gpu_snap(gpu_inflate_input_s *inputs, - gpu_inflate_status_s *outputs, +cudaError_t gpu_snap(gpu_inflate_input_s* inputs, + gpu_inflate_status_s* outputs, int count = 1, rmm::cuda_stream_view stream = rmm::cuda_stream_default); diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu index 999d02e3a50..a3d7bd048e8 100644 --- a/cpp/src/io/comp/snap.cu +++ b/cpp/src/io/comp/snap.cu @@ -31,11 +31,11 @@ constexpr int hash_bits = 12; * @brief snappy compressor state */ struct snap_state_s { - const uint8_t *src; ///< Ptr to uncompressed data + const uint8_t* src; ///< Ptr to uncompressed data uint32_t src_len; ///< Uncompressed data length - uint8_t *dst_base; ///< Base ptr to output compressed data - uint8_t *dst; ///< Current ptr to uncompressed data - uint8_t *end; ///< End of uncompressed data buffer + uint8_t* dst_base; ///< Base ptr to output compressed data + uint8_t* dst; ///< Current ptr to uncompressed data + uint8_t* end; ///< End of uncompressed data buffer volatile uint32_t literal_length; ///< Number of literal bytes volatile uint32_t copy_length; ///< Number of copy bytes volatile uint32_t copy_distance; ///< Distance for copy bytes @@ -53,10 +53,10 @@ static inline __device__ uint32_t snap_hash(uint32_t v) /** * @brief Fetches four consecutive bytes */ -static inline __device__ uint32_t fetch4(const uint8_t *src) +static inline __device__ uint32_t fetch4(const uint8_t* src) { uint32_t src_align = 3 & reinterpret_cast(src); - const uint32_t *src32 = reinterpret_cast(src - src_align); + const uint32_t* src32 = reinterpret_cast(src - src_align); uint32_t v = src32[0]; return (src_align) ? __funnelshift_r(v, src32[1], src_align * 8) : v; } @@ -72,8 +72,8 @@ static inline __device__ uint32_t fetch4(const uint8_t *src) * * @return Updated pointer to compressed byte stream */ -static __device__ uint8_t *StoreLiterals( - uint8_t *dst, uint8_t *end, const uint8_t *src, uint32_t len_minus1, uint32_t t) +static __device__ uint8_t* StoreLiterals( + uint8_t* dst, uint8_t* end, const uint8_t* src, uint32_t len_minus1, uint32_t t) { if (len_minus1 < 60) { if (!t && dst < end) dst[0] = (len_minus1 << 2); @@ -125,8 +125,8 @@ static __device__ uint8_t *StoreLiterals( * * @return Updated pointer to compressed byte stream */ -static __device__ uint8_t *StoreCopy(uint8_t *dst, - uint8_t *end, +static __device__ uint8_t* StoreCopy(uint8_t* dst, + uint8_t* end, uint32_t copy_len, uint32_t distance) { @@ -178,8 +178,8 @@ static inline __device__ uint32_t HashMatchAny(uint32_t v, uint32_t t) * * @return Number of bytes before first match (literal length) */ -static __device__ uint32_t FindFourByteMatch(snap_state_s *s, - const uint8_t *src, +static __device__ uint32_t FindFourByteMatch(snap_state_s* s, + const uint8_t* src, uint32_t pos0, uint32_t t) { @@ -233,8 +233,8 @@ static __device__ uint32_t FindFourByteMatch(snap_state_s *s, } /// @brief Returns the number of matching bytes for two byte sequences up to 63 bytes -static __device__ uint32_t Match60(const uint8_t *src1, - const uint8_t *src2, +static __device__ uint32_t Match60(const uint8_t* src1, + const uint8_t* src2, uint32_t len, uint32_t t) { @@ -258,21 +258,21 @@ static __device__ uint32_t Match60(const uint8_t *src1, * @param[in] count Number of blocks to compress */ extern "C" __global__ void __launch_bounds__(128) - snap_kernel(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs, int count) + snap_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, int count) { __shared__ __align__(16) snap_state_s state_g; - snap_state_s *const s = &state_g; + snap_state_s* const s = &state_g; uint32_t t = threadIdx.x; uint32_t pos; - const uint8_t *src; + const uint8_t* src; if (!t) { - const uint8_t *src = static_cast(inputs[blockIdx.x].srcDevice); + const uint8_t* src = static_cast(inputs[blockIdx.x].srcDevice); uint32_t src_len = static_cast(inputs[blockIdx.x].srcSize); - uint8_t *dst = static_cast(inputs[blockIdx.x].dstDevice); + uint8_t* dst = static_cast(inputs[blockIdx.x].dstDevice); uint32_t dst_len = static_cast(inputs[blockIdx.x].dstSize); - uint8_t *end = dst + dst_len; + uint8_t* end = dst + dst_len; s->src = src; s->src_len = src_len; s->dst_base = dst; @@ -289,7 +289,7 @@ extern "C" __global__ void __launch_bounds__(128) s->copy_distance = 0; } for (uint32_t i = t; i < sizeof(s->hash_map) / sizeof(uint32_t); i += 128) { - *reinterpret_cast(&s->hash_map[i * 2]) = 0; + *reinterpret_cast(&s->hash_map[i * 2]) = 0; } __syncthreads(); src = s->src; @@ -301,8 +301,8 @@ extern "C" __global__ void __launch_bounds__(128) __syncthreads(); if (t < 32) { // WARP0: Encode literals and copies - uint8_t *dst = s->dst; - uint8_t *end = s->end; + uint8_t* dst = s->dst; + uint8_t* end = s->end; if (literal_len > 0) { dst = StoreLiterals(dst, end, src + pos, literal_len - 1, t); pos += literal_len; @@ -341,8 +341,8 @@ extern "C" __global__ void __launch_bounds__(128) } } -cudaError_t __host__ gpu_snap(gpu_inflate_input_s *inputs, - gpu_inflate_status_s *outputs, +cudaError_t __host__ gpu_snap(gpu_inflate_input_s* inputs, + gpu_inflate_status_s* outputs, int count, rmm::cuda_stream_view stream) { diff --git a/cpp/src/io/comp/unbz2.h b/cpp/src/io/comp/unbz2.h index 8f3a6eace5a..5731db63757 100644 --- a/cpp/src/io/comp/unbz2.h +++ b/cpp/src/io/comp/unbz2.h @@ -82,25 +82,25 @@ namespace io { // If BZ_OUTBUFF_FULL is returned and block_start is non-NULL, dstlen will be updated to point to // the end of the last valid block, and block_start will contain the offset in bits of the beginning // of the block, so it can be passed in to resume decoding later on. -#define BZ_OK 0 -#define BZ_RUN_OK 1 -#define BZ_FLUSH_OK 2 -#define BZ_FINISH_OK 3 -#define BZ_STREAM_END 4 -#define BZ_SEQUENCE_ERROR (-1) -#define BZ_PARAM_ERROR (-2) -#define BZ_MEM_ERROR (-3) -#define BZ_DATA_ERROR (-4) +#define BZ_OK 0 +#define BZ_RUN_OK 1 +#define BZ_FLUSH_OK 2 +#define BZ_FINISH_OK 3 +#define BZ_STREAM_END 4 +#define BZ_SEQUENCE_ERROR (-1) +#define BZ_PARAM_ERROR (-2) +#define BZ_MEM_ERROR (-3) +#define BZ_DATA_ERROR (-4) #define BZ_DATA_ERROR_MAGIC (-5) -#define BZ_IO_ERROR (-6) -#define BZ_UNEXPECTED_EOF (-7) -#define BZ_OUTBUFF_FULL (-8) +#define BZ_IO_ERROR (-6) +#define BZ_UNEXPECTED_EOF (-7) +#define BZ_OUTBUFF_FULL (-8) -int32_t cpu_bz2_uncompress(const uint8_t *input, +int32_t cpu_bz2_uncompress(const uint8_t* input, size_t inlen, - uint8_t *dst, - size_t *dstlen, - uint64_t *block_start = nullptr); + uint8_t* dst, + size_t* dstlen, + uint64_t* block_start = nullptr); } // namespace io } // namespace cudf diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 44581bbc184..2cb99d897fe 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -106,32 +106,32 @@ struct bz2_file_header_s { #pragma pack(pop) struct gz_archive_s { - const gz_file_header_s *fhdr; + const gz_file_header_s* fhdr; uint16_t hcrc16; // header crc16 if present uint16_t xlen; - const uint8_t *fxtra; // xlen bytes (optional) - const uint8_t *fname; // zero-terminated original filename if present - const uint8_t *fcomment; // zero-terminated comment if present - const uint8_t *comp_data; // compressed data + const uint8_t* fxtra; // xlen bytes (optional) + const uint8_t* fname; // zero-terminated original filename if present + const uint8_t* fcomment; // zero-terminated comment if present + const uint8_t* comp_data; // compressed data size_t comp_len; // Compressed data length uint32_t crc32; // CRC32 of uncompressed data uint32_t isize; // Input size modulo 2^32 }; struct zip_archive_s { - const zip_eocd_s *eocd; // end of central directory - const zip64_eocdl *eocdl; // end of central dir locator (optional) - const zip_cdfh_s *cdfh; // start of central directory file headers + const zip_eocd_s* eocd; // end of central directory + const zip64_eocdl* eocdl; // end of central dir locator (optional) + const zip_cdfh_s* cdfh; // start of central directory file headers }; -bool ParseGZArchive(gz_archive_s *dst, const uint8_t *raw, size_t len) +bool ParseGZArchive(gz_archive_s* dst, const uint8_t* raw, size_t len) { - const gz_file_header_s *fhdr; + const gz_file_header_s* fhdr; if (!dst) return false; memset(dst, 0, sizeof(gz_archive_s)); if (len < sizeof(gz_file_header_s) + 8) return false; - fhdr = reinterpret_cast(raw); + fhdr = reinterpret_cast(raw); if (fhdr->id1 != 0x1f || fhdr->id2 != 0x8b) return false; dst->fhdr = fhdr; raw += sizeof(gz_file_header_s); @@ -188,7 +188,7 @@ bool ParseGZArchive(gz_archive_s *dst, const uint8_t *raw, size_t len) return (fhdr->comp_mthd == 8 && len > 0); } -bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len) +bool OpenZipArchive(zip_archive_s* dst, const uint8_t* raw, size_t len) { memset(dst, 0, sizeof(zip_archive_s)); // Find the end of central directory @@ -196,17 +196,17 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len) for (ptrdiff_t i = len - sizeof(zip_eocd_s) - 2; i + sizeof(zip_eocd_s) + 2 + 0xffff >= len && i >= 0; i--) { - const zip_eocd_s *eocd = reinterpret_cast(raw + i); + const zip_eocd_s* eocd = reinterpret_cast(raw + i); if (eocd->sig == 0x06054b50 && eocd->disk_id == eocd->start_disk // multi-file archives not supported && eocd->num_entries == eocd->total_entries && eocd->cdir_size >= sizeof(zip_cdfh_s) * eocd->num_entries && eocd->cdir_offset < len && - i + *reinterpret_cast(eocd + 1) <= static_cast(len)) { - const zip_cdfh_s *cdfh = reinterpret_cast(raw + eocd->cdir_offset); + i + *reinterpret_cast(eocd + 1) <= static_cast(len)) { + const zip_cdfh_s* cdfh = reinterpret_cast(raw + eocd->cdir_offset); dst->eocd = eocd; if (i >= static_cast(sizeof(zip64_eocdl))) { - const zip64_eocdl *eocdl = - reinterpret_cast(raw + i - sizeof(zip64_eocdl)); + const zip64_eocdl* eocdl = + reinterpret_cast(raw + i - sizeof(zip64_eocdl)); if (eocdl->sig == 0x07064b50) { dst->eocdl = eocdl; } } // Start of central directory @@ -217,13 +217,13 @@ bool OpenZipArchive(zip_archive_s *dst, const uint8_t *raw, size_t len) return (dst->eocd && dst->cdfh); } -int cpu_inflate(uint8_t *uncomp_data, size_t *destLen, const uint8_t *comp_data, size_t comp_len) +int cpu_inflate(uint8_t* uncomp_data, size_t* destLen, const uint8_t* comp_data, size_t comp_len) { int zerr; z_stream strm; memset(&strm, 0, sizeof(strm)); - strm.next_in = const_cast(reinterpret_cast(comp_data)); + strm.next_in = const_cast(reinterpret_cast(comp_data)); strm.avail_in = comp_len; strm.total_in = 0; strm.next_out = uncomp_data; @@ -250,16 +250,16 @@ int cpu_inflate(uint8_t *uncomp_data, size_t *destLen, const uint8_t *comp_data, * @param comp_data[in] Raw compressed data * @param comp_len[in] Compressed data size */ -int cpu_inflate_vector(std::vector &dst, const uint8_t *comp_data, size_t comp_len) +int cpu_inflate_vector(std::vector& dst, const uint8_t* comp_data, size_t comp_len) { int zerr; z_stream strm; memset(&strm, 0, sizeof(strm)); - strm.next_in = const_cast(reinterpret_cast(comp_data)); + strm.next_in = const_cast(reinterpret_cast(comp_data)); strm.avail_in = comp_len; strm.total_in = 0; - strm.next_out = reinterpret_cast(dst.data()); + strm.next_out = reinterpret_cast(dst.data()); strm.avail_out = dst.size(); strm.total_out = 0; zerr = inflateInit2(&strm, -15); // -15 for raw data without GZIP headers @@ -271,7 +271,7 @@ int cpu_inflate_vector(std::vector &dst, const uint8_t *comp_data, size_t if (strm.avail_out == 0) { dst.resize(strm.total_out + (1 << 30)); strm.avail_out = dst.size() - strm.total_out; - strm.next_out = reinterpret_cast(dst.data()) + strm.total_out; + strm.next_out = reinterpret_cast(dst.data()) + strm.total_out; } zerr = inflate(&strm, Z_SYNC_FLUSH); } while ((zerr == Z_BUF_ERROR || zerr == Z_OK) && strm.avail_out == 0 && @@ -293,10 +293,10 @@ int cpu_inflate_vector(std::vector &dst, const uint8_t *comp_data, size_t * * @return Vector containing the uncompressed output */ -std::vector io_uncompress_single_h2d(const void *src, size_t src_size, int stream_type) +std::vector io_uncompress_single_h2d(const void* src, size_t src_size, int stream_type) { - const uint8_t *raw = static_cast(src); - const uint8_t *comp_data = nullptr; + const uint8_t* raw = static_cast(src); + const uint8_t* comp_data = nullptr; size_t comp_len = 0; size_t uncomp_len = 0; @@ -320,8 +320,8 @@ std::vector io_uncompress_single_h2d(const void *src, size_t src_size, int if (OpenZipArchive(&za, raw, src_size)) { size_t cdfh_ofs = 0; for (int i = 0; i < za.eocd->num_entries; i++) { - const zip_cdfh_s *cdfh = reinterpret_cast( - reinterpret_cast(za.cdfh) + cdfh_ofs); + const zip_cdfh_s* cdfh = reinterpret_cast( + reinterpret_cast(za.cdfh) + cdfh_ofs); int cdfh_len = sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len; if (cdfh_ofs + cdfh_len > za.eocd->cdir_size || cdfh->sig != 0x02014b50) { // Bad cdir @@ -330,7 +330,7 @@ std::vector io_uncompress_single_h2d(const void *src, size_t src_size, int // For now, only accept with non-zero file sizes and DEFLATE if (cdfh->comp_method == 8 && cdfh->comp_size > 0 && cdfh->uncomp_size > 0) { size_t lfh_ofs = cdfh->hdr_ofs; - const zip_lfh_s *lfh = reinterpret_cast(raw + lfh_ofs); + const zip_lfh_s* lfh = reinterpret_cast(raw + lfh_ofs); if (lfh_ofs + sizeof(zip_lfh_s) <= src_size && lfh->sig == 0x04034b50 && lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len <= src_size) { if (lfh->comp_method == 8 && lfh->comp_size > 0 && lfh->uncomp_size > 0) { @@ -354,7 +354,7 @@ std::vector io_uncompress_single_h2d(const void *src, size_t src_size, int if (stream_type != IO_UNCOMP_STREAM_TYPE_INFER) break; // Fall through for INFER case IO_UNCOMP_STREAM_TYPE_BZIP2: if (src_size > 4) { - const bz2_file_header_s *fhdr = reinterpret_cast(raw); + const bz2_file_header_s* fhdr = reinterpret_cast(raw); // Check for BZIP2 file signature "BZh1" to "BZh9" if (fhdr->sig[0] == 'B' && fhdr->sig[1] == 'Z' && fhdr->sig[2] == 'h' && fhdr->blksz >= '1' && fhdr->blksz <= '9') { @@ -392,7 +392,7 @@ std::vector io_uncompress_single_h2d(const void *src, size_t src_size, int do { size_t dst_len = uncomp_len - dst_ofs; bz_err = cpu_bz2_uncompress( - comp_data, comp_len, reinterpret_cast(dst.data()) + dst_ofs, &dst_len, &src_ofs); + comp_data, comp_len, reinterpret_cast(dst.data()) + dst_ofs, &dst_len, &src_ofs); if (bz_err == BZ_OUTBUFF_FULL) { // TBD: We could infer the compression ratio based on produced/consumed byte counts // in order to minimize realloc events and over-allocation @@ -422,7 +422,7 @@ std::vector io_uncompress_single_h2d(const void *src, size_t src_size, int * @return Vector containing the output uncompressed data */ std::vector get_uncompressed_data(host_span const data, - std::string const &compression) + std::string const& compression) { int comp_type = IO_UNCOMP_STREAM_TYPE_INFER; if (compression == "gzip") @@ -443,9 +443,9 @@ std::vector get_uncompressed_data(host_span const data, class HostDecompressor_ZLIB : public HostDecompressor { public: HostDecompressor_ZLIB(bool gz_hdr_) : gz_hdr(gz_hdr_) {} - size_t Decompress(uint8_t *dstBytes, + size_t Decompress(uint8_t* dstBytes, size_t dstLen, - const uint8_t *srcBytes, + const uint8_t* srcBytes, size_t srcLen) override { if (gz_hdr) { @@ -471,14 +471,14 @@ class HostDecompressor_ZLIB : public HostDecompressor { class HostDecompressor_SNAPPY : public HostDecompressor { public: HostDecompressor_SNAPPY() {} - size_t Decompress(uint8_t *dstBytes, + size_t Decompress(uint8_t* dstBytes, size_t dstLen, - const uint8_t *srcBytes, + const uint8_t* srcBytes, size_t srcLen) override { uint32_t uncompressed_size, bytes_left, dst_pos; - const uint8_t *cur = srcBytes; - const uint8_t *end = srcBytes + srcLen; + const uint8_t* cur = srcBytes; + const uint8_t* end = srcBytes + srcLen; if (!dstBytes || srcLen < 1) { return 0; } // Read uncompressed length (varint) @@ -510,12 +510,12 @@ class HostDecompressor_SNAPPY : public HostDecompressor { if (blen & 2) { // xxxxxx1x: copy with 6-bit length, 2-byte or 4-byte offset if (cur + 2 > end) break; - offset = *reinterpret_cast(cur); + offset = *reinterpret_cast(cur); cur += 2; if (blen & 1) // 4-byte offset { if (cur + 2 > end) break; - offset |= (*reinterpret_cast(cur)) << 16; + offset |= (*reinterpret_cast(cur)) << 16; cur += 2; } blen = (blen >> 2) + 1; diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu index f9d491b3cc8..5fe01735dac 100644 --- a/cpp/src/io/comp/unsnap.cu +++ b/cpp/src/io/comp/unsnap.cu @@ -64,8 +64,8 @@ struct unsnap_queue_s { * @brief snappy decompression state */ struct unsnap_state_s { - const uint8_t *base; ///< base ptr of compressed stream - const uint8_t *end; ///< end of compressed stream + const uint8_t* base; ///< base ptr of compressed stream + const uint8_t* end; ///< end of compressed stream uint32_t uncompressed_size; ///< uncompressed stream size uint32_t bytes_left; ///< bytes to uncompressed remaining int32_t error; ///< current error status @@ -74,7 +74,7 @@ struct unsnap_state_s { gpu_inflate_input_s in; ///< input parameters for current block }; -inline __device__ volatile uint8_t &byte_access(unsnap_state_s *s, uint32_t pos) +inline __device__ volatile uint8_t& byte_access(unsnap_state_s* s, uint32_t pos) { return s->q.buf[pos & (prefetch_size - 1)]; } @@ -85,9 +85,9 @@ inline __device__ volatile uint8_t &byte_access(unsnap_state_s *s, uint32_t pos) * @param s decompression state * @param t warp lane id */ -__device__ void snappy_prefetch_bytestream(unsnap_state_s *s, int t) +__device__ void snappy_prefetch_bytestream(unsnap_state_s* s, int t) { - const uint8_t *base = s->base; + const uint8_t* base = s->base; uint32_t end = (uint32_t)(s->end - base); uint32_t align_bytes = (uint32_t)(0x20 - (0x1f & reinterpret_cast(base))); int32_t pos = min(align_bytes, end); @@ -275,7 +275,7 @@ inline __device__ uint32_t get_len5_mask(uint32_t v0, uint32_t v1) * @param s decompression state * @param t warp lane id */ -__device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t) +__device__ void snappy_decode_symbols(unsnap_state_s* s, uint32_t t) { uint32_t cur = 0; uint32_t end = static_cast(s->end - s->base); @@ -285,13 +285,15 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t) for (;;) { int32_t batch_len; - volatile unsnap_batch_s *b; + volatile unsnap_batch_s* b; // Wait for prefetcher if (t == 0) { s->q.prefetch_rdpos = cur; #pragma unroll(1) // We don't want unrolling here - while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) { busy_wait(10); } + while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) { + busy_wait(10); + } b = &s->q.batch[batch * batch_size]; } // Process small symbols in parallel: for data that does not get good compression, @@ -315,17 +317,17 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t) is_long_sym = ((b0 & ~4) != 0) && (((b0 + 1) & 2) == 0); short_sym_mask = ballot(is_long_sym); batch_len = 0; - b = reinterpret_cast(shuffle(reinterpret_cast(b))); + b = reinterpret_cast(shuffle(reinterpret_cast(b))); if (!(short_sym_mask & 1)) { batch_len = shuffle((t == 0) ? (short_sym_mask) ? __ffs(short_sym_mask) - 1 : 32 : 0); if (batch_len != 0) { uint32_t blen = 0; int32_t ofs = 0; if (t < batch_len) { - blen = (b0 & 1) ? ((b0 >> 2) & 7) + 4 : ((b0 >> 2) + 1); - ofs = (b0 & 1) ? ((b0 & 0xe0) << 3) | byte_access(s, cur_t + 1) - : (b0 & 2) ? byte_access(s, cur_t + 1) | (byte_access(s, cur_t + 2) << 8) - : -(int32_t)(cur_t + 1); + blen = (b0 & 1) ? ((b0 >> 2) & 7) + 4 : ((b0 >> 2) + 1); + ofs = (b0 & 1) ? ((b0 & 0xe0) << 3) | byte_access(s, cur_t + 1) + : (b0 & 2) ? byte_access(s, cur_t + 1) | (byte_access(s, cur_t + 2) << 8) + : -(int32_t)(cur_t + 1); b[t].len = blen; b[t].offset = ofs; ofs += blen; // for correct out-of-range detection below @@ -368,11 +370,10 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t) uint32_t blen = 0; int32_t ofs = 0; if (t < batch_add) { - blen = (b0 & 1) ? ((b0 >> 2) & 7) + 4 : ((b0 >> 2) + 1); - ofs = (b0 & 1) - ? ((b0 & 0xe0) << 3) | byte_access(s, cur_t + 1) - : (b0 & 2) ? byte_access(s, cur_t + 1) | (byte_access(s, cur_t + 2) << 8) - : -(int32_t)(cur_t + 1); + blen = (b0 & 1) ? ((b0 >> 2) & 7) + 4 : ((b0 >> 2) + 1); + ofs = (b0 & 1) ? ((b0 & 0xe0) << 3) | byte_access(s, cur_t + 1) + : (b0 & 2) ? byte_access(s, cur_t + 1) | (byte_access(s, cur_t + 2) << 8) + : -(int32_t)(cur_t + 1); b[batch_len + t].len = blen; b[batch_len + t].offset = ofs; ofs += blen; // for correct out-of-range detection below @@ -451,7 +452,9 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t) // Wait for prefetcher s->q.prefetch_rdpos = cur; #pragma unroll(1) // We don't want unrolling here - while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) { busy_wait(10); } + while (s->q.prefetch_wrpos < min(cur + 5 * batch_size, end)) { + busy_wait(10); + } dst_pos += blen; if (bytes_left < blen) break; bytes_left -= blen; @@ -467,7 +470,9 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t) } batch_len = shuffle(batch_len); if (t == 0) { - while (s->q.batch_len[batch] != 0) { busy_wait(20); } + while (s->q.batch_len[batch] != 0) { + busy_wait(20); + } } if (batch_len != batch_size) { break; } } @@ -489,18 +494,20 @@ __device__ void snappy_decode_symbols(unsnap_state_s *s, uint32_t t) *would result in out-of-bounds accesses) */ template -__device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_storage) +__device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_storage) { - const uint8_t *literal_base = s->base; - uint8_t *out = static_cast(s->in.dstDevice); + const uint8_t* literal_base = s->base; + uint8_t* out = static_cast(s->in.dstDevice); int batch = 0; do { - volatile unsnap_batch_s *b = &s->q.batch[batch * batch_size]; + volatile unsnap_batch_s* b = &s->q.batch[batch * batch_size]; int32_t batch_len, blen_t, dist_t; if (t == 0) { - while ((batch_len = s->q.batch_len[batch]) == 0) { busy_wait(20); } + while ((batch_len = s->q.batch_len[batch]) == 0) { + busy_wait(20); + } } else { batch_len = 0; } @@ -529,7 +536,7 @@ __device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_s uint32_t tr = t - shuffle(bofs - blen_t, it); int32_t dist = shuffle(dist_t, it); if (it < n) { - const uint8_t *src = (dist > 0) ? (out + t - dist) : (literal_base + tr - dist); + const uint8_t* src = (dist > 0) ? (out + t - dist) : (literal_base + tr - dist); out[t] = *src; } out += shuffle(bofs, n - 1); @@ -556,7 +563,7 @@ __device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_s } blen += blen2; if (t < blen) { - const uint8_t *src = (dist > 0) ? (out - d) : (literal_base - d); + const uint8_t* src = (dist > 0) ? (out - d) : (literal_base - d); out[t] = src[t]; } out += blen; @@ -569,12 +576,12 @@ __device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_s uint8_t b0, b1; if (t < blen) { uint32_t pos = t; - const uint8_t *src = out + ((pos >= dist) ? (pos % dist) : pos) - dist; + const uint8_t* src = out + ((pos >= dist) ? (pos % dist) : pos) - dist; b0 = *src; } if (32 + t < blen) { uint32_t pos = 32 + t; - const uint8_t *src = out + ((pos >= dist) ? (pos % dist) : pos) - dist; + const uint8_t* src = out + ((pos >= dist) ? (pos % dist) : pos) - dist; b1 = *src; } if (t < blen) { out[t] = b0; } @@ -616,24 +623,23 @@ __device__ void snappy_process_symbols(unsnap_state_s *s, int t, Storage &temp_s */ template __global__ void __launch_bounds__(block_size) - unsnap_kernel(gpu_inflate_input_s *inputs, gpu_inflate_status_s *outputs) + unsnap_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs) { __shared__ __align__(16) unsnap_state_s state_g; __shared__ cub::WarpReduce::TempStorage temp_storage; int t = threadIdx.x; - unsnap_state_s *s = &state_g; + unsnap_state_s* s = &state_g; int strm_id = blockIdx.x; if (t < sizeof(gpu_inflate_input_s) / sizeof(uint32_t)) { - reinterpret_cast(&s->in)[t] = - reinterpret_cast(&inputs[strm_id])[t]; + reinterpret_cast(&s->in)[t] = reinterpret_cast(&inputs[strm_id])[t]; __threadfence_block(); } if (t < batch_count) { s->q.batch_len[t] = 0; } __syncthreads(); if (!t) { - const uint8_t *cur = static_cast(s->in.srcDevice); - const uint8_t *end = cur + s->in.srcSize; + const uint8_t* cur = static_cast(s->in.srcDevice); + const uint8_t* end = cur + s->in.srcSize; s->error = 0; if (log_cyclecount) { s->tstart = clock(); } if (cur < end) { @@ -700,8 +706,8 @@ __global__ void __launch_bounds__(block_size) } } -cudaError_t __host__ gpu_unsnap(gpu_inflate_input_s *inputs, - gpu_inflate_status_s *outputs, +cudaError_t __host__ gpu_unsnap(gpu_inflate_input_s* inputs, + gpu_inflate_status_s* outputs, int count, rmm::cuda_stream_view stream) { diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index a3da5383196..68ac67b900d 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -269,7 +269,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) auto const is_negative = (*trimmed_field_range.first == '-'); auto const data_begin = trimmed_field_range.first + (is_negative || (*trimmed_field_range.first == '+')); - cudf::size_type *ptr = cudf::io::gpu::infer_integral_field_counter( + cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( data_begin, data_begin + count_number, is_negative, d_column_data[actual_col]); atomicAdd(ptr, 1); } else if (is_floatingpoint(trimmed_field_len, @@ -292,33 +292,33 @@ __global__ void __launch_bounds__(csvparse_block_dim) } template -__inline__ __device__ T decode_value(char const *begin, - char const *end, - parse_options_view const &opts) +__inline__ __device__ T decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { return cudf::io::parse_numeric(begin, end, opts); } template -__inline__ __device__ T decode_value(char const *begin, - char const *end, - parse_options_view const &opts) +__inline__ __device__ T decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { return cudf::io::parse_numeric(begin, end, opts); } template <> -__inline__ __device__ cudf::timestamp_D decode_value(char const *begin, - char const *end, - parse_options_view const &opts) +__inline__ __device__ cudf::timestamp_D decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { return timestamp_D{cudf::duration_D{to_date(begin, end, opts.dayfirst)}}; } template <> -__inline__ __device__ cudf::timestamp_s decode_value(char const *begin, - char const *end, - parse_options_view const &opts) +__inline__ __device__ cudf::timestamp_s decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { auto milli = to_date_time(begin, end, opts.dayfirst); if (milli == -1) { @@ -329,9 +329,9 @@ __inline__ __device__ cudf::timestamp_s decode_value(char const *begin, } template <> -__inline__ __device__ cudf::timestamp_ms decode_value(char const *begin, - char const *end, - parse_options_view const &opts) +__inline__ __device__ cudf::timestamp_ms decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { auto milli = to_date_time(begin, end, opts.dayfirst); if (milli == -1) { @@ -342,9 +342,9 @@ __inline__ __device__ cudf::timestamp_ms decode_value(char const *begin, } template <> -__inline__ __device__ cudf::timestamp_us decode_value(char const *begin, - char const *end, - parse_options_view const &opts) +__inline__ __device__ cudf::timestamp_us decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { auto milli = to_date_time(begin, end, opts.dayfirst); if (milli == -1) { @@ -355,9 +355,9 @@ __inline__ __device__ cudf::timestamp_us decode_value(char const *begin, } template <> -__inline__ __device__ cudf::timestamp_ns decode_value(char const *begin, - char const *end, - parse_options_view const &opts) +__inline__ __device__ cudf::timestamp_ns decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { auto milli = to_date_time(begin, end, opts.dayfirst); if (milli == -1) { @@ -371,7 +371,7 @@ __inline__ __device__ cudf::timestamp_ns decode_value(char const *begin, #define DURATION_DECODE_VALUE(Type) \ template <> \ __inline__ __device__ Type decode_value( \ - const char *begin, const char *end, parse_options_view const &opts) \ + const char* begin, const char* end, parse_options_view const& opts) \ { \ return Type{to_time_delta(begin, end)}; \ } @@ -385,18 +385,18 @@ DURATION_DECODE_VALUE(duration_ns) // The purpose of this is merely to allow compilation ONLY // TODO : make this work for csv template <> -__inline__ __device__ cudf::string_view decode_value(char const *begin, - char const *end, - parse_options_view const &opts) +__inline__ __device__ cudf::string_view decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { return cudf::string_view{}; } // The purpose of this is merely to allow compilation ONLY template <> -__inline__ __device__ cudf::dictionary32 decode_value(char const *begin, - char const *end, - parse_options_view const &opts) +__inline__ __device__ cudf::dictionary32 decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { return cudf::dictionary32{}; } @@ -404,9 +404,9 @@ __inline__ __device__ cudf::dictionary32 decode_value(char const *begin, // The purpose of this is merely to allow compilation ONLY // TODO : make this work for csv template <> -__inline__ __device__ cudf::list_view decode_value(char const *begin, - char const *end, - parse_options_view const &opts) +__inline__ __device__ cudf::list_view decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { return cudf::list_view{}; } @@ -414,9 +414,9 @@ __inline__ __device__ cudf::list_view decode_value(char const *begin, // The purpose of this is merely to allow compilation ONLY // TODO : make this work for csv template <> -__inline__ __device__ cudf::struct_view decode_value(char const *begin, - char const *end, - parse_options_view const &opts) +__inline__ __device__ cudf::struct_view decode_value(char const* begin, + char const* end, + parse_options_view const& opts) { return cudf::struct_view{}; } @@ -434,16 +434,16 @@ struct decode_op { */ template and !std::is_same_v and - !cudf::is_fixed_point()> * = nullptr> - __host__ __device__ __forceinline__ bool operator()(void *out_buffer, + !cudf::is_fixed_point()>* = nullptr> + __host__ __device__ __forceinline__ bool operator()(void* out_buffer, size_t row, const data_type, - char const *begin, - char const *end, - parse_options_view const &opts, + char const* begin, + char const* end, + parse_options_view const& opts, column_parse::flags flags) { - static_cast(out_buffer)[row] = [&flags, &opts, begin, end]() -> T { + static_cast(out_buffer)[row] = [&flags, &opts, begin, end]() -> T { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; } @@ -460,16 +460,16 @@ struct decode_op { * * @return bool Whether the parsed value is valid. */ - template ()> * = nullptr> - __host__ __device__ __forceinline__ bool operator()(void *out_buffer, + template ()>* = nullptr> + __host__ __device__ __forceinline__ bool operator()(void* out_buffer, size_t row, const data_type output_type, - char const *begin, - char const *end, - parse_options_view const &opts, + char const* begin, + char const* end, + parse_options_view const& opts, column_parse::flags flags) { - static_cast *>(out_buffer)[row] = + static_cast*>(out_buffer)[row] = [&flags, &opts, output_type, begin, end]() -> device_storage_type_t { return strings::detail::parse_decimal>( begin, end, output_type.scale()); @@ -481,16 +481,16 @@ struct decode_op { /** * @brief Dispatch for boolean type types. */ - template > * = nullptr> - __host__ __device__ __forceinline__ bool operator()(void *out_buffer, + template >* = nullptr> + __host__ __device__ __forceinline__ bool operator()(void* out_buffer, size_t row, const data_type, - char const *begin, - char const *end, - parse_options_view const &opts, + char const* begin, + char const* end, + parse_options_view const& opts, column_parse::flags flags) { - static_cast(out_buffer)[row] = [&opts, begin, end]() { + static_cast(out_buffer)[row] = [&opts, begin, end]() { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } @@ -505,17 +505,17 @@ struct decode_op { * @brief Dispatch for floating points, which are set to NaN if the input * is not valid. In such case, the validity mask is set to zero too. */ - template > * = nullptr> - __host__ __device__ __forceinline__ bool operator()(void *out_buffer, + template >* = nullptr> + __host__ __device__ __forceinline__ bool operator()(void* out_buffer, size_t row, const data_type, - char const *begin, - char const *end, - parse_options_view const &opts, + char const* begin, + char const* end, + parse_options_view const& opts, column_parse::flags flags) { - T const value = decode_value(begin, end, opts); - static_cast(out_buffer)[row] = value; + T const value = decode_value(begin, end, opts); + static_cast(out_buffer)[row] = value; return !std::isnan(value); } @@ -525,16 +525,16 @@ struct decode_op { */ template and !std::is_floating_point_v and - !cudf::is_fixed_point()> * = nullptr> - __host__ __device__ __forceinline__ bool operator()(void *out_buffer, + !cudf::is_fixed_point()>* = nullptr> + __host__ __device__ __forceinline__ bool operator()(void* out_buffer, size_t row, const data_type, - char const *begin, - char const *end, - parse_options_view const &opts, + char const* begin, + char const* end, + parse_options_view const& opts, column_parse::flags flags) { - static_cast(out_buffer)[row] = decode_value(begin, end, opts); + static_cast(out_buffer)[row] = decode_value(begin, end, opts); return true; } @@ -559,8 +559,8 @@ __global__ void __launch_bounds__(csvparse_block_dim) device_span column_flags, device_span row_offsets, device_span dtypes, - device_span columns, - device_span valids) + device_span columns, + device_span valids) { auto const raw_csv = data.data(); // thread IDs range per block, so also need the block id. @@ -605,7 +605,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) --end; } } - auto str_list = static_cast *>(columns[actual_col]); + auto str_list = static_cast*>(columns[actual_col]); str_list[rec_id].first = field_start; str_list[rec_id].second = end - field_start; } else { @@ -623,7 +623,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) } } } else if (dtypes[actual_col].id() == cudf::type_id::STRING) { - auto str_list = static_cast *>(columns[actual_col]); + auto str_list = static_cast*>(columns[actual_col]); str_list[rec_id].first = nullptr; str_list[rec_id].second = 0; } @@ -680,7 +680,7 @@ constexpr __device__ uint32_t make_char_context(uint32_t id0, * The char_ctx value should be created via make_char_context, and its value should * have been evaluated at compile-time. */ -inline __device__ void merge_char_context(uint4 &ctx, uint32_t char_ctx, uint32_t pos) +inline __device__ void merge_char_context(uint4& ctx, uint32_t char_ctx, uint32_t pos) { uint32_t id0 = (ctx.w >> 0) & 3; uint32_t id1 = (ctx.w >> 2) & 3; @@ -709,9 +709,10 @@ inline __device__ packed_rowctx_t pack_rowmaps(uint4 ctx_map) */ inline __device__ uint32_t select_rowmap(uint4 ctx_map, uint32_t ctxid) { - return (ctxid == ROW_CTX_NONE) - ? ctx_map.x - : (ctxid == ROW_CTX_QUOTE) ? ctx_map.y : (ctxid == ROW_CTX_COMMENT) ? ctx_map.z : 0; + return (ctxid == ROW_CTX_NONE) ? ctx_map.x + : (ctxid == ROW_CTX_QUOTE) ? ctx_map.y + : (ctxid == ROW_CTX_COMMENT) ? ctx_map.z + : 0; } /** @@ -731,7 +732,7 @@ inline __device__ uint32_t select_rowmap(uint4 ctx_map, uint32_t ctxid) * @param t thread id (leaf node id) */ template -inline __device__ void ctx_merge(uint64_t *ctxtree, packed_rowctx_t *ctxb, uint32_t t) +inline __device__ void ctx_merge(uint64_t* ctxtree, packed_rowctx_t* ctxb, uint32_t t) { uint64_t tmp = shuffle_xor(*ctxb, lanemask); if (!(t & tmask)) { @@ -754,7 +755,7 @@ inline __device__ void ctx_merge(uint64_t *ctxtree, packed_rowctx_t *ctxb, uint3 */ template inline __device__ void ctx_unmerge( - uint32_t base, uint64_t *ctxtree, uint32_t *ctx, uint32_t *brow4, uint32_t t) + uint32_t base, uint64_t* ctxtree, uint32_t* ctx, uint32_t* brow4, uint32_t t) { rowctx32_t ctxb_left, ctxb_right, ctxb_sum; ctxb_sum = get_row_context(ctxtree[base], *ctx); @@ -869,7 +870,7 @@ static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxt * @param commentchar Comment line character (skip rows starting with this character) */ __global__ void __launch_bounds__(rowofs_block_dim) - gather_row_offsets_gpu(uint64_t *row_ctx, + gather_row_offsets_gpu(uint64_t* row_ctx, device_span offsets_out, device_span const data, size_t chunk_size, @@ -892,11 +893,11 @@ __global__ void __launch_bounds__(rowofs_block_dim) __align__(8) uint64_t ctxtree[rowofs_block_dim * 2]; } temp_storage; - const char *end = start + (min(parse_pos + chunk_size, data_size) - start_offset); + const char* end = start + (min(parse_pos + chunk_size, data_size) - start_offset); uint32_t t = threadIdx.x; size_t block_pos = (parse_pos - start_offset) + blockIdx.x * static_cast(rowofs_block_bytes) + t * 32; - const char *cur = start + block_pos; + const char* cur = start + block_pos; // Initial state is neutral context (no state transitions), zero rows uint4 ctx_map = { @@ -934,7 +935,7 @@ __global__ void __launch_bounds__(rowofs_block_dim) ctx = make_char_context(ROW_CTX_NONE, ROW_CTX_QUOTE); } } else { - const char *data_end = start + data_size - start_offset; + const char* data_end = start + data_size - start_offset; if (cur <= end && cur == data_end) { // Add a newline at data end (need the extra row offset to infer length of previous row) ctx = make_char_context(ROW_CTX_EOF, ROW_CTX_EOF, ROW_CTX_EOF, 1, 1, 1); @@ -993,7 +994,7 @@ __global__ void __launch_bounds__(rowofs_block_dim) } } -size_t __host__ count_blank_rows(const cudf::io::parse_options_view &opts, +size_t __host__ count_blank_rows(const cudf::io::parse_options_view& opts, device_span data, device_span row_offsets, rmm::cuda_stream_view stream) @@ -1011,7 +1012,7 @@ size_t __host__ count_blank_rows(const cudf::io::parse_options_view &opts, }); } -device_span __host__ remove_blank_rows(cudf::io::parse_options_view const &options, +device_span __host__ remove_blank_rows(cudf::io::parse_options_view const& options, device_span data, device_span row_offsets, rmm::cuda_stream_view stream) @@ -1032,7 +1033,7 @@ device_span __host__ remove_blank_rows(cudf::io::parse_options_view co } std::vector detect_column_types( - cudf::io::parse_options_view const &options, + cudf::io::parse_options_view const& options, device_span const data, device_span const column_flags, device_span const row_starts, @@ -1052,13 +1053,13 @@ std::vector detect_column_types( return detail::make_std_vector_sync(d_stats, stream); } -void __host__ decode_row_column_data(cudf::io::parse_options_view const &options, +void __host__ decode_row_column_data(cudf::io::parse_options_view const& options, device_span data, device_span column_flags, device_span row_offsets, device_span dtypes, - device_span columns, - device_span valids, + device_span columns, + device_span valids, rmm::cuda_stream_view stream) { // Calculate actual block count to use based on records count @@ -1070,8 +1071,8 @@ void __host__ decode_row_column_data(cudf::io::parse_options_view const &options options, data, column_flags, row_offsets, dtypes, columns, valids); } -uint32_t __host__ gather_row_offsets(const parse_options_view &options, - uint64_t *row_ctx, +uint32_t __host__ gather_row_offsets(const parse_options_view& options, + uint64_t* row_ctx, device_span const offsets_out, device_span const data, size_t chunk_size, diff --git a/cpp/src/io/csv/csv_gpu.h b/cpp/src/io/csv/csv_gpu.h index 838abe66b94..9b83028fa92 100644 --- a/cpp/src/io/csv/csv_gpu.h +++ b/cpp/src/io/csv/csv_gpu.h @@ -149,8 +149,8 @@ inline __host__ __device__ rowctx64_t select_row_context(rowctx64_t sel_ctx, * * @return Number of row contexts */ -uint32_t gather_row_offsets(cudf::io::parse_options_view const &options, - uint64_t *row_ctx, +uint32_t gather_row_offsets(cudf::io::parse_options_view const& options, + uint64_t* row_ctx, device_span offsets_out, device_span data, size_t chunk_size, @@ -170,7 +170,7 @@ uint32_t gather_row_offsets(cudf::io::parse_options_view const &options, * @param row_offsets Row offsets in the character data buffer * @param stream CUDA stream used for device memory operations and kernel launches. */ -size_t count_blank_rows(cudf::io::parse_options_view const &options, +size_t count_blank_rows(cudf::io::parse_options_view const& options, device_span data, device_span row_offsets, rmm::cuda_stream_view stream); @@ -183,7 +183,7 @@ size_t count_blank_rows(cudf::io::parse_options_view const &options, * @param row_offsets Row offsets in the character data buffer * @param stream CUDA stream used for device memory operations and kernel launches. */ -device_span remove_blank_rows(const cudf::io::parse_options_view &options, +device_span remove_blank_rows(const cudf::io::parse_options_view& options, device_span data, device_span row_offsets, rmm::cuda_stream_view stream); @@ -200,7 +200,7 @@ device_span remove_blank_rows(const cudf::io::parse_options_view &opti * @return stats Histogram of each dtypes' occurrence for each column */ std::vector detect_column_types( - cudf::io::parse_options_view const &options, + cudf::io::parse_options_view const& options, device_span data, device_span column_flags, device_span row_offsets, @@ -219,13 +219,13 @@ std::vector detect_column_types( * @param[out] valids Device memory output of column valids bitmap data * @param[in] stream CUDA stream to use, default 0 */ -void decode_row_column_data(cudf::io::parse_options_view const &options, +void decode_row_column_data(cudf::io::parse_options_view const& options, device_span data, device_span column_flags, device_span row_offsets, device_span dtypes, - device_span columns, - device_span valids, + device_span columns, + device_span valids, rmm::cuda_stream_view stream); } // namespace gpu diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh index 4e4ddd09a9f..7160041ff4e 100644 --- a/cpp/src/io/csv/datetime.cuh +++ b/cpp/src/io/csv/datetime.cuh @@ -232,7 +232,9 @@ __inline__ __device__ void extract_time( if (*last == 'M' || *last == 'm') { if (*(last - 1) == 'P' || *(last - 1) == 'p') { hour_adjust = 12; } last = last - 2; - while (*last == ' ') { --last; } + while (*last == ' ') { + --last; + } } end = last + 1; diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 71391c8c444..70ce0fce1cc 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -40,6 +40,7 @@ #include #include #include +#include using std::string; using std::vector; @@ -57,7 +58,7 @@ struct VisitorOverload : Ts... { using Ts::operator()...; }; template -VisitorOverload(Ts...)->VisitorOverload; +VisitorOverload(Ts...) -> VisitorOverload; } // namespace namespace cudf { @@ -102,7 +103,7 @@ constexpr size_t calculateMaxRowSize(int num_columns = 0) noexcept * * @return Tuple of data_type and flags */ -std::tuple get_dtype_info(const std::string &dtype) +std::tuple get_dtype_info(const std::string& dtype) { if (dtype == "hex" || dtype == "hex64") { return std::make_tuple(data_type{cudf::type_id::INT64}, column_parse::as_hexadecimal); @@ -132,8 +133,8 @@ string removeQuotes(string str, char quotechar) * @brief Parse the first row to set the column names in the raw_csv parameter. * The first row can be either the header row, or the first data row */ -std::vector setColumnNames(std::vector const &header, - parse_options_view const &opts, +std::vector setColumnNames(std::vector const& header, + parse_options_view const& opts, int header_row, std::string prefix) { @@ -196,7 +197,7 @@ std::vector setColumnNames(std::vector const &header, } template -void erase_except_last(C &container, rmm::cuda_stream_view stream) +void erase_except_last(C& container, rmm::cuda_stream_view stream) { cudf::detail::device_single_thread( [span = device_span{container}] __device__() mutable { @@ -222,7 +223,7 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) size_t map_range_size = 0; if (range_size != 0) { auto num_given_dtypes = - std::visit([](const auto &dtypes) { return dtypes.size(); }, opts_.get_dtypes()); + std::visit([](const auto& dtypes) { return dtypes.size(); }, opts_.get_dtypes()); const auto num_columns = std::max(opts_.get_names().size(), num_given_dtypes); map_range_size = range_size + calculateMaxRowSize(num_columns); } @@ -240,7 +241,7 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) auto buffer = source_->host_read(range_offset, data_size); auto h_data = host_span( // - reinterpret_cast(buffer->data()), + reinterpret_cast(buffer->data()), buffer->size()); std::vector h_uncomp_data_owner; @@ -269,7 +270,7 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) num_rows, load_whole_file, stream); - auto &row_offsets = data_row_offsets.second; + auto& row_offsets = data_row_offsets.second; // Exclude the rows that are to be skipped from the end if (skip_end_rows > 0 && static_cast(skip_end_rows) < row_offsets.size()) { row_offsets.shrink(row_offsets.size() - skip_end_rows); @@ -282,8 +283,8 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream) table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) { auto const data_row_offsets = select_data_and_row_offsets(stream); - auto const &data = data_row_offsets.first; - auto const &row_offsets = data_row_offsets.second; + auto const& data = data_row_offsets.first; + auto const& row_offsets = data_row_offsets.second; // Exclude the end-of-data row from number of rows with actual data num_records_ = std::max(row_offsets.size(), 1ul) - 1; @@ -308,14 +309,16 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) // Looking for duplicates std::unordered_map col_names_histogram; - for (auto &col_name : col_names_) { + for (auto& col_name : col_names_) { // Operator [] inserts a default-initialized value if the given key is not // present if (++col_names_histogram[col_name] > 1) { if (opts_.is_enabled_mangle_dupe_cols()) { // Rename duplicates of column X as X.1, X.2, ...; First appearance // stays as X - col_name += "." + std::to_string(col_names_histogram[col_name] - 1); + do { + col_name += "." + std::to_string(col_names_histogram[col_name] - 1); + } while (col_names_histogram[col_name]++); } else { // All duplicate columns will be ignored; First appearance is parsed const auto idx = &col_name - col_names_.data(); @@ -336,13 +339,18 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) for (const auto index : opts_.get_use_cols_indexes()) { column_flags_[index] = column_parse::enabled; } - num_active_cols_ = opts_.get_use_cols_indexes().size(); + num_active_cols_ = std::unordered_set(opts_.get_use_cols_indexes().begin(), + opts_.get_use_cols_indexes().end()) + .size(); - for (const auto &name : opts_.get_use_cols_names()) { + for (const auto& name : opts_.get_use_cols_names()) { const auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { - column_flags_[it - col_names_.begin()] = column_parse::enabled; - num_active_cols_++; + auto curr_it = it - col_names_.begin(); + if (column_flags_[curr_it] == column_parse::disabled) { + column_flags_[curr_it] = column_parse::enabled; + num_active_cols_++; + } } } } @@ -353,7 +361,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) column_flags_[index] |= column_parse::as_datetime; } - for (const auto &name : opts_.get_infer_date_names()) { + for (const auto& name : opts_.get_infer_date_names()) { auto it = std::find(col_names_.begin(), col_names_.end(), name); if (it != col_names_.end()) { column_flags_[it - col_names_.begin()] |= column_parse::as_datetime; @@ -368,7 +376,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) auto out_columns = std::vector>(); bool has_to_infer_column_types = - std::visit([](const auto &dtypes) { return dtypes.empty(); }, opts_.get_dtypes()); + std::visit([](const auto& dtypes) { return dtypes.empty(); }, opts_.get_dtypes()); std::vector column_types; if (has_to_infer_column_types) { @@ -376,8 +384,8 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream) } else { column_types = std::visit(VisitorOverload{ - [&](const std::vector &data_types) { return data_types; }, - [&](const std::vector &dtypes) { return parse_column_types(dtypes); }}, + [&](const std::vector& data_types) { return data_types; }, + [&](const std::vector& dtypes) { return parse_column_types(dtypes); }}, opts_.get_dtypes()); } @@ -422,7 +430,9 @@ size_t reader::impl::find_first_row_start(host_span data) // For now, look for the first terminator (assume the first terminator isn't within a quote) // TODO: Attempt to infer this from the data size_t pos = 0; - while (pos < data.size() && data[pos] != opts.terminator) { ++pos; } + while (pos < data.size() && data[pos] != opts.terminator) { + ++pos; + } return std::min(pos + 1, data.size()); } @@ -529,7 +539,9 @@ reader::impl::load_data_and_gather_row_offsets(host_span data, stream.synchronize(); size_t rows_out_of_range = 0; - for (uint32_t i = 0; i < num_blocks; i++) { rows_out_of_range += row_ctx[i]; } + for (uint32_t i = 0; i < num_blocks; i++) { + rows_out_of_range += row_ctx[i]; + } if (rows_out_of_range != 0) { // Keep one row out of range (used to infer length of previous row) auto new_row_offsets_size = @@ -641,7 +653,7 @@ std::vector reader::impl::infer_column_types(device_span } if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) { - for (auto &type : dtypes) { + for (auto& type : dtypes) { if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); } } } @@ -655,13 +667,13 @@ std::vector reader::impl::infer_column_types(device_span } std::vector reader::impl::parse_column_types( - const std::vector &types_as_strings) + const std::vector& types_as_strings) { std::vector dtypes; const bool is_dict = std::all_of(types_as_strings.begin(), types_as_strings.end(), - [](const auto &s) { return s.find(':') != std::string::npos; }); + [](const auto& s) { return s.find(':') != std::string::npos; }); if (!is_dict) { if (types_as_strings.size() == 1) { @@ -670,7 +682,9 @@ std::vector reader::impl::parse_column_types( column_parse::flags col_flags_; std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]); dtypes.resize(num_active_cols_, dtype_); - for (int col = 0; col < num_actual_cols_; col++) { column_flags_[col] |= col_flags_; } + for (int col = 0; col < num_actual_cols_; col++) { + column_flags_[col] |= col_flags_; + } CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type"); } else { // If it's a list, assign dtypes to active columns in the given order @@ -692,7 +706,7 @@ std::vector reader::impl::parse_column_types( // Translate vector of `name : dtype` strings to map // NOTE: Incoming pairs can be out-of-order from column names in dataset std::unordered_map col_type_map; - for (const auto &pair : types_as_strings) { + for (const auto& pair : types_as_strings) { const auto pos = pair.find_last_of(':'); const auto name = pair.substr(0, pos); const auto dtype = pair.substr(pos + 1, pair.size()); @@ -714,7 +728,7 @@ std::vector reader::impl::parse_column_types( } if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) { - for (auto &type : dtypes) { + for (auto& type : dtypes) { if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); } } } @@ -753,8 +767,8 @@ std::vector reader::impl::decode_data(device_span dat } } - thrust::host_vector h_data(num_active_cols_); - thrust::host_vector h_valid(num_active_cols_); + thrust::host_vector h_data(num_active_cols_); + thrust::host_vector h_valid(num_active_cols_); for (int i = 0; i < num_active_cols_; ++i) { h_data[i] = out_buffers[i].data(); @@ -777,7 +791,7 @@ std::vector reader::impl::decode_data(device_span dat * @brief Create a serialized trie for N/A value matching, based on the options. */ cudf::detail::trie create_na_trie(char quotechar, - csv_reader_options const &reader_opts, + csv_reader_options const& reader_opts, rmm::cuda_stream_view stream) { // Default values to recognize as null values @@ -815,7 +829,7 @@ cudf::detail::trie create_na_trie(char quotechar, return cudf::detail::create_serialized_trie(na_values, stream); } -parse_options make_parse_options(csv_reader_options const &reader_opts, +parse_options make_parse_options(csv_reader_options const& reader_opts, rmm::cuda_stream_view stream) { auto parse_opts = parse_options{}; @@ -873,9 +887,9 @@ parse_options make_parse_options(csv_reader_options const &reader_opts, reader::impl::impl(std::unique_ptr source, std::string filepath, - csv_reader_options const &options, + csv_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : mr_(mr), source_(std::move(source)), filepath_(filepath), opts_(options) { num_actual_cols_ = opts_.get_names().size(); @@ -890,10 +904,10 @@ reader::impl::impl(std::unique_ptr source, } // Forward to implementation -reader::reader(std::vector const &filepaths, - csv_reader_options const &options, +reader::reader(std::vector const& filepaths, + csv_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported."); // Delay actual instantiation of data source until read to allow for @@ -902,10 +916,10 @@ reader::reader(std::vector const &filepaths, } // Forward to implementation -reader::reader(std::vector> &&sources, - csv_reader_options const &options, +reader::reader(std::vector>&& sources, + csv_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported."); _impl = std::make_unique(std::move(sources[0]), "", options, stream, mr); diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp index 17f27a28e30..29c6b48bc8a 100644 --- a/cpp/src/io/csv/reader_impl.hpp +++ b/cpp/src/io/csv/reader_impl.hpp @@ -79,9 +79,9 @@ class reader::impl { */ explicit impl(std::unique_ptr source, std::string filepath, - csv_reader_options const &options, + csv_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr); + rmm::mr::device_memory_resource* mr); /** * @brief Read an entire set or a subset of data and returns a set of columns. @@ -104,7 +104,7 @@ class reader::impl { device_span selected; public: - selected_rows_offsets(rmm::device_uvector &&data, + selected_rows_offsets(rmm::device_uvector&& data, device_span selected_span) : all{std::move(data)}, selected{selected_span} { @@ -188,7 +188,7 @@ class reader::impl { * types * @return List of columns' data types */ - std::vector parse_column_types(std::vector const &types_as_strings); + std::vector parse_column_types(std::vector const& types_as_strings); /** * @brief Converts the row-column data and outputs to column bufferrs. @@ -204,7 +204,7 @@ class reader::impl { rmm::cuda_stream_view stream); private: - rmm::mr::device_memory_resource *mr_ = nullptr; + rmm::mr::device_memory_resource* mr_ = nullptr; std::unique_ptr source_; std::string filepath_; std::string compression_type_; diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index 2bc7969d5e5..ba6bc30e0d4 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -57,8 +57,8 @@ namespace { * @param[in] begin Pointer to the first character in the row * @param[in] end pointer to the first character after the row */ -__device__ std::pair limit_range_to_brackets(char const *begin, - char const *end) +__device__ std::pair limit_range_to_brackets(char const* begin, + char const* end) { auto const data_begin = thrust::next(thrust::find_if( thrust::seq, begin, end, [] __device__(auto c) { return c == '[' || c == '{'; })); @@ -81,9 +81,9 @@ __device__ std::pair limit_range_to_brackets(char co * * @return Begin and end iterators of the key name; (`end`, `end`) if a key is not found */ -__device__ std::pair get_next_key(char const *begin, - char const *end, - char quotechar) +__device__ std::pair get_next_key(char const* begin, + char const* end, + char quotechar) { // Key starts after the first quote auto const key_begin = thrust::find(thrust::seq, begin, end, quotechar) + 1; @@ -109,9 +109,9 @@ __device__ std::pair get_next_key(char const *begin, * @return The parsed numeric value */ template -__inline__ __device__ T decode_value(const char *begin, +__inline__ __device__ T decode_value(const char* begin, uint64_t end, - parse_options_view const &opts) + parse_options_view const& opts) { return cudf::io::parse_numeric(begin, end, opts); } @@ -126,9 +126,9 @@ __inline__ __device__ T decode_value(const char *begin, * @return The parsed numeric value */ template -__inline__ __device__ T decode_value(const char *begin, - const char *end, - parse_options_view const &opts) +__inline__ __device__ T decode_value(const char* begin, + const char* end, + parse_options_view const& opts) { return cudf::io::parse_numeric(begin, end, opts); } @@ -143,9 +143,9 @@ __inline__ __device__ T decode_value(const char *begin, * @return The parsed timestamp_D */ template <> -__inline__ __device__ cudf::timestamp_D decode_value(const char *begin, - const char *end, - parse_options_view const &opts) +__inline__ __device__ cudf::timestamp_D decode_value(const char* begin, + const char* end, + parse_options_view const& opts) { return cudf::timestamp_D{cudf::duration_D{to_date(begin, end, opts.dayfirst)}}; } @@ -160,9 +160,9 @@ __inline__ __device__ cudf::timestamp_D decode_value(const char *begin, * @return The parsed timestamp_s */ template <> -__inline__ __device__ cudf::timestamp_s decode_value(const char *begin, - const char *end, - parse_options_view const &opts) +__inline__ __device__ cudf::timestamp_s decode_value(const char* begin, + const char* end, + parse_options_view const& opts) { auto milli = to_date_time(begin, end, opts.dayfirst); return cudf::timestamp_s{cudf::duration_s{milli / 1000}}; @@ -178,9 +178,9 @@ __inline__ __device__ cudf::timestamp_s decode_value(const char *begin, * @return The parsed timestamp_ms */ template <> -__inline__ __device__ cudf::timestamp_ms decode_value(const char *begin, - const char *end, - parse_options_view const &opts) +__inline__ __device__ cudf::timestamp_ms decode_value(const char* begin, + const char* end, + parse_options_view const& opts) { auto milli = to_date_time(begin, end, opts.dayfirst); return cudf::timestamp_ms{cudf::duration_ms{milli}}; @@ -196,9 +196,9 @@ __inline__ __device__ cudf::timestamp_ms decode_value(const char *begin, * @return The parsed timestamp_us */ template <> -__inline__ __device__ cudf::timestamp_us decode_value(const char *begin, - const char *end, - parse_options_view const &opts) +__inline__ __device__ cudf::timestamp_us decode_value(const char* begin, + const char* end, + parse_options_view const& opts) { auto milli = to_date_time(begin, end, opts.dayfirst); return cudf::timestamp_us{cudf::duration_us{milli * 1000}}; @@ -214,21 +214,21 @@ __inline__ __device__ cudf::timestamp_us decode_value(const char *begin, * @return The parsed timestamp_ns */ template <> -__inline__ __device__ cudf::timestamp_ns decode_value(const char *begin, - const char *end, - parse_options_view const &opts) +__inline__ __device__ cudf::timestamp_ns decode_value(const char* begin, + const char* end, + parse_options_view const& opts) { auto milli = to_date_time(begin, end, opts.dayfirst); return cudf::timestamp_ns{cudf::duration_ns{milli * 1000000}}; } #ifndef DURATION_DECODE_VALUE -#define DURATION_DECODE_VALUE(Type) \ - template <> \ - __inline__ __device__ Type decode_value( \ - const char *begin, const char *end, parse_options_view const &) \ - { \ - return Type{to_time_delta(begin, end)}; \ +#define DURATION_DECODE_VALUE(Type) \ + template <> \ + __inline__ __device__ Type decode_value( \ + const char* begin, const char* end, parse_options_view const&) \ + { \ + return Type{to_time_delta(begin, end)}; \ } #endif DURATION_DECODE_VALUE(duration_D) @@ -239,48 +239,48 @@ DURATION_DECODE_VALUE(duration_ns) // The purpose of these is merely to allow compilation ONLY template <> -__inline__ __device__ cudf::string_view decode_value(const char *, - const char *, - parse_options_view const &) +__inline__ __device__ cudf::string_view decode_value(const char*, + const char*, + parse_options_view const&) { return cudf::string_view{}; } template <> -__inline__ __device__ cudf::dictionary32 decode_value(const char *, - const char *, - parse_options_view const &) +__inline__ __device__ cudf::dictionary32 decode_value(const char*, + const char*, + parse_options_view const&) { return cudf::dictionary32{}; } template <> -__inline__ __device__ cudf::list_view decode_value(const char *, - const char *, - parse_options_view const &) +__inline__ __device__ cudf::list_view decode_value(const char*, + const char*, + parse_options_view const&) { return cudf::list_view{}; } template <> -__inline__ __device__ cudf::struct_view decode_value(const char *, - const char *, - parse_options_view const &) +__inline__ __device__ cudf::struct_view decode_value(const char*, + const char*, + parse_options_view const&) { return cudf::struct_view{}; } template <> -__inline__ __device__ numeric::decimal32 decode_value(const char *, - const char *, - parse_options_view const &) +__inline__ __device__ numeric::decimal32 decode_value(const char*, + const char*, + parse_options_view const&) { return numeric::decimal32{}; } template <> -__inline__ __device__ numeric::decimal64 decode_value(const char *, - const char *, - parse_options_view const &) +__inline__ __device__ numeric::decimal64 decode_value(const char*, + const char*, + parse_options_view const&) { return numeric::decimal64{}; } @@ -297,14 +297,14 @@ struct ConvertFunctor { * It is handled here rather than within convertStrToValue() as that function * is used by other types (ex. timestamp) that aren't 'booleable'. */ - template ::value> * = nullptr> - __host__ __device__ __forceinline__ bool operator()(char const *begin, - char const *end, - void *output_column, + template ::value>* = nullptr> + __host__ __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* output_column, cudf::size_type row, - const parse_options_view &opts) + const parse_options_view& opts) { - T &value{static_cast(output_column)[row]}; + T& value{static_cast(output_column)[row]}; value = [&opts, end, begin]() -> T { // Check for user-specified true/false values @@ -321,15 +321,15 @@ struct ConvertFunctor { * @brief Dispatch for floating points, which are set to NaN if the input * is not valid. In such case, the validity mask is set to zero too. */ - template ::value> * = nullptr> - __host__ __device__ __forceinline__ bool operator()(char const *begin, - char const *end, - void *out_buffer, + template ::value>* = nullptr> + __host__ __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, size_t row, - parse_options_view const &opts) + parse_options_view const& opts) { - T const value = decode_value(begin, end, opts); - static_cast(out_buffer)[row] = value; + T const value = decode_value(begin, end, opts); + static_cast(out_buffer)[row] = value; return !std::isnan(value); } @@ -340,14 +340,14 @@ struct ConvertFunctor { */ template ::value and - !std::is_integral::value> * = nullptr> - __host__ __device__ __forceinline__ bool operator()(char const *begin, - char const *end, - void *output_column, + !std::is_integral::value>* = nullptr> + __host__ __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* output_column, cudf::size_type row, - const parse_options_view &opts) + const parse_options_view& opts) { - static_cast(output_column)[row] = decode_value(begin, end, opts); + static_cast(output_column)[row] = decode_value(begin, end, opts); return true; } @@ -405,8 +405,8 @@ __device__ __inline__ bool is_like_float( */ struct field_descriptor { cudf::size_type column; - char const *value_begin; - char const *value_end; + char const* value_begin; + char const* value_end; }; /** @@ -416,15 +416,15 @@ struct field_descriptor { * @param[in] end pointer to the first character after the parsing range * @param[in] opts The global parsing behavior options * @param[in] field_idx Index of the current field in the input row - * @param[in] col_map Pointer to the (column name hash -> solumn index) map in device memory. + * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. * nullptr is passed when the input file does not consist of objects. * @return Descriptor of the parsed field */ -__device__ field_descriptor next_field_descriptor(const char *begin, - const char *end, - parse_options_view const &opts, +__device__ field_descriptor next_field_descriptor(const char* begin, + const char* end, + parse_options_view const& opts, cudf::size_type field_idx, - col_map_type *col_map) + col_map_type* col_map) { auto const desc_pre_trim = col_map == nullptr @@ -463,7 +463,7 @@ __device__ field_descriptor next_field_descriptor(const char *begin, * * @return The begin and end iterators of the row data. */ -__device__ std::pair get_row_data_range( +__device__ std::pair get_row_data_range( device_span const data, device_span const row_offsets, size_type row) { auto const row_begin = data.begin() + row_offsets[row]; @@ -481,7 +481,7 @@ __device__ std::pair get_row_data_range( * @param[in] data The entire data to read * @param[in] row_offsets The offset of each row in the input * @param[in] column_types The data type of each column - * @param[in] col_map Pointer to the (column name hash -> solumn index) map in device memory. + * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. * nullptr is passed when the input file does not consist of objects. * @param[out] output_columns The output column data * @param[out] valid_fields The bitmaps indicating whether column fields are valid @@ -491,9 +491,9 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts, device_span const data, device_span const row_offsets, device_span const column_types, - col_map_type *col_map, - device_span const output_columns, - device_span const valid_fields, + col_map_type* col_map, + device_span const output_columns, + device_span const valid_fields, device_span const num_valid_fields) { const auto rec_id = threadIdx.x + (blockDim.x * blockIdx.x); @@ -515,7 +515,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts, if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) { // Type dispatcher does not handle strings if (column_types[desc.column].id() == type_id::STRING) { - auto str_list = static_cast(output_columns[desc.column]); + auto str_list = static_cast(output_columns[desc.column]); str_list[rec_id].first = desc.value_begin; str_list[rec_id].second = value_len; @@ -536,7 +536,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts, } } } else if (column_types[desc.column].id() == type_id::STRING) { - auto str_list = static_cast(output_columns[desc.column]); + auto str_list = static_cast(output_columns[desc.column]); str_list[rec_id].first = nullptr; str_list[rec_id].second = 0; } @@ -562,7 +562,7 @@ __global__ void detect_data_types_kernel( parse_options_view const opts, device_span const data, device_span const row_offsets, - col_map_type *col_map, + col_map_type* col_map, int num_columns, device_span const column_infos) { @@ -645,8 +645,8 @@ __global__ void detect_data_types_kernel( atomicAdd(&column_infos[desc.column].bool_count, 1); } else if (digit_count == int_req_number_cnt) { bool is_negative = (*desc.value_begin == '-'); - char const *data_begin = desc.value_begin + (is_negative || (*desc.value_begin == '+')); - cudf::size_type *ptr = cudf::io::gpu::infer_integral_field_counter( + char const* data_begin = desc.value_begin + (is_negative || (*desc.value_begin == '+')); + cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( data_begin, data_begin + digit_count, is_negative, column_infos[desc.column]); atomicAdd(ptr, 1); } else if (is_like_float( @@ -685,18 +685,18 @@ __global__ void detect_data_types_kernel( * @brief Input data range that contains a field in key:value format. */ struct key_value_range { - char const *key_begin; - char const *key_end; - char const *value_begin; - char const *value_end; + char const* key_begin; + char const* key_end; + char const* value_begin; + char const* value_end; }; /** * @brief Parse the next field in key:value format and return ranges of its parts. */ -__device__ key_value_range get_next_key_value_range(char const *begin, - char const *end, - parse_options_view const &opts) +__device__ key_value_range get_next_key_value_range(char const* begin, + char const* end, + parse_options_view const& opts) { auto const key_range = get_next_key(begin, end, opts.quotechar); @@ -721,7 +721,7 @@ __device__ key_value_range get_next_key_value_range(char const *begin, __global__ void collect_keys_info_kernel(parse_options_view const options, device_span const data, device_span const row_offsets, - unsigned long long int *keys_cnt, + unsigned long long int* keys_cnt, thrust::optional keys_info) { auto const rec_id = threadIdx.x + (blockDim.x * blockIdx.x); @@ -729,7 +729,7 @@ __global__ void collect_keys_info_kernel(parse_options_view const options, auto const row_data_range = get_row_data_range(data, row_offsets, rec_id); - auto advance = [&](const char *begin) { + auto advance = [&](const char* begin) { return get_next_key_value_range(begin, row_data_range.second, options); }; for (auto field_range = advance(row_data_range.first); @@ -751,13 +751,13 @@ __global__ void collect_keys_info_kernel(parse_options_view const options, /** * @copydoc cudf::io::json::gpu::convert_json_to_columns */ -void convert_json_to_columns(parse_options_view const &opts, +void convert_json_to_columns(parse_options_view const& opts, device_span const data, device_span const row_offsets, device_span const column_types, - col_map_type *col_map, - device_span const output_columns, - device_span const valid_fields, + col_map_type* col_map, + device_span const output_columns, + device_span const valid_fields, device_span num_valid_fields, rmm::cuda_stream_view stream) { @@ -779,12 +779,12 @@ void convert_json_to_columns(parse_options_view const &opts, */ std::vector detect_data_types( - const parse_options_view &options, + const parse_options_view& options, device_span const data, device_span const row_offsets, bool do_set_null_count, int num_columns, - col_map_type *col_map, + col_map_type* col_map, rmm::cuda_stream_view stream) { int block_size; @@ -822,10 +822,10 @@ std::vector detect_data_types( /** * @copydoc cudf::io::json::gpu::gpu_collect_keys_info */ -void collect_keys_info(parse_options_view const &options, +void collect_keys_info(parse_options_view const& options, device_span const data, device_span const row_offsets, - unsigned long long int *keys_cnt, + unsigned long long int* keys_cnt, thrust::optional keys_info, rmm::cuda_stream_view stream) { diff --git a/cpp/src/io/json/json_gpu.h b/cpp/src/io/json/json_gpu.h index 4a68ce48f20..7a6bce5e5a5 100644 --- a/cpp/src/io/json/json_gpu.h +++ b/cpp/src/io/json/json_gpu.h @@ -44,20 +44,20 @@ using col_map_type = concurrent_unordered_map; * @param[in] data The entire data to read * @param[in] row_offsets The start of each data record * @param[in] dtypes The data type of each column - * @param[in] col_map Pointer to the (column name hash -> solumn index) map in device memory. + * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. * nullptr is passed when the input file does not consist of objects. * @param[out] output_columns The output column data * @param[out] valid_fields The bitmaps indicating whether column fields are valid * @param[out] num_valid_fields The numbers of valid fields in columns * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -void convert_json_to_columns(parse_options_view const &options, +void convert_json_to_columns(parse_options_view const& options, device_span data, device_span row_offsets, device_span column_types, - col_map_type *col_map, - device_span output_columns, - device_span valid_fields, + col_map_type* col_map, + device_span output_columns, + device_span valid_fields, device_span num_valid_fields, rmm::cuda_stream_view stream); @@ -68,19 +68,19 @@ void convert_json_to_columns(parse_options_view const &options, * @param[in] data Input data buffer * @param[in] row_offsets The offset of each row in the input * @param[in] num_columns The number of columns of input data - * @param[in] col_map Pointer to the (column name hash -> solumn index) map in device memory. + * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. * nullptr is passed when the input file does not consist of objects. * @param[in] stream CUDA stream used for device memory operations and kernel launches. * * @returns The count for each column data type */ std::vector detect_data_types( - parse_options_view const &options, + parse_options_view const& options, device_span data, device_span row_offsets, bool do_set_null_count, int num_columns, - col_map_type *col_map, + col_map_type* col_map, rmm::cuda_stream_view stream); /** @@ -93,10 +93,10 @@ std::vector detect_data_types( * @param[out] keys_info optional, information (offset, length, hash) for each found key * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -void collect_keys_info(parse_options_view const &options, +void collect_keys_info(parse_options_view const& options, device_span data, device_span row_offsets, - unsigned long long int *keys_cnt, + unsigned long long int* keys_cnt, thrust::optional keys_info, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 4d5eee6cac7..b4395d6c965 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -134,9 +134,9 @@ col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes, * @param[in] row_offsets Device array of row start locations in the input buffer * @param[in] stream CUDA stream used for device memory operations and kernel launches * - * @return std::unique_ptr
cudf table with three columns (offsets, lenghts, hashes) + * @return std::unique_ptr
cudf table with three columns (offsets, lengths, hashes) */ -std::unique_ptr
create_json_keys_info_table(const parse_options_view &options, +std::unique_ptr
create_json_keys_info_table(const parse_options_view& options, device_span const data, device_span const row_offsets, rmm::cuda_stream_view stream) @@ -167,7 +167,7 @@ std::unique_ptr
create_json_keys_info_table(const parse_options_view &opt /** * @brief Extract the keys from the JSON file the name offsets/lengths. */ -std::vector create_key_strings(char const *h_data, +std::vector create_key_strings(char const* h_data, table_view sorted_info, rmm::cuda_stream_view stream) { @@ -213,7 +213,7 @@ std::pair, col_map_ptr_type> reader::impl::get_json_obj { auto info = create_json_keys_info_table( opts_.view(), - device_span(static_cast(data_.data()), data_.size()), + device_span(static_cast(data_.data()), data_.size()), rec_starts, stream); @@ -243,7 +243,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) // This allows only mapping of a subset of the file if using byte range if (sources_.empty()) { assert(!filepaths_.empty()); - for (const auto &path : filepaths_) { + for (const auto& path : filepaths_) { sources_.emplace_back(datasource::create(path, range_offset, map_range_size)); } } @@ -251,12 +251,14 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size) // Iterate through the user defined sources and read the contents into the local buffer CUDF_EXPECTS(!sources_.empty(), "No sources were defined"); size_t total_source_size = 0; - for (const auto &source : sources_) { total_source_size += source->size(); } + for (const auto& source : sources_) { + total_source_size += source->size(); + } total_source_size = total_source_size - range_offset; buffer_.resize(total_source_size); size_t bytes_read = 0; - for (const auto &source : sources_) { + for (const auto& source : sources_) { if (!source->is_empty()) { auto data_size = (map_range_size != 0) ? map_range_size : source->size(); bytes_read += source->host_read(range_offset, data_size, &buffer_[bytes_read]); @@ -282,12 +284,12 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream) {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}}); if (compression_type == "none") { // Do not use the owner vector here to avoid extra copy - uncomp_data_ = reinterpret_cast(buffer_.data()); + uncomp_data_ = reinterpret_cast(buffer_.data()); uncomp_size_ = buffer_.size(); } else { uncomp_data_owner_ = get_uncompressed_data( // host_span( // - reinterpret_cast(buffer_.data()), + reinterpret_cast(buffer_.data()), buffer_.size()), compression_type); @@ -314,7 +316,7 @@ rmm::device_uvector reader::impl::find_record_starts(rmm::cuda_stream_ rmm::device_uvector rec_starts(prefilter_count, stream); - auto *find_result_ptr = rec_starts.data(); + auto* find_result_ptr = rec_starts.data(); // Manually adding an extra row to account for the first row in the file if (byte_range_offset_ == 0) { find_result_ptr++; @@ -372,7 +374,7 @@ rmm::device_uvector reader::impl::find_record_starts(rmm::cuda_stream_ * Only rows that need to be parsed are copied, based on the byte range * Also updates the array of record starts to match the device data offset. */ -void reader::impl::upload_data_to_device(rmm::device_uvector &rec_starts, +void reader::impl::upload_data_to_device(rmm::device_uvector& rec_starts, rmm::cuda_stream_view stream) { size_t start_offset = 0; @@ -472,7 +474,7 @@ void reader::impl::set_data_types(device_span rec_starts, // Assume that the dtype is in dictionary format only if all elements contain a colon const bool is_dict = - std::all_of(std::cbegin(dtype), std::cend(dtype), [](const std::string &s) { + std::all_of(std::cbegin(dtype), std::cend(dtype), [](const std::string& s) { return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s); }); @@ -487,7 +489,7 @@ void reader::impl::set_data_types(device_span rec_starts, std::cbegin(dtype), std::cend(dtype), std::inserter(col_type_map, col_type_map.end()), - [&](auto const &ts) { + [&](auto const& ts) { auto const [col_name, type_str] = split_on_colon(ts); return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})}; }); @@ -496,12 +498,12 @@ void reader::impl::set_data_types(device_span rec_starts, std::transform(std::cbegin(metadata_.column_names), std::cend(metadata_.column_names), std::back_inserter(dtypes_), - [&](auto const &column_name) { return col_type_map[column_name]; }); + [&](auto const& column_name) { return col_type_map[column_name]; }); } else { std::transform(std::cbegin(dtype), std::cend(dtype), std::back_inserter(dtypes_), - [](auto const &col_dtype) { return convert_string_to_dtype(col_dtype); }); + [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); }); } } else { CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n"); @@ -510,14 +512,14 @@ void reader::impl::set_data_types(device_span rec_starts, auto const h_column_infos = cudf::io::json::gpu::detect_data_types( opts_.view(), - device_span(static_cast(data_.data()), data_.size()), + device_span(static_cast(data_.data()), data_.size()), rec_starts, do_set_null_count, num_columns, get_column_map_device_ptr(), stream); - auto get_type_id = [&](auto const &cinfo) { + auto get_type_id = [&](auto const& cinfo) { auto int_count_total = cinfo.big_int_count + cinfo.negative_small_int_count + cinfo.positive_small_int_count; if (cinfo.null_count == static_cast(rec_starts.size())) { @@ -545,7 +547,7 @@ void reader::impl::set_data_types(device_span rec_starts, std::transform(std::cbegin(h_column_infos), std::cend(h_column_infos), std::back_inserter(dtypes_), - [&](auto const &cinfo) { return data_type{get_type_id(cinfo)}; }); + [&](auto const& cinfo) { return data_type{get_type_id(cinfo)}; }); } } @@ -562,8 +564,8 @@ table_with_metadata reader::impl::convert_data_to_table(device_span h_dtypes(num_columns); - thrust::host_vector h_data(num_columns); - thrust::host_vector h_valid(num_columns); + thrust::host_vector h_data(num_columns); + thrust::host_vector h_valid(num_columns); for (size_t i = 0; i < num_columns; ++i) { h_dtypes[i] = dtypes_[i]; @@ -572,14 +574,14 @@ table_with_metadata reader::impl::convert_data_to_table(device_span(h_dtypes, stream); - auto d_data = cudf::detail::make_device_uvector_async(h_data, stream); - auto d_valid = cudf::detail::make_device_uvector_async(h_valid, stream); + auto d_data = cudf::detail::make_device_uvector_async(h_data, stream); + auto d_valid = cudf::detail::make_device_uvector_async(h_valid, stream); auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async(num_columns, stream); cudf::io::json::gpu::convert_json_to_columns( opts_.view(), - device_span(static_cast(data_.data()), data_.size()), + device_span(static_cast(data_.data()), data_.size()), rec_starts, d_dtypes, get_column_map_device_ptr(), @@ -632,11 +634,11 @@ table_with_metadata reader::impl::convert_data_to_table(device_span(std::move(out_columns)), metadata_}; } -reader::impl::impl(std::vector> &&sources, - std::vector const &filepaths, - json_reader_options const &options, +reader::impl::impl(std::vector>&& sources, + std::vector const& filepaths, + json_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : options_(options), mr_(mr), sources_(std::move(sources)), filepaths_(filepaths) { CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); @@ -657,7 +659,7 @@ reader::impl::impl(std::vector> &&sources, * * @return Table and its metadata */ -table_with_metadata reader::impl::read(json_reader_options const &options, +table_with_metadata reader::impl::read(json_reader_options const& options, rmm::cuda_stream_view stream) { auto range_offset = options.get_byte_range_offset(); @@ -686,10 +688,10 @@ table_with_metadata reader::impl::read(json_reader_options const &options, } // Forward to implementation -reader::reader(std::vector const &filepaths, - json_reader_options const &options, +reader::reader(std::vector const& filepaths, + json_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { // Delay actual instantiation of data source until read to allow for // partial memory mapping of file using byte ranges @@ -698,10 +700,10 @@ reader::reader(std::vector const &filepaths, } // Forward to implementation -reader::reader(std::vector> &&sources, - json_reader_options const &options, +reader::reader(std::vector>&& sources, + json_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { std::vector file_paths = {}; // Empty filepaths _impl = std::make_unique(std::move(sources), file_paths, options, stream, mr); @@ -711,7 +713,7 @@ reader::reader(std::vector> &&sources, reader::~reader() = default; // Forward to implementation -table_with_metadata reader::read(json_reader_options const &options, rmm::cuda_stream_view stream) +table_with_metadata reader::read(json_reader_options const& options, rmm::cuda_stream_view stream) { return table_with_metadata{_impl->read(options, stream)}; } diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp index f22653303ce..bbda7e9ba74 100644 --- a/cpp/src/io/json/reader_impl.hpp +++ b/cpp/src/io/json/reader_impl.hpp @@ -44,7 +44,7 @@ using namespace cudf::io::json; using namespace cudf::io; using col_map_type = cudf::io::json::gpu::col_map_type; -using col_map_ptr_type = std::unique_ptr>; +using col_map_ptr_type = std::unique_ptr>; /** * @brief Class used to parse Json input and convert it into gdf columns. @@ -54,13 +54,13 @@ class reader::impl { private: const json_reader_options options_{}; - rmm::mr::device_memory_resource *mr_ = nullptr; + rmm::mr::device_memory_resource* mr_ = nullptr; std::vector> sources_; std::vector filepaths_; std::vector buffer_; - const char *uncomp_data_ = nullptr; + const char* uncomp_data_ = nullptr; size_t uncomp_size_ = 0; // Used when the input data is compressed, to ensure the allocated uncompressed data is freed @@ -87,7 +87,7 @@ class reader::impl { * @brief Sets the column map data member and makes a device copy to be used as a kernel * parameter. */ - void set_column_map(col_map_ptr_type &&map, rmm::cuda_stream_view stream) + void set_column_map(col_map_ptr_type&& map, rmm::cuda_stream_view stream) { key_to_col_idx_map_ = std::move(map); d_key_col_map_ = @@ -145,7 +145,7 @@ class reader::impl { * Only rows that need to be parsed are copied, based on the byte range * Also updates the array of record starts to match the device data offset. */ - void upload_data_to_device(rmm::device_uvector &rec_starts, + void upload_data_to_device(rmm::device_uvector& rec_starts, rmm::cuda_stream_view stream); /** @@ -183,11 +183,11 @@ class reader::impl { /** * @brief Constructor from a dataset source with reader options. */ - explicit impl(std::vector> &&sources, - std::vector const &filepaths, - json_reader_options const &options, + explicit impl(std::vector>&& sources, + std::vector const& filepaths, + json_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr); + rmm::mr::device_memory_resource* mr); /** * @brief Read an entire set or a subset of data from the source @@ -197,7 +197,7 @@ class reader::impl { * * @return Table and its metadata */ - table_with_metadata read(json_reader_options const &options, rmm::cuda_stream_view stream); + table_with_metadata read(json_reader_options const& options, rmm::cuda_stream_view stream); }; } // namespace json diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index e69a61bde66..ef39e475b93 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -54,7 +54,7 @@ static inline __device__ uint32_t hash_string(const string_view val) if (val.empty()) { return 0; } else { - char const *ptr = val.data(); + char const* ptr = val.data(); uint32_t len = val.size_bytes(); return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1); } @@ -68,13 +68,13 @@ static inline __device__ uint32_t hash_string(const string_view val) * @param[in] temp_storage shared memory storage to scan non-null positions */ template -static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, +static __device__ void LoadNonNullIndices(volatile dictinit_state_s* s, int t, - Storage &temp_storage) + Storage& temp_storage) { if (t == 0) { s->nnz = 0; } for (uint32_t i = 0; i < s->chunk.num_rows; i += block_size) { - const uint32_t *valid_map = s->chunk.leaf_column->null_mask(); + const uint32_t* valid_map = s->chunk.leaf_column->null_mask(); auto column_offset = s->chunk.leaf_column->offset(); uint32_t is_valid, nz_pos; if (t < block_size / 32) { @@ -120,12 +120,12 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, // blockDim {block_size,1,1} template __global__ void __launch_bounds__(block_size, 2) - gpuInitDictionaryIndices(DictionaryChunk *chunks, + gpuInitDictionaryIndices(DictionaryChunk* chunks, const table_device_view view, - uint32_t *dict_data, - uint32_t *dict_index, + uint32_t* dict_data, + uint32_t* dict_index, size_t row_index_stride, - size_type *str_col_ids, + size_type* str_col_ids, uint32_t num_columns) { __shared__ __align__(16) dictinit_state_s state_g; @@ -138,14 +138,14 @@ __global__ void __launch_bounds__(block_size, 2) typename block_scan::TempStorage scan_storage; } temp_storage; - dictinit_state_s *const s = &state_g; + dictinit_state_s* const s = &state_g; uint32_t col_id = blockIdx.x; uint32_t group_id = blockIdx.y; uint32_t nnz, start_row, dict_char_count; int t = threadIdx.x; if (t == 0) { - column_device_view *leaf_column_view = view.begin() + str_col_ids[col_id]; + column_device_view* leaf_column_view = view.begin() + str_col_ids[col_id]; s->chunk = chunks[group_id * num_columns + col_id]; s->chunk.leaf_column = leaf_column_view; s->chunk.dict_data = @@ -305,21 +305,21 @@ __global__ void __launch_bounds__(block_size, 2) */ // blockDim {1024,1,1} extern "C" __global__ void __launch_bounds__(1024) - gpuCompactChunkDictionaries(StripeDictionary *stripes, - DictionaryChunk const *chunks, + gpuCompactChunkDictionaries(StripeDictionary* stripes, + DictionaryChunk const* chunks, uint32_t num_columns) { __shared__ __align__(16) StripeDictionary stripe_g; __shared__ __align__(16) DictionaryChunk chunk_g; - __shared__ const uint32_t *volatile ck_curptr_g; + __shared__ const uint32_t* volatile ck_curptr_g; __shared__ uint32_t volatile ck_curlen_g; uint32_t col_id = blockIdx.x; uint32_t stripe_id = blockIdx.y; uint32_t chunk_len; int t = threadIdx.x; - const uint32_t *src; - uint32_t *dst; + const uint32_t* src; + uint32_t* dst; if (t == 0) stripe_g = stripes[stripe_id * num_columns + col_id]; __syncthreads(); @@ -365,7 +365,7 @@ struct build_state_s { // blockDim {1024,1,1} template __global__ void __launch_bounds__(block_size) - gpuBuildStripeDictionaries(StripeDictionary *stripes, uint32_t num_columns) + gpuBuildStripeDictionaries(StripeDictionary* stripes, uint32_t num_columns) { __shared__ __align__(16) build_state_s state_g; using block_reduce = cub::BlockReduce; @@ -375,7 +375,7 @@ __global__ void __launch_bounds__(block_size) typename block_scan::TempStorage scan_storage; } temp_storage; - build_state_s *const s = &state_g; + build_state_s* const s = &state_g; uint32_t col_id = blockIdx.x; uint32_t stripe_id = blockIdx.y; uint32_t num_strings; @@ -427,12 +427,12 @@ __global__ void __launch_bounds__(block_size) /** * @copydoc cudf::io::orc::gpu::InitDictionaryIndices */ -void InitDictionaryIndices(const table_device_view &view, - DictionaryChunk *chunks, - uint32_t *dict_data, - uint32_t *dict_index, +void InitDictionaryIndices(const table_device_view& view, + DictionaryChunk* chunks, + uint32_t* dict_data, + uint32_t* dict_index, size_t row_index_stride, - size_type *str_col_ids, + size_type* str_col_ids, uint32_t num_columns, uint32_t num_rowgroups, rmm::cuda_stream_view stream) @@ -447,9 +447,9 @@ void InitDictionaryIndices(const table_device_view &view, /** * @copydoc cudf::io::orc::gpu::BuildStripeDictionaries */ -void BuildStripeDictionaries(StripeDictionary *stripes, - StripeDictionary *stripes_host, - DictionaryChunk const *chunks, +void BuildStripeDictionaries(StripeDictionary* stripes, + StripeDictionary* stripes_host, + DictionaryChunk const* chunks, uint32_t num_stripes, uint32_t num_rowgroups, uint32_t num_columns, @@ -463,12 +463,12 @@ void BuildStripeDictionaries(StripeDictionary *stripes, if (stripes_host[i].dict_data != nullptr) { thrust::device_ptr dict_data_ptr = thrust::device_pointer_cast(stripes_host[i].dict_data); - column_device_view *string_column = stripes_host[i].leaf_column; + column_device_view* string_column = stripes_host[i].leaf_column; // NOTE: Requires the --expt-extended-lambda nvcc flag thrust::sort(rmm::exec_policy(stream), dict_data_ptr, dict_data_ptr + stripes_host[i].num_strings, - [string_column] __device__(const uint32_t &lhs, const uint32_t &rhs) { + [string_column] __device__(const uint32_t& lhs, const uint32_t& rhs) { return string_column->element(lhs) < string_column->element(rhs); }); diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index ea6d6b6ac85..287364c3191 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -23,7 +23,7 @@ namespace cudf { namespace io { namespace orc { -uint32_t ProtobufReader::read_field_size(const uint8_t *end) +uint32_t ProtobufReader::read_field_size(const uint8_t* end) { auto const size = get(); CUDF_EXPECTS(size <= static_cast(end - m_cur), "Protobuf parsing out of bounds"); @@ -37,13 +37,11 @@ void ProtobufReader::skip_struct_field(int t) case PB_TYPE_FIXED64: skip_bytes(8); break; case PB_TYPE_FIXEDLEN: skip_bytes(get()); break; case PB_TYPE_FIXED32: skip_bytes(4); break; - default: - // printf("invalid type (%d)\n", t); - break; + default: break; } } -void ProtobufReader::read(PostScript &s, size_t maxlen) +void ProtobufReader::read(PostScript& s, size_t maxlen) { auto op = std::make_tuple(make_field_reader(1, s.footerLength), make_field_reader(2, s.compression), @@ -54,7 +52,7 @@ void ProtobufReader::read(PostScript &s, size_t maxlen) function_builder(s, maxlen, op); } -void ProtobufReader::read(FileFooter &s, size_t maxlen) +void ProtobufReader::read(FileFooter& s, size_t maxlen) { auto op = std::make_tuple(make_field_reader(1, s.headerLength), make_field_reader(2, s.contentLength), @@ -67,7 +65,7 @@ void ProtobufReader::read(FileFooter &s, size_t maxlen) function_builder(s, maxlen, op); } -void ProtobufReader::read(StripeInformation &s, size_t maxlen) +void ProtobufReader::read(StripeInformation& s, size_t maxlen) { auto op = std::make_tuple(make_field_reader(1, s.offset), make_field_reader(2, s.indexLength), @@ -77,7 +75,7 @@ void ProtobufReader::read(StripeInformation &s, size_t maxlen) function_builder(s, maxlen, op); } -void ProtobufReader::read(SchemaType &s, size_t maxlen) +void ProtobufReader::read(SchemaType& s, size_t maxlen) { auto op = std::make_tuple(make_field_reader(1, s.kind), make_packed_field_reader(2, s.subtypes), @@ -88,13 +86,13 @@ void ProtobufReader::read(SchemaType &s, size_t maxlen) function_builder(s, maxlen, op); } -void ProtobufReader::read(UserMetadataItem &s, size_t maxlen) +void ProtobufReader::read(UserMetadataItem& s, size_t maxlen) { auto op = std::make_tuple(make_field_reader(1, s.name), make_field_reader(2, s.value)); function_builder(s, maxlen, op); } -void ProtobufReader::read(StripeFooter &s, size_t maxlen) +void ProtobufReader::read(StripeFooter& s, size_t maxlen) { auto op = std::make_tuple(make_field_reader(1, s.streams), make_field_reader(2, s.columns), @@ -102,7 +100,7 @@ void ProtobufReader::read(StripeFooter &s, size_t maxlen) function_builder(s, maxlen, op); } -void ProtobufReader::read(Stream &s, size_t maxlen) +void ProtobufReader::read(Stream& s, size_t maxlen) { auto op = std::make_tuple(make_field_reader(1, s.kind), make_field_reader(2, s.column_id), @@ -110,59 +108,59 @@ void ProtobufReader::read(Stream &s, size_t maxlen) function_builder(s, maxlen, op); } -void ProtobufReader::read(ColumnEncoding &s, size_t maxlen) +void ProtobufReader::read(ColumnEncoding& s, size_t maxlen) { auto op = std::make_tuple(make_field_reader(1, s.kind), make_field_reader(2, s.dictionarySize)); function_builder(s, maxlen, op); } -void ProtobufReader::read(integer_statistics &s, size_t maxlen) +void ProtobufReader::read(integer_statistics& s, size_t maxlen) { auto op = std::make_tuple( make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum)); function_builder(s, maxlen, op); } -void ProtobufReader::read(double_statistics &s, size_t maxlen) +void ProtobufReader::read(double_statistics& s, size_t maxlen) { auto op = std::make_tuple( make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum)); function_builder(s, maxlen, op); } -void ProtobufReader::read(string_statistics &s, size_t maxlen) +void ProtobufReader::read(string_statistics& s, size_t maxlen) { auto op = std::make_tuple( make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum)); function_builder(s, maxlen, op); } -void ProtobufReader::read(bucket_statistics &s, size_t maxlen) +void ProtobufReader::read(bucket_statistics& s, size_t maxlen) { auto op = std::make_tuple(make_packed_field_reader(1, s.count)); function_builder(s, maxlen, op); } -void ProtobufReader::read(decimal_statistics &s, size_t maxlen) +void ProtobufReader::read(decimal_statistics& s, size_t maxlen) { auto op = std::make_tuple( make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum)); function_builder(s, maxlen, op); } -void ProtobufReader::read(date_statistics &s, size_t maxlen) +void ProtobufReader::read(date_statistics& s, size_t maxlen) { auto op = std::make_tuple(make_field_reader(1, s.minimum), make_field_reader(2, s.maximum)); function_builder(s, maxlen, op); } -void ProtobufReader::read(binary_statistics &s, size_t maxlen) +void ProtobufReader::read(binary_statistics& s, size_t maxlen) { auto op = std::make_tuple(make_field_reader(1, s.sum)); function_builder(s, maxlen, op); } -void ProtobufReader::read(timestamp_statistics &s, size_t maxlen) +void ProtobufReader::read(timestamp_statistics& s, size_t maxlen) { auto op = std::make_tuple(make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), @@ -171,7 +169,7 @@ void ProtobufReader::read(timestamp_statistics &s, size_t maxlen) function_builder(s, maxlen, op); } -void ProtobufReader::read(column_statistics &s, size_t maxlen) +void ProtobufReader::read(column_statistics& s, size_t maxlen) { auto op = std::make_tuple(make_field_reader(1, s.number_of_values), make_field_reader(2, s.int_stats), @@ -185,13 +183,13 @@ void ProtobufReader::read(column_statistics &s, size_t maxlen) function_builder(s, maxlen, op); } -void ProtobufReader::read(StripeStatistics &s, size_t maxlen) +void ProtobufReader::read(StripeStatistics& s, size_t maxlen) { auto op = std::make_tuple(make_raw_field_reader(1, s.colStats)); function_builder(s, maxlen, op); } -void ProtobufReader::read(Metadata &s, size_t maxlen) +void ProtobufReader::read(Metadata& s, size_t maxlen) { auto op = std::make_tuple(make_field_reader(1, s.stripeStats)); function_builder(s, maxlen, op); @@ -245,7 +243,7 @@ void ProtobufWriter::put_row_index_entry(int32_t present_blk, m_buf->data()[lpos + 2] = (uint8_t)(sz); } -size_t ProtobufWriter::write(const PostScript &s) +size_t ProtobufWriter::write(const PostScript& s) { ProtobufFieldWriter w(this); w.field_uint(1, s.footerLength); @@ -257,7 +255,7 @@ size_t ProtobufWriter::write(const PostScript &s) return w.value(); } -size_t ProtobufWriter::write(const FileFooter &s) +size_t ProtobufWriter::write(const FileFooter& s) { ProtobufFieldWriter w(this); w.field_uint(1, s.headerLength); @@ -271,7 +269,7 @@ size_t ProtobufWriter::write(const FileFooter &s) return w.value(); } -size_t ProtobufWriter::write(const StripeInformation &s) +size_t ProtobufWriter::write(const StripeInformation& s) { ProtobufFieldWriter w(this); w.field_uint(1, s.offset); @@ -282,7 +280,7 @@ size_t ProtobufWriter::write(const StripeInformation &s) return w.value(); } -size_t ProtobufWriter::write(const SchemaType &s) +size_t ProtobufWriter::write(const SchemaType& s) { ProtobufFieldWriter w(this); w.field_uint(1, s.kind); @@ -294,7 +292,7 @@ size_t ProtobufWriter::write(const SchemaType &s) return w.value(); } -size_t ProtobufWriter::write(const UserMetadataItem &s) +size_t ProtobufWriter::write(const UserMetadataItem& s) { ProtobufFieldWriter w(this); w.field_string(1, s.name); @@ -302,7 +300,7 @@ size_t ProtobufWriter::write(const UserMetadataItem &s) return w.value(); } -size_t ProtobufWriter::write(const StripeFooter &s) +size_t ProtobufWriter::write(const StripeFooter& s) { ProtobufFieldWriter w(this); w.field_repeated_struct(1, s.streams); @@ -311,7 +309,7 @@ size_t ProtobufWriter::write(const StripeFooter &s) return w.value(); } -size_t ProtobufWriter::write(const Stream &s) +size_t ProtobufWriter::write(const Stream& s) { ProtobufFieldWriter w(this); w.field_uint(1, s.kind); @@ -320,7 +318,7 @@ size_t ProtobufWriter::write(const Stream &s) return w.value(); } -size_t ProtobufWriter::write(const ColumnEncoding &s) +size_t ProtobufWriter::write(const ColumnEncoding& s) { ProtobufFieldWriter w(this); w.field_uint(1, s.kind); @@ -328,14 +326,14 @@ size_t ProtobufWriter::write(const ColumnEncoding &s) return w.value(); } -size_t ProtobufWriter::write(const StripeStatistics &s) +size_t ProtobufWriter::write(const StripeStatistics& s) { ProtobufFieldWriter w(this); w.field_repeated_struct_blob(1, s.colStats); return w.value(); } -size_t ProtobufWriter::write(const Metadata &s) +size_t ProtobufWriter::write(const Metadata& s) { ProtobufFieldWriter w(this); w.field_repeated_struct(1, s.stripeStats); @@ -376,7 +374,7 @@ OrcDecompressor::OrcDecompressor(CompressionKind kind, uint32_t blockSize) * * @returns pointer to uncompressed data, nullptr if error */ -const uint8_t *OrcDecompressor::Decompress(const uint8_t *srcBytes, size_t srcLen, size_t *dstLen) +const uint8_t* OrcDecompressor::Decompress(const uint8_t* srcBytes, size_t srcLen, size_t* dstLen) { // If uncompressed, just pass-through the input if (m_kind == NONE) { @@ -431,7 +429,7 @@ const uint8_t *OrcDecompressor::Decompress(const uint8_t *srcBytes, size_t srcLe return m_buf.data(); } -metadata::metadata(datasource *const src) : source(src) +metadata::metadata(datasource* const src) : source(src) { const auto len = source->size(); const auto max_ps_size = std::min(len, static_cast(256)); @@ -439,7 +437,7 @@ metadata::metadata(datasource *const src) : source(src) // Read uncompressed postscript section (max 255 bytes + 1 byte for length) auto buffer = source->host_read(len - max_ps_size, max_ps_size); const size_t ps_length = buffer->data()[max_ps_size - 1]; - const uint8_t *ps_data = &buffer->data()[max_ps_size - ps_length - 1]; + const uint8_t* ps_data = &buffer->data()[max_ps_size - ps_length - 1]; ProtobufReader(ps_data, ps_length).read(ps); CUDF_EXPECTS(ps.footerLength + ps_length < len, "Invalid footer length"); @@ -461,30 +459,23 @@ metadata::metadata(datasource *const src) : source(src) auto md_data = decompressor->Decompress(buffer->data(), ps.metadataLength, &md_length); orc::ProtobufReader(md_data, md_length).read(md); - // Initilize the column names + // Initialize the column names init_column_names(); } void metadata::init_column_names() const { auto const schema_idxs = get_schema_indexes(); - auto const &types = ff.types; + auto const& types = ff.types; for (int32_t col_id = 0; col_id < get_num_columns(); ++col_id) { std::string col_name; - uint32_t parent_idx = col_id; - uint32_t idx = col_id; - do { - idx = parent_idx; - parent_idx = (idx < types.size()) ? static_cast(schema_idxs[idx].parent) : ~0; - if (parent_idx >= types.size()) break; - - auto const field_idx = - (parent_idx < types.size()) ? static_cast(schema_idxs[idx].field) : ~0; + if (schema_idxs[col_id].parent >= 0 and schema_idxs[col_id].field >= 0) { + auto const parent_idx = static_cast(schema_idxs[col_id].parent); + auto const field_idx = static_cast(schema_idxs[col_id].field); if (field_idx < types[parent_idx].fieldNames.size()) { - col_name = - types[parent_idx].fieldNames[field_idx] + (col_name.empty() ? "" : ("." + col_name)); + col_name = types[parent_idx].fieldNames[field_idx]; } - } while (parent_idx != idx); + } // If we have no name (root column), generate a name column_names.push_back(col_name.empty() ? "col" + std::to_string(col_id) : col_name); } @@ -496,7 +487,7 @@ std::vector metadata::get_schema_indexes() const auto const schema_size = static_cast(result.size()); for (uint32_t i = 0; i < schema_size; i++) { - auto const &subtypes = ff.types[i].subtypes; + auto const& subtypes = ff.types[i].subtypes; auto const num_children = static_cast(subtypes.size()); if (result[i].parent == -1) { // Not initialized result[i].parent = i; // set root node as its own parent diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h index e6fec8afb0f..474f404be0f 100644 --- a/cpp/src/io/orc/orc.h +++ b/cpp/src/io/orc/orc.h @@ -135,32 +135,32 @@ struct Metadata { */ class ProtobufReader { public: - ProtobufReader(const uint8_t *base, size_t len) : m_base(base), m_cur(base), m_end(base + len) {} + ProtobufReader(const uint8_t* base, size_t len) : m_base(base), m_cur(base), m_end(base + len) {} template - void read(T &s) + void read(T& s) { read(s, m_end - m_cur); } - void read(PostScript &, size_t maxlen); - void read(FileFooter &, size_t maxlen); - void read(StripeInformation &, size_t maxlen); - void read(SchemaType &, size_t maxlen); - void read(UserMetadataItem &, size_t maxlen); - void read(StripeFooter &, size_t maxlen); - void read(Stream &, size_t maxlen); - void read(ColumnEncoding &, size_t maxlen); - void read(integer_statistics &, size_t maxlen); - void read(double_statistics &, size_t maxlen); - void read(string_statistics &, size_t maxlen); - void read(bucket_statistics &, size_t maxlen); - void read(decimal_statistics &, size_t maxlen); - void read(date_statistics &, size_t maxlen); - void read(binary_statistics &, size_t maxlen); - void read(timestamp_statistics &, size_t maxlen); - void read(column_statistics &, size_t maxlen); - void read(StripeStatistics &, size_t maxlen); - void read(Metadata &, size_t maxlen); + void read(PostScript&, size_t maxlen); + void read(FileFooter&, size_t maxlen); + void read(StripeInformation&, size_t maxlen); + void read(SchemaType&, size_t maxlen); + void read(UserMetadataItem&, size_t maxlen); + void read(StripeFooter&, size_t maxlen); + void read(Stream&, size_t maxlen); + void read(ColumnEncoding&, size_t maxlen); + void read(integer_statistics&, size_t maxlen); + void read(double_statistics&, size_t maxlen); + void read(string_statistics&, size_t maxlen); + void read(bucket_statistics&, size_t maxlen); + void read(decimal_statistics&, size_t maxlen); + void read(date_statistics&, size_t maxlen); + void read(binary_statistics&, size_t maxlen); + void read(timestamp_statistics&, size_t maxlen); + void read(column_statistics&, size_t maxlen); + void read(StripeStatistics&, size_t maxlen); + void read(Metadata&, size_t maxlen); private: template @@ -178,11 +178,11 @@ class ProtobufReader { void skip_struct_field(int t); template - void function_builder(T &s, size_t maxlen, std::tuple &op); + void function_builder(T& s, size_t maxlen, std::tuple& op); template ::value and - !std::is_enum::value> * = nullptr> + !std::is_enum::value>* = nullptr> int static constexpr encode_field_number_base(int field_number) noexcept { return (field_number * 8) + PB_TYPE_FIXEDLEN; @@ -190,21 +190,21 @@ class ProtobufReader { template ::value or - std::is_enum::value> * = nullptr> + std::is_enum::value>* = nullptr> int static constexpr encode_field_number_base(int field_number) noexcept { return (field_number * 8) + PB_TYPE_VARINT; } template ::value> * = nullptr> + typename std::enable_if_t::value>* = nullptr> int static constexpr encode_field_number_base(int field_number) noexcept { return (field_number * 8) + PB_TYPE_FIXED32; } template ::value> * = nullptr> + typename std::enable_if_t::value>* = nullptr> int static constexpr encode_field_number_base(int field_number) noexcept { return (field_number * 8) + PB_TYPE_FIXED64; @@ -212,7 +212,7 @@ class ProtobufReader { template ::value or - std::is_same::value> * = nullptr> + std::is_same::value>* = nullptr> int static constexpr encode_field_number(int field_number) noexcept { return encode_field_number_base(field_number); @@ -220,8 +220,8 @@ class ProtobufReader { // containters change the field number encoding template >::value> - * = nullptr> + typename std::enable_if_t< + std::is_same>::value>* = nullptr> int static constexpr encode_field_number(int field_number) noexcept { return encode_field_number_base(field_number); @@ -229,49 +229,49 @@ class ProtobufReader { // optional fields don't change the field number encoding template >::value> - * = nullptr> + typename std::enable_if_t< + std::is_same>::value>* = nullptr> int static constexpr encode_field_number(int field_number) noexcept { return encode_field_number_base(field_number); } - uint32_t read_field_size(const uint8_t *end); + uint32_t read_field_size(const uint8_t* end); - template ::value> * = nullptr> - void read_field(T &value, const uint8_t *end) + template ::value>* = nullptr> + void read_field(T& value, const uint8_t* end) { value = get(); } - template ::value> * = nullptr> - void read_field(T &value, const uint8_t *end) + template ::value>* = nullptr> + void read_field(T& value, const uint8_t* end) { value = static_cast(get()); } - template ::value> * = nullptr> - void read_field(T &value, const uint8_t *end) + template ::value>* = nullptr> + void read_field(T& value, const uint8_t* end) { auto const size = read_field_size(end); - value.assign(reinterpret_cast(m_cur), size); + value.assign(reinterpret_cast(m_cur), size); m_cur += size; } template >::value> * = nullptr> - void read_field(T &value, const uint8_t *end) + typename std::enable_if_t>::value>* = nullptr> + void read_field(T& value, const uint8_t* end) { auto const size = read_field_size(end); - value.emplace_back(reinterpret_cast(m_cur), size); + value.emplace_back(reinterpret_cast(m_cur), size); m_cur += size; } - template >::value and - !std::is_same::value> * = nullptr> - void read_field(T &value, const uint8_t *end) + template < + typename T, + typename std::enable_if_t>::value and + !std::is_same::value>* = nullptr> + void read_field(T& value, const uint8_t* end) { auto const size = read_field_size(end); value.emplace_back(); @@ -279,9 +279,9 @@ class ProtobufReader { } template >::value> - * = nullptr> - void read_field(T &value, const uint8_t *end) + typename std::enable_if_t< + std::is_same>::value>* = nullptr> + void read_field(T& value, const uint8_t* end) { typename T::value_type contained_value; read_field(contained_value, end); @@ -289,29 +289,30 @@ class ProtobufReader { } template - auto read_field(T &value, const uint8_t *end) -> decltype(read(value, 0)) + auto read_field(T& value, const uint8_t* end) -> decltype(read(value, 0)) { auto const size = read_field_size(end); read(value, size); } - template ::value> * = nullptr> - void read_field(T &value, const uint8_t *end) + template ::value>* = nullptr> + void read_field(T& value, const uint8_t* end) { memcpy(&value, m_cur, sizeof(T)); m_cur += sizeof(T); } template - void read_packed_field(T &value, const uint8_t *end) + void read_packed_field(T& value, const uint8_t* end) { auto const len = get(); auto const field_end = std::min(m_cur + len, end); - while (m_cur < field_end) value.push_back(get()); + while (m_cur < field_end) + value.push_back(get()); } template - void read_raw_field(T &value, const uint8_t *end) + void read_raw_field(T& value, const uint8_t* end) { auto const size = read_field_size(end); value.emplace_back(m_cur, m_cur + size); @@ -321,14 +322,14 @@ class ProtobufReader { template struct field_reader { int const encoded_field_number; - T &output_value; + T& output_value; - field_reader(int field_number, T &field_value) + field_reader(int field_number, T& field_value) : encoded_field_number(encode_field_number(field_number)), output_value(field_value) { } - inline void operator()(ProtobufReader *pbr, const uint8_t *end) + inline void operator()(ProtobufReader* pbr, const uint8_t* end) { pbr->read_field(output_value, end); } @@ -337,14 +338,14 @@ class ProtobufReader { template struct packed_field_reader { int const encoded_field_number; - T &output_value; + T& output_value; - packed_field_reader(int field_number, T &field_value) + packed_field_reader(int field_number, T& field_value) : encoded_field_number(encode_field_number(field_number)), output_value(field_value) { } - inline void operator()(ProtobufReader *pbr, const uint8_t *end) + inline void operator()(ProtobufReader* pbr, const uint8_t* end) { pbr->read_packed_field(output_value, end); } @@ -353,22 +354,22 @@ class ProtobufReader { template struct raw_field_reader { int const encoded_field_number; - T &output_value; + T& output_value; - raw_field_reader(int field_number, T &field_value) + raw_field_reader(int field_number, T& field_value) : encoded_field_number(encode_field_number(field_number)), output_value(field_value) { } - inline void operator()(ProtobufReader *pbr, const uint8_t *end) + inline void operator()(ProtobufReader* pbr, const uint8_t* end) { pbr->read_raw_field(output_value, end); } }; - const uint8_t *const m_base; - const uint8_t *m_cur; - const uint8_t *const m_end; + const uint8_t* const m_base; + const uint8_t* m_cur; + const uint8_t* const m_end; public: /** @@ -381,7 +382,7 @@ class ProtobufReader { * @return the field reader object of the right type */ template - static auto make_field_reader(int field_number, T &field_value) + static auto make_field_reader(int field_number, T& field_value) { return field_reader(field_number, field_value); } @@ -395,7 +396,7 @@ class ProtobufReader { * @return the packed field reader object of the right type */ template - static auto make_packed_field_reader(int field_number, T &field_value) + static auto make_packed_field_reader(int field_number, T& field_value) { return packed_field_reader(field_number, field_value); } @@ -410,7 +411,7 @@ class ProtobufReader { * @return the raw field reader object of the right type */ template - static auto make_raw_field_reader(int field_number, T &field_value) + static auto make_raw_field_reader(int field_number, T& field_value) { return raw_field_reader(field_number, field_value); } @@ -469,7 +470,7 @@ inline int64_t ProtobufReader::get() class ProtobufWriter { public: ProtobufWriter() { m_buf = nullptr; } - ProtobufWriter(std::vector *output) { m_buf = output; } + ProtobufWriter(std::vector* output) { m_buf = output; } void putb(uint8_t v) { m_buf->push_back(v); } uint32_t put_uint(uint64_t v) { @@ -496,19 +497,19 @@ class ProtobufWriter { TypeKind kind); public: - size_t write(const PostScript &); - size_t write(const FileFooter &); - size_t write(const StripeInformation &); - size_t write(const SchemaType &); - size_t write(const UserMetadataItem &); - size_t write(const StripeFooter &); - size_t write(const Stream &); - size_t write(const ColumnEncoding &); - size_t write(const StripeStatistics &); - size_t write(const Metadata &); + size_t write(const PostScript&); + size_t write(const FileFooter&); + size_t write(const StripeInformation&); + size_t write(const SchemaType&); + size_t write(const UserMetadataItem&); + size_t write(const StripeFooter&); + size_t write(const Stream&); + size_t write(const ColumnEncoding&); + size_t write(const StripeStatistics&); + size_t write(const Metadata&); protected: - std::vector *m_buf; + std::vector* m_buf; struct ProtobufFieldWriter; }; @@ -519,7 +520,7 @@ class ProtobufWriter { class OrcDecompressor { public: OrcDecompressor(CompressionKind kind, uint32_t blockSize); - const uint8_t *Decompress(const uint8_t *srcBytes, size_t srcLen, size_t *dstLen); + const uint8_t* Decompress(const uint8_t* srcBytes, size_t srcLen, size_t* dstLen); uint32_t GetLog2MaxCompressionRatio() const { return m_log2MaxRatio; } uint32_t GetMaxUncompressedBlockSize(uint32_t block_len) const { @@ -537,12 +538,38 @@ class OrcDecompressor { std::vector m_buf; }; +/** + * @brief Stores orc id for each column and its adjacent number of children + * in case of struct or number of children in case of list column. + * If list column has struct column, then all child columns of that struct are treated as child + * column of list. + * + * @code{.pseudo} + * Consider following data where a struct has two members and a list column + * {"struct": [{"a": 1, "b": 2}, {"a":3, "b":5}], "list":[[1, 2], [2, 3]]} + * + * `orc_column_meta` for struct column would be + * id = 0 + * num_children = 2 + * + * `orc_column_meta` for list column would be + * id = 3 + * num_children = 1 + * @endcode + * + */ +struct orc_column_meta { + // orc_column_meta(uint32_t _id, uint32_t _num_children) : id(_id), num_children(_num_children){}; + uint32_t id; // orc id for the column + uint32_t num_children; // number of children at the same level of nesting in case of struct +}; + /** * @brief A helper class for ORC file metadata. Provides some additional * convenience methods for initializing and accessing metadata. */ class metadata { - using OrcStripeInfo = std::pair; + using OrcStripeInfo = std::pair; public: struct stripe_source_mapping { @@ -551,12 +578,12 @@ class metadata { }; public: - explicit metadata(datasource *const src); + explicit metadata(datasource* const src); size_t get_total_rows() const { return ff.numberOfRows; } int get_num_stripes() const { return ff.stripes.size(); } int get_num_columns() const { return ff.types.size(); } - std::string const &get_column_name(int32_t column_id) const + std::string const& get_column_name(int32_t column_id) const { if (column_names.empty() && get_num_columns() != 0) { init_column_names(); } return column_names[column_id]; @@ -569,7 +596,7 @@ class metadata { Metadata md; std::vector stripefooters; std::unique_ptr decompressor; - datasource *const source; + datasource* const source; private: struct schema_indexes { diff --git a/cpp/src/io/orc/orc_field_reader.hpp b/cpp/src/io/orc/orc_field_reader.hpp index 8e9bca44340..45d2cbe3bf2 100644 --- a/cpp/src/io/orc/orc_field_reader.hpp +++ b/cpp/src/io/orc/orc_field_reader.hpp @@ -41,10 +41,10 @@ namespace orc { template struct FunctionSwitchImpl { template - static inline void run(ProtobufReader *pbr, - const uint8_t *end, - const int &encoded_field_number, - std::tuple &ops) + static inline void run(ProtobufReader* pbr, + const uint8_t* end, + const int& encoded_field_number, + std::tuple& ops) { if (encoded_field_number == std::get(ops).encoded_field_number) { std::get(ops)(pbr, end); @@ -57,10 +57,10 @@ struct FunctionSwitchImpl { template <> struct FunctionSwitchImpl<0> { template - static inline void run(ProtobufReader *pbr, - const uint8_t *end, - const int &encoded_field_number, - std::tuple &ops) + static inline void run(ProtobufReader* pbr, + const uint8_t* end, + const int& encoded_field_number, + std::tuple& ops) { if (encoded_field_number == std::get<0>(ops).encoded_field_number) { std::get<0>(ops)(pbr, end); @@ -78,10 +78,10 @@ struct FunctionSwitchImpl<0> { * pointed to by the functors. */ template -inline void ProtobufReader::function_builder(T &s, size_t maxlen, std::tuple &op) +inline void ProtobufReader::function_builder(T& s, size_t maxlen, std::tuple& op) { constexpr int index = std::tuple_size>::value - 1; - auto *const end = std::min(m_cur + maxlen, m_end); + auto* const end = std::min(m_cur + maxlen, m_end); while (m_cur < end) { auto const field = get(); FunctionSwitchImpl::run(this, end, field, op); diff --git a/cpp/src/io/orc/orc_field_writer.hpp b/cpp/src/io/orc/orc_field_writer.hpp index 13c7befa3a1..7882810b50d 100644 --- a/cpp/src/io/orc/orc_field_writer.hpp +++ b/cpp/src/io/orc/orc_field_writer.hpp @@ -31,15 +31,15 @@ namespace orc { struct ProtobufWriter::ProtobufFieldWriter { int struct_size; - ProtobufWriter *p; + ProtobufWriter* p; - ProtobufFieldWriter(ProtobufWriter *pbw) : struct_size(0), p(pbw) {} + ProtobufFieldWriter(ProtobufWriter* pbw) : struct_size(0), p(pbw) {} /** * @brief Function to write a unsigned integer to the internal buffer */ template - void field_uint(int field, const T &value) + void field_uint(int field, const T& value) { struct_size += p->put_uint(field * 8 + PB_TYPE_VARINT); struct_size += p->put_uint(static_cast(value)); @@ -50,7 +50,7 @@ struct ProtobufWriter::ProtobufFieldWriter { * buffer */ template - void field_packed_uint(int field, const std::vector &value) + void field_packed_uint(int field, const std::vector& value) { struct_size += p->put_uint(field * 8 + PB_TYPE_FIXEDLEN); auto lpos = p->m_buf->size(); @@ -68,31 +68,33 @@ struct ProtobufWriter::ProtobufFieldWriter { /** * @brief Function to write a string to the internal buffer */ - void field_string(int field, const std::string &value) + void field_string(int field, const std::string& value) { size_t len = value.length(); struct_size += p->put_uint(field * 8 + PB_TYPE_FIXEDLEN); struct_size += p->put_uint(len) + len; - for (size_t i = 0; i < len; i++) p->putb(value[i]); + for (size_t i = 0; i < len; i++) + p->putb(value[i]); } /** * @brief Function to write a blob to the internal buffer */ template - void field_blob(int field, const std::vector &value) + void field_blob(int field, const std::vector& value) { size_t len = value.size(); struct_size += p->put_uint(field * 8 + PB_TYPE_FIXEDLEN); struct_size += p->put_uint(len) + len; - for (size_t i = 0; i < len; i++) p->putb(value[i]); + for (size_t i = 0; i < len; i++) + p->putb(value[i]); } /** * @brief Function to write a struct to the internal buffer */ template - void field_struct(int field, const T &value) + void field_struct(int field, const T& value) { struct_size += p->put_uint((field)*8 + PB_TYPE_FIXEDLEN); auto lpos = p->m_buf->size(); @@ -107,18 +109,20 @@ struct ProtobufWriter::ProtobufFieldWriter { /** * @brief Function to write a vector of strings to the internal buffer */ - void field_repeated_string(int field, const std::vector &value) + void field_repeated_string(int field, const std::vector& value) { - for (const auto &elem : value) field_string(field, elem); + for (const auto& elem : value) + field_string(field, elem); } /** * @brief Function to write a vector of structs to the internal buffer */ template - void field_repeated_struct(int field, const std::vector &value) + void field_repeated_struct(int field, const std::vector& value) { - for (const auto &elem : value) field_struct(field, elem); + for (const auto& elem : value) + field_struct(field, elem); } /** @@ -126,9 +130,10 @@ struct ProtobufWriter::ProtobufFieldWriter { * buffer */ template - void field_repeated_struct_blob(int field, const std::vector &value) + void field_repeated_struct_blob(int field, const std::vector& value) { - for (const auto &elem : value) field_blob(field, elem); + for (const auto& elem : value) + field_blob(field, elem); } /** diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h index 66734df86c0..fa91dd13755 100644 --- a/cpp/src/io/orc/orc_gpu.h +++ b/cpp/src/io/orc/orc_gpu.h @@ -37,7 +37,7 @@ using cudf::detail::device_2dspan; struct CompressedStreamInfo { CompressedStreamInfo() = default; - explicit constexpr CompressedStreamInfo(const uint8_t *compressed_data_, size_t compressed_size_) + explicit constexpr CompressedStreamInfo(const uint8_t* compressed_data_, size_t compressed_size_) : compressed_data(compressed_data_), uncompressed_data(nullptr), compressed_data_size(compressed_size_), @@ -49,13 +49,13 @@ struct CompressedStreamInfo { max_uncompressed_size(0) { } - const uint8_t *compressed_data; // [in] base ptr to compressed stream data - uint8_t *uncompressed_data; // [in] base ptr to uncompressed stream data or NULL if not known yet + const uint8_t* compressed_data; // [in] base ptr to compressed stream data + uint8_t* uncompressed_data; // [in] base ptr to uncompressed stream data or NULL if not known yet size_t compressed_data_size; // [in] compressed data size for this stream - gpu_inflate_input_s *decctl; // [in] base ptr to decompression structure to be filled - gpu_inflate_status_s *decstatus; // [in] results of decompression - gpu_inflate_input_s - *copyctl; // [in] base ptr to copy structure to be filled for uncompressed blocks + gpu_inflate_input_s* decctl; // [in] base ptr to decompression structure to be filled + gpu_inflate_status_s* decstatus; // [in] results of decompression + gpu_inflate_input_s* + copyctl; // [in] base ptr to copy structure to be filled for uncompressed blocks uint32_t num_compressed_blocks; // [in,out] number of entries in decctl(in), number of compressed // blocks(out) uint32_t num_uncompressed_blocks; // [in,out] number of entries in copyctl(in), number of @@ -89,13 +89,16 @@ constexpr int orc_decimal2float64_scale = 0x80; * @brief Struct to describe per stripe's column information */ struct ColumnDesc { - const uint8_t *streams[CI_NUM_STREAMS]; // ptr to data stream index + const uint8_t* streams[CI_NUM_STREAMS]; // ptr to data stream index uint32_t strm_id[CI_NUM_STREAMS]; // stream ids uint32_t strm_len[CI_NUM_STREAMS]; // stream length - uint32_t *valid_map_base; // base pointer of valid bit map for this column - void *column_data_base; // base pointer of column data + uint32_t* valid_map_base; // base pointer of valid bit map for this column + void* column_data_base; // base pointer of column data uint32_t start_row; // starting row of the stripe - uint32_t num_rows; // starting row of the stripe + uint32_t num_rows; // number of rows in stripe + uint32_t column_num_rows; // number of rows in whole column + uint32_t num_child_rows; // store number of child rows if it's list column + uint32_t num_rowgroups; // number of rowgroups in the chunk uint32_t dictionary_start; // start position in global dictionary uint32_t dict_len; // length of local dictionary uint32_t null_count; // number of null values in this stripe's column @@ -115,6 +118,9 @@ struct RowGroup { uint32_t chunk_id; // Column chunk this entry belongs to uint32_t strm_offset[2]; // Index offset for CI_DATA and CI_DATA2 streams uint16_t run_pos[2]; // Run position for CI_DATA and CI_DATA2 + uint32_t num_rows; // number of rows in rowgroup + uint32_t start_row; // starting row of the rowgroup + uint32_t num_child_rows; // number of rows of children in rowgroup in case of list type }; /** @@ -128,16 +134,16 @@ struct EncChunk { uint8_t dtype_len; // data type length int32_t scale; // scale for decimals or timestamps - uint32_t *dict_index; // dictionary index from row index + uint32_t* dict_index; // dictionary index from row index device_span decimal_offsets; - column_device_view *leaf_column; + column_device_view* leaf_column; }; /** * @brief Struct to describe the streams that correspond to a single `EncChunk`. */ struct encoder_chunk_streams { - uint8_t *data_ptrs[CI_NUM_STREAMS]; // encoded output + uint8_t* data_ptrs[CI_NUM_STREAMS]; // encoded output int32_t ids[CI_NUM_STREAMS]; // stream id; -1 if stream is not present uint32_t lengths[CI_NUM_STREAMS]; // in: max length, out: actual length }; @@ -160,8 +166,8 @@ struct StripeStream { * @brief Struct to describe a dictionary chunk */ struct DictionaryChunk { - uint32_t *dict_data; // dictionary data (index of non-null rows) - uint32_t *dict_index; // row indices of corresponding string (row from dictionary index) + uint32_t* dict_data; // dictionary data (index of non-null rows) + uint32_t* dict_index; // row indices of corresponding string (row from dictionary index) uint32_t start_row; // start row of this chunk uint32_t num_rows; // num rows in this chunk uint32_t num_strings; // number of strings in this chunk @@ -170,22 +176,22 @@ struct DictionaryChunk { uint32_t num_dict_strings; // number of strings in dictionary uint32_t dict_char_count; // size of dictionary string data for this chunk - column_device_view *leaf_column; //!< Pointer to string column + column_device_view* leaf_column; //!< Pointer to string column }; /** * @brief Struct to describe a dictionary */ struct StripeDictionary { - uint32_t *dict_data; // row indices of corresponding string (row from dictionary index) - uint32_t *dict_index; // dictionary index from row index + uint32_t* dict_data; // row indices of corresponding string (row from dictionary index) + uint32_t* dict_index; // dictionary index from row index uint32_t column_id; // real column id uint32_t start_chunk; // first chunk in stripe uint32_t num_chunks; // number of chunks in the stripe uint32_t num_strings; // number of unique strings in the dictionary uint32_t dict_char_count; // total size of dictionary string data - column_device_view *leaf_column; //!< Pointer to string column + column_device_view* leaf_column; //!< Pointer to string column }; /** @@ -198,7 +204,7 @@ struct StripeDictionary { *compressed size) * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void ParseCompressedStripeData(CompressedStreamInfo *strm_info, +void ParseCompressedStripeData(CompressedStreamInfo* strm_info, int32_t num_streams, uint32_t compression_block_size, uint32_t log2maxcr = 24, @@ -211,7 +217,7 @@ void ParseCompressedStripeData(CompressedStreamInfo *strm_info, * @param[in] num_streams Number of compressed streams * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void PostDecompressionReassemble(CompressedStreamInfo *strm_info, +void PostDecompressionReassemble(CompressedStreamInfo* strm_info, int32_t num_streams, rmm::cuda_stream_view stream = rmm::cuda_stream_default); @@ -224,15 +230,19 @@ void PostDecompressionReassemble(CompressedStreamInfo *strm_info, * @param[in] num_columns Number of columns * @param[in] num_stripes Number of stripes * @param[in] num_rowgroups Number of row groups + * @param[in] rowidx_stride Row index stride + * @param[in] use_base_stride Whether to use base stride obtained from meta or use the computed + * value * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void ParseRowGroupIndex(RowGroup *row_groups, - CompressedStreamInfo *strm_info, - ColumnDesc *chunks, +void ParseRowGroupIndex(RowGroup* row_groups, + CompressedStreamInfo* strm_info, + ColumnDesc* chunks, uint32_t num_columns, uint32_t num_stripes, uint32_t num_rowgroups, uint32_t rowidx_stride, + bool use_base_stride, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** @@ -242,15 +252,13 @@ void ParseRowGroupIndex(RowGroup *row_groups, * @param[in] global_dictionary Global dictionary device array * @param[in] num_columns Number of columns * @param[in] num_stripes Number of stripes - * @param[in] max_rows Maximum number of rows to load * @param[in] first_row Crop all rows below first_row * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void DecodeNullsAndStringDictionaries(ColumnDesc *chunks, - DictionaryEntry *global_dictionary, +void DecodeNullsAndStringDictionaries(ColumnDesc* chunks, + DictionaryEntry* global_dictionary, uint32_t num_columns, uint32_t num_stripes, - size_t max_rows = ~0, size_t first_row = 0, rmm::cuda_stream_view stream = rmm::cuda_stream_default); @@ -261,25 +269,25 @@ void DecodeNullsAndStringDictionaries(ColumnDesc *chunks, * @param[in] global_dictionary Global dictionary device array * @param[in] num_columns Number of columns * @param[in] num_stripes Number of stripes - * @param[in] max_rows Maximum number of rows to load * @param[in] first_row Crop all rows below first_row * @param[in] tz_table Timezone translation table * @param[in] tz_len Length of timezone translation table - * @param[in] row_groups Optional row index data + * @param[in] row_groups Optional row index data [rowgroup][column] * @param[in] num_rowgroups Number of row groups in row index data * @param[in] rowidx_stride Row index stride + * @param[in] level Current nesting level being processed * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void DecodeOrcColumnData(ColumnDesc const *chunks, - DictionaryEntry *global_dictionary, +void DecodeOrcColumnData(ColumnDesc* chunks, + DictionaryEntry* global_dictionary, + device_2dspan row_groups, uint32_t num_columns, uint32_t num_stripes, - size_t max_rows = ~0, size_t first_row = 0, timezone_table_view tz_table = {}, - const RowGroup *row_groups = 0, uint32_t num_rowgroups = 0, uint32_t rowidx_stride = 0, + size_t level = 0, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** @@ -303,7 +311,7 @@ void EncodeOrcColumnData(device_2dspan chunks, * @param[in,out] enc_streams chunk streams device array [column][rowgroup] * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void EncodeStripeDictionaries(StripeDictionary *stripes, +void EncodeStripeDictionaries(StripeDictionary* stripes, device_2dspan chunks, uint32_t num_string_columns, uint32_t num_stripes, @@ -317,7 +325,7 @@ void EncodeStripeDictionaries(StripeDictionary *stripes, * @param[in,out] chunks encoder chunk device array [column][rowgroup] * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void set_chunk_columns(const table_device_view &view, +void set_chunk_columns(const table_device_view& view, device_2dspan chunks, rmm::cuda_stream_view stream); @@ -345,14 +353,14 @@ void CompactOrcDataStreams(device_2dspan strm_desc, * @param[out] comp_in Per-block compression input parameters * @param[out] comp_out Per-block compression status */ -void CompressOrcDataStreams(uint8_t *compressed_data, +void CompressOrcDataStreams(uint8_t* compressed_data, uint32_t num_compressed_blocks, CompressionKind compression, uint32_t comp_blk_size, device_2dspan strm_desc, device_2dspan enc_streams, - gpu_inflate_input_s *comp_in, - gpu_inflate_status_s *comp_out, + gpu_inflate_input_s* comp_in, + gpu_inflate_status_s* comp_out, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** @@ -368,12 +376,12 @@ void CompressOrcDataStreams(uint8_t *compressed_data, * @param[in] num_rowgroups Number of row groups * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void InitDictionaryIndices(const table_device_view &view, - DictionaryChunk *chunks, - uint32_t *dict_data, - uint32_t *dict_index, +void InitDictionaryIndices(const table_device_view& view, + DictionaryChunk* chunks, + uint32_t* dict_data, + uint32_t* dict_index, size_t row_index_stride, - size_type *str_col_ids, + size_type* str_col_ids, uint32_t num_columns, uint32_t num_rowgroups, rmm::cuda_stream_view stream); @@ -389,9 +397,9 @@ void InitDictionaryIndices(const table_device_view &view, * @param[in] num_columns Number of columns * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void BuildStripeDictionaries(StripeDictionary *stripes_dev, - StripeDictionary *stripes_host, - DictionaryChunk const *chunks, +void BuildStripeDictionaries(StripeDictionary* stripes_dev, + StripeDictionary* stripes_host, + DictionaryChunk const* chunks, uint32_t num_stripes, uint32_t num_rowgroups, uint32_t num_columns, @@ -407,8 +415,8 @@ void BuildStripeDictionaries(StripeDictionary *stripes_dev, * @param[in] row_index_stride Rowgroup size in rows * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void orc_init_statistics_groups(statistics_group *groups, - const stats_column_desc *cols, +void orc_init_statistics_groups(statistics_group* groups, + const stats_column_desc* cols, uint32_t num_columns, uint32_t num_rowgroups, uint32_t row_index_stride, @@ -422,8 +430,8 @@ void orc_init_statistics_groups(statistics_group *groups, * @param[in] statistics_count Number of statistics buffers to encode * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void orc_init_statistics_buffersize(statistics_merge_group *groups, - const statistics_chunk *chunks, +void orc_init_statistics_buffersize(statistics_merge_group* groups, + const statistics_chunk* chunks, uint32_t statistics_count, rmm::cuda_stream_view stream = rmm::cuda_stream_default); @@ -435,9 +443,9 @@ void orc_init_statistics_buffersize(statistics_merge_group *groups, * @param[in,out] chunks Statistics data * @param[in] statistics_count Number of statistics buffers */ -void orc_encode_statistics(uint8_t *blob_bfr, - statistics_merge_group *groups, - const statistics_chunk *chunks, +void orc_encode_statistics(uint8_t* blob_bfr, + statistics_merge_group* groups, + const statistics_chunk* chunks, uint32_t statistics_count, rmm::cuda_stream_view stream = rmm::cuda_stream_default); diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 499cb3f0432..b2b4538994e 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -26,6 +26,7 @@ #include #include "orc.h" +#include #include #include #include @@ -34,6 +35,7 @@ #include #include #include +#include #include #include @@ -50,7 +52,7 @@ namespace { /** * @brief Function that translates ORC data kind to cuDF type enum */ -constexpr type_id to_type_id(const orc::SchemaType &schema, +constexpr type_id to_type_id(const orc::SchemaType& schema, bool use_np_dtypes, type_id timestamp_type_id, bool decimals_as_float64) @@ -76,6 +78,8 @@ constexpr type_id to_type_id(const orc::SchemaType &schema, // There isn't a (DAYS -> np.dtype) mapping return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS; case orc::DECIMAL: return (decimals_as_float64) ? type_id::FLOAT64 : type_id::DECIMAL64; + case orc::LIST: return type_id::LIST; + case orc::STRUCT: return type_id::STRUCT; default: break; } @@ -123,6 +127,26 @@ constexpr std::pair get_index_type_and_pos( } // namespace namespace { +/** + * @brief struct to store buffer data and size of list buffer + */ +struct list_buffer_data { + size_type* data; + size_type size; +}; + +// Generates offsets for list buffer from number of elements in a row. +void generate_offsets_for_list(rmm::device_uvector const& buff_data, + rmm::cuda_stream_view stream) +{ + auto transformer = [] __device__(list_buffer_data list_data) { + thrust::exclusive_scan( + thrust::seq, list_data.data, list_data.data + list_data.size, list_data.data); + }; + thrust::for_each(rmm::exec_policy(stream), buff_data.begin(), buff_data.end(), transformer); + stream.synchronize(); +} + /** * @brief Struct that maps ORC streams to columns */ @@ -148,20 +172,19 @@ struct orc_stream_info { * @brief Function that populates column descriptors stream/chunk */ size_t gather_stream_info(const size_t stripe_index, - const orc::StripeInformation *stripeinfo, - const orc::StripeFooter *stripefooter, - const std::vector &orc2gdf, - const std::vector &gdf2orc, + const orc::StripeInformation* stripeinfo, + const orc::StripeFooter* stripefooter, + const std::vector& orc2gdf, + const std::vector& gdf2orc, const std::vector types, bool use_index, - size_t *num_dictionary_entries, - hostdevice_vector &chunks, - std::vector &stream_info) + size_t* num_dictionary_entries, + cudf::detail::hostdevice_2dvector& chunks, + std::vector& stream_info) { - const auto num_columns = gdf2orc.size(); - uint64_t src_offset = 0; - uint64_t dst_offset = 0; - for (const auto &stream : stripefooter->streams) { + uint64_t src_offset = 0; + uint64_t dst_offset = 0; + for (const auto& stream : stripefooter->streams) { if (!stream.column_id || *stream.column_id >= orc2gdf.size()) { dst_offset += stream.length; continue; @@ -177,11 +200,11 @@ size_t gather_stream_info(const size_t stripe_index, const auto schema_type = types[column_id]; if (schema_type.subtypes.size() != 0) { if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) { - for (const auto &idx : schema_type.subtypes) { + for (const auto& idx : schema_type.subtypes) { auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1; if (child_idx >= 0) { col = child_idx; - auto &chunk = chunks[stripe_index * num_columns + col]; + auto& chunk = chunks[stripe_index][col]; chunk.strm_id[gpu::CI_PRESENT] = stream_info.size(); chunk.strm_len[gpu::CI_PRESENT] = stream.length; } @@ -192,7 +215,7 @@ size_t gather_stream_info(const size_t stripe_index, if (col != -1) { if (src_offset >= stripeinfo->indexLength || use_index) { // NOTE: skip_count field is temporarily used to track index ordering - auto &chunk = chunks[stripe_index * num_columns + col]; + auto& chunk = chunks[stripe_index][col]; const auto idx = get_index_type_and_pos(stream.kind, chunk.skip_count, col == orc2gdf[column_id]); if (idx.first < gpu::CI_NUM_STREAMS) { @@ -220,8 +243,8 @@ size_t gather_stream_info(const size_t stripe_index, /** * @brief Determines if a column should be converted from decimal to float */ -bool should_convert_decimal_column_to_float(const std::vector &columns_to_convert, - cudf::io::orc::metadata &metadata, +bool should_convert_decimal_column_to_float(const std::vector& columns_to_convert, + cudf::io::orc::metadata& metadata, int column_index) { return (std::find(columns_to_convert.begin(), @@ -237,7 +260,7 @@ bool should_convert_decimal_column_to_float(const std::vector &colu * to aggregate that metadata from all the files. */ class aggregate_orc_metadata { - using OrcStripeInfo = std::pair; + using OrcStripeInfo = std::pair; public: mutable std::vector per_file_metadata; @@ -248,11 +271,11 @@ class aggregate_orc_metadata { /** * @brief Create a metadata object from each element in the source vector */ - auto metadatas_from_sources(std::vector> const &sources) + auto metadatas_from_sources(std::vector> const& sources) { std::vector metadatas; std::transform( - sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const &source) { + sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) { return cudf::io::orc::metadata(source.get()); }); return metadatas; @@ -264,7 +287,7 @@ class aggregate_orc_metadata { size_type calc_num_rows() const { return std::accumulate( - per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto &sum, auto &pfm) { + per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) { return sum + pfm.get_total_rows(); }); } @@ -284,12 +307,12 @@ class aggregate_orc_metadata { size_type calc_num_stripes() const { return std::accumulate( - per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto &sum, auto &pfm) { + per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) { return sum + pfm.get_num_stripes(); }); } - aggregate_orc_metadata(std::vector> const &sources) + aggregate_orc_metadata(std::vector> const& sources) : per_file_metadata(metadatas_from_sources(sources)), num_rows(calc_num_rows()), num_columns(calc_num_cols()), @@ -297,7 +320,7 @@ class aggregate_orc_metadata { { // Verify that the input files have the same number of columns, // as well as matching types, compression, and names - for (auto const &pfm : per_file_metadata) { + for (auto const& pfm : per_file_metadata) { CUDF_EXPECTS(per_file_metadata[0].get_num_columns() == pfm.get_num_columns(), "All sources must have the same number of columns"); CUDF_EXPECTS(per_file_metadata[0].ps.compression == pfm.ps.compression, @@ -318,7 +341,7 @@ class aggregate_orc_metadata { } } - auto const &get_schema(int schema_idx) const { return per_file_metadata[0].ff.types[schema_idx]; } + auto const& get_schema(int schema_idx) const { return per_file_metadata[0].ff.types[schema_idx]; } auto get_col_type(int col_idx) const { return per_file_metadata[0].ff.types[col_idx]; } @@ -330,7 +353,7 @@ class aggregate_orc_metadata { auto get_num_source_files() const { return per_file_metadata.size(); } - auto const &get_types() const { return per_file_metadata[0].ff.types; } + auto const& get_types() const { return per_file_metadata[0].ff.types; } int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; } @@ -344,9 +367,9 @@ class aggregate_orc_metadata { } std::vector select_stripes( - std::vector> const &user_specified_stripes, - size_type &row_start, - size_type &row_count) + std::vector> const& user_specified_stripes, + size_type& row_start, + size_type& row_count) { std::vector selected_stripes_mapping; @@ -365,7 +388,7 @@ class aggregate_orc_metadata { // Coalesce stripe info at the source file later since that makes downstream processing much // easier in impl::read - for (const size_t &stripe_idx : user_specified_stripes[src_file_idx]) { + for (const size_t& stripe_idx : user_specified_stripes[src_file_idx]) { CUDF_EXPECTS(stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size(), "Invalid stripe index"); stripe_infos.push_back( @@ -384,7 +407,8 @@ class aggregate_orc_metadata { CUDF_EXPECTS(row_count >= 0, "Invalid row count"); CUDF_EXPECTS(row_start <= get_num_rows(), "Invalid row start"); - size_type count = 0; + size_type count = 0; + size_type stripe_skip_rows = 0; // Iterate all source files, each source file has corelating metadata for (size_t src_file_idx = 0; src_file_idx < per_file_metadata.size() && count < row_start + row_count; @@ -399,16 +423,20 @@ class aggregate_orc_metadata { if (count > row_start || count == 0) { stripe_infos.push_back( std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr)); + } else { + stripe_skip_rows = count; } } selected_stripes_mapping.push_back({static_cast(src_file_idx), stripe_infos}); } + // Need to remove skipped rows from the stripes which are not selected. + row_start -= stripe_skip_rows; } // Read each stripe's stripefooter metadata if (not selected_stripes_mapping.empty()) { - for (auto &mapping : selected_stripes_mapping) { + for (auto& mapping : selected_stripes_mapping) { // Resize to all stripe_info for the source level per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size()); @@ -434,66 +462,121 @@ class aggregate_orc_metadata { return selected_stripes_mapping; } + /** + * @brief Adds column as per the request and saves metadata about children. + * Struct children are in the same level as struct, only list column + * children are pushed to next level. + * + * @param selection A vector that saves list of columns as per levels of nesting. + * @param types A vector of schema types of columns. + * @param level current level of nesting. + * @param id current column id that needs to be added. + * @param has_timestamp_column True if timestamp column present and false otherwise. + * + * @return returns number of child columns at same level in case of struct and next level in case + * of list + */ + uint32_t add_column(std::vector>& selection, + std::vector const& types, + const size_t level, + const uint32_t id, + bool& has_timestamp_column, + bool& has_list_column) + { + uint32_t num_lvl_child_columns = 0; + if (level == selection.size()) { selection.emplace_back(); } + selection[level].push_back({id, 0}); + const int col_id = selection[level].size() - 1; + if (types[id].kind == orc::TIMESTAMP) { has_timestamp_column = true; } + + switch (types[id].kind) { + case orc::LIST: { + uint32_t lvl_cols = 0; + if (not types[id].subtypes.empty()) { + has_list_column = true; + // Since list column needs to be processed before its child can be processed, + // child column is being added to next level + lvl_cols = + add_column(selection, types, level + 1, id + 1, has_timestamp_column, has_list_column); + } + // The list child column may be a struct in which case lvl_cols will be > 1 + selection[level][col_id].num_children = lvl_cols; + } break; + + case orc::STRUCT: + for (const auto child_id : types[id].subtypes) { + num_lvl_child_columns += + add_column(selection, types, level, child_id, has_timestamp_column, has_list_column); + } + selection[level][col_id].num_children = num_lvl_child_columns; + break; + + default: break; + } + + return num_lvl_child_columns + 1; + } + /** * @brief Filters and reduces down to a selection of columns * * @param use_names List of column names to select * @param has_timestamp_column True if timestamp column present and false otherwise * - * @return input column information, output column information, list of output column schema - * indices + * @return Vector of list of ORC column meta-data */ - std::vector select_columns(std::vector const &use_names, - bool &has_timestamp_column) const + std::vector> select_columns( + std::vector const& use_names, bool& has_timestamp_column, bool& has_list_column) { - auto const &pfm = per_file_metadata[0]; + auto const& pfm = per_file_metadata[0]; + std::vector> selection; - std::vector output_column_schema_idxs; if (not use_names.empty()) { - int index = 0; - for (auto const &use_name : use_names) { + uint32_t index = 0; + // Have to check only parent columns + auto const num_columns = pfm.ff.types[0].subtypes.size(); + + for (const auto& use_name : use_names) { bool name_found = false; - for (int i = 0; i < pfm.get_num_columns(); ++i, ++index) { - if (index >= pfm.get_num_columns()) { index = 0; } - if (pfm.get_column_name(index).compare(use_name) == 0) { + for (uint32_t i = 0; i < num_columns; ++i, ++index) { + if (index >= num_columns) { index = 0; } + auto col_id = pfm.ff.types[0].subtypes[index]; + if (pfm.get_column_name(col_id) == use_name) { name_found = true; - output_column_schema_idxs.emplace_back(index); - if (pfm.ff.types[index].kind == orc::TIMESTAMP) { has_timestamp_column = true; } - index++; + add_column(selection, pfm.ff.types, 0, col_id, has_timestamp_column, has_list_column); + // Should start with next index + index = i + 1; break; } } CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name)); } } else { - // For now, only select all leaf nodes - for (int i = 1; i < pfm.get_num_columns(); ++i) { - if (pfm.ff.types[i].subtypes.empty()) { - output_column_schema_idxs.emplace_back(i); - if (pfm.ff.types[i].kind == orc::TIMESTAMP) { has_timestamp_column = true; } - } + for (auto const& col_id : pfm.ff.types[0].subtypes) { + add_column(selection, pfm.ff.types, 0, col_id, has_timestamp_column, has_list_column); } } - return output_column_schema_idxs; + return selection; } }; rmm::device_buffer reader::impl::decompress_stripe_data( - hostdevice_vector &chunks, - const std::vector &stripe_data, - const OrcDecompressor *decompressor, - std::vector &stream_info, + cudf::detail::hostdevice_2dvector& chunks, + const std::vector& stripe_data, + const OrcDecompressor* decompressor, + std::vector& stream_info, size_t num_stripes, - device_span row_groups, + cudf::detail::hostdevice_2dvector& row_groups, size_t row_index_stride, + bool use_base_stride, rmm::cuda_stream_view stream) { // Parse the columns' compressed info hostdevice_vector compinfo(0, stream_info.size(), stream); - for (const auto &info : stream_info) { + for (const auto& info : stream_info) { compinfo.insert(gpu::CompressedStreamInfo( - static_cast(stripe_data[info.stripe_idx].data()) + info.dst_pos, + static_cast(stripe_data[info.stripe_idx].data()) + info.dst_pos, info.length)); } compinfo.host_to_device(stream); @@ -525,7 +608,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data( uint32_t start_pos = 0; uint32_t start_pos_uncomp = (uint32_t)num_compressed_blocks; for (size_t i = 0; i < compinfo.size(); ++i) { - auto dst_base = static_cast(decomp_data.data()); + auto dst_base = static_cast(decomp_data.data()); compinfo[i].uncompressed_data = dst_base + decomp_offset; compinfo[i].decctl = inflate_in.data() + start_pos; compinfo[i].decstatus = inflate_out.data() + start_pos; @@ -569,11 +652,11 @@ rmm::device_buffer reader::impl::decompress_stripe_data( // decompression failed. compinfo.device_to_host(stream, true); - const size_t num_columns = chunks.size() / num_stripes; + const size_t num_columns = chunks.size().second; for (size_t i = 0; i < num_stripes; ++i) { for (size_t j = 0; j < num_columns; ++j) { - auto &chunk = chunks[i * num_columns + j]; + auto& chunk = chunks[i][j]; for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) { if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) { chunk.streams[k] = compinfo[chunk.strm_id[k]].uncompressed_data; @@ -583,38 +666,40 @@ rmm::device_buffer reader::impl::decompress_stripe_data( } } - if (not row_groups.empty()) { + if (row_groups.size().first) { chunks.host_to_device(stream); - gpu::ParseRowGroupIndex(row_groups.data(), + row_groups.host_to_device(stream); + gpu::ParseRowGroupIndex(row_groups.base_device_ptr(), compinfo.device_ptr(), - chunks.device_ptr(), + chunks.base_device_ptr(), num_columns, num_stripes, - row_groups.size() / num_columns, + row_groups.size().first, row_index_stride, + use_base_stride, stream); } return decomp_data; } -void reader::impl::decode_stream_data(hostdevice_vector &chunks, +void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector& chunks, size_t num_dicts, size_t skip_rows, - size_t num_rows, timezone_table_view tz_table, - device_span row_groups, + cudf::detail::hostdevice_2dvector& row_groups, size_t row_index_stride, - std::vector &out_buffers, + std::vector& out_buffers, + size_t level, rmm::cuda_stream_view stream) { - const auto num_columns = out_buffers.size(); - const auto num_stripes = chunks.size() / out_buffers.size(); + const auto num_stripes = chunks.size().first; + const auto num_columns = chunks.size().second; // Update chunks with pointers to column data for (size_t i = 0; i < num_stripes; ++i) { for (size_t j = 0; j < num_columns; ++j) { - auto &chunk = chunks[i * num_columns + j]; + auto& chunk = chunks[i][j]; chunk.column_data_base = out_buffers[j].data(); chunk.valid_map_base = out_buffers[j].null_mask(); } @@ -625,37 +710,203 @@ void reader::impl::decode_stream_data(hostdevice_vector &chunks chunks.host_to_device(stream); gpu::DecodeNullsAndStringDictionaries( - chunks.device_ptr(), global_dict.data(), num_columns, num_stripes, num_rows, skip_rows, stream); - gpu::DecodeOrcColumnData(chunks.device_ptr(), + chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream); + gpu::DecodeOrcColumnData(chunks.base_device_ptr(), global_dict.data(), + row_groups, num_columns, num_stripes, - num_rows, skip_rows, tz_table, - row_groups.data(), - row_groups.size() / num_columns, + row_groups.size().first, row_index_stride, + level, stream); chunks.device_to_host(stream, true); for (size_t i = 0; i < num_stripes; ++i) { for (size_t j = 0; j < num_columns; ++j) { - out_buffers[j].null_count() += chunks[i * num_columns + j].null_count; + out_buffers[j].null_count() += chunks[i][j].null_count; } } } -reader::impl::impl(std::vector> &&sources, - orc_reader_options const &options, - rmm::mr::device_memory_resource *mr) +// Aggregate child column metadata per stripe and per column +void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan chunks, + cudf::detail::host_2dspan row_groups, + std::vector const& list_col, + const int32_t level) +{ + const auto num_of_stripes = chunks.size().first; + const auto num_of_rowgroups = row_groups.size().first; + const auto num_parent_cols = _selected_columns[level].size(); + const auto num_child_cols = _selected_columns[level + 1].size(); + const auto number_of_child_chunks = num_child_cols * num_of_stripes; + auto& num_child_rows = _col_meta.num_child_rows; + + // Reset the meta to store child column details. + num_child_rows.resize(_selected_columns[level + 1].size()); + std::fill(num_child_rows.begin(), num_child_rows.end(), 0); + _col_meta.child_start_row.resize(number_of_child_chunks); + _col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks); + _col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols); + + auto child_start_row = cudf::detail::host_2dspan( + _col_meta.child_start_row.data(), num_of_stripes, num_child_cols); + auto num_child_rows_per_stripe = cudf::detail::host_2dspan( + _col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols); + auto rwgrp_meta = cudf::detail::host_2dspan( + _col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols); + + int index = 0; // number of child column processed + + // For each parent column, update its child column meta for each stripe. + std::for_each(list_col.cbegin(), list_col.cend(), [&](const auto p_col) { + const auto parent_col_idx = _col_meta.orc_col_map[level][p_col.id]; + auto start_row = 0; + auto processed_row_groups = 0; + + for (size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) { + // Aggregate num_rows and start_row from processed parent columns per row groups + if (num_of_rowgroups) { + auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups; + auto processed_child_rows = 0; + + for (size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups; + rowgroup_id++, processed_row_groups++) { + const auto child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows; + for (uint32_t id = 0; id < p_col.num_children; id++) { + const auto child_col_idx = index + id; + rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows; + rwgrp_meta[processed_row_groups][child_col_idx].num_rows = child_rows; + } + processed_child_rows += child_rows; + } + } + + // Aggregate start row, number of rows per chunk and total number of rows in a column + const auto child_rows = chunks[stripe_id][parent_col_idx].num_child_rows; + for (uint32_t id = 0; id < p_col.num_children; id++) { + const auto child_col_idx = index + id; + + num_child_rows[child_col_idx] += child_rows; + num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows; + // start row could be different for each column when there is nesting at each stripe level + child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row; + } + start_row += child_rows; + } + index += p_col.num_children; + }); +} + +std::unique_ptr reader::impl::create_empty_column(const int32_t orc_col_id, + column_name_info& schema_info, + rmm::cuda_stream_view stream) +{ + schema_info.name = _metadata->get_column_name(0, orc_col_id); + // If the column type is orc::DECIMAL see if the user + // desires it to be converted to float64 or not + auto const decimal_as_float64 = should_convert_decimal_column_to_float( + _decimal_cols_as_float, _metadata->per_file_metadata[0], orc_col_id); + auto const type = to_type_id( + _metadata->get_schema(orc_col_id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64); + int32_t scale = 0; + std::vector> child_columns; + std::unique_ptr out_col = nullptr; + + switch (type) { + case type_id::LIST: + schema_info.children.emplace_back("offsets"); + schema_info.children.emplace_back(""); + out_col = make_lists_column( + 0, + make_empty_column(data_type(type_id::INT32)), + create_empty_column( + _metadata->get_col_type(orc_col_id).subtypes[0], schema_info.children.back(), stream), + 0, + rmm::device_buffer{0, stream}, + stream); + + break; + + case type_id::STRUCT: + for (const auto col : _metadata->get_col_type(orc_col_id).subtypes) { + schema_info.children.emplace_back(""); + child_columns.push_back(create_empty_column(col, schema_info.children.back(), stream)); + } + out_col = + make_structs_column(0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream); + break; + + case type_id::DECIMAL64: + scale = -static_cast(_metadata->get_types()[orc_col_id].scale.value_or(0)); + out_col = make_empty_column(data_type(type, scale)); + break; + + default: out_col = make_empty_column(data_type(type)); + } + + return out_col; +} + +// Adds child column buffers to parent column +column_buffer&& reader::impl::assemble_buffer(const int32_t orc_col_id, + std::vector>& col_buffers, + const size_t level) +{ + auto const col_id = _col_meta.orc_col_map[level][orc_col_id]; + auto& col_buffer = col_buffers[level][col_id]; + + col_buffer.name = _metadata->get_column_name(0, orc_col_id); + switch (col_buffer.type.id()) { + case type_id::LIST: + col_buffer.children.emplace_back( + assemble_buffer(_metadata->get_col_type(orc_col_id).subtypes[0], col_buffers, level + 1)); + break; + + case type_id::STRUCT: + for (auto const& col : _metadata->get_col_type(orc_col_id).subtypes) { + col_buffer.children.emplace_back(assemble_buffer(col, col_buffers, level)); + } + + break; + + default: break; + } + + return std::move(col_buffer); +} + +// creates columns along with schema information for each column +void reader::impl::create_columns(std::vector>&& col_buffers, + std::vector>& out_columns, + std::vector& schema_info, + rmm::cuda_stream_view stream) +{ + for (size_t i = 0; i < _selected_columns[0].size();) { + auto const& col_meta = _selected_columns[0][i]; + schema_info.emplace_back(""); + + auto col_buffer = assemble_buffer(col_meta.id, col_buffers, 0); + out_columns.emplace_back(make_column(col_buffer, &schema_info.back(), stream, _mr)); + + // Need to skip child columns of struct which are at the same level and have been processed + i += (col_buffers[0][i].type.id() == type_id::STRUCT) ? col_meta.num_children + 1 : 1; + } +} + +reader::impl::impl(std::vector>&& sources, + orc_reader_options const& options, + rmm::mr::device_memory_resource* mr) : _mr(mr), _sources(std::move(sources)) { // Open and parse the source(s) dataset metadata _metadata = std::make_unique(_sources); // Select only columns required by the options - _selected_columns = _metadata->select_columns(options.get_columns(), _has_timestamp_column); + _selected_columns = + _metadata->select_columns(options.get_columns(), _has_timestamp_column, _has_list_column); // Override output timestamp resolution if requested if (options.get_timestamp_type().id() != type_id::EMPTY) { @@ -674,272 +925,360 @@ reader::impl::impl(std::vector> &&sources, table_with_metadata reader::impl::read(size_type skip_rows, size_type num_rows, - const std::vector> &stripes, + const std::vector>& stripes, rmm::cuda_stream_view stream) { + CUDF_EXPECTS(skip_rows == 0 or (not _has_list_column), + "skip_rows is not supported by list column"); + std::vector> out_columns; + // buffer and stripe data are stored as per nesting level + std::vector> out_buffers(_selected_columns.size()); + std::vector schema_info; + std::vector> lvl_stripe_data(_selected_columns.size()); table_metadata out_metadata; - // There are no columns in table + // There are no columns in the table if (_selected_columns.size() == 0) return {std::make_unique
(), std::move(out_metadata)}; // Select only stripes required (aka row groups) const auto selected_stripes = _metadata->select_stripes(stripes, skip_rows, num_rows); - // Association between each ORC column and its cudf::column - std::vector orc_col_map(_metadata->get_num_cols(), -1); - - // Get a list of column data types - std::vector column_types; - for (const auto &col : _selected_columns) { - // If the column type is orc::DECIMAL see if the user - // desires it to be converted to float64 or not - auto const decimal_as_float64 = should_convert_decimal_column_to_float( - _decimal_cols_as_float, _metadata->per_file_metadata[0], col); - - auto col_type = to_type_id( - _metadata->get_col_type(col), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64); - CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); - // Remove this once we support Decimal128 data type - CUDF_EXPECTS((col_type != type_id::DECIMAL64) or (_metadata->get_col_type(col).precision <= 18), - "Decimal data has precision > 18, Decimal64 data type doesn't support it."); - if (col_type == type_id::DECIMAL64) { - // sign of the scale is changed since cuDF follows c++ libraries like CNL - // which uses negative scaling, but liborc and other libraries - // follow positive scaling. - auto const scale = -static_cast(_metadata->get_col_type(col).scale.value_or(0)); - column_types.emplace_back(col_type, scale); - } else { - column_types.emplace_back(col_type); - } + // Iterates through levels of nested columns, struct columns and its children will be + // in the same level since child column also have same number of rows, + // list column children will be 1 level down compared to parent. + for (size_t level = 0; level < _selected_columns.size(); level++) { + auto& selected_columns = _selected_columns[level]; + // Association between each ORC column and its cudf::column + _col_meta.orc_col_map.emplace_back(_metadata->get_num_cols(), -1); + std::vector list_col; + + // Get a list of column data types + std::vector column_types; + for (auto& col : selected_columns) { + // If the column type is orc::DECIMAL see if the user + // desires it to be converted to float64 or not + auto const decimal_as_float64 = should_convert_decimal_column_to_float( + _decimal_cols_as_float, _metadata->per_file_metadata[0], col.id); + auto col_type = to_type_id( + _metadata->get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64); + CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type"); + // Remove this once we support Decimal128 data type + CUDF_EXPECTS( + (col_type != type_id::DECIMAL64) or (_metadata->get_col_type(col.id).precision <= 18), + "Decimal data has precision > 18, Decimal64 data type doesn't support it."); + if (col_type == type_id::DECIMAL64) { + // sign of the scale is changed since cuDF follows c++ libraries like CNL + // which uses negative scaling, but liborc and other libraries + // follow positive scaling. + auto const scale = -static_cast(_metadata->get_col_type(col.id).scale.value_or(0)); + column_types.emplace_back(col_type, scale); + } else { + column_types.emplace_back(col_type); + } - // Map each ORC column to its column - orc_col_map[col] = column_types.size() - 1; - } + // Map each ORC column to its column + _col_meta.orc_col_map[level][col.id] = column_types.size() - 1; + if (col_type == type_id::LIST) list_col.emplace_back(col); + } - // If no rows or stripes to read, return empty columns - if (num_rows <= 0 || selected_stripes.empty()) { - std::transform(column_types.cbegin(), - column_types.cend(), - std::back_inserter(out_columns), - [](auto const &dtype) { return make_empty_column(dtype); }); - } else { - // Get the total number of stripes across all input files. - size_t total_num_stripes = - std::accumulate(selected_stripes.begin(), - selected_stripes.end(), - 0, - [](size_t sum, auto &stripe_source_mapping) { - return sum + stripe_source_mapping.stripe_info.size(); - }); - - const auto num_columns = _selected_columns.size(); - const auto num_chunks = total_num_stripes * num_columns; - hostdevice_vector chunks(num_chunks, stream); - memset(chunks.host_ptr(), 0, chunks.memory_size()); - - const bool use_index = - (_use_index == true) && - // Only use if we don't have much work with complete columns & stripes - // TODO: Consider nrows, gpu, and tune the threshold - (num_rows > _metadata->get_row_index_stride() && !(_metadata->get_row_index_stride() & 7) && - _metadata->get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) && - // Only use if first row is aligned to a stripe boundary - // TODO: Fix logic to handle unaligned rows - (skip_rows == 0); - - // Logically view streams as columns - std::vector stream_info; - - // Tracker for eventually deallocating compressed and uncompressed data - std::vector stripe_data; - - size_t stripe_start_row = 0; - size_t num_dict_entries = 0; - size_t num_rowgroups = 0; - size_t stripe_chunk_index = 0; - - for (auto &stripe_source_mapping : selected_stripes) { - // Iterate through the source files selected stripes - for (size_t stripe_pos_index = 0; stripe_pos_index < stripe_source_mapping.stripe_info.size(); - stripe_pos_index++) { - auto &stripe_pair = stripe_source_mapping.stripe_info[stripe_pos_index]; - const auto stripe_info = stripe_pair.first; - const auto stripe_footer = stripe_pair.second; - - auto stream_count = stream_info.size(); - const auto total_data_size = gather_stream_info(stripe_chunk_index, - stripe_info, - stripe_footer, - orc_col_map, - _selected_columns, - _metadata->get_types(), - use_index, - &num_dict_entries, - chunks, - stream_info); - - CUDF_EXPECTS(total_data_size > 0, "Expected streams data within stripe"); - - stripe_data.emplace_back(total_data_size, stream); - auto dst_base = static_cast(stripe_data.back().data()); - - // Coalesce consecutive streams into one read - while (stream_count < stream_info.size()) { - const auto d_dst = dst_base + stream_info[stream_count].dst_pos; - const auto offset = stream_info[stream_count].offset; - auto len = stream_info[stream_count].length; - stream_count++; - - while (stream_count < stream_info.size() && - stream_info[stream_count].offset == offset + len) { - len += stream_info[stream_count].length; + // If no rows or stripes to read, return empty columns + if (num_rows <= 0 || selected_stripes.empty()) { + for (size_t i = 0; i < _selected_columns[0].size();) { + auto const& col_meta = _selected_columns[0][i]; + auto const schema = _metadata->get_schema(col_meta.id); + schema_info.emplace_back(""); + out_columns.push_back( + std::move(create_empty_column(col_meta.id, schema_info.back(), stream))); + // Since struct children will be in the same level, have to skip them. + i += (schema.kind == orc::STRUCT) ? col_meta.num_children + 1 : 1; + } + break; + } else { + // Get the total number of stripes across all input files. + size_t total_num_stripes = + std::accumulate(selected_stripes.begin(), + selected_stripes.end(), + 0, + [](size_t sum, auto& stripe_source_mapping) { + return sum + stripe_source_mapping.stripe_info.size(); + }); + const auto num_columns = selected_columns.size(); + cudf::detail::hostdevice_2dvector chunks( + total_num_stripes, num_columns, stream); + memset(chunks.base_host_ptr(), 0, chunks.memory_size()); + + const bool use_index = + (_use_index == true) && + // Only use if we don't have much work with complete columns & stripes + // TODO: Consider nrows, gpu, and tune the threshold + (num_rows > _metadata->get_row_index_stride() && !(_metadata->get_row_index_stride() & 7) && + _metadata->get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) && + // Only use if first row is aligned to a stripe boundary + // TODO: Fix logic to handle unaligned rows + (skip_rows == 0); + + // Logically view streams as columns + std::vector stream_info; + + // Tracker for eventually deallocating compressed and uncompressed data + auto& stripe_data = lvl_stripe_data[level]; + + size_t stripe_start_row = 0; + size_t num_dict_entries = 0; + size_t num_rowgroups = 0; + int stripe_idx = 0; + + for (auto const& stripe_source_mapping : selected_stripes) { + // Iterate through the source files selected stripes + for (auto const& stripe : stripe_source_mapping.stripe_info) { + const auto stripe_info = stripe.first; + const auto stripe_footer = stripe.second; + + auto stream_count = stream_info.size(); + const auto total_data_size = gather_stream_info(stripe_idx, + stripe_info, + stripe_footer, + _col_meta.orc_col_map[level], + selected_columns, + _metadata->get_types(), + use_index, + &num_dict_entries, + chunks, + stream_info); + + CUDF_EXPECTS(total_data_size > 0, "Expected streams data within stripe"); + + stripe_data.emplace_back(total_data_size, stream); + auto dst_base = static_cast(stripe_data.back().data()); + + // Coalesce consecutive streams into one read + while (stream_count < stream_info.size()) { + const auto d_dst = dst_base + stream_info[stream_count].dst_pos; + const auto offset = stream_info[stream_count].offset; + auto len = stream_info[stream_count].length; stream_count++; + + while (stream_count < stream_info.size() && + stream_info[stream_count].offset == offset + len) { + len += stream_info[stream_count].length; + stream_count++; + } + if (_metadata->per_file_metadata[stripe_source_mapping.source_idx] + .source->is_device_read_preferred(len)) { + CUDF_EXPECTS( + _metadata->per_file_metadata[stripe_source_mapping.source_idx].source->device_read( + offset, len, d_dst, stream) == len, + "Unexpected discrepancy in bytes read."); + } else { + const auto buffer = + _metadata->per_file_metadata[stripe_source_mapping.source_idx].source->host_read( + offset, len); + CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read."); + CUDA_TRY(cudaMemcpyAsync( + d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value())); + stream.synchronize(); + } } - if (_metadata->per_file_metadata[stripe_source_mapping.source_idx] - .source->is_device_read_preferred(len)) { - CUDF_EXPECTS( - _metadata->per_file_metadata[stripe_source_mapping.source_idx].source->device_read( - offset, len, d_dst, stream) == len, - "Unexpected discrepancy in bytes read."); - } else { - const auto buffer = - _metadata->per_file_metadata[stripe_source_mapping.source_idx].source->host_read( - offset, len); - CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read."); - CUDA_TRY( - cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value())); - stream.synchronize(); - } - } - // Update chunks to reference streams pointers - for (size_t col_idx = 0; col_idx < num_columns; col_idx++) { - auto &chunk = chunks[stripe_chunk_index * num_columns + col_idx]; - chunk.start_row = stripe_start_row; - chunk.num_rows = stripe_info->numberOfRows; - chunk.encoding_kind = stripe_footer->columns[_selected_columns[col_idx]].kind; - chunk.type_kind = _metadata->per_file_metadata[stripe_source_mapping.source_idx] - .ff.types[_selected_columns[col_idx]] - .kind; - auto const decimal_as_float64 = should_convert_decimal_column_to_float( - _decimal_cols_as_float, _metadata->per_file_metadata[0], _selected_columns[col_idx]); - chunk.decimal_scale = _metadata->per_file_metadata[stripe_source_mapping.source_idx] - .ff.types[_selected_columns[col_idx]] - .scale.value_or(0) | - (decimal_as_float64 ? orc::gpu::orc_decimal2float64_scale : 0); - chunk.rowgroup_id = num_rowgroups; - chunk.dtype_len = (column_types[col_idx].id() == type_id::STRING) - ? sizeof(string_index_pair) - : cudf::size_of(column_types[col_idx]); - if (chunk.type_kind == orc::TIMESTAMP) { - chunk.ts_clock_rate = to_clockrate(_timestamp_type.id()); + const auto num_rows_per_stripe = stripe_info->numberOfRows; + const auto rowgroup_id = num_rowgroups; + auto stripe_num_rowgroups = 0; + if (use_index) { + stripe_num_rowgroups = (num_rows_per_stripe + _metadata->get_row_index_stride() - 1) / + _metadata->get_row_index_stride(); } - for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) { - chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos; + // Update chunks to reference streams pointers + for (size_t col_idx = 0; col_idx < num_columns; col_idx++) { + auto& chunk = chunks[stripe_idx][col_idx]; + // start row, number of rows in a each stripe and total number of rows + // may change in lower levels of nesting + chunk.start_row = (level == 0) + ? stripe_start_row + : _col_meta.child_start_row[stripe_idx * num_columns + col_idx]; + chunk.num_rows = + (level == 0) + ? stripe_info->numberOfRows + : _col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx]; + chunk.column_num_rows = (level == 0) ? num_rows : _col_meta.num_child_rows[col_idx]; + chunk.encoding_kind = stripe_footer->columns[selected_columns[col_idx].id].kind; + chunk.type_kind = _metadata->per_file_metadata[stripe_source_mapping.source_idx] + .ff.types[selected_columns[col_idx].id] + .kind; + auto const decimal_as_float64 = + should_convert_decimal_column_to_float(_decimal_cols_as_float, + _metadata->per_file_metadata[0], + selected_columns[col_idx].id); + chunk.decimal_scale = _metadata->per_file_metadata[stripe_source_mapping.source_idx] + .ff.types[selected_columns[col_idx].id] + .scale.value_or(0) | + (decimal_as_float64 ? orc::gpu::orc_decimal2float64_scale : 0); + + chunk.rowgroup_id = rowgroup_id; + chunk.dtype_len = (column_types[col_idx].id() == type_id::STRING) + ? sizeof(string_index_pair) + : ((column_types[col_idx].id() == type_id::LIST) or + (column_types[col_idx].id() == type_id::STRUCT)) + ? sizeof(int32_t) + : cudf::size_of(column_types[col_idx]); + chunk.num_rowgroups = stripe_num_rowgroups; + if (chunk.type_kind == orc::TIMESTAMP) { + chunk.ts_clock_rate = to_clockrate(_timestamp_type.id()); + } + for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) { + chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos; + } } - } + stripe_start_row += num_rows_per_stripe; + num_rowgroups += stripe_num_rowgroups; - stripe_start_row += stripe_info->numberOfRows; - if (use_index) { - num_rowgroups += (stripe_info->numberOfRows + _metadata->get_row_index_stride() - 1) / - _metadata->get_row_index_stride(); + stripe_idx++; } - stripe_chunk_index++; } - } - // Process dataset chunk pages into output columns - if (stripe_data.size() != 0) { - // Setup row group descriptors if using indexes - rmm::device_uvector row_groups(num_rowgroups * num_columns, stream); - if (_metadata->per_file_metadata[0].ps.compression != orc::NONE) { - auto decomp_data = - decompress_stripe_data(chunks, - stripe_data, - _metadata->per_file_metadata[0].decompressor.get(), - stream_info, - total_num_stripes, - row_groups, - _metadata->get_row_index_stride(), - stream); - stripe_data.clear(); - stripe_data.push_back(std::move(decomp_data)); - } else { - if (not row_groups.is_empty()) { - chunks.host_to_device(stream); - gpu::ParseRowGroupIndex(row_groups.data(), - nullptr, - chunks.device_ptr(), - num_columns, - total_num_stripes, - num_rowgroups, - _metadata->get_row_index_stride(), - stream); + // Process dataset chunk pages into output columns + if (stripe_data.size() != 0) { + auto row_groups = + cudf::detail::hostdevice_2dvector(num_rowgroups, num_columns, stream); + if (level > 0 and row_groups.size().first) { + cudf::host_span row_groups_span(row_groups.base_host_ptr(), + num_rowgroups * num_columns); + auto& rw_grp_meta = _col_meta.rwgrp_meta; + + // Update start row and num rows per row group + std::transform(rw_grp_meta.begin(), + rw_grp_meta.end(), + row_groups_span.begin(), + rw_grp_meta.begin(), + [&](auto meta, auto& row_grp) { + row_grp.num_rows = meta.num_rows; + row_grp.start_row = meta.start_row; + return meta; + }); + } + // Setup row group descriptors if using indexes + if (_metadata->per_file_metadata[0].ps.compression != orc::NONE) { + auto decomp_data = + decompress_stripe_data(chunks, + stripe_data, + _metadata->per_file_metadata[0].decompressor.get(), + stream_info, + total_num_stripes, + row_groups, + _metadata->get_row_index_stride(), + level == 0, + stream); + stripe_data.clear(); + stripe_data.push_back(std::move(decomp_data)); + } else { + if (row_groups.size().first) { + chunks.host_to_device(stream); + row_groups.host_to_device(stream); + gpu::ParseRowGroupIndex(row_groups.base_device_ptr(), + nullptr, + chunks.base_device_ptr(), + num_columns, + total_num_stripes, + num_rowgroups, + _metadata->get_row_index_stride(), + level == 0, + stream); + } } - } - // Setup table for converting timestamp columns from local to UTC time - auto const tz_table = _has_timestamp_column - ? build_timezone_transition_table( - selected_stripes[0].stripe_info[0].second->writerTimezone, stream) - : timezone_table{}; - - std::vector out_buffers; - for (size_t i = 0; i < column_types.size(); ++i) { - bool is_nullable = false; - for (size_t j = 0; j < total_num_stripes; ++j) { - if (chunks[j * num_columns + i].strm_len[gpu::CI_PRESENT] != 0) { - is_nullable = true; - break; + // Setup table for converting timestamp columns from local to UTC time + auto const tz_table = + _has_timestamp_column + ? build_timezone_transition_table( + selected_stripes[0].stripe_info[0].second->writerTimezone, stream) + : timezone_table{}; + + for (size_t i = 0; i < column_types.size(); ++i) { + bool is_nullable = false; + for (size_t j = 0; j < total_num_stripes; ++j) { + if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) { + is_nullable = true; + break; + } } + auto is_list_type = (column_types[i].id() == type_id::LIST); + auto n_rows = (level == 0) ? num_rows : _col_meta.num_child_rows[i]; + // For list column, offset column will be always size + 1 + if (is_list_type) n_rows++; + out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, stream, _mr); } - out_buffers.emplace_back(column_types[i], num_rows, is_nullable, stream, _mr); - } - decode_stream_data(chunks, - num_dict_entries, - skip_rows, - num_rows, - tz_table.view(), - row_groups, - _metadata->get_row_index_stride(), - out_buffers, - stream); - - for (size_t i = 0; i < column_types.size(); ++i) { - out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, _mr)); + decode_stream_data(chunks, + num_dict_entries, + skip_rows, + tz_table.view(), + row_groups, + _metadata->get_row_index_stride(), + out_buffers[level], + level, + stream); + + // Extract information to process list child columns + if (list_col.size()) { + row_groups.device_to_host(stream, true); + aggregate_child_meta(chunks, row_groups, list_col, level); + } + + // ORC stores number of elements at each row, so we need to generate offsets from that + if (list_col.size()) { + std::vector buff_data; + std::for_each( + out_buffers[level].begin(), out_buffers[level].end(), [&buff_data](auto& out_buffer) { + if (out_buffer.type.id() == type_id::LIST) { + auto data = static_cast(out_buffer.data()); + buff_data.emplace_back(list_buffer_data{data, out_buffer.size}); + } + }); + + auto const dev_buff_data = cudf::detail::make_device_uvector_async(buff_data, stream); + generate_offsets_for_list(dev_buff_data, stream); + } } } } - // Return column names (must match order of returned columns) - out_metadata.column_names.resize(_selected_columns.size()); - for (size_t i = 0; i < _selected_columns.size(); i++) { - out_metadata.column_names[i] = _metadata->get_column_name(0, _selected_columns[i]); + // If out_columns is empty, then create columns from buffer. + if (out_columns.empty()) { + create_columns(std::move(out_buffers), out_columns, schema_info, stream); } - for (const auto &meta : _metadata->per_file_metadata) { - for (const auto &kv : meta.ff.metadata) { out_metadata.user_data.insert({kv.name, kv.value}); } + // Return column names (must match order of returned columns) + out_metadata.column_names.reserve(schema_info.size()); + std::transform(schema_info.cbegin(), + schema_info.cend(), + std::back_inserter(out_metadata.column_names), + [](auto info) { return info.name; }); + + out_metadata.schema_info = std::move(schema_info); + + for (const auto& meta : _metadata->per_file_metadata) { + for (const auto& kv : meta.ff.metadata) { + out_metadata.user_data.insert({kv.name, kv.value}); + } } return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; } // Forward to implementation -reader::reader(std::vector const &filepaths, - orc_reader_options const &options, +reader::reader(std::vector const& filepaths, + orc_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { _impl = std::make_unique(datasource::create(filepaths), options, mr); } // Forward to implementation -reader::reader(std::vector> &&sources, - orc_reader_options const &options, +reader::reader(std::vector>&& sources, + orc_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { _impl = std::make_unique(std::move(sources), options, mr); } @@ -948,7 +1287,7 @@ reader::reader(std::vector> &&sources, reader::~reader() = default; // Forward to implementation -table_with_metadata reader::read(orc_reader_options const &options, rmm::cuda_stream_view stream) +table_with_metadata reader::read(orc_reader_options const& options, rmm::cuda_stream_view stream) { return _impl->read( options.get_skip_rows(), options.get_num_rows(), options.get_stripes(), stream); diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp index 0307d84cd1b..1769fb6f193 100644 --- a/cpp/src/io/orc/reader_impl.hpp +++ b/cpp/src/io/orc/reader_impl.hpp @@ -48,6 +48,24 @@ struct stripe_source_mapping; } // namespace class aggregate_orc_metadata; +/** + * @brief Keeps track of orc mapping and child column details. + */ +struct reader_column_meta { + std::vector> + orc_col_map; // Mapping between column id in orc to processing order. + std::vector num_child_rows; // number of rows in child columns + std::vector child_start_row; // start row of child columns [stripe][column] + std::vector + num_child_rows_per_stripe; // number of rows of child columns [stripe][column] + struct row_group_meta { + uint32_t num_rows; // number of rows in a column in a row group + uint32_t start_row; // start row in a column in a row group + }; + // num_rowgroups * num_columns + std::vector rwgrp_meta; // rowgroup metadata [rowgroup][column] +}; + /** * @brief Implementation for ORC reader */ @@ -60,9 +78,9 @@ class reader::impl { * @param options Settings for controlling reading behavior * @param mr Device memory resource to use for device memory allocation */ - explicit impl(std::vector> &&sources, - orc_reader_options const &options, - rmm::mr::device_memory_resource *mr); + explicit impl(std::vector>&& sources, + orc_reader_options const& options, + rmm::mr::device_memory_resource* mr); /** * @brief Read an entire set or a subset of data and returns a set of columns @@ -76,68 +94,123 @@ class reader::impl { */ table_with_metadata read(size_type skip_rows, size_type num_rows, - const std::vector> &stripes, + const std::vector>& stripes, rmm::cuda_stream_view stream); private: /** * @brief Decompresses the stripe data, at stream granularity * - * @param chunks List of column chunk descriptors + * @param chunks Vector of list of column chunk descriptors * @param stripe_data List of source stripe column data * @param decompressor Originally host decompressor * @param stream_info List of stream to column mappings * @param num_stripes Number of stripes making up column chunks - * @param row_groups List of row index descriptors + * @param row_groups Vector of list of row index descriptors * @param row_index_stride Distance between each row index + * @param use_base_stride Whether to use base stride obtained from meta or use the computed value * @param stream CUDA stream used for device memory operations and kernel launches. * * @return Device buffer to decompressed page data */ - rmm::device_buffer decompress_stripe_data(hostdevice_vector &chunks, - const std::vector &stripe_data, - const OrcDecompressor *decompressor, - std::vector &stream_info, - size_t num_stripes, - device_span row_groups, - size_t row_index_stride, - rmm::cuda_stream_view stream); + rmm::device_buffer decompress_stripe_data( + cudf::detail::hostdevice_2dvector& chunks, + const std::vector& stripe_data, + const OrcDecompressor* decompressor, + std::vector& stream_info, + size_t num_stripes, + cudf::detail::hostdevice_2dvector& row_groups, + size_t row_index_stride, + bool use_base_stride, + rmm::cuda_stream_view stream); /** * @brief Converts the stripe column data and outputs to columns * - * @param chunks List of column chunk descriptors + * @param chunks Vector of list of column chunk descriptors * @param num_dicts Number of dictionary entries required * @param skip_rows Number of rows to offset from start - * @param num_rows Number of rows to output * @param tz_table Local time to UTC conversion table - * @param row_groups List of row index descriptors + * @param row_groups Vector of list of row index descriptors * @param row_index_stride Distance between each row index * @param out_buffers Output columns' device buffers + * @param level Current nesting level being processed * @param stream CUDA stream used for device memory operations and kernel launches. */ - void decode_stream_data(hostdevice_vector &chunks, + void decode_stream_data(cudf::detail::hostdevice_2dvector& chunks, size_t num_dicts, size_t skip_rows, - size_t num_rows, timezone_table_view tz_table, - device_span row_groups, + cudf::detail::hostdevice_2dvector& row_groups, size_t row_index_stride, - std::vector &out_buffers, + std::vector& out_buffers, + size_t level, rmm::cuda_stream_view stream); + /** + * @brief Aggregate child metadata from parent column chunks. + * + * @param chunks Vector of list of parent column chunks. + * @param chunks Vector of list of parent column row groups. + * @param list_col Vector of column metadata of list type parent columns. + * @param level Current nesting level being processed. + */ + void aggregate_child_meta(cudf::detail::host_2dspan chunks, + cudf::detail::host_2dspan row_groups, + std::vector const& list_col, + const int32_t level); + + /** + * @brief Assemble the buffer with child columns. + * + * @param orc_col_id Column id in orc. + * @param col_buffers Column buffers for columns and children. + * @param level Current nesting level. + */ + column_buffer&& assemble_buffer(const int32_t orc_col_id, + std::vector>& col_buffers, + const size_t level); + + /** + * @brief Create columns and respective schema information from the buffer. + * + * @param col_buffers Column buffers for columns and children. + * @param out_columns Vector of columns formed from column buffers. + * @param schema_info Vector of schema information formed from column buffers. + * @param stream CUDA stream used for device memory operations and kernel launches. + */ + void create_columns(std::vector>&& col_buffers, + std::vector>& out_columns, + std::vector& schema_info, + rmm::cuda_stream_view stream); + + /** + * @brief Create empty columns and respective schema information from the buffer. + * + * @param col_buffers Column buffers for columns and children. + * @param schema_info Vector of schema information formed from column buffers. + * @param stream CUDA stream used for device memory operations and kernel launches. + * + * @return An empty column equivalent to orc column type. + */ + std::unique_ptr create_empty_column(const int32_t orc_col_id, + column_name_info& schema_info, + rmm::cuda_stream_view stream); + private: - rmm::mr::device_memory_resource *_mr = nullptr; + rmm::mr::device_memory_resource* _mr = nullptr; std::vector> _sources; std::unique_ptr _metadata; // _output_columns associated schema indices - std::vector _selected_columns; + std::vector> _selected_columns; bool _use_index = true; bool _use_np_dtypes = true; bool _has_timestamp_column = false; + bool _has_list_column = false; std::vector _decimal_cols_as_float; data_type _timestamp_type{type_id::EMPTY}; + reader_column_meta _col_meta; }; } // namespace orc diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 4c85150a9f0..517a1e0e689 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -39,8 +39,8 @@ constexpr unsigned int init_groups_per_block = 4; constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block; __global__ void __launch_bounds__(init_threads_per_block) - gpu_init_statistics_groups(statistics_group *groups, - const stats_column_desc *cols, + gpu_init_statistics_groups(statistics_group* groups, + const stats_column_desc* cols, uint32_t num_columns, uint32_t num_rowgroups, uint32_t row_index_stride) @@ -49,7 +49,7 @@ __global__ void __launch_bounds__(init_threads_per_block) uint32_t col_id = blockIdx.y; uint32_t chunk_id = (blockIdx.x * init_groups_per_block) + threadIdx.y; uint32_t t = threadIdx.x; - statistics_group *group = &group_g[threadIdx.y]; + statistics_group* group = &group_g[threadIdx.y]; if (chunk_id < num_rowgroups and t == 0) { uint32_t num_rows = cols[col_id].leaf_column->size(); group->col = &cols[col_id]; @@ -78,8 +78,8 @@ constexpr unsigned int pb_fldlen_common = 2 * pb_fld_hdrlen + pb_fldlen_int64; template __global__ void __launch_bounds__(block_size, 1) - gpu_init_statistics_buffersize(statistics_merge_group *groups, - const statistics_chunk *chunks, + gpu_init_statistics_buffersize(statistics_merge_group* groups, + const statistics_chunk* chunks, uint32_t statistics_count) { using block_scan = cub::BlockScan; @@ -91,7 +91,7 @@ __global__ void __launch_bounds__(block_size, 1) uint32_t stats_len = 0, stats_pos; uint32_t idx = start + t; if (idx < statistics_count) { - const stats_column_desc *col = groups[idx].col; + const stats_column_desc* col = groups[idx].col; statistics_dtype dtype = col->stats_dtype; switch (dtype) { case dtype_bool: stats_len = pb_fldlen_common + pb_fld_hdrlen + pb_fldlen_bucket1; break; @@ -131,8 +131,8 @@ __global__ void __launch_bounds__(block_size, 1) } struct stats_state_s { - uint8_t *base; ///< Output buffer start - uint8_t *end; ///< Output buffer end + uint8_t* base; ///< Output buffer start + uint8_t* end; ///< Output buffer end statistics_chunk chunk; statistics_merge_group group; stats_column_desc col; @@ -146,7 +146,7 @@ struct stats_state_s { * https://developers.google.com/protocol-buffers/docs/encoding */ // Protobuf varint encoding for unsigned int -__device__ inline uint8_t *pb_encode_uint(uint8_t *p, uint64_t v) +__device__ inline uint8_t* pb_encode_uint(uint8_t* p, uint64_t v) { while (v > 0x7f) { *p++ = ((uint32_t)v | 0x80); @@ -157,30 +157,30 @@ __device__ inline uint8_t *pb_encode_uint(uint8_t *p, uint64_t v) } // Protobuf field encoding for unsigned int -__device__ inline uint8_t *pb_put_uint(uint8_t *p, uint32_t id, uint64_t v) +__device__ inline uint8_t* pb_put_uint(uint8_t* p, uint32_t id, uint64_t v) { p[0] = id * 8 + PB_TYPE_VARINT; // NOTE: Assumes id < 16 return pb_encode_uint(p + 1, v); } // Protobuf field encoding for signed int -__device__ inline uint8_t *pb_put_int(uint8_t *p, uint32_t id, int64_t v) +__device__ inline uint8_t* pb_put_int(uint8_t* p, uint32_t id, int64_t v) { int64_t s = (v < 0); return pb_put_uint(p, id, (v ^ -s) * 2 + s); } // Protobuf field encoding for 'packed' unsigned int (single value) -__device__ inline uint8_t *pb_put_packed_uint(uint8_t *p, uint32_t id, uint64_t v) +__device__ inline uint8_t* pb_put_packed_uint(uint8_t* p, uint32_t id, uint64_t v) { - uint8_t *p2 = pb_encode_uint(p + 2, v); + uint8_t* p2 = pb_encode_uint(p + 2, v); p[0] = id * 8 + PB_TYPE_FIXEDLEN; p[1] = static_cast(p2 - (p + 2)); return p2; } // Protobuf field encoding for binary/string -__device__ inline uint8_t *pb_put_binary(uint8_t *p, uint32_t id, const void *bytes, uint32_t len) +__device__ inline uint8_t* pb_put_binary(uint8_t* p, uint32_t id, const void* bytes, uint32_t len) { p[0] = id * 8 + PB_TYPE_FIXEDLEN; p = pb_encode_uint(p + 1, len); @@ -189,7 +189,7 @@ __device__ inline uint8_t *pb_put_binary(uint8_t *p, uint32_t id, const void *by } // Protobuf field encoding for 64-bit raw encoding (double) -__device__ inline uint8_t *pb_put_fixed64(uint8_t *p, uint32_t id, const void *raw64) +__device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, const void* raw64) { p[0] = id * 8 + PB_TYPE_FIXED64; memcpy(p + 1, raw64, 8); @@ -226,15 +226,15 @@ constexpr unsigned int encode_threads_per_block = encode_threads_per_chunk * encode_chunks_per_block; __global__ void __launch_bounds__(encode_threads_per_block) - gpu_encode_statistics(uint8_t *blob_bfr, - statistics_merge_group *groups, - const statistics_chunk *chunks, + gpu_encode_statistics(uint8_t* blob_bfr, + statistics_merge_group* groups, + const statistics_chunk* chunks, uint32_t statistics_count) { __shared__ __align__(8) stats_state_s state_g[encode_chunks_per_block]; uint32_t t = threadIdx.x; uint32_t idx = blockIdx.x * encode_chunks_per_block + threadIdx.y; - stats_state_s *const s = &state_g[threadIdx.y]; + stats_state_s* const s = &state_g[threadIdx.y]; // Encode and update actual bfr size if (idx < statistics_count && t == 0) { @@ -243,8 +243,8 @@ __global__ void __launch_bounds__(encode_threads_per_block) s->col = *(s->group.col); s->base = blob_bfr + s->group.start_chunk; s->end = blob_bfr + s->group.start_chunk + s->group.num_chunks; - uint8_t *cur = pb_put_uint(s->base, 1, s->chunk.non_nulls); - uint8_t *fld_start = cur; + uint8_t* cur = pb_put_uint(s->base, 1, s->chunk.non_nulls); + uint8_t* fld_start = cur; switch (s->col.stats_dtype) { case dtype_int8: case dtype_int16: @@ -373,8 +373,8 @@ __global__ void __launch_bounds__(encode_threads_per_block) * @param[in] row_index_stride Rowgroup size in rows * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void orc_init_statistics_groups(statistics_group *groups, - const stats_column_desc *cols, +void orc_init_statistics_groups(statistics_group* groups, + const stats_column_desc* cols, uint32_t num_columns, uint32_t num_rowgroups, uint32_t row_index_stride, @@ -394,8 +394,8 @@ void orc_init_statistics_groups(statistics_group *groups, * @param[in] statistics_count Number of statistics buffers to encode * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void orc_init_statistics_buffersize(statistics_merge_group *groups, - const statistics_chunk *chunks, +void orc_init_statistics_buffersize(statistics_merge_group* groups, + const statistics_chunk* chunks, uint32_t statistics_count, rmm::cuda_stream_view stream) { @@ -411,9 +411,9 @@ void orc_init_statistics_buffersize(statistics_merge_group *groups, * @param[in,out] chunks Statistics data * @param[in] statistics_count Number of statistics buffers */ -void orc_encode_statistics(uint8_t *blob_bfr, - statistics_merge_group *groups, - const statistics_chunk *chunks, +void orc_encode_statistics(uint8_t* blob_bfr, + statistics_merge_group* groups, + const statistics_chunk* chunks, uint32_t statistics_count, rmm::cuda_stream_view stream) { diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 6bc0e475a27..903f9475e2a 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -50,7 +50,7 @@ struct int128_s { }; struct orc_bytestream_s { - const uint8_t *base; + const uint8_t* base; uint32_t pos; uint32_t len; uint32_t fill_pos; @@ -93,7 +93,7 @@ struct orc_rowdec_state_s { }; struct orc_strdict_state_s { - DictionaryEntry *local_dict; + DictionaryEntry* local_dict; uint32_t dict_pos; uint32_t dict_len; }; @@ -113,6 +113,7 @@ struct orcdec_state_s { orc_bytestream_s bs; orc_bytestream_s bs2; int is_string; + uint64_t num_child_rows; union { orc_strdict_state_s dict; uint32_t nulls_desc_row; // number of rows processed for nulls. @@ -145,8 +146,8 @@ struct orcdec_state_s { * @param[in] base Pointer to raw byte stream data * @param[in] len Stream length in bytes */ -static __device__ void bytestream_init(volatile orc_bytestream_s *bs, - const uint8_t *base, +static __device__ void bytestream_init(volatile orc_bytestream_s* bs, + const uint8_t* base, uint32_t len) { uint32_t pos = (len > 0) ? static_cast(7 & reinterpret_cast(base)) : 0; @@ -163,7 +164,7 @@ static __device__ void bytestream_init(volatile orc_bytestream_s *bs, * @param[in] bs Byte stream input * @param[in] bytes_consumed Number of bytes that were consumed */ -static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s *bs, +static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s* bs, uint32_t bytes_consumed) { uint32_t pos = bs->pos; @@ -182,7 +183,7 @@ static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s *bs, * @param[in] bs Byte stream input * @param[in] t thread id */ -static __device__ void bytestream_fill(orc_bytestream_s *bs, int t) +static __device__ void bytestream_fill(orc_bytestream_s* bs, int t) { auto const count = bs->fill_count; if (t < count) { @@ -200,7 +201,7 @@ static __device__ void bytestream_fill(orc_bytestream_s *bs, int t) * @param[in] pos Position in byte stream * @return byte */ -inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s *bs, int pos) +inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int pos) { return bs->buf.u8[pos & (bytestream_buffer_size - 1)]; } @@ -212,7 +213,7 @@ inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s *bs, int * @param[in] pos Position in byte stream * @result bits */ -inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s *bs, int pos) +inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int pos) { uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2]; uint32_t b = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2]; @@ -227,7 +228,7 @@ inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s *bs, int * @param[in] numbits number of bits * @return bits */ -inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s *bs, int pos) +inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int pos) { uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2]; uint32_t b = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2]; @@ -248,7 +249,7 @@ inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s *bs, int * @param[in] numbits number of bits * @return decoded value */ -inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s *bs, +inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs, int bitpos, uint32_t numbits) { @@ -266,7 +267,7 @@ inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s *bs, * @param[in] numbits number of bits * @return decoded value */ -inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s *bs, +inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs, int bitpos, uint32_t numbits) { @@ -291,10 +292,10 @@ inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s *bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs, +inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, int bitpos, uint32_t numbits, - uint32_t &result) + uint32_t& result) { result = bytestream_readbits(bs, bitpos, numbits); } @@ -307,10 +308,10 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs, +inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, int bitpos, uint32_t numbits, - int32_t &result) + int32_t& result) { uint32_t u = bytestream_readbits(bs, bitpos, numbits); result = (int32_t)((u >> 1u) ^ -(int32_t)(u & 1)); @@ -324,10 +325,10 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs, +inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, int bitpos, uint32_t numbits, - uint64_t &result) + uint64_t& result) { result = bytestream_readbits64(bs, bitpos, numbits); } @@ -340,10 +341,10 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs, +inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, int bitpos, uint32_t numbits, - int64_t &result) + int64_t& result) { uint64_t u = bytestream_readbits64(bs, bitpos, numbits); result = (int64_t)((u >> 1u) ^ -(int64_t)(u & 1)); @@ -357,7 +358,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s *bs, * @return length of varint in bytes */ template -inline __device__ uint32_t varint_length(volatile orc_bytestream_s *bs, int pos) +inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos) { if (bytestream_readbyte(bs, pos) > 0x7f) { uint32_t next32 = bytestream_readu32(bs, pos + 1); @@ -395,7 +396,7 @@ inline __device__ uint32_t varint_length(volatile orc_bytestream_s *bs, int pos) * @return new position in byte stream buffer */ template -inline __device__ int decode_base128_varint(volatile orc_bytestream_s *bs, int pos, T &result) +inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int pos, T& result) { uint32_t v = bytestream_readbyte(bs, pos++); if (v > 0x7f) { @@ -449,7 +450,7 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s *bs, int p /** * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals) */ -inline __device__ int128_s decode_varint128(volatile orc_bytestream_s *bs, int pos) +inline __device__ int128_s decode_varint128(volatile orc_bytestream_s* bs, int pos) { uint32_t b = bytestream_readbyte(bs, pos++); int64_t sign_mask = -(int32_t)(b & 1); @@ -477,7 +478,7 @@ inline __device__ int128_s decode_varint128(volatile orc_bytestream_s *bs, int p /** * @brief Decodes an unsigned 32-bit varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, uint32_t &result) +inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint32_t& result) { uint32_t u; pos = decode_base128_varint(bs, pos, u); @@ -488,7 +489,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, uint /** * @brief Decodes an unsigned 64-bit varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, uint64_t &result) +inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint64_t& result) { uint64_t u; pos = decode_base128_varint(bs, pos, u); @@ -499,7 +500,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, uint /** * @brief Signed version of 32-bit decode_varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, int32_t &result) +inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int32_t& result) { uint32_t u; pos = decode_base128_varint(bs, pos, u); @@ -510,7 +511,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, int3 /** * @brief Signed version of 64-bit decode_varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, int64_t &result) +inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int64_t& result) { uint64_t u; pos = decode_base128_varint(bs, pos, u); @@ -528,7 +529,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s *bs, int pos, int6 * @return number of values decoded */ template -inline __device__ void lengths_to_positions(volatile T *vals, uint32_t numvals, unsigned int t) +inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t) { for (uint32_t n = 1; n < numvals; n <<= 1) { __syncthreads(); @@ -549,7 +550,7 @@ inline __device__ void lengths_to_positions(volatile T *vals, uint32_t numvals, */ template static __device__ uint32_t Integer_RLEv1( - orc_bytestream_s *bs, volatile orc_rlev1_state_s *rle, volatile T *vals, uint32_t maxvals, int t) + orc_bytestream_s* bs, volatile orc_rlev1_state_s* rle, volatile T* vals, uint32_t maxvals, int t) { uint32_t numvals, numruns; if (t == 0) { @@ -602,7 +603,9 @@ static __device__ uint32_t Integer_RLEv1( int delta = run_data >> 24; uint32_t base = run_data & 0x3ff; uint32_t pos = vals[base] & 0xffff; - for (int i = 1 + tr; i < n; i += 32) { vals[base + i] = ((delta * i) << 16) | pos; } + for (int i = 1 + tr; i < n; i += 32) { + vals[base + i] = ((delta * i) << 16) | pos; + } } __syncthreads(); } @@ -648,12 +651,17 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = { * @param[in] vals buffer for output values (uint32_t, int32_t, uint64_t or int64_t) * @param[in] maxvals maximum number of values to decode * @param[in] t thread id + * @param[in] has_buffered_values If true, means there are already buffered values * * @return number of values decoded */ template -static __device__ uint32_t Integer_RLEv2( - orc_bytestream_s *bs, volatile orc_rlev2_state_s *rle, volatile T *vals, uint32_t maxvals, int t) +static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, + volatile orc_rlev2_state_s* rle, + volatile T* vals, + uint32_t maxvals, + int t, + bool has_buffered_values = false) { uint32_t numvals, numruns; int r, tr; @@ -700,11 +708,16 @@ static __device__ uint32_t Integer_RLEv2( l += deltapos; } } - if (numvals + n > maxvals) break; + if ((numvals != 0) and (numvals + n > maxvals)) break; + // case where there are buffered values and can't consume a whole chunk + // from decoded values, so skip adding any more to buffer, work on buffered values and then + // start fresh in next iteration with empty buffer. + if ((numvals == 0) and (n > maxvals) and (has_buffered_values)) break; + pos += l; if (pos > maxpos) break; + ((numvals == 0) and (n > maxvals)) ? numvals = maxvals : numvals += n; lastpos = pos; - numvals += n; numruns++; } rle->num_vals = numvals; @@ -864,7 +877,9 @@ static __device__ uint32_t Integer_RLEv2( baseval = rle->baseval.u32[r]; else baseval = rle->baseval.u64[r]; - for (uint32_t j = tr; j < n; j += 32) { vals[base + j] += baseval; } + for (uint32_t j = tr; j < n; j += 32) { + vals[base + j] += baseval; + } } } __syncthreads(); @@ -879,7 +894,7 @@ static __device__ uint32_t Integer_RLEv2( * * @return 32-bit value */ -inline __device__ uint32_t rle8_read_bool32(volatile uint32_t *vals, uint32_t bitpos) +inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bitpos) { uint32_t a = vals[(bitpos >> 5) + 0]; uint32_t b = vals[(bitpos >> 5) + 1]; @@ -899,9 +914,9 @@ inline __device__ uint32_t rle8_read_bool32(volatile uint32_t *vals, uint32_t bi * * @return number of values decoded */ -static __device__ uint32_t Byte_RLE(orc_bytestream_s *bs, - volatile orc_byterle_state_s *rle, - volatile uint8_t *vals, +static __device__ uint32_t Byte_RLE(orc_bytestream_s* bs, + volatile orc_byterle_state_s* rle, + volatile uint8_t* vals, uint32_t maxvals, int t) { @@ -926,9 +941,10 @@ static __device__ uint32_t Byte_RLE(orc_bytestream_s *bs, n = 0x100 - n; pos += n; } - if (pos > maxpos || numvals + n > maxvals) { break; } + if ((numvals != 0) and (numvals + n > maxvals)) break; + if (pos > maxpos) break; numruns++; - numvals += n; + ((numvals == 0) and (n > maxvals)) ? numvals = maxvals : numvals += n; lastpos = pos; } rle->num_runs = numruns; @@ -1009,9 +1025,9 @@ static const __device__ __constant__ int64_t kPow5i[28] = {1, * * @return number of values decoded */ -static __device__ int Decode_Decimals(orc_bytestream_s *bs, - volatile orc_byterle_state_s *scratch, - volatile orcdec_state_s::values &vals, +static __device__ int Decode_Decimals(orc_bytestream_s* bs, + volatile orc_byterle_state_s* scratch, + volatile orcdec_state_s::values& vals, int val_scale, int numvals, int col_scale, @@ -1113,11 +1129,10 @@ static __device__ int Decode_Decimals(orc_bytestream_s *bs, // blockDim {block_size,1,1} template __global__ void __launch_bounds__(block_size) - gpuDecodeNullsAndStringDictionaries(ColumnDesc *chunks, - DictionaryEntry *global_dictionary, + gpuDecodeNullsAndStringDictionaries(ColumnDesc* chunks, + DictionaryEntry* global_dictionary, uint32_t num_columns, uint32_t num_stripes, - size_t max_num_rows, size_t first_row) { __shared__ __align__(16) orcdec_state_s state_g; @@ -1128,15 +1143,17 @@ __global__ void __launch_bounds__(block_size) typename block_reduce::TempStorage bk_storage; } temp_storage; - orcdec_state_s *const s = &state_g; - bool is_nulldec = (blockIdx.y >= num_stripes); - uint32_t column = blockIdx.x; - uint32_t stripe = (is_nulldec) ? blockIdx.y - num_stripes : blockIdx.y; - uint32_t chunk_id = stripe * num_columns + column; + orcdec_state_s* const s = &state_g; + const bool is_nulldec = (blockIdx.y >= num_stripes); + const uint32_t column = blockIdx.x; + const uint32_t stripe = (is_nulldec) ? blockIdx.y - num_stripes : blockIdx.y; + const uint32_t chunk_id = stripe * num_columns + column; int t = threadIdx.x; if (t == 0) s->chunk = chunks[chunk_id]; __syncthreads(); + const size_t max_num_rows = s->chunk.column_num_rows; + if (is_nulldec) { uint32_t null_count = 0; // Decode NULLs @@ -1176,7 +1193,7 @@ __global__ void __launch_bounds__(block_size) int64_t dst_pos = max(dst_row, (int64_t)0); uint32_t startbit = -static_cast(min(dst_row, (int64_t)0)); uint32_t nbits = nrows - min(startbit, nrows); - uint32_t *valid = s->chunk.valid_map_base + (dst_pos >> 5); + uint32_t* valid = s->chunk.valid_map_base + (dst_pos >> 5); uint32_t bitpos = static_cast(dst_pos) & 0x1f; if ((size_t)(dst_pos + nbits) > max_num_rows) { nbits = static_cast(max_num_rows - min((size_t)dst_pos, max_num_rows)); @@ -1251,7 +1268,7 @@ __global__ void __launch_bounds__(block_size) __syncthreads(); while (s->top.dict.dict_len > 0) { uint32_t numvals = min(s->top.dict.dict_len, blockDim.x), len; - volatile uint32_t *vals = s->vals.u32; + volatile uint32_t* vals = s->vals.u32; bytestream_fill(&s->bs, t); __syncthreads(); if (is_rlev1(s->chunk.encoding_kind)) { @@ -1293,10 +1310,10 @@ __global__ void __launch_bounds__(block_size) * @param[in] temp_storage shared memory storage to perform block reduce */ template -static __device__ void DecodeRowPositions(orcdec_state_s *s, +static __device__ void DecodeRowPositions(orcdec_state_s* s, size_t first_row, int t, - Storage &temp_storage) + Storage& temp_storage) { using block_reduce = cub::BlockReduce; @@ -1322,10 +1339,9 @@ static __device__ void DecodeRowPositions(orcdec_state_s *s, uint32_t rmax = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row); uint32_t r = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row); uint32_t valid = (t < nrows && r < rmax) - ? (((const uint8_t *)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1 + ? (((const uint8_t*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1 : 0; - volatile uint16_t *row_ofs_plus1 = - (volatile uint16_t *)&s->u.rowdec.row[s->u.rowdec.nz_count]; + volatile uint16_t* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count]; uint32_t nz_pos, row_plus1, nz_count = s->u.rowdec.nz_count, last_row; if (t < nrows) { row_ofs_plus1[t] = valid; } lengths_to_positions(row_ofs_plus1, nrows, t); @@ -1372,54 +1388,62 @@ static const __device__ __constant__ uint32_t kTimestampNanoScale[8] = { * @param[in] global_dictionary Global dictionary device array * @param[in] tz_table Timezone translation table * @param[in] row_groups Optional row index data - * @param[in] max_num_rows Maximum number of rows to load * @param[in] first_row Crop all rows below first_row - * @param[in] num_chunks Number of column chunks (num_columns * num_stripes) - * @param[in] num_rowgroups Number of row groups in row index data * @param[in] rowidx_stride Row index stride + * @param[in] level nesting level being processed */ // blockDim {block_size,1,1} template __global__ void __launch_bounds__(block_size) - gpuDecodeOrcColumnData(ColumnDesc const *chunks, - DictionaryEntry *global_dictionary, + gpuDecodeOrcColumnData(ColumnDesc* chunks, + DictionaryEntry* global_dictionary, timezone_table_view tz_table, - const RowGroup *row_groups, - size_t max_num_rows, + device_2dspan row_groups, size_t first_row, - uint32_t num_columns, - uint32_t num_rowgroups, - uint32_t rowidx_stride) + uint32_t rowidx_stride, + size_t level) { __shared__ __align__(16) orcdec_state_s state_g; - __shared__ typename cub::BlockReduce::TempStorage temp_storage; + using block_reduce = cub::BlockReduce; + __shared__ union { + typename cub::BlockReduce::TempStorage blk_uint32; + typename cub::BlockReduce::TempStorage blk_uint64; + } temp_storage; - orcdec_state_s *const s = &state_g; + orcdec_state_s* const s = &state_g; uint32_t chunk_id; - int t = threadIdx.x; + int t = threadIdx.x; + auto num_rowgroups = row_groups.size().first; if (num_rowgroups > 0) { - if (t == 0) s->top.data.index = row_groups[blockIdx.y * num_columns + blockIdx.x]; + if (t == 0) { s->top.data.index = row_groups[blockIdx.y][blockIdx.x]; } __syncthreads(); chunk_id = s->top.data.index.chunk_id; } else { chunk_id = blockIdx.x; } - if (t == 0) s->chunk = chunks[chunk_id]; - - __syncthreads(); if (t == 0) { + s->chunk = chunks[chunk_id]; + s->num_child_rows = 0; + } + __syncthreads(); + // Struct doesn't have any data in itself, so skip + const bool is_valid = s->chunk.type_kind != STRUCT; + const size_t max_num_rows = s->chunk.column_num_rows; + if (t == 0 and is_valid) { // If we have an index, seek to the initial run and update row positions if (num_rowgroups > 0) { uint32_t ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]); uint32_t ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]); - uint32_t rowgroup_rowofs; + uint32_t rowgroup_rowofs = + (level == 0) ? (blockIdx.y - min(s->chunk.rowgroup_id, blockIdx.y)) * rowidx_stride + : s->top.data.index.start_row; + ; s->chunk.streams[CI_DATA] += ofs0; s->chunk.strm_len[CI_DATA] -= ofs0; s->chunk.streams[CI_DATA2] += ofs1; s->chunk.strm_len[CI_DATA2] -= ofs1; - rowgroup_rowofs = min((blockIdx.y - min(s->chunk.rowgroup_id, blockIdx.y)) * rowidx_stride, - s->chunk.num_rows); + rowgroup_rowofs = min(rowgroup_rowofs, s->chunk.num_rows); s->chunk.start_row += rowgroup_rowofs; s->chunk.num_rows -= rowgroup_rowofs; } @@ -1433,7 +1457,8 @@ __global__ void __launch_bounds__(block_size) s->top.data.end_row = static_cast(first_row + max_num_rows); } if (num_rowgroups > 0) { - s->top.data.end_row = min(s->top.data.end_row, s->chunk.start_row + rowidx_stride); + s->top.data.end_row = + min(s->top.data.end_row, s->chunk.start_row + s->top.data.index.num_rows); } if (!is_dictionary(s->chunk.encoding_kind)) { s->chunk.dictionary_start = 0; } @@ -1443,7 +1468,9 @@ __global__ void __launch_bounds__(block_size) bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]); } __syncthreads(); - while (s->top.data.cur_row < s->top.data.end_row) { + + while (is_valid && (s->top.data.cur_row < s->top.data.end_row)) { + uint32_t list_child_elements = 0; bytestream_fill(&s->bs, t); bytestream_fill(&s->bs2, t); __syncthreads(); @@ -1467,7 +1494,7 @@ __global__ void __launch_bounds__(block_size) uint32_t vals_skipped = 0; if (s->is_string || s->chunk.type_kind == TIMESTAMP) { // For these data types, we have a secondary unsigned 32-bit data stream - orc_bytestream_s *bs = (is_dictionary(s->chunk.encoding_kind)) ? &s->bs : &s->bs2; + orc_bytestream_s* bs = (is_dictionary(s->chunk.encoding_kind)) ? &s->bs : &s->bs2; uint32_t ofs = 0; if (s->chunk.type_kind == TIMESTAMP) { // Restore buffered secondary stream values, if any @@ -1485,9 +1512,11 @@ __global__ void __launch_bounds__(block_size) numvals = ofs + Integer_RLEv1(bs, &s->u.rlev1, &s->vals.u32[ofs], numvals - ofs, t); } else { if (s->chunk.type_kind == TIMESTAMP) - numvals = ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u64[ofs], numvals - ofs, t); + numvals = + ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u64[ofs], numvals - ofs, t, ofs > 0); else - numvals = ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u32[ofs], numvals - ofs, t); + numvals = + ofs + Integer_RLEv2(bs, &s->u.rlev2, &s->vals.u32[ofs], numvals - ofs, t, ofs > 0); } __syncthreads(); if (numvals <= ofs && t >= ofs && t < s->top.data.max_vals) { s->vals.u32[t] = 0; } @@ -1533,8 +1562,9 @@ __global__ void __launch_bounds__(block_size) __syncthreads(); // Account for skipped values if (num_rowgroups > 0 && !s->is_string) { - uint32_t run_pos = (s->chunk.type_kind == DECIMAL) ? s->top.data.index.run_pos[CI_DATA2] - : s->top.data.index.run_pos[CI_DATA]; + uint32_t run_pos = (s->chunk.type_kind == DECIMAL || s->chunk.type_kind == LIST) + ? s->top.data.index.run_pos[CI_DATA2] + : s->top.data.index.run_pos[CI_DATA]; numvals = min(numvals + run_pos, (s->chunk.type_kind == BOOLEAN) ? blockDim.x * 2 : blockDim.x); } @@ -1547,6 +1577,13 @@ __global__ void __launch_bounds__(block_size) numvals = Integer_RLEv2(&s->bs, &s->u.rlev2, s->vals.i32, numvals, t); } __syncthreads(); + } else if (s->chunk.type_kind == LIST) { + if (is_rlev1(s->chunk.encoding_kind)) { + numvals = Integer_RLEv1(&s->bs2, &s->u.rlev1, s->vals.u64, numvals, t); + } else { + numvals = Integer_RLEv2(&s->bs2, &s->u.rlev2, s->vals.u64, numvals, t); + } + __syncthreads(); } else if (s->chunk.type_kind == BYTE) { numvals = Byte_RLE(&s->bs, &s->u.rle8, s->vals.u8, numvals, t); __syncthreads(); @@ -1583,7 +1620,7 @@ __global__ void __launch_bounds__(block_size) } else if (s->chunk.type_kind == LONG || s->chunk.type_kind == TIMESTAMP || s->chunk.type_kind == DECIMAL) { - orc_bytestream_s *bs = (s->chunk.type_kind == DECIMAL) ? &s->bs2 : &s->bs; + orc_bytestream_s* bs = (s->chunk.type_kind == DECIMAL) ? &s->bs2 : &s->bs; if (is_rlev1(s->chunk.encoding_kind)) { numvals = Integer_RLEv1(bs, &s->u.rlev1, s->vals.i64, numvals, t); } else { @@ -1629,12 +1666,16 @@ __global__ void __launch_bounds__(block_size) } else { vals_skipped = 0; if (num_rowgroups > 0) { - uint32_t run_pos = s->top.data.index.run_pos[CI_DATA]; + uint32_t run_pos = (s->chunk.type_kind == LIST) ? s->top.data.index.run_pos[CI_DATA2] + : s->top.data.index.run_pos[CI_DATA]; if (run_pos) { vals_skipped = min(numvals, run_pos); numvals -= vals_skipped; __syncthreads(); - if (t == 0) { s->top.data.index.run_pos[CI_DATA] = 0; } + if (t == 0) { + (s->chunk.type_kind == LIST) ? s->top.data.index.run_pos[CI_DATA2] = 0 + : s->top.data.index.run_pos[CI_DATA] = 0; + } } } } @@ -1647,56 +1688,66 @@ __global__ void __launch_bounds__(block_size) __syncthreads(); // Use the valid bits to compute non-null row positions until we get a full batch of values to // decode - DecodeRowPositions(s, first_row, t, temp_storage); + DecodeRowPositions(s, first_row, t, temp_storage.blk_uint32); if (!s->top.data.nrows && !s->u.rowdec.nz_count && !vals_skipped) { // This is a bug (could happen with bitstream errors with a bad run that would produce more // values than the number of remaining rows) return; } + // Store decoded values to output if (t < min(min(s->top.data.max_vals, s->u.rowdec.nz_count), s->top.data.nrows) && s->u.rowdec.row[t] != 0 && s->top.data.cur_row + s->u.rowdec.row[t] - 1 < s->top.data.end_row) { size_t row = s->top.data.cur_row + s->u.rowdec.row[t] - 1 - first_row; if (row < max_num_rows) { - void *data_out = s->chunk.column_data_base; + void* data_out = s->chunk.column_data_base; switch (s->chunk.type_kind) { case FLOAT: - case INT: static_cast(data_out)[row] = s->vals.u32[t + vals_skipped]; break; + case INT: static_cast(data_out)[row] = s->vals.u32[t + vals_skipped]; break; case DOUBLE: case LONG: case DECIMAL: - static_cast(data_out)[row] = s->vals.u64[t + vals_skipped]; + static_cast(data_out)[row] = s->vals.u64[t + vals_skipped]; break; + case LIST: { + // Since the offsets column in cudf is `size_type`, + // If the limit exceeds then value will be 0, which is Fail. + cudf_assert( + (s->vals.u64[t + vals_skipped] > std::numeric_limits::max()) and + "Number of elements is more than what size_type can handle"); + list_child_elements = s->vals.u64[t + vals_skipped]; + static_cast(data_out)[row] = list_child_elements; + } break; case SHORT: - static_cast(data_out)[row] = + static_cast(data_out)[row] = static_cast(s->vals.u32[t + vals_skipped]); break; - case BYTE: static_cast(data_out)[row] = s->vals.u8[t + vals_skipped]; break; + case BYTE: static_cast(data_out)[row] = s->vals.u8[t + vals_skipped]; break; case BOOLEAN: - static_cast(data_out)[row] = + static_cast(data_out)[row] = (s->vals.u8[(t + vals_skipped) >> 3] >> ((~(t + vals_skipped)) & 7)) & 1; break; case DATE: if (s->chunk.dtype_len == 8) { // Convert from days to milliseconds by multiplying by 24*3600*1000 - static_cast(data_out)[row] = + static_cast(data_out)[row] = 86400000ll * (int64_t)s->vals.i32[t + vals_skipped]; } else { - static_cast(data_out)[row] = s->vals.u32[t + vals_skipped]; + static_cast(data_out)[row] = s->vals.u32[t + vals_skipped]; } break; case STRING: case BINARY: case VARCHAR: case CHAR: { - string_index_pair *strdesc = &static_cast(data_out)[row]; - void const *ptr = nullptr; + string_index_pair* strdesc = &static_cast(data_out)[row]; + void const* ptr = nullptr; uint32_t count = 0; if (is_dictionary(s->chunk.encoding_kind)) { auto const dict_idx = s->vals.u32[t + vals_skipped]; if (dict_idx < s->chunk.dict_len) { - auto const &g_entry = global_dictionary[s->chunk.dictionary_start + dict_idx]; + auto const& g_entry = global_dictionary[s->chunk.dictionary_start + dict_idx]; ptr = s->chunk.streams[CI_DICTIONARY] + g_entry.pos; count = g_entry.len; @@ -1710,7 +1761,7 @@ __global__ void __launch_bounds__(block_size) count = secondary_val; } } - strdesc->first = static_cast(ptr); + strdesc->first = static_cast(ptr); strdesc->second = count; break; } @@ -1723,17 +1774,21 @@ __global__ void __launch_bounds__(block_size) } if (seconds < 0 && nanos != 0) { seconds -= 1; } if (s->chunk.ts_clock_rate) - static_cast(data_out)[row] = + static_cast(data_out)[row] = seconds * s->chunk.ts_clock_rate + (nanos + (499999999 / s->chunk.ts_clock_rate)) / (1000000000 / s->chunk.ts_clock_rate); // Output to desired clock rate else - static_cast(data_out)[row] = seconds * 1000000000 + nanos; + static_cast(data_out)[row] = seconds * 1000000000 + nanos; break; } } } } + // Aggregate num of elements for the chunk + if (s->chunk.type_kind == LIST) { + list_child_elements = block_reduce(temp_storage.blk_uint64).Sum(list_child_elements); + } __syncthreads(); // Buffer secondary stream values if (s->chunk.type_kind == TIMESTAMP) { @@ -1748,12 +1803,19 @@ __global__ void __launch_bounds__(block_size) __syncthreads(); if (t == 0) { s->top.data.cur_row += s->top.data.nrows; + if (s->chunk.type_kind == LIST) { s->num_child_rows += list_child_elements; } if (s->is_string && !is_dictionary(s->chunk.encoding_kind) && s->top.data.max_vals > 0) { s->chunk.dictionary_start += s->vals.u32[s->top.data.max_vals - 1]; } } __syncthreads(); } + if (t == 0 and s->chunk.type_kind == LIST) { + if (num_rowgroups > 0) { + row_groups[blockIdx.y][blockIdx.x].num_child_rows = s->num_child_rows; + } + atomicAdd(&chunks[chunk_id].num_child_rows, s->num_child_rows); + } } /** @@ -1763,22 +1825,20 @@ __global__ void __launch_bounds__(block_size) * @param[in] global_dictionary Global dictionary device array * @param[in] num_columns Number of columns * @param[in] num_stripes Number of stripes - * @param[in] max_rows Maximum number of rows to load * @param[in] first_row Crop all rows below first_row * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void __host__ DecodeNullsAndStringDictionaries(ColumnDesc *chunks, - DictionaryEntry *global_dictionary, +void __host__ DecodeNullsAndStringDictionaries(ColumnDesc* chunks, + DictionaryEntry* global_dictionary, uint32_t num_columns, uint32_t num_stripes, - size_t max_num_rows, size_t first_row, rmm::cuda_stream_view stream) { dim3 dim_block(block_size, 1); dim3 dim_grid(num_columns, num_stripes * 2); // 1024 threads per chunk gpuDecodeNullsAndStringDictionaries<<>>( - chunks, global_dictionary, num_columns, num_stripes, max_num_rows, first_row); + chunks, global_dictionary, num_columns, num_stripes, first_row); } /** @@ -1788,39 +1848,32 @@ void __host__ DecodeNullsAndStringDictionaries(ColumnDesc *chunks, * @param[in] global_dictionary Global dictionary device array * @param[in] num_columns Number of columns * @param[in] num_stripes Number of stripes - * @param[in] max_rows Maximum number of rows to load * @param[in] first_row Crop all rows below first_row * @param[in] tz_table Timezone translation table - * @param[in] row_groups Optional row index data + * @param[in] row_groups Optional row index data [row_group][column] * @param[in] num_rowgroups Number of row groups in row index data * @param[in] rowidx_stride Row index stride + * @param[in] level nesting level being processed * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void __host__ DecodeOrcColumnData(ColumnDesc const *chunks, - DictionaryEntry *global_dictionary, +void __host__ DecodeOrcColumnData(ColumnDesc* chunks, + DictionaryEntry* global_dictionary, + device_2dspan row_groups, uint32_t num_columns, uint32_t num_stripes, - size_t max_num_rows, size_t first_row, timezone_table_view tz_table, - const RowGroup *row_groups, uint32_t num_rowgroups, uint32_t rowidx_stride, + size_t level, rmm::cuda_stream_view stream) { uint32_t num_chunks = num_columns * num_stripes; dim3 dim_block(block_size, 1); // 1024 threads per chunk dim3 dim_grid((num_rowgroups > 0) ? num_columns : num_chunks, (num_rowgroups > 0) ? num_rowgroups : 1); - gpuDecodeOrcColumnData<<>>(chunks, - global_dictionary, - tz_table, - row_groups, - max_num_rows, - first_row, - num_columns, - num_rowgroups, - rowidx_stride); + gpuDecodeOrcColumnData<<>>( + chunks, global_dictionary, tz_table, row_groups, first_row, rowidx_stride, level); } } // namespace gpu diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index b469d7215b4..e007c49e61c 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -57,7 +57,7 @@ struct intrle_enc_state_s { struct strdata_enc_state_s { uint32_t char_count; uint32_t lengths_red[(512 / 32)]; - const char *str_data[512]; + const char* str_data[512]; }; struct orcenc_state_s { @@ -115,9 +115,9 @@ static inline __device__ uint32_t CountLeadingBytes64(uint64_t v) { return __clz /** * @brief Raw data output * - * @param[in] cid stream type (strm_pos[cid] will be updated and output stored at - *streams[cid]+strm_pos[cid]) - * @param[in] inmask input buffer position mask for circular buffers + * @tparam cid stream type (strm_pos[cid] will be updated and output stored at + * streams[cid]+strm_pos[cid]) + * @tparam inmask input buffer position mask for circular buffers * @param[in] s encoder state * @param[in] inbuf base input buffer * @param[in] inpos position in input buffer @@ -126,9 +126,9 @@ static inline __device__ uint32_t CountLeadingBytes64(uint64_t v) { return __clz */ template static __device__ void StoreBytes( - orcenc_state_s *s, const uint8_t *inbuf, uint32_t inpos, uint32_t count, int t) + orcenc_state_s* s, const uint8_t* inbuf, uint32_t inpos, uint32_t count, int t) { - uint8_t *dst = s->stream.data_ptrs[cid] + s->strm_pos[cid]; + uint8_t* dst = s->stream.data_ptrs[cid] + s->strm_pos[cid]; while (count > 0) { uint32_t n = min(count, 512); if (t < n) { dst[t] = inbuf[(inpos + t) & inmask]; } @@ -143,12 +143,12 @@ static __device__ void StoreBytes( /** * @brief ByteRLE encoder * - * @param[in] cid stream type (strm_pos[cid] will be updated and output stored at - *streams[cid]+strm_pos[cid]) + * @tparam cid stream type (strm_pos[cid] will be updated and output stored at + * streams[cid]+strm_pos[cid]) + * @tparam inmask input buffer position mask for circular buffers * @param[in] s encoder state * @param[in] inbuf base input buffer * @param[in] inpos position in input buffer - * @param[in] inmask input buffer position mask for circular buffers * @param[in] numvals max number of values to encode * @param[in] flush encode all remaining values if nonzero * @param[in] t thread id @@ -157,9 +157,9 @@ static __device__ void StoreBytes( */ template static __device__ uint32_t ByteRLE( - orcenc_state_s *s, const uint8_t *inbuf, uint32_t inpos, uint32_t numvals, uint32_t flush, int t) + orcenc_state_s* s, const uint8_t* inbuf, uint32_t inpos, uint32_t numvals, uint32_t flush, int t) { - uint8_t *dst = s->stream.data_ptrs[cid] + s->strm_pos[cid]; + uint8_t* dst = s->stream.data_ptrs[cid] + s->strm_pos[cid]; uint32_t out_cnt = 0; while (numvals > 0) { @@ -272,7 +272,7 @@ static const __device__ __constant__ uint8_t kByteLengthToRLEv2_W[9] = { /** * @brief Encode a varint value, return the number of bytes written */ -static inline __device__ uint32_t StoreVarint(uint8_t *dst, uint64_t v) +static inline __device__ uint32_t StoreVarint(uint8_t* dst, uint64_t v) { uint32_t bytecnt = 0; for (;;) { @@ -289,7 +289,7 @@ static inline __device__ uint32_t StoreVarint(uint8_t *dst, uint64_t v) } template -static inline __device__ void StoreBytesBigEndian(uint8_t *dst, T v, uint32_t w) +static inline __device__ void StoreBytesBigEndian(uint8_t* dst, T v, uint32_t w) { for (uint32_t i = 0, b = w * 8; i < w; ++i) { b -= 8; @@ -299,7 +299,7 @@ static inline __device__ void StoreBytesBigEndian(uint8_t *dst, T v, uint32_t w) // Combine and store bits for symbol widths less than 8 static inline __device__ void StoreBitsBigEndian( - uint8_t *dst, uint32_t v, uint32_t w, int num_vals, int t) + uint8_t* dst, uint32_t v, uint32_t w, int num_vals, int t) { if (t <= (num_vals | 0x1f)) { uint32_t mask; @@ -324,12 +324,12 @@ static inline __device__ void StoreBitsBigEndian( /** * @brief Integer RLEv2 encoder * - * @param[in] cid stream type (strm_pos[cid] will be updated and output stored at - *streams[cid]+strm_pos[cid]) + * @tparam cid stream type (strm_pos[cid] will be updated and output stored at + * streams[cid]+strm_pos[cid]) + * @tparam inmask input buffer position mask for circular buffers * @param[in] s encoder state * @param[in] inbuf base input buffer * @param[in] inpos position in input buffer - * @param[in] inmask input buffer position mask for circular buffers * @param[in] numvals max number of values to encode * @param[in] flush encode all remaining values if nonzero * @param[in] t thread id @@ -343,16 +343,16 @@ template -static __device__ uint32_t IntegerRLE(orcenc_state_s *s, - const T *inbuf, +static __device__ uint32_t IntegerRLE(orcenc_state_s* s, + const T* inbuf, uint32_t inpos, uint32_t numvals, uint32_t flush, int t, - Storage &temp_storage) + Storage& temp_storage) { using block_reduce = cub::BlockReduce; - uint8_t *dst = s->stream.data_ptrs[cid] + s->strm_pos[cid]; + uint8_t* dst = s->stream.data_ptrs[cid] + s->strm_pos[cid]; uint32_t out_cnt = 0; __shared__ volatile uint64_t block_vmin; @@ -473,7 +473,7 @@ static __device__ uint32_t IntegerRLE(orcenc_state_s *s, uint32_t bw, pw = 1, pll, pgw = 1, bv_scale = (is_signed) ? 0 : 1; vmax = (is_signed) ? ((vmin < 0) ? -vmin : vmin) * 2 : vmin; bw = (sizeof(T) > 4) ? (8 - min(CountLeadingBytes64(vmax << bv_scale), 7)) - : (4 - min(CountLeadingBytes32(vmax << bv_scale), 3)); + : (4 - min(CountLeadingBytes32(vmax << bv_scale), 3)); if (zero_pll_war) { // Insert a dummy zero patch pll = 1; @@ -560,8 +560,8 @@ static __device__ uint32_t IntegerRLE(orcenc_state_s *s, * @param[in] len(t) string length (per thread) * @param[in] t thread id */ -static __device__ void StoreStringData(uint8_t *dst, - strdata_enc_state_s *strenc, +static __device__ void StoreStringData(uint8_t* dst, + strdata_enc_state_s* strenc, uint32_t len, int t) { @@ -601,7 +601,7 @@ static __device__ void StoreStringData(uint8_t *dst, * @param[in] t thread id */ template -inline __device__ void lengths_to_positions(volatile T *vals, uint32_t numvals, unsigned int t) +inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t) { for (uint32_t n = 1; n < numvals; n <<= 1) { __syncthreads(); @@ -619,7 +619,7 @@ static const __device__ __constant__ int32_t kTimeScale[10] = { * @brief Encode column data * * @param[in] chunks encoder chunks device array [column][rowgroup] - * @param[in, out] chunks cunk streams device array [column][rowgroup] + * @param[in, out] streams chunk streams device array [column][rowgroup] */ // blockDim {512,1,1} template @@ -635,7 +635,7 @@ __global__ void __launch_bounds__(block_size) typename cub::BlockReduce::TempStorage u64; } temp_storage; - orcenc_state_s *const s = &state_g; + orcenc_state_s* const s = &state_g; uint32_t col_id = blockIdx.x; uint32_t group_id = blockIdx.y; int t = threadIdx.x; @@ -913,7 +913,7 @@ __global__ void __launch_bounds__(block_size) streams[col_id][group_id].lengths[t] = s->strm_pos[t]; if (!s->stream.data_ptrs[t]) { streams[col_id][group_id].data_ptrs[t] = - static_cast(const_cast(s->chunk.leaf_column->head())) + + static_cast(const_cast(s->chunk.leaf_column->head())) + (s->chunk.leaf_column->offset() + s->chunk.start_row) * s->chunk.dtype_len; } } @@ -929,14 +929,14 @@ __global__ void __launch_bounds__(block_size) // blockDim {512,1,1} template __global__ void __launch_bounds__(block_size) - gpuEncodeStringDictionaries(StripeDictionary *stripes, + gpuEncodeStringDictionaries(StripeDictionary* stripes, device_2dspan chunks, device_2dspan streams) { __shared__ __align__(16) orcenc_state_s state_g; __shared__ typename cub::BlockReduce::TempStorage temp_storage; - orcenc_state_s *const s = &state_g; + orcenc_state_s* const s = &state_g; uint32_t stripe_id = blockIdx.x; uint32_t cid = (blockIdx.y) ? CI_DICTIONARY : CI_DATA2; int t = threadIdx.x; @@ -953,7 +953,7 @@ __global__ void __launch_bounds__(block_size) s->nrows = s->u.dict_stripe.num_strings; s->cur_row = 0; } - column_device_view *string_column = s->u.dict_stripe.leaf_column; + column_device_view* string_column = s->u.dict_stripe.leaf_column; auto const dict_data = s->u.dict_stripe.dict_data; __syncthreads(); if (s->chunk.encoding_kind != DICTIONARY_V2) { @@ -965,7 +965,7 @@ __global__ void __launch_bounds__(block_size) uint32_t string_idx = (t < numvals) ? dict_data[s->cur_row + t] : 0; if (cid == CI_DICTIONARY) { // Encoding string contents - const char *ptr = 0; + const char* ptr = 0; uint32_t count = 0; if (t < numvals) { auto string_val = string_column->element(string_idx); @@ -1026,7 +1026,7 @@ __global__ void __launch_bounds__(1024) { __shared__ __align__(16) StripeStream ss; __shared__ __align__(16) encoder_chunk_streams strm0; - __shared__ uint8_t *volatile ck_curptr_g; + __shared__ uint8_t* volatile ck_curptr_g; __shared__ uint32_t volatile ck_curlen_g; auto const stripe_id = blockIdx.x; @@ -1041,7 +1041,7 @@ __global__ void __launch_bounds__(1024) auto const cid = ss.stream_type; auto dst_ptr = strm0.data_ptrs[cid] + strm0.lengths[cid]; for (auto group = ss.first_chunk_id + 1; group < ss.first_chunk_id + ss.num_chunks; ++group) { - uint8_t *src_ptr; + uint8_t* src_ptr; uint32_t len; if (t == 0) { src_ptr = streams[ss.column_id][group].data_ptrs[cid]; @@ -1080,13 +1080,13 @@ __global__ void __launch_bounds__(1024) __global__ void __launch_bounds__(256) gpuInitCompressionBlocks(device_2dspan strm_desc, device_2dspan streams, // const? - gpu_inflate_input_s *comp_in, - gpu_inflate_status_s *comp_out, - uint8_t *compressed_bfr, + gpu_inflate_input_s* comp_in, + gpu_inflate_status_s* comp_out, + uint8_t* compressed_bfr, uint32_t comp_blk_size) { __shared__ __align__(16) StripeStream ss; - __shared__ uint8_t *volatile uncomp_base_g; + __shared__ uint8_t* volatile uncomp_base_g; auto const stripe_id = blockIdx.x; auto const stream_id = blockIdx.y; @@ -1103,8 +1103,8 @@ __global__ void __launch_bounds__(256) dst = compressed_bfr + ss.bfr_offset; num_blocks = (ss.stream_size > 0) ? (ss.stream_size - 1) / comp_blk_size + 1 : 1; for (uint32_t b = t; b < num_blocks; b += 256) { - gpu_inflate_input_s *blk_in = &comp_in[ss.first_block + b]; - gpu_inflate_status_s *blk_out = &comp_out[ss.first_block + b]; + gpu_inflate_input_s* blk_in = &comp_in[ss.first_block + b]; + gpu_inflate_status_s* blk_out = &comp_out[ss.first_block + b]; uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size)); blk_in->srcDevice = src + b * comp_blk_size; blk_in->srcSize = blk_size; @@ -1130,21 +1130,21 @@ __global__ void __launch_bounds__(256) // blockDim {1024,1,1} __global__ void __launch_bounds__(1024) gpuCompactCompressedBlocks(device_2dspan strm_desc, - gpu_inflate_input_s *comp_in, - gpu_inflate_status_s *comp_out, - uint8_t *compressed_bfr, + gpu_inflate_input_s* comp_in, + gpu_inflate_status_s* comp_out, + uint8_t* compressed_bfr, uint32_t comp_blk_size) { __shared__ __align__(16) StripeStream ss; - __shared__ const uint8_t *volatile comp_src_g; + __shared__ const uint8_t* volatile comp_src_g; __shared__ uint32_t volatile comp_len_g; auto const stripe_id = blockIdx.x; auto const stream_id = blockIdx.y; uint32_t t = threadIdx.x; uint32_t num_blocks, b, blk_size; - const uint8_t *src; - uint8_t *dst; + const uint8_t* src; + uint8_t* dst; if (t == 0) ss = strm_desc[stripe_id][stream_id]; __syncthreads(); @@ -1154,21 +1154,21 @@ __global__ void __launch_bounds__(1024) b = 0; do { if (t == 0) { - gpu_inflate_input_s *blk_in = &comp_in[ss.first_block + b]; - gpu_inflate_status_s *blk_out = &comp_out[ss.first_block + b]; + gpu_inflate_input_s* blk_in = &comp_in[ss.first_block + b]; + gpu_inflate_status_s* blk_out = &comp_out[ss.first_block + b]; uint32_t src_len = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size)); uint32_t dst_len = (blk_out->status == 0) ? blk_out->bytes_written : src_len; uint32_t blk_size24; if (dst_len >= src_len) { // Copy from uncompressed source - src = static_cast(blk_in->srcDevice); + src = static_cast(blk_in->srcDevice); blk_out->bytes_written = src_len; dst_len = src_len; blk_size24 = dst_len * 2 + 1; } else { // Compressed block - src = static_cast(blk_in->dstDevice); + src = static_cast(blk_in->dstDevice); blk_size24 = dst_len * 2 + 0; } dst[0] = static_cast(blk_size24 >> 0); @@ -1207,7 +1207,7 @@ void EncodeOrcColumnData(device_2dspan chunks, gpuEncodeOrcColumnData<512><<>>(chunks, streams); } -void EncodeStripeDictionaries(StripeDictionary *stripes, +void EncodeStripeDictionaries(StripeDictionary* stripes, device_2dspan chunks, uint32_t num_string_columns, uint32_t num_stripes, @@ -1220,7 +1220,7 @@ void EncodeStripeDictionaries(StripeDictionary *stripes, <<>>(stripes, chunks, enc_streams); } -void set_chunk_columns(const table_device_view &view, +void set_chunk_columns(const table_device_view& view, device_2dspan chunks, rmm::cuda_stream_view stream) { @@ -1239,14 +1239,14 @@ void CompactOrcDataStreams(device_2dspan strm_desc, gpuCompactOrcDataStreams<<>>(strm_desc, enc_streams); } -void CompressOrcDataStreams(uint8_t *compressed_data, +void CompressOrcDataStreams(uint8_t* compressed_data, uint32_t num_compressed_blocks, CompressionKind compression, uint32_t comp_blk_size, device_2dspan strm_desc, device_2dspan enc_streams, - gpu_inflate_input_s *comp_in, - gpu_inflate_status_s *comp_out, + gpu_inflate_input_s* comp_in, + gpu_inflate_status_s* comp_out, rmm::cuda_stream_view stream) { dim3 dim_block_init(256, 1); diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index 42cb15a56b7..317b7255718 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -32,11 +32,11 @@ struct compressed_stream_s { // blockDim {128,1,1} extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeData( - CompressedStreamInfo *strm_info, int32_t num_streams, uint32_t block_size, uint32_t log2maxcr) + CompressedStreamInfo* strm_info, int32_t num_streams, uint32_t block_size, uint32_t log2maxcr) { __shared__ compressed_stream_s strm_g[4]; - compressed_stream_s *const s = &strm_g[threadIdx.x / 32]; + compressed_stream_s* const s = &strm_g[threadIdx.x / 32]; int strm_id = blockIdx.x * 4 + (threadIdx.x / 32); int lane_id = threadIdx.x % 32; @@ -45,9 +45,9 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat __syncthreads(); if (strm_id < num_streams) { // Walk through the compressed blocks - const uint8_t *cur = s->info.compressed_data; - const uint8_t *end = cur + s->info.compressed_data_size; - uint8_t *uncompressed = s->info.uncompressed_data; + const uint8_t* cur = s->info.compressed_data; + const uint8_t* end = cur + s->info.compressed_data_size; + uint8_t* uncompressed = s->info.uncompressed_data; size_t max_uncompressed_size = 0; uint32_t num_compressed_blocks = 0; uint32_t num_uncompressed_blocks = 0; @@ -55,7 +55,7 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0); uint32_t is_uncompressed = block_len & 1; uint32_t uncompressed_size; - gpu_inflate_input_s *init_ctl = nullptr; + gpu_inflate_input_s* init_ctl = nullptr; block_len >>= 1; cur += 3; if (block_len > block_size || cur + block_len > end) { @@ -67,10 +67,9 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat // TBD: For some codecs like snappy, it wouldn't be too difficult to get the actual // uncompressed size and avoid waste due to block size alignment For now, rely on the max // compression ratio to limit waste for the most extreme cases (small single-block streams) - uncompressed_size = - (is_uncompressed) - ? block_len - : (block_len < (block_size >> log2maxcr)) ? block_len << log2maxcr : block_size; + uncompressed_size = (is_uncompressed) ? block_len + : (block_len < (block_size >> log2maxcr)) ? block_len << log2maxcr + : block_size; if (is_uncompressed) { if (uncompressed_size <= 32) { // For short blocks, copy the uncompressed data to output @@ -94,7 +93,7 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat num_compressed_blocks++; } if (!lane_id && init_ctl) { - s->ctl.srcDevice = const_cast(cur); + s->ctl.srcDevice = const_cast(cur); s->ctl.srcSize = block_len; s->ctl.dstDevice = uncompressed + max_uncompressed_size; s->ctl.dstSize = uncompressed_size; @@ -118,11 +117,11 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat // blockDim {128,1,1} extern "C" __global__ void __launch_bounds__(128, 8) - gpuPostDecompressionReassemble(CompressedStreamInfo *strm_info, int32_t num_streams) + gpuPostDecompressionReassemble(CompressedStreamInfo* strm_info, int32_t num_streams) { __shared__ compressed_stream_s strm_g[4]; - compressed_stream_s *const s = &strm_g[threadIdx.x / 32]; + compressed_stream_s* const s = &strm_g[threadIdx.x / 32]; int strm_id = blockIdx.x * 4 + (threadIdx.x / 32); int lane_id = threadIdx.x % 32; @@ -133,12 +132,12 @@ extern "C" __global__ void __launch_bounds__(128, 8) s->info.num_compressed_blocks + s->info.num_uncompressed_blocks > 0 && s->info.max_uncompressed_size > 0) { // Walk through the compressed blocks - const uint8_t *cur = s->info.compressed_data; - const uint8_t *end = cur + s->info.compressed_data_size; - const gpu_inflate_input_s *dec_in = s->info.decctl; - const gpu_inflate_status_s *dec_out = s->info.decstatus; - uint8_t *uncompressed_actual = s->info.uncompressed_data; - uint8_t *uncompressed_estimated = uncompressed_actual; + const uint8_t* cur = s->info.compressed_data; + const uint8_t* end = cur + s->info.compressed_data_size; + const gpu_inflate_input_s* dec_in = s->info.decctl; + const gpu_inflate_status_s* dec_out = s->info.decstatus; + uint8_t* uncompressed_actual = s->info.uncompressed_data; + uint8_t* uncompressed_estimated = uncompressed_actual; uint32_t num_compressed_blocks = 0; uint32_t max_compressed_blocks = s->info.num_compressed_blocks; @@ -159,9 +158,9 @@ extern "C" __global__ void __launch_bounds__(128, 8) break; } uncompressed_size_est = - shuffle((lane_id == 0) ? *(const uint32_t *)&dec_in[num_compressed_blocks].dstSize : 0); + shuffle((lane_id == 0) ? *(const uint32_t*)&dec_in[num_compressed_blocks].dstSize : 0); uncompressed_size_actual = shuffle( - (lane_id == 0) ? *(const uint32_t *)&dec_out[num_compressed_blocks].bytes_written : 0); + (lane_id == 0) ? *(const uint32_t*)&dec_out[num_compressed_blocks].bytes_written : 0); } // In practice, this should never happen with a well-behaved writer, as we would expect the // uncompressed size to always be equal to the compression block size except for the last @@ -219,13 +218,13 @@ enum row_entry_state_e { * @param[in] end end of byte stream * @return bytes consumed */ -static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s *s, - const uint8_t *start, - const uint8_t *end) +static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s* s, + const uint8_t* start, + const uint8_t* end) { constexpr uint32_t pb_rowindexentry_id = static_cast(PB_TYPE_FIXEDLEN) + 8; - const uint8_t *cur = start; + const uint8_t* cur = start; row_entry_state_e state = NOT_FOUND; uint32_t length = 0, strm_idx_id = s->chunk.skip_count >> 8, idx_id = 1, ci_id = CI_PRESENT, pos_end = 0; @@ -268,9 +267,9 @@ static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s *s, } break; case STORE_INDEX0: - ci_id = (idx_id == (strm_idx_id & 0xff)) - ? CI_DATA - : (idx_id == ((strm_idx_id >> 8) & 0xff)) ? CI_DATA2 : CI_PRESENT; + ci_id = (idx_id == (strm_idx_id & 0xff)) ? CI_DATA + : (idx_id == ((strm_idx_id >> 8) & 0xff)) ? CI_DATA2 + : CI_PRESENT; idx_id++; if (s->is_compressed) { if (ci_id < CI_PRESENT) s->row_index_entry[0][ci_id] = v; @@ -313,9 +312,9 @@ static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s *s, * @param[in,out] s row group index state * @param[in] num_rowgroups Number of index entries to read */ -static __device__ void gpuReadRowGroupIndexEntries(rowindex_state_s *s, int num_rowgroups) +static __device__ void gpuReadRowGroupIndexEntries(rowindex_state_s* s, int num_rowgroups) { - const uint8_t *index_data = s->chunk.streams[CI_INDEX]; + const uint8_t* index_data = s->chunk.streams[CI_INDEX]; int index_data_len = s->chunk.strm_len[CI_INDEX]; for (int i = 0; i < num_rowgroups; i++) { s->row_index_entry[0][0] = 0; @@ -347,7 +346,7 @@ static __device__ void gpuReadRowGroupIndexEntries(rowindex_state_s *s, int num_ * @param[in] num_rowgroups Number of index entries * @param[in] t thread id */ -static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s *s, +static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s, int ci_id, int num_rowgroups, int t) @@ -356,10 +355,10 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s *s, if (strm_len > 0) { int32_t compressed_offset = (t < num_rowgroups) ? s->compressed_offset[t][ci_id] : 0; if (compressed_offset > 0) { - const uint8_t *start = s->strm_info[ci_id].compressed_data; - const uint8_t *cur = start; - const uint8_t *end = cur + s->strm_info[ci_id].compressed_data_size; - gpu_inflate_status_s *decstatus = s->strm_info[ci_id].decstatus; + const uint8_t* start = s->strm_info[ci_id].compressed_data; + const uint8_t* cur = start; + const uint8_t* end = cur + s->strm_info[ci_id].compressed_data_size; + gpu_inflate_status_s* decstatus = s->strm_info[ci_id].decstatus; uint32_t uncomp_offset = 0; for (;;) { uint32_t block_len, is_uncompressed; @@ -392,19 +391,23 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s *s, * @param[in] num_columns Number of columns * @param[in] num_stripes Number of stripes * @param[in] num_rowgroups Number of row groups + * @param[in] rowidx_stride Row index stride + * @param[in] use_base_stride Whether to use base stride obtained from meta or use the computed + * value */ // blockDim {128,1,1} extern "C" __global__ void __launch_bounds__(128, 8) - gpuParseRowGroupIndex(RowGroup *row_groups, - CompressedStreamInfo *strm_info, - ColumnDesc *chunks, + gpuParseRowGroupIndex(RowGroup* row_groups, + CompressedStreamInfo* strm_info, + ColumnDesc* chunks, uint32_t num_columns, uint32_t num_stripes, uint32_t num_rowgroups, - uint32_t rowidx_stride) + uint32_t rowidx_stride, + bool use_base_stride) { __shared__ __align__(16) rowindex_state_s state_g; - rowindex_state_s *const s = &state_g; + rowindex_state_s* const s = &state_g; uint32_t chunk_id = blockIdx.y * num_columns + blockIdx.x; int t = threadIdx.x; @@ -415,11 +418,10 @@ extern "C" __global__ void __launch_bounds__(128, 8) if (s->chunk.strm_len[1] > 0) s->strm_info[1] = strm_info[s->chunk.strm_id[1]]; } - uint32_t rowgroups_in_chunk = - (rowidx_stride > 0) ? (s->chunk.num_rows + rowidx_stride - 1) / rowidx_stride : 1; - s->rowgroup_start = s->chunk.rowgroup_id; - s->rowgroup_end = s->rowgroup_start + rowgroups_in_chunk; - s->is_compressed = (strm_info != NULL); + uint32_t rowgroups_in_chunk = s->chunk.num_rowgroups; + s->rowgroup_start = s->chunk.rowgroup_id; + s->rowgroup_end = s->rowgroup_start + rowgroups_in_chunk; + s->is_compressed = (strm_info != NULL); } __syncthreads(); while (s->rowgroup_start < s->rowgroup_end) { @@ -443,10 +445,19 @@ extern "C" __global__ void __launch_bounds__(128, 8) t4 = t & 3; t32 = t >> 2; for (int i = t32; i < num_rowgroups; i += 32) { + auto const num_rows = + (use_base_stride) ? rowidx_stride + : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows; + auto const start_row = + (use_base_stride) + ? rowidx_stride + : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row; for (int j = t4; j < rowgroup_size4; j += 4) { - ((uint32_t *)&row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x])[j] = - ((volatile uint32_t *)&s->rowgroups[i])[j]; + ((uint32_t*)&row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x])[j] = + ((volatile uint32_t*)&s->rowgroups[i])[j]; } + row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows = num_rows; + row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row = start_row; } __syncthreads(); if (t == 0) { s->rowgroup_start += num_rowgroups; } @@ -454,7 +465,7 @@ extern "C" __global__ void __launch_bounds__(128, 8) } } -void __host__ ParseCompressedStripeData(CompressedStreamInfo *strm_info, +void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info, int32_t num_streams, uint32_t compression_block_size, uint32_t log2maxcr, @@ -466,7 +477,7 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo *strm_info, strm_info, num_streams, compression_block_size, log2maxcr); } -void __host__ PostDecompressionReassemble(CompressedStreamInfo *strm_info, +void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info, int32_t num_streams, rmm::cuda_stream_view stream) { @@ -485,21 +496,31 @@ void __host__ PostDecompressionReassemble(CompressedStreamInfo *strm_info, * @param[in] num_columns Number of columns * @param[in] num_stripes Number of stripes * @param[in] num_rowgroups Number of row groups + * @param[in] rowidx_stride Row index stride + * @param[in] use_base_stride Whether to use base stride obtained from meta or use the computed + * value * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default` */ -void __host__ ParseRowGroupIndex(RowGroup *row_groups, - CompressedStreamInfo *strm_info, - ColumnDesc *chunks, +void __host__ ParseRowGroupIndex(RowGroup* row_groups, + CompressedStreamInfo* strm_info, + ColumnDesc* chunks, uint32_t num_columns, uint32_t num_stripes, uint32_t num_rowgroups, uint32_t rowidx_stride, + bool use_base_stride, rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); dim3 dim_grid(num_columns, num_stripes); // 1 column chunk per block - gpuParseRowGroupIndex<<>>( - row_groups, strm_info, chunks, num_columns, num_stripes, num_rowgroups, rowidx_stride); + gpuParseRowGroupIndex<<>>(row_groups, + strm_info, + chunks, + num_columns, + num_stripes, + num_rowgroups, + rowidx_stride, + use_base_stride); } } // namespace gpu diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp index 81ffa954c1a..f5bda3401c0 100644 --- a/cpp/src/io/orc/timezone.cpp +++ b/cpp/src/io/orc/timezone.cpp @@ -76,7 +76,7 @@ struct timezone_file { { return (is_64bit ? sizeof(uint64_t) : sizeof(uint32_t)) + sizeof(uint32_t); } - static constexpr auto file_content_size_32(timezone_file_header const &header) noexcept + static constexpr auto file_content_size_32(timezone_file_header const& header) noexcept { return header.timecnt * sizeof(uint32_t) + // transition times header.timecnt * sizeof(uint8_t) + // transition time index @@ -100,9 +100,9 @@ struct timezone_file { header.charcnt = __builtin_bswap32(header.charcnt); } - void read_header(std::ifstream &input_file, size_t file_size) + void read_header(std::ifstream& input_file, size_t file_size) { - input_file.read(reinterpret_cast(&header), sizeof(header)); + input_file.read(reinterpret_cast(&header), sizeof(header)); CUDF_EXPECTS(!input_file.fail() && header.magic == tzif_magic, "Error reading time zones file header."); header_to_little_endian(); @@ -113,7 +113,7 @@ struct timezone_file { // skip the 32-bit content input_file.seekg(file_content_size_32(header), std::ios_base::cur); // read the 64-bit header - input_file.read(reinterpret_cast(&header), sizeof(header)); + input_file.read(reinterpret_cast(&header), sizeof(header)); header_to_little_endian(); is_header_from_64bit = true; } @@ -125,7 +125,7 @@ struct timezone_file { "Number of transition times is larger than the file size."); } - timezone_file(std::string const &timezone_name) + timezone_file(std::string const& timezone_name) { using std::ios_base; @@ -142,23 +142,25 @@ struct timezone_file { // Read transition times (convert from 32-bit to 64-bit if necessary) transition_times.resize(timecnt()); if (is_header_from_64bit) { - fin.read(reinterpret_cast(transition_times.data()), + fin.read(reinterpret_cast(transition_times.data()), transition_times.size() * sizeof(int64_t)); - for (auto &tt : transition_times) { tt = __builtin_bswap64(tt); } + for (auto& tt : transition_times) { + tt = __builtin_bswap64(tt); + } } else { std::vector tt32(timecnt()); - fin.read(reinterpret_cast(tt32.data()), tt32.size() * sizeof(int32_t)); + fin.read(reinterpret_cast(tt32.data()), tt32.size() * sizeof(int32_t)); std::transform( - tt32.cbegin(), tt32.cend(), std::back_inserter(transition_times), [](auto &tt) { + tt32.cbegin(), tt32.cend(), std::back_inserter(transition_times), [](auto& tt) { return __builtin_bswap32(tt); }); } ttime_idx.resize(timecnt()); - fin.read(reinterpret_cast(ttime_idx.data()), timecnt() * sizeof(uint8_t)); + fin.read(reinterpret_cast(ttime_idx.data()), timecnt() * sizeof(uint8_t)); // Read time types ttype.resize(typecnt()); - fin.read(reinterpret_cast(ttype.data()), typecnt() * sizeof(localtime_type_record_s)); + fin.read(reinterpret_cast(ttype.data()), typecnt() * sizeof(localtime_type_record_s)); CUDF_EXPECTS(!fin.fail(), "Failed to read time types from the time zone file."); for (uint32_t i = 0; i < typecnt(); i++) { ttype[i].utcoff = __builtin_bswap32(ttype[i].utcoff); @@ -182,7 +184,7 @@ struct timezone_file { template class posix_parser { public: - posix_parser(Container const &tz_string) : cur{tz_string.begin()}, end{tz_string.end()} {} + posix_parser(Container const& tz_string) : cur{tz_string.begin()}, end{tz_string.end()} {} /** * @brief Advances the parser past a name from the posix TZ string. @@ -340,7 +342,7 @@ static int days_in_month(int month, bool is_leap_year) * * @return transition time in seconds from the beginning of the year */ -static int64_t get_transition_time(dst_transition_s const &trans, int year) +static int64_t get_transition_time(dst_transition_s const& trans, int year) { auto day = trans.day; @@ -365,7 +367,9 @@ static int64_t get_transition_time(dst_transition_s const &trans, int year) day += 7; } // Add months - for (int m = 1; m < month; m++) { day += days_in_month(m, is_leap); } + for (int m = 1; m < month; m++) { + day += days_in_month(m, is_leap); + } } else if (trans.type == 'J') { // Account for 29th of February on leap years day += (day > 31 + 29 && is_leap_year(year)); @@ -374,7 +378,7 @@ static int64_t get_transition_time(dst_transition_s const &trans, int year) return trans.time + day * day_seconds; } -timezone_table build_timezone_transition_table(std::string const &timezone_name, +timezone_table build_timezone_transition_table(std::string const& timezone_name, rmm::cuda_stream_view stream) { if (timezone_name == "UTC" || timezone_name.empty()) { diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh index b0231ca9e7d..e5341573418 100644 --- a/cpp/src/io/orc/timezone.cuh +++ b/cpp/src/io/orc/timezone.cuh @@ -56,8 +56,8 @@ static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years; * * @return GMT offset */ -CUDA_HOST_DEVICE_CALLABLE int32_t get_gmt_offset_impl(int64_t const *ttimes, - int32_t const *offsets, +CUDA_HOST_DEVICE_CALLABLE int32_t get_gmt_offset_impl(int64_t const* ttimes, + int32_t const* offsets, size_t count, int64_t ts) { @@ -112,8 +112,8 @@ struct timezone_table { rmm::device_uvector offsets; timezone_table() : ttimes{0, rmm::cuda_stream_default}, offsets{0, rmm::cuda_stream_default} {} timezone_table(int32_t gmt_offset, - rmm::device_uvector &&ttimes, - rmm::device_uvector &&offsets) + rmm::device_uvector&& ttimes, + rmm::device_uvector&& offsets) : gmt_offset{gmt_offset}, ttimes{std::move(ttimes)}, offsets{std::move(offsets)} { } @@ -130,7 +130,7 @@ struct timezone_table { * * @return The transition table for the given timezone */ -timezone_table build_timezone_transition_table(std::string const &timezone_name, +timezone_table build_timezone_transition_table(std::string const& timezone_name, rmm::cuda_stream_view stream); } // namespace io diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 4a2330d479b..0cd3f333ba3 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -138,8 +138,8 @@ class orc_column_view { */ explicit orc_column_view(size_t index, size_t str_id, - column_view const &col, - const table_metadata *metadata, + column_view const& col, + const table_metadata* metadata, rmm::cuda_stream_view stream) : _index(index), _str_id(str_id), @@ -168,7 +168,7 @@ class orc_column_view { /** * @brief Function that associates an existing dictionary chunk allocation */ - void attach_dict_chunk(gpu::DictionaryChunk *host_dict, gpu::DictionaryChunk *dev_dict) + void attach_dict_chunk(gpu::DictionaryChunk* host_dict, gpu::DictionaryChunk* dev_dict) { dict = host_dict; d_dict = dev_dict; @@ -180,14 +180,14 @@ class orc_column_view { } auto device_dict_chunk() const { return d_dict; } - auto const &decimal_offsets() const { return d_decimal_offsets; } - void attach_decimal_offsets(uint32_t *sizes_ptr) { d_decimal_offsets = sizes_ptr; } + auto const& decimal_offsets() const { return d_decimal_offsets; } + void attach_decimal_offsets(uint32_t* sizes_ptr) { d_decimal_offsets = sizes_ptr; } /** * @brief Function that associates an existing stripe dictionary allocation */ - void attach_stripe_dict(gpu::StripeDictionary *host_stripe_dict, - gpu::StripeDictionary *dev_stripe_dict) + void attach_stripe_dict(gpu::StripeDictionary* host_stripe_dict, + gpu::StripeDictionary* dev_stripe_dict) { stripe_dict = host_stripe_dict; d_stripe_dict = dev_stripe_dict; @@ -207,7 +207,7 @@ class orc_column_view { auto data_count() const noexcept { return _data_count; } size_t null_count() const noexcept { return _null_count; } bool nullable() const noexcept { return (_nulls != nullptr); } - uint32_t const *nulls() const noexcept { return _nulls; } + uint32_t const* nulls() const noexcept { return _nulls; } auto scale() const noexcept { return _scale; } auto precision() const noexcept { return _precision; } @@ -226,7 +226,7 @@ class orc_column_view { size_t _type_width = 0; size_type _data_count = 0; size_t _null_count = 0; - uint32_t const *_nulls = nullptr; + uint32_t const* _nulls = nullptr; // ORC-related members std::string _name{}; @@ -238,21 +238,21 @@ class orc_column_view { // String dictionary-related members size_t dict_stride = 0; - gpu::DictionaryChunk const *dict = nullptr; - gpu::StripeDictionary const *stripe_dict = nullptr; - gpu::DictionaryChunk *d_dict = nullptr; - gpu::StripeDictionary *d_stripe_dict = nullptr; + gpu::DictionaryChunk const* dict = nullptr; + gpu::StripeDictionary const* stripe_dict = nullptr; + gpu::DictionaryChunk* d_dict = nullptr; + gpu::StripeDictionary* d_stripe_dict = nullptr; // Offsets for encoded decimal elements. Used to enable direct writing of encoded decimal elements // into the output stream. - uint32_t *d_decimal_offsets = nullptr; + uint32_t* d_decimal_offsets = nullptr; }; std::vector writer::impl::gather_stripe_info( host_span columns, size_t num_rowgroups) { auto const is_any_column_string = - std::any_of(columns.begin(), columns.end(), [](auto const &col) { return col.is_string(); }); + std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.is_string(); }); // Apply rows per stripe limit to limit string dictionaries size_t const max_stripe_rows = is_any_column_string ? 1000000 : 5000000; @@ -260,7 +260,7 @@ std::vector writer::impl::gather_stripe_info( for (size_t rowgroup = 0, stripe_start = 0, stripe_size = 0; rowgroup < num_rowgroups; ++rowgroup) { auto const rowgroup_size = - std::accumulate(columns.begin(), columns.end(), 0ul, [&](size_t total_size, auto const &col) { + std::accumulate(columns.begin(), columns.end(), 0ul, [&](size_t total_size, auto const& col) { if (col.is_string()) { const auto dt = col.host_dict_chunk(rowgroup); return total_size + row_index_stride_ + dt->string_char_count; @@ -285,19 +285,19 @@ std::vector writer::impl::gather_stripe_info( return infos; } -void writer::impl::init_dictionaries(const table_device_view &view, - orc_column_view *columns, - std::vector const &str_col_ids, +void writer::impl::init_dictionaries(const table_device_view& view, + orc_column_view* columns, + std::vector const& str_col_ids, device_span d_str_col_ids, - uint32_t *dict_data, - uint32_t *dict_index, - hostdevice_vector *dict) + uint32_t* dict_data, + uint32_t* dict_index, + hostdevice_vector* dict) { const size_t num_rowgroups = dict->size() / str_col_ids.size(); // Setup per-rowgroup dictionary indexes for each dictionary-aware column for (size_t i = 0; i < str_col_ids.size(); ++i) { - auto &str_column = columns[str_col_ids[i]]; + auto& str_column = columns[str_col_ids[i]]; str_column.set_dict_stride(str_col_ids.size()); str_column.attach_dict_chunk(dict->host_ptr(), dict->device_ptr()); } @@ -314,21 +314,21 @@ void writer::impl::init_dictionaries(const table_device_view &view, dict->device_to_host(stream, true); } -void writer::impl::build_dictionaries(orc_column_view *columns, - std::vector const &str_col_ids, +void writer::impl::build_dictionaries(orc_column_view* columns, + std::vector const& str_col_ids, host_span stripe_bounds, - hostdevice_vector const &dict, - uint32_t *dict_index, - hostdevice_vector &stripe_dict) + hostdevice_vector const& dict, + uint32_t* dict_index, + hostdevice_vector& stripe_dict) { const auto num_rowgroups = dict.size() / str_col_ids.size(); for (size_t col_idx = 0; col_idx < str_col_ids.size(); ++col_idx) { - auto &str_column = columns[str_col_ids[col_idx]]; + auto& str_column = columns[str_col_ids[col_idx]]; str_column.attach_stripe_dict(stripe_dict.host_ptr(), stripe_dict.device_ptr()); - for (auto const &stripe : stripe_bounds) { - auto &sd = stripe_dict[stripe.id * str_col_ids.size() + col_idx]; + for (auto const& stripe : stripe_bounds) { + auto& sd = stripe_dict[stripe.id * str_col_ids.size() + col_idx]; sd.dict_data = str_column.host_dict_chunk(stripe.first)->dict_data; sd.dict_index = dict_index + col_idx * str_column.data_count(); // Indexed by abs row sd.column_id = str_col_ids[col_idx]; @@ -337,7 +337,7 @@ void writer::impl::build_dictionaries(orc_column_view *columns, sd.dict_char_count = 0; sd.num_strings = std::accumulate(stripe.cbegin(), stripe.cend(), 0, [&](auto dt_str_cnt, auto rg_idx) { - const auto &dt = dict[rg_idx * str_col_ids.size() + col_idx]; + const auto& dt = dict[rg_idx * str_col_ids.size() + col_idx]; return dt_str_cnt + dt.num_dict_strings; }); sd.leaf_column = dict[col_idx].leaf_column; @@ -353,13 +353,13 @@ void writer::impl::build_dictionaries(orc_column_view *columns, stripe_bounds.back().cend(), string_column_cost{}, [&](auto cost, auto rg_idx) -> string_column_cost { - const auto &dt = dict[rg_idx * str_col_ids.size() + col_idx]; + const auto& dt = dict[rg_idx * str_col_ids.size() + col_idx]; return {cost.direct + dt.string_char_count, cost.dictionary + dt.dict_char_count + dt.num_dict_strings}; }); // Disable dictionary if it does not reduce the output size if (col_cost.dictionary >= col_cost.direct) { - for (auto const &stripe : stripe_bounds) { + for (auto const& stripe : stripe_bounds) { stripe_dict[stripe.id * str_col_ids.size() + col_idx].dict_data = nullptr; } } @@ -379,19 +379,19 @@ void writer::impl::build_dictionaries(orc_column_view *columns, orc_streams writer::impl::create_streams(host_span columns, host_span stripe_bounds, - std::map const &decimal_column_sizes) + std::map const& decimal_column_sizes) { // 'column 0' row index stream std::vector streams{{ROW_INDEX, 0}}; // TODO: Separate index and data streams? // First n + 1 streams are row index streams streams.reserve(columns.size() + 1); - std::transform(columns.begin(), columns.end(), std::back_inserter(streams), [](auto const &col) { + std::transform(columns.begin(), columns.end(), std::back_inserter(streams), [](auto const& col) { return Stream{ROW_INDEX, col.id()}; }); std::vector ids(columns.size() * gpu::CI_NUM_STREAMS, -1); - for (auto &column : columns) { + for (auto& column : columns) { TypeKind kind = column.orc_kind(); StreamKind data_kind = DATA; StreamKind data2_kind = LENGTH; @@ -454,7 +454,7 @@ orc_streams writer::impl::create_streams(host_span columns, size_t dict_data_size = 0; size_t dict_strings = 0; size_t dict_lengths_div512 = 0; - for (auto const &stripe : stripe_bounds) { + for (auto const& stripe : stripe_bounds) { const auto sd = column.host_stripe_dict(stripe.id); enable_dict = (enable_dict && sd->dict_data != nullptr); if (enable_dict) { @@ -546,13 +546,13 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets( size_t non_rle_data_size = 0; size_t rle_data_size = 0; for (size_t i = 0; i < streams.size(); ++i) { - const auto &stream = streams[i]; + const auto& stream = streams[i]; auto const is_rle_data = [&]() { // First stream is an index stream, don't check types, etc. if (!stream.column_index().has_value()) return true; - auto const &column = columns[stream.column_index().value()]; + auto const& column = columns[stream.column_index().value()]; // Dictionary encoded string column - dictionary characters or // directly encoded string - column characters if (column.orc_kind() == TypeKind::STRING && @@ -581,18 +581,18 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets( } struct segmented_valid_cnt_input { - bitmask_type const *mask; + bitmask_type const* mask; std::vector indices; }; -encoded_data writer::impl::encode_columns(const table_device_view &view, +encoded_data writer::impl::encode_columns(const table_device_view& view, host_span columns, - std::vector const &str_col_ids, - rmm::device_uvector &&dict_data, - rmm::device_uvector &&dict_index, - encoder_decimal_info &&dec_chunk_sizes, + std::vector const& str_col_ids, + rmm::device_uvector&& dict_data, + rmm::device_uvector&& dict_index, + encoder_decimal_info&& dec_chunk_sizes, host_span stripe_bounds, - orc_streams const &streams) + orc_streams const& streams) { auto const num_columns = columns.size(); auto const num_rowgroups = stripes_size(stripe_bounds); @@ -604,11 +604,11 @@ encoded_data writer::impl::encode_columns(const table_device_view &view, // Initialize column chunks' descriptions std::map validity_check_inputs; - for (auto const &column : columns) { - for (auto const &stripe : stripe_bounds) { + for (auto const& column : columns) { + for (auto const& stripe : stripe_bounds) { for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) { auto const rg_idx = *rg_idx_it; - auto &ck = chunks[column.index()][rg_idx]; + auto& ck = chunks[column.index()][rg_idx]; ck.start_row = (rg_idx * row_index_stride_); ck.num_rows = std::min(row_index_stride_, column.data_count() - ck.start_row); @@ -618,7 +618,7 @@ encoded_data writer::impl::encode_columns(const table_device_view &view, ck.dict_index = (ck.encoding_kind == DICTIONARY_V2) ? column.host_stripe_dict(stripe.id)->dict_index : nullptr; - ck.dtype_len = 1; + ck.dtype_len = 1; } else { ck.dtype_len = column.type_width(); } @@ -632,22 +632,22 @@ encoded_data writer::impl::encode_columns(const table_device_view &view, auto validity_check_indices = [&](size_t col_idx) { std::vector indices; - for (auto const &stripe : stripe_bounds) { + for (auto const& stripe : stripe_bounds) { for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend() - 1; ++rg_idx_it) { - auto const &chunk = chunks[col_idx][*rg_idx_it]; + auto const& chunk = chunks[col_idx][*rg_idx_it]; indices.push_back(chunk.start_row); indices.push_back(chunk.start_row + chunk.num_rows); } } return indices; }; - for (auto const &column : columns) { + for (auto const& column : columns) { if (column.orc_kind() == TypeKind::BOOLEAN && column.nullable()) { validity_check_inputs[column.index()] = {column.nulls(), validity_check_indices(column.index())}; } } - for (auto &cnt_in : validity_check_inputs) { + for (auto& cnt_in : validity_check_inputs) { auto const valid_counts = segmented_count_set_bits(cnt_in.second.mask, cnt_in.second.indices); CUDF_EXPECTS( std::none_of(valid_counts.cbegin(), @@ -659,13 +659,13 @@ encoded_data writer::impl::encode_columns(const table_device_view &view, } for (size_t col_idx = 0; col_idx < num_columns; col_idx++) { - auto const &column = columns[col_idx]; + auto const& column = columns[col_idx]; auto col_streams = chunk_streams[col_idx]; - for (auto const &stripe : stripe_bounds) { + for (auto const& stripe : stripe_bounds) { for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) { auto const rg_idx = *rg_idx_it; - auto const &ck = chunks[col_idx][rg_idx]; - auto &strm = col_streams[rg_idx]; + auto const& ck = chunks[col_idx][rg_idx]; + auto& strm = col_streams[rg_idx]; for (int strm_type = 0; strm_type < gpu::CI_NUM_STREAMS; ++strm_type) { auto const strm_id = streams.id(col_idx * gpu::CI_NUM_STREAMS + strm_type); @@ -688,7 +688,7 @@ encoded_data writer::impl::encode_columns(const table_device_view &view, if (strm_type == gpu::CI_DATA2 && ck.encoding_kind == DICTIONARY_V2) strm.data_ptrs[strm_type] += stream_offsets.non_rle_data_size; } else { - auto const &strm_up = col_streams[stripe_dict[-dict_stride].start_chunk]; + auto const& strm_up = col_streams[stripe_dict[-dict_stride].start_chunk]; strm.data_ptrs[strm_type] = strm_up.data_ptrs[strm_type] + strm_up.lengths[strm_type]; } @@ -754,19 +754,19 @@ std::vector writer::impl::gather_stripes( size_t num_rows, size_t num_index_streams, host_span stripe_bounds, - hostdevice_2dvector *enc_streams, - hostdevice_2dvector *strm_desc) + hostdevice_2dvector* enc_streams, + hostdevice_2dvector* strm_desc) { std::vector stripes(stripe_bounds.size()); - for (auto const &stripe : stripe_bounds) { + for (auto const& stripe : stripe_bounds) { for (size_t col_idx = 0; col_idx < enc_streams->size().first; col_idx++) { - const auto &strm = (*enc_streams)[col_idx][stripe.first]; + const auto& strm = (*enc_streams)[col_idx][stripe.first]; // Assign stream data of column data stream(s) for (int k = 0; k < gpu::CI_INDEX; k++) { const auto stream_id = strm.ids[k]; if (stream_id != -1) { - auto *ss = &(*strm_desc)[stripe.id][stream_id - num_index_streams]; + auto* ss = &(*strm_desc)[stripe.id][stream_id - num_index_streams]; ss->stream_size = 0; ss->first_chunk_id = stripe.first; ss->num_chunks = stripe.size; @@ -790,7 +790,7 @@ std::vector writer::impl::gather_stripes( } std::vector> writer::impl::gather_statistic_blobs( - const table_device_view &table, + const table_device_view& table, host_span columns, host_span stripe_bounds) { @@ -804,8 +804,8 @@ std::vector> writer::impl::gather_statistic_blobs( rmm::device_uvector stat_chunks(num_chunks + num_stat_blobs, stream); rmm::device_uvector stat_groups(num_chunks, stream); - for (auto const &column : columns) { - stats_column_desc *desc = &stat_desc[column.index()]; + for (auto const& column : columns) { + stats_column_desc* desc = &stat_desc[column.index()]; switch (column.orc_kind()) { case TypeKind::BYTE: desc->stats_dtype = dtype_int8; break; case TypeKind::SHORT: desc->stats_dtype = dtype_int16; break; @@ -834,13 +834,13 @@ std::vector> writer::impl::gather_statistic_blobs( } else { desc->ts_scale = 0; } - for (auto const &stripe : stripe_bounds) { + for (auto const& stripe : stripe_bounds) { auto grp = &stat_merge[column.index() * stripe_bounds.size() + stripe.id]; grp->col = stat_desc.device_ptr(column.index()); grp->start_chunk = static_cast(column.index() * num_rowgroups + stripe.first); grp->num_chunks = stripe.size; } - statistics_merge_group *col_stats = + statistics_merge_group* col_stats = &stat_merge[stripe_bounds.size() * columns.size() + column.index()]; col_stats->col = stat_desc.device_ptr(column.index()); col_stats->start_chunk = static_cast(column.index() * stripe_bounds.size()); @@ -888,8 +888,8 @@ std::vector> writer::impl::gather_statistic_blobs( blobs.device_to_host(stream, true); for (size_t i = 0; i < num_stat_blobs; i++) { - const uint8_t *stat_begin = blobs.host_ptr(stat_merge[i].start_chunk); - const uint8_t *stat_end = stat_begin + stat_merge[i].num_chunks; + const uint8_t* stat_begin = blobs.host_ptr(stat_merge[i].start_chunk); + const uint8_t* stat_end = stat_begin + stat_merge[i].num_chunks; stat_blobs[i].assign(stat_begin, stat_end); } @@ -899,13 +899,13 @@ std::vector> writer::impl::gather_statistic_blobs( void writer::impl::write_index_stream(int32_t stripe_id, int32_t stream_id, host_span columns, - stripe_rowgroups const &rowgroups_range, + stripe_rowgroups const& rowgroups_range, host_2dspan enc_streams, host_2dspan strm_desc, host_span comp_out, - StripeInformation *stripe, - orc_streams *streams, - ProtobufWriter *pbw) + StripeInformation* stripe, + orc_streams* streams, + ProtobufWriter* pbw) { row_group_index_info present; row_group_index_info data; @@ -913,13 +913,13 @@ void writer::impl::write_index_stream(int32_t stripe_id, auto kind = TypeKind::STRUCT; auto const column_id = stream_id - 1; - auto find_record = [=, &strm_desc](gpu::encoder_chunk_streams const &stream, + auto find_record = [=, &strm_desc](gpu::encoder_chunk_streams const& stream, gpu::StreamIndexType type) { row_group_index_info record; if (stream.ids[type] > 0) { record.pos = 0; if (compression_kind_ != NONE) { - auto const &ss = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)]; + auto const& ss = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)]; record.blk_pos = ss.first_block; record.comp_pos = 0; record.comp_size = ss.stream_size; @@ -927,9 +927,9 @@ void writer::impl::write_index_stream(int32_t stripe_id, } return record; }; - auto scan_record = [=, &comp_out](gpu::encoder_chunk_streams const &stream, + auto scan_record = [=, &comp_out](gpu::encoder_chunk_streams const& stream, gpu::StreamIndexType type, - row_group_index_info &record) { + row_group_index_info& record) { if (record.pos >= 0) { record.pos += stream.lengths[type]; while ((record.pos >= 0) && (record.blk_pos >= 0) && @@ -945,7 +945,7 @@ void writer::impl::write_index_stream(int32_t stripe_id, // TBD: Not sure we need an empty index stream for column 0 if (stream_id != 0) { - const auto &strm = enc_streams[column_id][0]; + const auto& strm = enc_streams[column_id][0]; present = find_record(strm, gpu::CI_PRESENT); data = find_record(strm, gpu::CI_DATA); data2 = find_record(strm, gpu::CI_DATA2); @@ -965,7 +965,7 @@ void writer::impl::write_index_stream(int32_t stripe_id, present.comp_pos, present.pos, data.comp_pos, data.pos, data2.comp_pos, data2.pos, kind); if (stream_id != 0) { - const auto &strm = enc_streams[column_id][rowgroup]; + const auto& strm = enc_streams[column_id][rowgroup]; scan_record(strm, gpu::CI_PRESENT, present); scan_record(strm, gpu::CI_DATA, data); scan_record(strm, gpu::CI_DATA2, data2); @@ -983,18 +983,18 @@ void writer::impl::write_index_stream(int32_t stripe_id, stripe->indexLength += buffer_.size(); } -void writer::impl::write_data_stream(gpu::StripeStream const &strm_desc, - gpu::encoder_chunk_streams const &enc_stream, - uint8_t const *compressed_data, - uint8_t *stream_out, - StripeInformation *stripe, - orc_streams *streams) +void writer::impl::write_data_stream(gpu::StripeStream const& strm_desc, + gpu::encoder_chunk_streams const& enc_stream, + uint8_t const* compressed_data, + uint8_t* stream_out, + StripeInformation* stripe, + orc_streams* streams) { const auto length = strm_desc.stream_size; (*streams)[enc_stream.ids[strm_desc.stream_type]].length = length; if (length == 0) { return; } - const auto *stream_in = (compression_kind_ == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type] + const auto* stream_in = (compression_kind_ == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type] : (compressed_data + strm_desc.bfr_offset); if (out_sink_->is_device_write_preferred(length)) { @@ -1009,7 +1009,7 @@ void writer::impl::write_data_stream(gpu::StripeStream const &strm_desc, stripe->dataLength += length; } -void writer::impl::add_uncompressed_block_headers(std::vector &v) +void writer::impl::add_uncompressed_block_headers(std::vector& v) { if (compression_kind_ != NONE) { size_t uncomp_len = v.size() - 3, pos = 0, block_len; @@ -1030,10 +1030,10 @@ void writer::impl::add_uncompressed_block_headers(std::vector &v) } writer::impl::impl(std::unique_ptr sink, - orc_writer_options const &options, + orc_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : compression_kind_(to_orc_compression(options.get_compression())), enable_statistics_(options.enable_statistics()), out_sink_(std::move(sink)), @@ -1046,10 +1046,10 @@ writer::impl::impl(std::unique_ptr sink, } writer::impl::impl(std::unique_ptr sink, - chunked_orc_writer_options const &options, + chunked_orc_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : compression_kind_(to_orc_compression(options.get_compression())), enable_statistics_(options.enable_statistics()), out_sink_(std::move(sink)), @@ -1073,7 +1073,7 @@ void writer::impl::init_state() out_sink_->host_write(MAGIC, std::strlen(MAGIC)); } -rmm::device_uvector get_string_column_ids(const table_device_view &view, +rmm::device_uvector get_string_column_ids(const table_device_view& view, rmm::cuda_stream_view stream) { rmm::device_uvector string_column_ids(view.num_columns(), stream); @@ -1096,8 +1096,8 @@ rmm::device_uvector get_string_column_ids(const table_device_view &vi struct rowgroup_iterator { using difference_type = long; using value_type = int; - using pointer = int *; - using reference = int &; + using pointer = int*; + using reference = int&; using iterator_category = thrust::output_device_iterator_tag; size_type idx; size_type rowgroup_size; @@ -1111,7 +1111,7 @@ struct rowgroup_iterator { { return rowgroup_iterator{idx + i, rowgroup_size}; } - CUDA_HOST_DEVICE_CALLABLE rowgroup_iterator &operator++() + CUDA_HOST_DEVICE_CALLABLE rowgroup_iterator& operator++() { ++idx; return *this; @@ -1120,14 +1120,14 @@ struct rowgroup_iterator { { return (idx + offset) / rowgroup_size; } - CUDA_HOST_DEVICE_CALLABLE bool operator!=(rowgroup_iterator const &other) + CUDA_HOST_DEVICE_CALLABLE bool operator!=(rowgroup_iterator const& other) { return idx != other.idx; } }; // returns host vector of per-rowgroup sizes -encoder_decimal_info decimal_chunk_sizes(table_view const &table, +encoder_decimal_info decimal_chunk_sizes(table_view const& table, host_span orc_columns, size_type rowgroup_size, host_span stripes, @@ -1138,21 +1138,21 @@ encoder_decimal_info decimal_chunk_sizes(table_view const &table, auto const d_table = table_device_view::create(table, stream); // Compute per-element offsets (within each row group) on the device for (size_t col_idx = 0; col_idx < orc_columns.size(); ++col_idx) { - auto &orc_col = orc_columns[col_idx]; + auto& orc_col = orc_columns[col_idx]; if (orc_col.orc_kind() == DECIMAL) { - auto const &col = table.column(col_idx); - auto ¤t_sizes = + auto const& col = table.column(col_idx); + auto& current_sizes = elem_sizes.insert({col_idx, rmm::device_uvector(col.size(), stream)}) .first->second; thrust::tabulate(rmm::exec_policy(stream), current_sizes.begin(), current_sizes.end(), [table = *d_table, col_idx] __device__(auto idx) { - auto const &col = table.column(col_idx); + auto const& col = table.column(col_idx); if (col.is_null(idx)) return 0u; - int64_t const element = (col.type().id() == type_id::DECIMAL32) - ? col.element(idx) - : col.element(idx); + int64_t const element = (col.type().id() == type_id::DECIMAL32) + ? col.element(idx) + : col.element(idx); int64_t const sign = (element < 0) ? 1 : 0; uint64_t zigzaged_value = ((element ^ -sign) * 2) + sign; @@ -1180,7 +1180,7 @@ encoder_decimal_info decimal_chunk_sizes(table_view const &table, auto const num_rowgroups = stripes_size(stripes); auto d_tmp_rowgroup_sizes = rmm::device_uvector(num_rowgroups, stream); std::map> rg_sizes; - for (auto const &[col_idx, esizes] : elem_sizes) { + for (auto const& [col_idx, esizes] : elem_sizes) { // Copy last elem in each row group - equal to row group size thrust::tabulate( rmm::exec_policy(stream), @@ -1196,13 +1196,13 @@ encoder_decimal_info decimal_chunk_sizes(table_view const &table, } std::map decimal_column_sizes( - std::map> const &chunk_sizes) + std::map> const& chunk_sizes) { std::map column_sizes; std::transform(chunk_sizes.cbegin(), chunk_sizes.cend(), std::inserter(column_sizes, column_sizes.end()), - [](auto const &chunk_size) -> std::pair { + [](auto const& chunk_size) -> std::pair { return { chunk_size.first, std::accumulate(chunk_size.second.cbegin(), chunk_size.second.cend(), 0lu)}; @@ -1210,7 +1210,7 @@ std::map decimal_column_sizes( return column_sizes; } -void writer::impl::write(table_view const &table) +void writer::impl::write(table_view const& table) { CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed"); auto const num_columns = table.num_columns(); @@ -1231,7 +1231,7 @@ void writer::impl::write(table_view const &table) orc_columns.reserve(num_columns); // Mapping of string columns for quick look-up std::vector str_col_ids; - for (auto const &column : table) { + for (auto const& column : table) { auto const current_id = orc_columns.size(); auto const current_str_id = str_col_ids.size(); @@ -1302,7 +1302,7 @@ void writer::impl::write(table_view const &table) for (size_t stripe_id = 0; stripe_id < stripe_bounds.size(); stripe_id++) { for (size_t i = 0; i < num_data_streams; i++) { // TODO range for (at least) - gpu::StripeStream *ss = &strm_descs[stripe_id][i]; + gpu::StripeStream* ss = &strm_descs[stripe_id][i]; if (!out_sink_->is_device_write_preferred(ss->stream_size)) { all_device_write = false; } size_t stream_size = ss->stream_size; if (compression_kind_ != NONE) { @@ -1323,7 +1323,7 @@ void writer::impl::write(table_view const &table) return pinned_buffer{nullptr, cudaFreeHost}; } else { return pinned_buffer{[](size_t size) { - uint8_t *ptr = nullptr; + uint8_t* ptr = nullptr; CUDA_TRY(cudaMallocHost(&ptr, size)); return ptr; }(max_stream_size), @@ -1337,7 +1337,7 @@ void writer::impl::write(table_view const &table) hostdevice_vector comp_in(num_compressed_blocks, stream); if (compression_kind_ != NONE) { strm_descs.host_to_device(stream); - gpu::CompressOrcDataStreams(static_cast(compressed_data.data()), + gpu::CompressOrcDataStreams(static_cast(compressed_data.data()), num_compressed_blocks, compression_kind_, compression_blocksize_, @@ -1354,8 +1354,8 @@ void writer::impl::write(table_view const &table) // Write stripes for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) { - auto const &rowgroup_range = stripe_bounds[stripe_id]; - auto &stripe = stripes[stripe_id]; + auto const& rowgroup_range = stripe_bounds[stripe_id]; + auto& stripe = stripes[stripe_id]; stripe.offset = out_sink_->bytes_written(); @@ -1374,10 +1374,10 @@ void writer::impl::write(table_view const &table) } // Column data consisting one or more separate streams - for (auto const &strm_desc : strm_descs[stripe_id]) { + for (auto const& strm_desc : strm_descs[stripe_id]) { write_data_stream(strm_desc, enc_data.streams[strm_desc.column_id][rowgroup_range.first], - static_cast(compressed_data.data()), + static_cast(compressed_data.data()), stream_output.get(), &stripe, &streams); @@ -1450,7 +1450,7 @@ void writer::impl::write(table_view const &table) ff.types[0].kind = STRUCT; ff.types[0].subtypes.resize(num_columns); ff.types[0].fieldNames.resize(num_columns); - for (auto const &column : orc_columns) { + for (auto const& column : orc_columns) { ff.types[column.id()].kind = column.orc_kind(); if (column.orc_kind() == DECIMAL) { ff.types[column.id()].scale = static_cast(column.scale()); @@ -1465,7 +1465,7 @@ void writer::impl::write(table_view const &table) "Mismatch in table structure between multiple calls to write"); CUDF_EXPECTS(std::all_of(orc_columns.cbegin(), orc_columns.cend(), - [&](auto const &col) { + [&](auto const& col) { return ff.types[1 + col.index()].kind == col.orc_kind(); }), "Mismatch in column types between multiple calls to write"); @@ -1517,20 +1517,20 @@ void writer::impl::close() // Forward to implementation writer::writer(std::unique_ptr sink, - orc_writer_options const &options, + orc_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : _impl(std::make_unique(std::move(sink), options, mode, stream, mr)) { } // Forward to implementation writer::writer(std::unique_ptr sink, - chunked_orc_writer_options const &options, + chunked_orc_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : _impl(std::make_unique(std::move(sink), options, mode, stream, mr)) { } @@ -1539,7 +1539,7 @@ writer::writer(std::unique_ptr sink, writer::~writer() = default; // Forward to implementation -void writer::write(table_view const &table) { _impl->write(table); } +void writer::write(table_view const& table) { _impl->write(table); } // Forward to implementation void writer::close() { _impl->close(); } diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 155c83a88d9..db5cd349198 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -74,7 +74,7 @@ struct encoder_decimal_info { }; /** - * @brief Returns the total number of rowgroups in the list of contigious stripes. + * @brief Returns the total number of rowgroups in the list of contiguous stripes. */ inline auto stripes_size(host_span stripes) { @@ -110,7 +110,7 @@ class orc_streams { orc_stream_offsets compute_offsets(host_span columns, size_t num_rowgroups) const; - operator std::vector const&() const { return streams; } + operator std::vector const &() const { return streams; } private: std::vector streams; diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index a9b8eb0ac6b..dde86af68c8 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -24,7 +24,7 @@ namespace parquet { * @Brief Parquet CompactProtocolWriter class */ -size_t CompactProtocolWriter::write(const FileMetaData &f) +size_t CompactProtocolWriter::write(const FileMetaData& f) { CompactProtocolFieldWriter c(*this); c.field_int(1, f.version); @@ -48,7 +48,7 @@ size_t CompactProtocolWriter::write(const FileMetaData &f) return c.value(); } -size_t CompactProtocolWriter::write(const SchemaElement &s) +size_t CompactProtocolWriter::write(const SchemaElement& s) { CompactProtocolFieldWriter c(*this); if (s.type != UNDEFINED_TYPE) { @@ -69,7 +69,7 @@ size_t CompactProtocolWriter::write(const SchemaElement &s) return c.value(); } -size_t CompactProtocolWriter::write(const RowGroup &r) +size_t CompactProtocolWriter::write(const RowGroup& r) { CompactProtocolFieldWriter c(*this); c.field_struct_list(1, r.columns); @@ -78,7 +78,7 @@ size_t CompactProtocolWriter::write(const RowGroup &r) return c.value(); } -size_t CompactProtocolWriter::write(const KeyValue &k) +size_t CompactProtocolWriter::write(const KeyValue& k) { CompactProtocolFieldWriter c(*this); c.field_string(1, k.key); @@ -86,7 +86,7 @@ size_t CompactProtocolWriter::write(const KeyValue &k) return c.value(); } -size_t CompactProtocolWriter::write(const ColumnChunk &s) +size_t CompactProtocolWriter::write(const ColumnChunk& s) { CompactProtocolFieldWriter c(*this); if (s.file_path.size() != 0) { c.field_string(1, s.file_path); } @@ -103,7 +103,7 @@ size_t CompactProtocolWriter::write(const ColumnChunk &s) return c.value(); } -size_t CompactProtocolWriter::write(const ColumnChunkMetaData &s) +size_t CompactProtocolWriter::write(const ColumnChunkMetaData& s) { CompactProtocolFieldWriter c(*this); c.field_int(1, s.type); @@ -122,9 +122,10 @@ size_t CompactProtocolWriter::write(const ColumnChunkMetaData &s) void CompactProtocolFieldWriter::put_byte(uint8_t v) { writer.m_buf.push_back(v); } -void CompactProtocolFieldWriter::put_byte(const uint8_t *raw, uint32_t len) +void CompactProtocolFieldWriter::put_byte(const uint8_t* raw, uint32_t len) { - for (uint32_t i = 0; i < len; i++) writer.m_buf.push_back(raw[i]); + for (uint32_t i = 0; i < len; i++) + writer.m_buf.push_back(raw[i]); } uint32_t CompactProtocolFieldWriter::put_uint(uint64_t v) @@ -170,17 +171,19 @@ inline void CompactProtocolFieldWriter::field_int(int field, int64_t val) } template -inline void CompactProtocolFieldWriter::field_int_list(int field, const std::vector &val) +inline void CompactProtocolFieldWriter::field_int_list(int field, const std::vector& val) { put_field_header(field, current_field_value, ST_FLD_LIST); put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_I32)); if (val.size() >= 0xf) put_uint(val.size()); - for (auto &v : val) { put_int(static_cast(v)); } + for (auto& v : val) { + put_int(static_cast(v)); + } current_field_value = field; } template -inline void CompactProtocolFieldWriter::field_struct(int field, const T &val) +inline void CompactProtocolFieldWriter::field_struct(int field, const T& val) { put_field_header(field, current_field_value, ST_FLD_STRUCT); writer.write(val); @@ -188,12 +191,14 @@ inline void CompactProtocolFieldWriter::field_struct(int field, const T &val) } template -inline void CompactProtocolFieldWriter::field_struct_list(int field, const std::vector &val) +inline void CompactProtocolFieldWriter::field_struct_list(int field, const std::vector& val) { put_field_header(field, current_field_value, ST_FLD_LIST); put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_STRUCT)); if (val.size() >= 0xf) put_uint(val.size()); - for (auto &v : val) { writer.write(v); } + for (auto& v : val) { + writer.write(v); + } current_field_value = field; } @@ -204,7 +209,7 @@ inline size_t CompactProtocolFieldWriter::value() } inline void CompactProtocolFieldWriter::field_struct_blob(int field, - const std::vector &val) + const std::vector& val) { put_field_header(field, current_field_value, ST_FLD_STRUCT); put_byte(val.data(), (uint32_t)val.size()); @@ -212,32 +217,32 @@ inline void CompactProtocolFieldWriter::field_struct_blob(int field, current_field_value = field; } -inline void CompactProtocolFieldWriter::field_string(int field, const std::string &val) +inline void CompactProtocolFieldWriter::field_string(int field, const std::string& val) { put_field_header(field, current_field_value, ST_FLD_BINARY); put_uint(val.size()); // FIXME : replace reinterpret_cast - put_byte(reinterpret_cast(val.data()), (uint32_t)val.size()); + put_byte(reinterpret_cast(val.data()), (uint32_t)val.size()); current_field_value = field; } inline void CompactProtocolFieldWriter::field_string_list(int field, - const std::vector &val) + const std::vector& val) { put_field_header(field, current_field_value, ST_FLD_LIST); put_byte((uint8_t)((std::min(val.size(), (size_t)0xfu) << 4) | ST_FLD_BINARY)); if (val.size() >= 0xf) put_uint(val.size()); - for (auto &v : val) { + for (auto& v : val) { put_uint(v.size()); // FIXME : replace reinterpret_cast - put_byte(reinterpret_cast(v.data()), (uint32_t)v.size()); + put_byte(reinterpret_cast(v.data()), (uint32_t)v.size()); } current_field_value = field; } inline int CompactProtocolFieldWriter::current_field() { return current_field_value; } -inline void CompactProtocolFieldWriter::set_current_field(const int &field) +inline void CompactProtocolFieldWriter::set_current_field(const int& field) { current_field_value = field; } diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index 2ce9245490e..633bbdf1e19 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -36,34 +36,34 @@ namespace parquet { */ class CompactProtocolWriter { public: - CompactProtocolWriter(std::vector *output) : m_buf(*output) {} + CompactProtocolWriter(std::vector* output) : m_buf(*output) {} - size_t write(const FileMetaData &); - size_t write(const SchemaElement &); - size_t write(const RowGroup &); - size_t write(const KeyValue &); - size_t write(const ColumnChunk &); - size_t write(const ColumnChunkMetaData &); + size_t write(const FileMetaData&); + size_t write(const SchemaElement&); + size_t write(const RowGroup&); + size_t write(const KeyValue&); + size_t write(const ColumnChunk&); + size_t write(const ColumnChunkMetaData&); protected: - std::vector &m_buf; + std::vector& m_buf; friend class CompactProtocolFieldWriter; }; class CompactProtocolFieldWriter { - CompactProtocolWriter &writer; + CompactProtocolWriter& writer; size_t struct_start_pos; int current_field_value; public: - CompactProtocolFieldWriter(CompactProtocolWriter &caller) + CompactProtocolFieldWriter(CompactProtocolWriter& caller) : writer(caller), struct_start_pos(writer.m_buf.size()), current_field_value(0) { } void put_byte(uint8_t v); - void put_byte(const uint8_t *raw, uint32_t len); + void put_byte(const uint8_t* raw, uint32_t len); uint32_t put_uint(uint64_t v); @@ -76,25 +76,25 @@ class CompactProtocolFieldWriter { inline void field_int(int field, int64_t val); template - inline void field_int_list(int field, const std::vector &val); + inline void field_int_list(int field, const std::vector& val); template - inline void field_struct(int field, const T &val); + inline void field_struct(int field, const T& val); template - inline void field_struct_list(int field, const std::vector &val); + inline void field_struct_list(int field, const std::vector& val); inline size_t value(); - inline void field_struct_blob(int field, const std::vector &val); + inline void field_struct_blob(int field, const std::vector& val); - inline void field_string(int field, const std::string &val); + inline void field_string(int field, const std::string& val); - inline void field_string_list(int field, const std::vector &val); + inline void field_string_list(int field, const std::vector& val); inline int current_field(); - inline void set_current_field(const int &field); + inline void set_current_field(const int& field); }; } // namespace parquet diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index dfd9c1384c5..f8158eaa6e9 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -45,10 +45,10 @@ namespace parquet { namespace gpu { struct page_state_s { - const uint8_t *data_start; - const uint8_t *data_end; - const uint8_t *lvl_end; - const uint8_t *dict_base; // ptr to dictionary page data + const uint8_t* data_start; + const uint8_t* data_end; + const uint8_t* lvl_end; + const uint8_t* dict_base; // ptr to dictionary page data int32_t dict_size; // size of dictionary data int32_t first_row; // First row in page to output int32_t num_rows; // Rows in page to decode (including rows to be skipped) @@ -80,7 +80,7 @@ struct page_state_s { int32_t input_leaf_count; // how many leaf values of the input we've processed uint32_t rep[non_zero_buffer_size]; // circular buffer of repetition level values uint32_t def[non_zero_buffer_size]; // circular buffer of definition level values - const uint8_t *lvl_start[NUM_LEVEL_TYPES]; // [def,rep] + const uint8_t* lvl_start[NUM_LEVEL_TYPES]; // [def,rep] int32_t lvl_count[NUM_LEVEL_TYPES]; // how many of each of the streams we've decoded int32_t row_index_lower_bound; // lower bound of row indices we should process }; @@ -100,9 +100,9 @@ struct page_state_s { * * @return The hash value */ -__device__ uint32_t device_str2hash32(const char *key, size_t len, uint32_t seed = 33) +__device__ uint32_t device_str2hash32(const char* key, size_t len, uint32_t seed = 33) { - const uint8_t *p = reinterpret_cast(key); + const uint8_t* p = reinterpret_cast(key); uint32_t h1 = seed, k1; const uint32_t c1 = 0xcc9e2d51; const uint32_t c2 = 0x1b873593; @@ -149,7 +149,7 @@ __device__ uint32_t device_str2hash32(const char *key, size_t len, uint32_t seed * * @return The 32-bit value read */ -inline __device__ uint32_t get_vlq32(const uint8_t *&cur, const uint8_t *end) +inline __device__ uint32_t get_vlq32(const uint8_t*& cur, const uint8_t* end) { uint32_t v = *cur++; if (v >= 0x80 && cur < end) { @@ -178,9 +178,9 @@ inline __device__ uint32_t get_vlq32(const uint8_t *&cur, const uint8_t *end) * * @return The length of the section */ -__device__ uint32_t InitLevelSection(page_state_s *s, - const uint8_t *cur, - const uint8_t *end, +__device__ uint32_t InitLevelSection(page_state_s* s, + const uint8_t* cur, + const uint8_t* end, level_type lvl) { int32_t len; @@ -236,10 +236,10 @@ __device__ uint32_t InitLevelSection(page_state_s *s, * @param[in] lvl The level type we are decoding - DEFINITION or REPETITION */ __device__ void gpuDecodeStream( - uint32_t *output, page_state_s *s, int32_t target_count, int t, level_type lvl) + uint32_t* output, page_state_s* s, int32_t target_count, int t, level_type lvl) { - const uint8_t *cur_def = s->lvl_start[lvl]; - const uint8_t *end = s->lvl_end; + const uint8_t* cur_def = s->lvl_start[lvl]; + const uint8_t* end = s->lvl_end; uint32_t level_run = s->initial_rle_run[lvl]; int32_t level_val = s->initial_rle_value[lvl]; int level_bits = s->col.level_bits[lvl]; @@ -253,7 +253,7 @@ __device__ void gpuDecodeStream( // Get a new run symbol from the byte stream int sym_len = 0; if (!t) { - const uint8_t *cur = cur_def; + const uint8_t* cur = cur_def; if (cur < end) { level_run = get_vlq32(cur, end); } if (!(level_run & 1)) { if (cur < end) level_val = cur[0]; @@ -282,7 +282,7 @@ __device__ void gpuDecodeStream( batch_len8 = (batch_len + 7) >> 3; if (t < batch_len) { int bitpos = t * level_bits; - const uint8_t *cur = cur_def + (bitpos >> 3); + const uint8_t* cur = cur_def + (bitpos >> 3); bitpos &= 7; if (cur < end) level_val = cur[0]; cur++; @@ -327,9 +327,9 @@ __device__ void gpuDecodeStream( * * @return The new output position */ -__device__ int gpuDecodeDictionaryIndices(volatile page_state_s *s, int target_pos, int t) +__device__ int gpuDecodeDictionaryIndices(volatile page_state_s* s, int target_pos, int t) { - const uint8_t *end = s->data_end; + const uint8_t* end = s->data_end; int dict_bits = s->dict_bits; int pos = s->dict_pos; @@ -337,7 +337,7 @@ __device__ int gpuDecodeDictionaryIndices(volatile page_state_s *s, int target_p int is_literal, batch_len; if (!t) { uint32_t run = s->dict_run; - const uint8_t *cur = s->data_start; + const uint8_t* cur = s->data_start; if (run <= 1) { run = (cur < end) ? get_vlq32(cur, end) : 0; if (!(run & 1)) { @@ -380,7 +380,7 @@ __device__ int gpuDecodeDictionaryIndices(volatile page_state_s *s, int target_p int dict_idx = s->dict_val; if (is_literal) { int32_t ofs = (t - ((batch_len + 7) & ~7)) * dict_bits; - const uint8_t *p = s->data_start + (ofs >> 3); + const uint8_t* p = s->data_start + (ofs >> 3); ofs &= 7; if (p < end) { uint32_t c = 8 - ofs; @@ -413,16 +413,16 @@ __device__ int gpuDecodeDictionaryIndices(volatile page_state_s *s, int target_p * * @return The new output position */ -__device__ int gpuDecodeRleBooleans(volatile page_state_s *s, int target_pos, int t) +__device__ int gpuDecodeRleBooleans(volatile page_state_s* s, int target_pos, int t) { - const uint8_t *end = s->data_end; + const uint8_t* end = s->data_end; int pos = s->dict_pos; while (pos < target_pos) { int is_literal, batch_len; if (!t) { uint32_t run = s->dict_run; - const uint8_t *cur = s->data_start; + const uint8_t* cur = s->data_start; if (run <= 1) { run = (cur < end) ? get_vlq32(cur, end) : 0; if (!(run & 1)) { @@ -455,7 +455,7 @@ __device__ int gpuDecodeRleBooleans(volatile page_state_s *s, int target_pos, in int dict_idx; if (is_literal) { int32_t ofs = t - ((batch_len + 7) & ~7); - const uint8_t *p = s->data_start + (ofs >> 3); + const uint8_t* p = s->data_start + (ofs >> 3); dict_idx = (p < end) ? (p[0] >> (ofs & 7u)) & 1 : 0; } else { dict_idx = s->dict_val; @@ -476,12 +476,12 @@ __device__ int gpuDecodeRleBooleans(volatile page_state_s *s, int target_pos, in * * @return The new output position */ -__device__ void gpuInitStringDescriptors(volatile page_state_s *s, int target_pos, int t) +__device__ void gpuInitStringDescriptors(volatile page_state_s* s, int target_pos, int t) { int pos = s->dict_pos; // This step is purely serial if (!t) { - const uint8_t *cur = s->data_start; + const uint8_t* cur = s->data_start; int dict_size = s->dict_size; int k = s->dict_val; @@ -511,9 +511,9 @@ __device__ void gpuInitStringDescriptors(volatile page_state_s *s, int target_po * @param[in] src_pos Source position * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash) */ -inline __device__ void gpuOutputString(volatile page_state_s *s, int src_pos, void *dstv) +inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, void* dstv) { - const char *ptr = NULL; + const char* ptr = NULL; size_t len = 0; if (s->dict_base) { @@ -522,8 +522,8 @@ inline __device__ void gpuOutputString(volatile page_state_s *s, int src_pos, vo sizeof(string_index_pair) : 0; if (dict_pos < (uint32_t)s->dict_size) { - const string_index_pair *src = - reinterpret_cast(s->dict_base + dict_pos); + const string_index_pair* src = + reinterpret_cast(s->dict_base + dict_pos); ptr = src->first; len = src->second; } @@ -531,16 +531,16 @@ inline __device__ void gpuOutputString(volatile page_state_s *s, int src_pos, vo // Plain encoding uint32_t dict_pos = s->dict_idx[src_pos & (non_zero_buffer_size - 1)]; if (dict_pos <= (uint32_t)s->dict_size) { - ptr = reinterpret_cast(s->data_start + dict_pos); + ptr = reinterpret_cast(s->data_start + dict_pos); len = s->str_len[src_pos & (non_zero_buffer_size - 1)]; } } if (s->dtype_len == 4) { // Output hash - *static_cast(dstv) = device_str2hash32(ptr, len); + *static_cast(dstv) = device_str2hash32(ptr, len); } else { // Output string descriptor - string_index_pair *dst = static_cast(dstv); + string_index_pair* dst = static_cast(dstv); dst->first = ptr; dst->second = len; } @@ -553,7 +553,7 @@ inline __device__ void gpuOutputString(volatile page_state_s *s, int src_pos, vo * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ -inline __device__ void gpuOutputBoolean(volatile page_state_s *s, int src_pos, uint8_t *dst) +inline __device__ void gpuOutputBoolean(volatile page_state_s* s, int src_pos, uint8_t* dst) { *dst = s->dict_idx[src_pos & (non_zero_buffer_size - 1)]; } @@ -566,8 +566,8 @@ inline __device__ void gpuOutputBoolean(volatile page_state_s *s, int src_pos, u * @param[in] dict_pos byte position in dictionary * @param[in] dict_size size of dictionary */ -inline __device__ void gpuStoreOutput(uint32_t *dst, - const uint8_t *src8, +inline __device__ void gpuStoreOutput(uint32_t* dst, + const uint8_t* src8, uint32_t dict_pos, uint32_t dict_size) { @@ -576,9 +576,9 @@ inline __device__ void gpuStoreOutput(uint32_t *dst, src8 -= ofs; // align to 32-bit boundary ofs <<= 3; // bytes -> bits if (dict_pos < dict_size) { - bytebuf = *reinterpret_cast(src8 + dict_pos); + bytebuf = *reinterpret_cast(src8 + dict_pos); if (ofs) { - uint32_t bytebufnext = *reinterpret_cast(src8 + dict_pos + 4); + uint32_t bytebufnext = *reinterpret_cast(src8 + dict_pos + 4); bytebuf = __funnelshift_r(bytebuf, bytebufnext, ofs); } } else { @@ -595,8 +595,8 @@ inline __device__ void gpuStoreOutput(uint32_t *dst, * @param[in] dict_pos byte position in dictionary * @param[in] dict_size size of dictionary */ -inline __device__ void gpuStoreOutput(uint2 *dst, - const uint8_t *src8, +inline __device__ void gpuStoreOutput(uint2* dst, + const uint8_t* src8, uint32_t dict_pos, uint32_t dict_size) { @@ -605,10 +605,10 @@ inline __device__ void gpuStoreOutput(uint2 *dst, src8 -= ofs; // align to 32-bit boundary ofs <<= 3; // bytes -> bits if (dict_pos < dict_size) { - v.x = *reinterpret_cast(src8 + dict_pos + 0); - v.y = *reinterpret_cast(src8 + dict_pos + 4); + v.x = *reinterpret_cast(src8 + dict_pos + 0); + v.y = *reinterpret_cast(src8 + dict_pos + 4); if (ofs) { - uint32_t next = *reinterpret_cast(src8 + dict_pos + 8); + uint32_t next = *reinterpret_cast(src8 + dict_pos + 8); v.x = __funnelshift_r(v.x, v.y, ofs); v.y = __funnelshift_r(v.y, next, ofs); } @@ -625,9 +625,9 @@ inline __device__ void gpuStoreOutput(uint2 *dst, * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ -inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s *s, int src_pos, int64_t *dst) +inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src_pos, int64_t* dst) { - const uint8_t *src8; + const uint8_t* src8; uint32_t dict_pos, dict_size = s->dict_size, ofs; int64_t ts; @@ -647,11 +647,11 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s *s, int src if (dict_pos + 4 < dict_size) { uint3 v; int64_t nanos, secs, days; - v.x = *reinterpret_cast(src8 + dict_pos + 0); - v.y = *reinterpret_cast(src8 + dict_pos + 4); - v.z = *reinterpret_cast(src8 + dict_pos + 8); + v.x = *reinterpret_cast(src8 + dict_pos + 0); + v.y = *reinterpret_cast(src8 + dict_pos + 4); + v.z = *reinterpret_cast(src8 + dict_pos + 8); if (ofs) { - uint32_t next = *reinterpret_cast(src8 + dict_pos + 12); + uint32_t next = *reinterpret_cast(src8 + dict_pos + 12); v.x = __funnelshift_r(v.x, v.y, ofs); v.y = __funnelshift_r(v.y, v.z, ofs); v.z = __funnelshift_r(v.z, next, ofs); @@ -681,9 +681,9 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s *s, int src * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ -inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s *s, int src_pos, int64_t *dst) +inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s, int src_pos, int64_t* dst) { - const uint8_t *src8; + const uint8_t* src8; uint32_t dict_pos, dict_size = s->dict_size, ofs; int64_t ts; @@ -704,10 +704,10 @@ inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s *s, int src uint2 v; int64_t val; int32_t ts_scale; - v.x = *reinterpret_cast(src8 + dict_pos + 0); - v.y = *reinterpret_cast(src8 + dict_pos + 4); + v.x = *reinterpret_cast(src8 + dict_pos + 0); + v.y = *reinterpret_cast(src8 + dict_pos + 4); if (ofs) { - uint32_t next = *reinterpret_cast(src8 + dict_pos + 8); + uint32_t next = *reinterpret_cast(src8 + dict_pos + 8); v.x = __funnelshift_r(v.x, v.y, ofs); v.y = __funnelshift_r(v.y, next, ofs); } @@ -746,12 +746,12 @@ static const __device__ __constant__ double kPow10[40] = { * @param[in] dst Pointer to row output data * @param[in] dtype Stored data type */ -inline __device__ void gpuOutputDecimalAsFloat(volatile page_state_s *s, +inline __device__ void gpuOutputDecimalAsFloat(volatile page_state_s* s, int src_pos, - double *dst, + double* dst, int dtype) { - const uint8_t *dict; + const uint8_t* dict; uint32_t dict_pos, dict_size = s->dict_size, dtype_len_in; int64_t i128_hi, i128_lo; int32_t scale; @@ -823,12 +823,12 @@ inline __device__ void gpuOutputDecimalAsFloat(volatile page_state_s *s, * @param[in] src_pos Source position * @param[in] dst Pointer to row output data */ -inline __device__ void gpuOutputFixedLenByteArrayAsInt64(volatile page_state_s *s, +inline __device__ void gpuOutputFixedLenByteArrayAsInt64(volatile page_state_s* s, int src_pos, - int64_t *dst) + int64_t* dst) { uint32_t const dtype_len_in = s->dtype_len_in; - uint8_t const *data = s->dict_base ? s->dict_base : s->data_start; + uint8_t const* data = s->dict_base ? s->dict_base : s->data_start; uint32_t const pos = (s->dict_base ? ((s->dict_bits > 0) ? s->dict_idx[src_pos & (non_zero_buffer_size - 1)] : 0) : src_pos) * @@ -857,9 +857,9 @@ inline __device__ void gpuOutputFixedLenByteArrayAsInt64(volatile page_state_s * * @param[in] dst Pointer to row output data */ template -inline __device__ void gpuOutputFast(volatile page_state_s *s, int src_pos, T *dst) +inline __device__ void gpuOutputFast(volatile page_state_s* s, int src_pos, T* dst) { - const uint8_t *dict; + const uint8_t* dict; uint32_t dict_pos, dict_size = s->dict_size; if (s->dict_base) { @@ -883,12 +883,12 @@ inline __device__ void gpuOutputFast(volatile page_state_s *s, int src_pos, T *d * @param[in] dst8 Pointer to row output data * @param[in] len Length of element */ -static __device__ void gpuOutputGeneric(volatile page_state_s *s, +static __device__ void gpuOutputGeneric(volatile page_state_s* s, int src_pos, - uint8_t *dst8, + uint8_t* dst8, int len) { - const uint8_t *dict; + const uint8_t* dict; uint32_t dict_pos, dict_size = s->dict_size; if (s->dict_base) { @@ -908,23 +908,23 @@ static __device__ void gpuOutputGeneric(volatile page_state_s *s, } } else { // Copy 4 bytes at a time - const uint8_t *src8 = dict; + const uint8_t* src8 = dict; unsigned int ofs = 3 & reinterpret_cast(src8); src8 -= ofs; // align to 32-bit boundary ofs <<= 3; // bytes -> bits for (unsigned int i = 0; i < len; i += 4) { uint32_t bytebuf; if (dict_pos < dict_size) { - bytebuf = *reinterpret_cast(src8 + dict_pos); + bytebuf = *reinterpret_cast(src8 + dict_pos); if (ofs) { - uint32_t bytebufnext = *reinterpret_cast(src8 + dict_pos + 4); + uint32_t bytebufnext = *reinterpret_cast(src8 + dict_pos + 4); bytebuf = __funnelshift_r(bytebuf, bytebufnext, ofs); } } else { bytebuf = 0; } dict_pos += 4; - *reinterpret_cast(dst8 + i) = bytebuf; + *reinterpret_cast(dst8 + i) = bytebuf; } } } @@ -939,9 +939,9 @@ static __device__ void gpuOutputGeneric(volatile page_state_s *s, * @param[in] min_row crop all rows below min_row * @param[in] num_chunk Number of column chunks */ -static __device__ bool setupLocalPageInfo(page_state_s *const s, - PageInfo *p, - ColumnChunkDesc const *chunks, +static __device__ bool setupLocalPageInfo(page_state_s* const s, + PageInfo* p, + ColumnChunkDesc const* chunks, size_t min_row, size_t num_rows, int32_t num_chunks) @@ -984,8 +984,8 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s, // - On page N, the remaining 4/6 values are encoded, but there are no new rows. // if (s->page.num_input_values > 0 && s->page.num_rows > 0) { if (s->page.num_input_values > 0) { - uint8_t *cur = s->page.page_data; - uint8_t *end = cur + s->page.uncompressed_page_size; + uint8_t* cur = s->page.page_data; + uint8_t* end = cur + s->page.uncompressed_page_size; uint32_t dtype_len_out = s->col.data_type >> 3; s->ts_scale = 0; @@ -1052,7 +1052,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s, if (s->col.column_data_base != nullptr) { int max_depth = s->col.max_nesting_depth; for (int idx = 0; idx < max_depth; idx++) { - PageNestingInfo *pni = &s->page.nesting[idx]; + PageNestingInfo* pni = &s->page.nesting[idx]; size_t output_offset; // schemas without lists @@ -1064,7 +1064,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s, output_offset = pni->page_start_value; } - pni->data_out = static_cast(s->col.column_data_base[idx]); + pni->data_out = static_cast(s->col.column_data_base[idx]); if (pni->data_out != nullptr) { // anything below max depth with a valid data pointer must be a list, so the // element size is the size of the offset type. @@ -1094,7 +1094,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s, // RLE-packed dictionary indices, first byte indicates index length in bits if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) { // String dictionary: use index - s->dict_base = reinterpret_cast(s->col.str_dict_index); + s->dict_base = reinterpret_cast(s->col.str_dict_index); s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair); } else { s->dict_base = @@ -1195,7 +1195,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s, * @param[in] valid_mask The validity mask to be stored * @param[in] value_count # of bits in the validity mask */ -static __device__ void store_validity(PageNestingInfo *pni, +static __device__ void store_validity(PageNestingInfo* pni, uint32_t valid_mask, int32_t value_count) { @@ -1249,10 +1249,10 @@ static __device__ void store_validity(PageNestingInfo *pni, * @param[in] target_input_value_count The desired # of input level values we want to process * @param[in] t Thread index */ -inline __device__ void get_nesting_bounds(int &start_depth, - int &end_depth, - int &d, - page_state_s *s, +inline __device__ void get_nesting_bounds(int& start_depth, + int& end_depth, + int& d, + page_state_s* s, int input_value_count, int32_t target_input_value_count, int t) @@ -1288,7 +1288,7 @@ inline __device__ void get_nesting_bounds(int &start_depth, * @param[in] t Thread index */ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value_count, - page_state_s *s, + page_state_s* s, int t) { // max nesting depth of the column @@ -1339,7 +1339,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu // walk from 0 to max_depth uint32_t next_thread_value_count, next_warp_value_count; for (int s_idx = 0; s_idx < max_depth; s_idx++) { - PageNestingInfo *pni = &s->page.nesting[s_idx]; + PageNestingInfo* pni = &s->page.nesting[s_idx]; // if we are within the range of nesting levels we should be adding value indices for int const in_nesting_bounds = @@ -1391,7 +1391,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu cudf::size_type const ofs = s->page.nesting[s_idx + 1].value_count + next_thread_value_count + s->page.nesting[s_idx + 1].page_start_value; - (reinterpret_cast(pni->data_out))[idx] = ofs; + (reinterpret_cast(pni->data_out))[idx] = ofs; } } @@ -1455,7 +1455,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu * @param[in] target_leaf_count Target count of non-null leaf values to generate indices for * @param[in] t Thread index */ -__device__ void gpuDecodeLevels(page_state_s *s, int32_t target_leaf_count, int t) +__device__ void gpuDecodeLevels(page_state_s* s, int32_t target_leaf_count, int t) { bool has_repetition = s->col.max_level[level_type::REPETITION] > 0; @@ -1467,7 +1467,7 @@ __device__ void gpuDecodeLevels(page_state_s *s, int32_t target_leaf_count, int gpuDecodeStream(s->def, s, cur_leaf_count, t, level_type::DEFINITION); __syncwarp(); - // because the rep and def streams are encoded seperately, we cannot request an exact + // because the rep and def streams are encoded separately, we cannot request an exact // # of values to be decoded at once. we can only process the lowest # of decoded rep/def // levels we get. int actual_leaf_count = has_repetition ? min(s->lvl_count[level_type::REPETITION], @@ -1494,7 +1494,7 @@ __device__ void gpuDecodeLevels(page_state_s *s, int32_t target_leaf_count, int * @param[in] bounds_set Whether or not s->row_index_lower_bound, s->first_row and s->num_rows * have been computed for this page (they will only be set in the second/trim pass). */ -static __device__ void gpuUpdatePageSizes(page_state_s *s, +static __device__ void gpuUpdatePageSizes(page_state_s* s, int32_t target_input_value_count, int t, bool bounds_set) @@ -1586,8 +1586,8 @@ static __device__ void gpuUpdatePageSizes(page_state_s *s, */ // blockDim {block_size,1,1} extern "C" __global__ void __launch_bounds__(block_size) - gpuComputePageSizes(PageInfo *pages, - ColumnChunkDesc const *chunks, + gpuComputePageSizes(PageInfo* pages, + ColumnChunkDesc const* chunks, size_t min_row, size_t num_rows, int32_t num_chunks, @@ -1595,10 +1595,10 @@ extern "C" __global__ void __launch_bounds__(block_size) { __shared__ __align__(16) page_state_s state_g; - page_state_s *const s = &state_g; + page_state_s* const s = &state_g; int page_idx = blockIdx.x; int t = threadIdx.x; - PageInfo *pp = &pages[page_idx]; + PageInfo* pp = &pages[page_idx]; if (!setupLocalPageInfo( s, pp, chunks, trim_pass ? min_row : 0, trim_pass ? num_rows : INT_MAX, num_chunks)) { @@ -1678,15 +1678,15 @@ extern "C" __global__ void __launch_bounds__(block_size) */ // blockDim {block_size,1,1} extern "C" __global__ void __launch_bounds__(block_size) - gpuDecodePageData(PageInfo *pages, - ColumnChunkDesc const *chunks, + gpuDecodePageData(PageInfo* pages, + ColumnChunkDesc const* chunks, size_t min_row, size_t num_rows, int32_t num_chunks) { __shared__ __align__(16) page_state_s state_g; - page_state_s *const s = &state_g; + page_state_s* const s = &state_g; int page_idx = blockIdx.x; int t = threadIdx.x; int out_thread0; @@ -1732,7 +1732,7 @@ extern "C" __global__ void __launch_bounds__(block_size) } else if ((s->col.data_type & 7) == BYTE_ARRAY) { gpuInitStringDescriptors(s, src_target_pos, t & 0x1f); } - if (t == 32) { *(volatile int32_t *)&s->dict_pos = src_target_pos; } + if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; } } else { // WARP1..WARP3: Decode values int dtype = s->col.data_type & 7; @@ -1767,52 +1767,52 @@ extern "C" __global__ void __launch_bounds__(block_size) int leaf_level_index = s->col.max_nesting_depth - 1; uint32_t dtype_len = s->dtype_len; - void *dst = + void* dst = s->page.nesting[leaf_level_index].data_out + static_cast(dst_pos) * dtype_len; if (dtype == BYTE_ARRAY) { gpuOutputString(s, val_src_pos, dst); } else if (dtype == BOOLEAN) { - gpuOutputBoolean(s, val_src_pos, static_cast(dst)); + gpuOutputBoolean(s, val_src_pos, static_cast(dst)); } else if (s->col.converted_type == DECIMAL) { switch (dtype) { - case INT32: gpuOutputFast(s, val_src_pos, static_cast(dst)); break; - case INT64: gpuOutputFast(s, val_src_pos, static_cast(dst)); break; + case INT32: gpuOutputFast(s, val_src_pos, static_cast(dst)); break; + case INT64: gpuOutputFast(s, val_src_pos, static_cast(dst)); break; default: // we currently do not support reading byte arrays larger than DECIMAL64 if (s->dtype_len_in <= 8) { - gpuOutputFixedLenByteArrayAsInt64(s, val_src_pos, static_cast(dst)); + gpuOutputFixedLenByteArrayAsInt64(s, val_src_pos, static_cast(dst)); } else { - gpuOutputDecimalAsFloat(s, val_src_pos, static_cast(dst), dtype); + gpuOutputDecimalAsFloat(s, val_src_pos, static_cast(dst), dtype); } break; } } else if (dtype == INT96) { - gpuOutputInt96Timestamp(s, val_src_pos, static_cast(dst)); + gpuOutputInt96Timestamp(s, val_src_pos, static_cast(dst)); } else if (dtype_len == 8) { if (s->ts_scale) { - gpuOutputInt64Timestamp(s, val_src_pos, static_cast(dst)); + gpuOutputInt64Timestamp(s, val_src_pos, static_cast(dst)); } else { - gpuOutputFast(s, val_src_pos, static_cast(dst)); + gpuOutputFast(s, val_src_pos, static_cast(dst)); } } else if (dtype_len == 4) { - gpuOutputFast(s, val_src_pos, static_cast(dst)); + gpuOutputFast(s, val_src_pos, static_cast(dst)); } else { - gpuOutputGeneric(s, val_src_pos, static_cast(dst), dtype_len); + gpuOutputGeneric(s, val_src_pos, static_cast(dst), dtype_len); } } - if (t == out_thread0) { *(volatile int32_t *)&s->src_pos = target_pos; } + if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; } } __syncthreads(); } } struct chunk_row_output_iter { - PageInfo *p; + PageInfo* p; using value_type = size_type; using difference_type = size_type; - using pointer = size_type *; - using reference = size_type &; + using pointer = size_type*; + using reference = size_type&; using iterator_category = thrust::output_device_iterator_tag; __host__ __device__ chunk_row_output_iter operator+(int i) @@ -1828,16 +1828,16 @@ struct chunk_row_output_iter { }; struct start_offset_output_iterator { - PageInfo *pages; - int *page_indices; + PageInfo* pages; + int* page_indices; int cur_index; int src_col_schema; int nesting_depth; int empty = 0; using value_type = size_type; using difference_type = size_type; - using pointer = size_type *; - using reference = size_type &; + using pointer = size_type*; + using reference = size_type&; using iterator_category = thrust::output_device_iterator_tag; __host__ __device__ start_offset_output_iterator operator+(int i) @@ -1854,7 +1854,7 @@ struct start_offset_output_iterator { private: __device__ reference dereference(int index) { - PageInfo const &p = pages[page_indices[index]]; + PageInfo const& p = pages[page_indices[index]]; if (p.src_col_schema != src_col_schema || p.flags & PAGEINFO_FLAGS_DICTIONARY) { return empty; } return p.nesting[nesting_depth].page_start_value; } @@ -1863,14 +1863,14 @@ struct start_offset_output_iterator { /** * @copydoc cudf::io::parquet::gpu::PreprocessColumnData */ -void PreprocessColumnData(hostdevice_vector &pages, - hostdevice_vector const &chunks, - std::vector &input_columns, - std::vector &output_columns, +void PreprocessColumnData(hostdevice_vector& pages, + hostdevice_vector const& chunks, + std::vector& input_columns, + std::vector& output_columns, size_t num_rows, size_t min_row, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { dim3 dim_block(block_size, 1); dim3 dim_grid(pages.size(), 1); // 1 threadblock per page @@ -1885,9 +1885,9 @@ void PreprocessColumnData(hostdevice_vector &pages, // computes: // PageInfo::chunk_row for all pages auto key_input = thrust::make_transform_iterator( - pages.device_ptr(), [] __device__(PageInfo const &page) { return page.chunk_idx; }); + pages.device_ptr(), [] __device__(PageInfo const& page) { return page.chunk_idx; }); auto page_input = thrust::make_transform_iterator( - pages.device_ptr(), [] __device__(PageInfo const &page) { return page.num_rows; }); + pages.device_ptr(), [] __device__(PageInfo const& page) { return page.num_rows; }); thrust::exclusive_scan_by_key(rmm::exec_policy(stream), key_input, key_input + pages.size(), @@ -1927,7 +1927,7 @@ void PreprocessColumnData(hostdevice_vector &pages, pages.device_ptr(), pages.device_ptr() + pages.size(), page_keys.begin(), - [] __device__(PageInfo const &page) { return page.src_col_schema; }); + [] __device__(PageInfo const& page) { return page.src_col_schema; }); thrust::sequence(rmm::exec_policy(stream), page_index.begin(), page_index.end()); thrust::stable_sort_by_key(rmm::exec_policy(stream), @@ -1939,20 +1939,20 @@ void PreprocessColumnData(hostdevice_vector &pages, // compute output column sizes by examining the pages of the -input- columns for (size_t idx = 0; idx < input_columns.size(); idx++) { - auto const &input_col = input_columns[idx]; + auto const& input_col = input_columns[idx]; auto src_col_schema = input_col.schema_idx; size_t max_depth = input_col.nesting_depth(); - auto *cols = &output_columns; + auto* cols = &output_columns; for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { - auto &out_buf = (*cols)[input_col.nesting[l_idx]]; + auto& out_buf = (*cols)[input_col.nesting[l_idx]]; cols = &out_buf.children; // size iterator. indexes pages by sorted order auto size_input = thrust::make_transform_iterator( page_index.begin(), [src_col_schema, l_idx, pages = pages.device_ptr()] __device__(int index) { - auto const &page = pages[index]; + auto const& page = pages[index]; if (page.src_col_schema != src_col_schema || page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; } @@ -1989,8 +1989,8 @@ void PreprocessColumnData(hostdevice_vector &pages, /** * @copydoc cudf::io::parquet::gpu::DecodePageData */ -void __host__ DecodePageData(hostdevice_vector &pages, - hostdevice_vector const &chunks, +void __host__ DecodePageData(hostdevice_vector& pages, + hostdevice_vector const& chunks, size_t num_rows, size_t min_row, rmm::cuda_stream_view stream) diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu index 2d505b99981..0c55828b120 100644 --- a/cpp/src/io/parquet/page_dict.cu +++ b/cpp/src/io/parquet/page_dict.cu @@ -29,8 +29,8 @@ namespace parquet { namespace gpu { struct dict_state_s { uint32_t row_cnt; - PageFragment *cur_fragment; - uint32_t *hashmap; + PageFragment* cur_fragment; + uint32_t* hashmap; uint32_t total_dict_entries; //!< Total number of entries in dictionary uint32_t dictionary_size; //!< Total dictionary size in bytes uint32_t num_dict_entries; //!< Dictionary entries in current fragment to add @@ -52,14 +52,14 @@ inline __device__ uint32_t uint64_hash16(uint64_t v) return uint32_hash16((uint32_t)(v + (v >> 32))); } -inline __device__ uint32_t hash_string(const string_view &val) +inline __device__ uint32_t hash_string(const string_view& val) { - const char *p = val.data(); + const char* p = val.data(); uint32_t len = val.size_bytes(); uint32_t hash = len; if (len > 0) { uint32_t align_p = 3 & reinterpret_cast(p); - const uint32_t *p32 = reinterpret_cast(p - align_p); + const uint32_t* p32 = reinterpret_cast(p - align_p); uint32_t ofs = align_p * 8; uint32_t v; while (len > 4) { @@ -85,8 +85,8 @@ inline __device__ uint32_t hash_string(const string_view &val) * @param[in] frag_start_row row position of current fragment * @param[in] t thread id */ -__device__ void FetchDictionaryFragment(dict_state_s *s, - uint32_t *dict_data, +__device__ void FetchDictionaryFragment(dict_state_s* s, + uint32_t* dict_data, uint32_t frag_start_row, uint32_t t) { @@ -108,12 +108,12 @@ __device__ void FetchDictionaryFragment(dict_state_s *s, /// Generate dictionary indices in ascending row order template -__device__ void GenerateDictionaryIndices(dict_state_s *s, uint32_t t) +__device__ void GenerateDictionaryIndices(dict_state_s* s, uint32_t t) { using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage temp_storage; - uint32_t *dict_index = s->col.dict_index; - uint32_t *dict_data = s->col.dict_data + s->ck.start_row; + uint32_t* dict_index = s->col.dict_index; + uint32_t* dict_data = s->col.dict_data + s->ck.start_row; uint32_t num_dict_entries = 0; for (uint32_t i = 0; i < s->row_cnt; i += 1024) { @@ -150,13 +150,13 @@ __device__ void GenerateDictionaryIndices(dict_state_s *s, uint32_t t) // blockDim(1024, 1, 1) template __global__ void __launch_bounds__(block_size, 1) - gpuBuildChunkDictionaries(device_span chunks, uint32_t *dev_scratch) + gpuBuildChunkDictionaries(device_span chunks, uint32_t* dev_scratch) { __shared__ __align__(8) dict_state_s state_g; using block_reduce = cub::BlockReduce; __shared__ typename block_reduce::TempStorage temp_storage; - dict_state_s *const s = &state_g; + dict_state_s* const s = &state_g; uint32_t t = threadIdx.x; uint32_t dtype, dtype_len, dtype_len_in; @@ -227,23 +227,19 @@ __global__ void __launch_bounds__(block_size, 1) val = s->col.leaf_column->element(row); hash = uint64_hash16(val); } else { - val = (dtype_len_in == 4) - ? s->col.leaf_column->element(row) - : (dtype_len_in == 2) ? s->col.leaf_column->element(row) - : s->col.leaf_column->element(row); + val = (dtype_len_in == 4) ? s->col.leaf_column->element(row) + : (dtype_len_in == 2) ? s->col.leaf_column->element(row) + : s->col.leaf_column->element(row); hash = uint32_hash16(val); } // Walk the list of rows with the same hash next_addr = &s->hashmap[hash]; while ((next = atomicCAS(next_addr, 0, row + 1)) != 0) { auto const current = next - 1; - uint64_t val2 = (dtype_len_in == 8) - ? s->col.leaf_column->element(current) - : (dtype_len_in == 4) - ? s->col.leaf_column->element(current) - : (dtype_len_in == 2) - ? s->col.leaf_column->element(current) - : s->col.leaf_column->element(current); + uint64_t val2 = (dtype_len_in == 8) ? s->col.leaf_column->element(current) + : (dtype_len_in == 4) ? s->col.leaf_column->element(current) + : (dtype_len_in == 2) ? s->col.leaf_column->element(current) + : s->col.leaf_column->element(current); if (val2 == val) { is_dupe = 1; break; @@ -274,7 +270,9 @@ __global__ void __launch_bounds__(block_size, 1) bool reorder_check = (is_valid && is_dupe && next - 1 > row); if (reorder_check) { next = s->col.dict_index[next - 1]; - while (next & (1u << 31)) { next = s->col.dict_index[next & 0x7fffffff]; } + while (next & (1u << 31)) { + next = s->col.dict_index[next & 0x7fffffff]; + } } if (__syncthreads_or(reorder_check)) { if (reorder_check) { atomicMin(&s->col.dict_index[next], row); } @@ -324,7 +322,7 @@ __global__ void __launch_bounds__(block_size, 1) * @param[in] stream CUDA stream to use, default 0 */ void BuildChunkDictionaries(device_span chunks, - uint32_t *dev_scratch, + uint32_t* dev_scratch, rmm::cuda_stream_view stream) { auto num_chunks = chunks.size(); diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index bf9114949aa..3c62dcf7eea 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -59,8 +59,8 @@ struct frag_init_state_s { }; struct page_enc_state_s { - uint8_t *cur; //!< current output ptr - uint8_t *rle_out; //!< current RLE write ptr + uint8_t* cur; //!< current output ptr + uint8_t* rle_out; //!< current RLE write ptr uint32_t rle_run; //!< current RLE run uint32_t run_val; //!< current RLE run value uint32_t rle_pos; //!< RLE encoder positions @@ -81,9 +81,9 @@ struct page_enc_state_s { /** * @brief Return a 12-bit hash from a byte sequence */ -inline __device__ uint32_t hash_string(const string_view &val) +inline __device__ uint32_t hash_string(const string_view& val) { - char const *ptr = val.data(); + char const* ptr = val.data(); uint32_t len = val.size_bytes(); if (len != 0) { return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1); @@ -130,7 +130,7 @@ __global__ void __launch_bounds__(block_size) typename block_scan::TempStorage scan_storage; } temp_storage; - frag_init_state_s *const s = &state_g; + frag_init_state_s* const s = &state_g; uint32_t t = threadIdx.x; uint32_t start_row, dtype_len, dtype_len_in, dtype; @@ -190,9 +190,11 @@ __global__ void __launch_bounds__(block_size) s->frag.num_values = s->frag.num_rows; } } - dtype = s->col.physical_type; - dtype_len = - (dtype == INT96) ? 12 : (dtype == INT64 || dtype == DOUBLE) ? 8 : (dtype == BOOLEAN) ? 1 : 4; + dtype = s->col.physical_type; + dtype_len = (dtype == INT96) ? 12 + : (dtype == INT64 || dtype == DOUBLE) ? 8 + : (dtype == BOOLEAN) ? 1 + : 4; if (dtype == INT32) { dtype_len_in = GetDtypeLogicalLen(s->col.leaf_column); } else if (dtype == INT96) { @@ -224,11 +226,10 @@ __global__ void __launch_bounds__(block_size) } else if (dtype_len_in == 8) { hash = uint64_init_hash(s->col.leaf_column->element(val_idx)); } else { - hash = uint32_init_hash((dtype_len_in == 4) - ? s->col.leaf_column->element(val_idx) - : (dtype_len_in == 2) - ? s->col.leaf_column->element(val_idx) - : s->col.leaf_column->element(val_idx)); + hash = + uint32_init_hash((dtype_len_in == 4) ? s->col.leaf_column->element(val_idx) + : (dtype_len_in == 2) ? s->col.leaf_column->element(val_idx) + : s->col.leaf_column->element(val_idx)); } } } else { @@ -246,7 +247,7 @@ __global__ void __launch_bounds__(block_size) } __syncthreads(); if (is_valid && dtype != BOOLEAN) { - uint32_t *dict_index = s->col.dict_index; + uint32_t* dict_index = s->col.dict_index; if (dict_index) { atomicAdd(&s->map.u32[hash >> 1], (hash & 1) ? 1 << 16 : 1); dict_index[start_value_idx + nz_pos] = @@ -283,7 +284,7 @@ __global__ void __launch_bounds__(block_size) __syncthreads(); // Put the indices back in hash order if (s->col.dict_index) { - uint32_t *dict_index = s->col.dict_index + start_row; + uint32_t* dict_index = s->col.dict_index + start_row; uint32_t nnz = s->frag.non_nulls; for (uint32_t i = 0; i < nnz; i += block_size) { uint32_t pos = 0, hash = 0, pos_old, pos_new, sh, colliding_row, val = 0; @@ -393,7 +394,7 @@ __global__ void __launch_bounds__(128) uint32_t frag_id = blockIdx.y * 4 + (threadIdx.x >> 5); uint32_t column_id = blockIdx.x; auto num_fragments_per_column = fragments.size().second; - statistics_group *const g = &group_g[threadIdx.x >> 5]; + statistics_group* const g = &group_g[threadIdx.x >> 5]; if (!lane_id && frag_id < num_fragments_per_column) { g->col = &col_desc[column_id]; g->start_row = fragments[column_id][frag_id].start_value_idx; @@ -408,8 +409,8 @@ __global__ void __launch_bounds__(128) gpuInitPages(device_2dspan chunks, device_span pages, device_span col_desc, - statistics_merge_group *page_grstats, - statistics_merge_group *chunk_grstats, + statistics_merge_group* page_grstats, + statistics_merge_group* chunk_grstats, int32_t num_columns) { // TODO: All writing seems to be done by thread 0. Could be replaced by thrust foreach @@ -502,9 +503,9 @@ __global__ void __launch_bounds__(128) fragment_data_size = frag_g.fragment_data_size; } // TODO (dm): this convoluted logic to limit page size needs refactoring - max_page_size = (values_in_page * 2 >= ck_g.num_values) - ? 256 * 1024 - : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024 : 512 * 1024; + max_page_size = (values_in_page * 2 >= ck_g.num_values) ? 256 * 1024 + : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024 + : 512 * 1024; if (num_rows >= ck_g.num_rows || (values_in_page > 0 && (page_size + fragment_data_size > max_page_size || @@ -632,7 +633,7 @@ static __device__ __constant__ uint32_t kRleRunMask[16] = { /** * @brief Variable-length encode an integer */ -inline __device__ uint8_t *VlqEncode(uint8_t *p, uint32_t v) +inline __device__ uint8_t* VlqEncode(uint8_t* p, uint32_t v) { while (v > 0x7f) { *p++ = (v | 0x80); @@ -646,7 +647,7 @@ inline __device__ uint8_t *VlqEncode(uint8_t *p, uint32_t v) * @brief Pack literal values in output bitstream (1,2,4,8,12 or 16 bits per value) */ inline __device__ void PackLiterals( - uint8_t *dst, uint32_t v, uint32_t count, uint32_t w, uint32_t t) + uint8_t* dst, uint32_t v, uint32_t count, uint32_t w, uint32_t t) { if (w == 1 || w == 2 || w == 4 || w == 8 || w == 12 || w == 16) { if (t <= (count | 0x1f)) { @@ -713,7 +714,7 @@ inline __device__ void PackLiterals( // Copy scratch data to final destination auto available_bytes = (count * w + 7) / 8; - auto scratch_bytes = reinterpret_cast(&scratch[0]); + auto scratch_bytes = reinterpret_cast(&scratch[0]); if (t < available_bytes) { dst[t] = scratch_bytes[t]; } if (t + 128 < available_bytes) { dst[t + 128] = scratch_bytes[t + 128]; } __syncthreads(); @@ -730,7 +731,7 @@ inline __device__ void PackLiterals( * @param[in] t thread id (0..127) */ static __device__ void RleEncode( - page_enc_state_s *s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t) + page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t) { uint32_t rle_pos = s->rle_pos; uint32_t rle_run = s->rle_run; @@ -759,7 +760,7 @@ static __device__ void RleEncode( if (rle_rpt_count < max_rpt_count || (flush && rle_pos == numvals)) { if (t == 0) { uint32_t const run_val = s->run_val; - uint8_t *dst = VlqEncode(s->rle_out, rle_run); + uint8_t* dst = VlqEncode(s->rle_out, rle_run); *dst++ = run_val; if (nbits > 8) { *dst++ = run_val >> 8; } s->rle_out = dst; @@ -823,7 +824,7 @@ static __device__ void RleEncode( rle_rpt_count = 0; // Defer repeat run } if (lit_div8 != 0) { - uint8_t *dst = s->rle_out + 1 + (rle_run >> 1) * nbits; + uint8_t* dst = s->rle_out + 1 + (rle_run >> 1) * nbits; PackLiterals(dst, (rle_pos + t < numvals) ? v0 : 0, lit_div8 * 8, nbits, t); rle_run = (rle_run + lit_div8 * 2) | 1; rle_pos = min(rle_pos + lit_div8 * 8, numvals); @@ -833,7 +834,7 @@ static __device__ void RleEncode( __syncthreads(); // Complete literal run if (!t) { - uint8_t *dst = s->rle_out; + uint8_t* dst = s->rle_out; dst[0] = rle_run; // At most 0x7f dst += 1 + nbits * (rle_run >> 1); s->rle_out = dst; @@ -868,13 +869,13 @@ static __device__ void RleEncode( * @param[in] flush nonzero if last batch in block * @param[in] t thread id (0..127) */ -static __device__ void PlainBoolEncode(page_enc_state_s *s, +static __device__ void PlainBoolEncode(page_enc_state_s* s, uint32_t numvals, uint32_t flush, uint32_t t) { uint32_t rle_pos = s->rle_pos; - uint8_t *dst = s->rle_out; + uint8_t* dst = s->rle_out; while (rle_pos < numvals) { uint32_t pos = rle_pos + t; @@ -935,7 +936,7 @@ __global__ void __launch_bounds__(128, 8) using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage temp_storage; - page_enc_state_s *const s = &state_g; + page_enc_state_s* const s = &state_g; uint32_t t = threadIdx.x; uint32_t dtype, dtype_len_in, dtype_len_out; int32_t dict_bits; @@ -1002,8 +1003,8 @@ __global__ void __launch_bounds__(128, 8) __syncthreads(); } if (t < 32) { - uint8_t *cur = s->cur; - uint8_t *rle_out = s->rle_out; + uint8_t* cur = s->cur; + uint8_t* rle_out = s->rle_out; if (t < 4) { uint32_t rle_bytes = (uint32_t)(rle_out - cur) - 4; cur[t] = rle_bytes >> (t * 8); @@ -1015,7 +1016,7 @@ __global__ void __launch_bounds__(128, 8) } else if (s->page.page_type != PageType::DICTIONARY_PAGE && s->col.num_rep_level_bits() != 0 // This means there ARE repetition levels (has list) ) { - auto encode_levels = [&](uint8_t const *lvl_val_data, uint32_t nbits) { + auto encode_levels = [&](uint8_t const* lvl_val_data, uint32_t nbits) { // For list types, the repetition and definition levels are pre-calculated. We just need to // encode and write them now. if (!t) { @@ -1040,8 +1041,8 @@ __global__ void __launch_bounds__(128, 8) __syncthreads(); } if (t < 32) { - uint8_t *cur = s->cur; - uint8_t *rle_out = s->rle_out; + uint8_t* cur = s->cur; + uint8_t* rle_out = s->rle_out; if (t < 4) { uint32_t rle_bytes = (uint32_t)(rle_out - cur) - 4; cur[t] = rle_bytes >> (t * 8); @@ -1056,9 +1057,11 @@ __global__ void __launch_bounds__(128, 8) } // Encode data values __syncthreads(); - dtype = s->col.physical_type; - dtype_len_out = - (dtype == INT96) ? 12 : (dtype == INT64 || dtype == DOUBLE) ? 8 : (dtype == BOOLEAN) ? 1 : 4; + dtype = s->col.physical_type; + dtype_len_out = (dtype == INT96) ? 12 + : (dtype == INT64 || dtype == DOUBLE) ? 8 + : (dtype == BOOLEAN) ? 1 + : 4; if (dtype == INT32) { dtype_len_in = GetDtypeLogicalLen(s->col.leaf_column); } else if (dtype == INT96) { @@ -1068,7 +1071,7 @@ __global__ void __launch_bounds__(128, 8) } dict_bits = (dtype == BOOLEAN) ? 1 : (s->page.dict_bits_plus1 - 1); if (t == 0) { - uint8_t *dst = s->cur; + uint8_t* dst = s->cur; s->rle_run = 0; s->rle_pos = 0; s->rle_numvals = 0; @@ -1138,7 +1141,7 @@ __global__ void __launch_bounds__(128, 8) __syncthreads(); } else { // Non-dictionary encoding - uint8_t *dst = s->cur; + uint8_t* dst = s->cur; if (is_valid) { len = dtype_len_out; @@ -1250,7 +1253,7 @@ __global__ void __launch_bounds__(128, 8) } } if (t == 0) { - uint8_t *base = s->page.page_data + s->page.max_hdr_size; + uint8_t* base = s->page.page_data + s->page.max_hdr_size; uint32_t actual_data_size = static_cast(s->cur - base); uint32_t compressed_bfr_size = GetMaxCompressedBfrSize(actual_data_size); s->page.max_data_size = actual_data_size; @@ -1298,7 +1301,7 @@ __global__ void __launch_bounds__(128) gpuDecideCompression(device_span 0x7f) { *p++ = v | 0x80; @@ -1339,7 +1342,7 @@ inline __device__ uint8_t *cpw_put_uint32(uint8_t *p, uint32_t v) return p; } -inline __device__ uint8_t *cpw_put_uint64(uint8_t *p, uint64_t v) +inline __device__ uint8_t* cpw_put_uint64(uint8_t* p, uint64_t v) { while (v > 0x7f) { *p++ = v | 0x80; @@ -1349,19 +1352,19 @@ inline __device__ uint8_t *cpw_put_uint64(uint8_t *p, uint64_t v) return p; } -inline __device__ uint8_t *cpw_put_int32(uint8_t *p, int32_t v) +inline __device__ uint8_t* cpw_put_int32(uint8_t* p, int32_t v) { int32_t s = (v < 0); return cpw_put_uint32(p, (v ^ -s) * 2 + s); } -inline __device__ uint8_t *cpw_put_int64(uint8_t *p, int64_t v) +inline __device__ uint8_t* cpw_put_int64(uint8_t* p, int64_t v) { int64_t s = (v < 0); return cpw_put_uint64(p, (v ^ -s) * 2 + s); } -inline __device__ uint8_t *cpw_put_fldh(uint8_t *p, int f, int cur, int t) +inline __device__ uint8_t* cpw_put_fldh(uint8_t* p, int f, int cur, int t) { if (f > cur && f <= cur + 15) { *p++ = ((f - cur) << 4) | t; @@ -1373,11 +1376,11 @@ inline __device__ uint8_t *cpw_put_fldh(uint8_t *p, int f, int cur, int t) } class header_encoder { - uint8_t *current_header_ptr; + uint8_t* current_header_ptr; int current_field_index; public: - inline __device__ header_encoder(uint8_t *header_start) + inline __device__ header_encoder(uint8_t* header_start) : current_header_ptr(header_start), current_field_index(0) { } @@ -1411,7 +1414,7 @@ class header_encoder { current_field_index = field; } - inline __device__ void field_binary(int field, const void *value, uint32_t length) + inline __device__ void field_binary(int field, const void* value, uint32_t length) { current_header_ptr = cpw_put_fldh(current_header_ptr, field, current_field_index, ST_FLD_BINARY); @@ -1421,21 +1424,21 @@ class header_encoder { current_field_index = field; } - inline __device__ void end(uint8_t **header_end, bool termination_flag = true) + inline __device__ void end(uint8_t** header_end, bool termination_flag = true) { if (termination_flag == false) { *current_header_ptr++ = 0; } *header_end = current_header_ptr; } - inline __device__ uint8_t *get_ptr(void) { return current_header_ptr; } + inline __device__ uint8_t* get_ptr(void) { return current_header_ptr; } - inline __device__ void set_ptr(uint8_t *ptr) { current_header_ptr = ptr; } + inline __device__ void set_ptr(uint8_t* ptr) { current_header_ptr = ptr; } }; -__device__ uint8_t *EncodeStatistics(uint8_t *start, - const statistics_chunk *s, +__device__ uint8_t* EncodeStatistics(uint8_t* start, + const statistics_chunk* s, uint8_t dtype, - float *fp_scratch) + float* fp_scratch) { uint8_t *end, dtype_len; switch (dtype) { @@ -1488,7 +1491,7 @@ __global__ void __launch_bounds__(128) gpuEncodePageHeaders(device_span pages, device_span comp_stat, device_span page_stats, - const statistics_chunk *chunk_stats) + const statistics_chunk* chunk_stats) { // When this whole kernel becomes single thread, the following variables need not be __shared__ __shared__ __align__(8) parquet_column_device_view col_g; @@ -1579,7 +1582,7 @@ __global__ void __launch_bounds__(1024) uint32_t t = threadIdx.x; uint8_t *dst, *dst_base; - const EncPage *first_page; + const EncPage* first_page; uint32_t num_pages, uncompressed_size; if (t == 0) ck_g = chunks[blockIdx.x]; @@ -1592,7 +1595,7 @@ __global__ void __launch_bounds__(1024) dst_base = dst; uncompressed_size = ck_g.bfr_size; for (uint32_t page = 0; page < num_pages; page++) { - const uint8_t *src; + const uint8_t* src; uint32_t hdr_len, data_len; if (t == 0) { page_g = first_page[page]; } @@ -1625,8 +1628,8 @@ __global__ void __launch_bounds__(1024) * */ struct def_level_fn { - column_device_view const *parent_col; - uint8_t const *d_nullability; + column_device_view const* parent_col; + uint8_t const* d_nullability; uint8_t sub_level_start; uint8_t curr_def_level; @@ -1757,12 +1760,14 @@ struct def_level_fn { */ dremel_data get_dremel_data(column_view h_col, // TODO(cp): use device_span once it is converted to a single hd_vec - rmm::device_uvector const &d_nullability, - std::vector const &nullability, + rmm::device_uvector const& d_nullability, + std::vector const& nullability, rmm::cuda_stream_view stream) { auto get_list_level = [](column_view col) { - while (col.type().id() == type_id::STRUCT) { col = col.child(0); } + while (col.type().id() == type_id::STRUCT) { + col = col.child(0); + } return col; }; @@ -1832,7 +1837,7 @@ dremel_data get_dremel_data(column_view h_col, } std::unique_ptr device_view_owners; - column_device_view *d_nesting_levels; + column_device_view* d_nesting_levels; std::tie(device_view_owners, d_nesting_levels) = contiguous_copy_column_device_views(nesting_levels, stream); @@ -2147,8 +2152,8 @@ void InitEncoderPages(device_2dspan chunks, device_span pages, device_span col_desc, int32_t num_columns, - statistics_merge_group *page_grstats, - statistics_merge_group *chunk_grstats, + statistics_merge_group* page_grstats, + statistics_merge_group* chunk_grstats, rmm::cuda_stream_view stream) { auto num_rowgroups = chunks.size().first; @@ -2199,7 +2204,7 @@ void DecideCompression(device_span chunks, rmm::cuda_stream_view void EncodePageHeaders(device_span pages, device_span comp_stat, device_span page_stats, - const statistics_chunk *chunk_stats, + const statistics_chunk* chunk_stats, rmm::cuda_stream_view stream) { // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index bc10fd92566..a5536775116 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -45,9 +45,9 @@ static const __device__ __constant__ uint8_t g_list2struct[16] = {0, ST_FLD_LIST}; struct byte_stream_s { - const uint8_t *cur; - const uint8_t *end; - const uint8_t *base; + const uint8_t* cur; + const uint8_t* end; + const uint8_t* base; // Parsed symbols PageType page_type; PageInfo page; @@ -61,12 +61,12 @@ struct byte_stream_s { * * @return Current byte pointed to by the byte stream */ -inline __device__ unsigned int getb(byte_stream_s *bs) +inline __device__ unsigned int getb(byte_stream_s* bs) { return (bs->cur < bs->end) ? *bs->cur++ : 0; } -inline __device__ void skip_bytes(byte_stream_s *bs, size_t bytecnt) +inline __device__ void skip_bytes(byte_stream_s* bs, size_t bytecnt) { bytecnt = min(bytecnt, (size_t)(bs->end - bs->cur)); bs->cur += bytecnt; @@ -83,7 +83,7 @@ inline __device__ void skip_bytes(byte_stream_s *bs, size_t bytecnt) * * @return Decoded 32 bit integer */ -__device__ uint32_t get_u32(byte_stream_s *bs) +__device__ uint32_t get_u32(byte_stream_s* bs) { uint32_t v = 0, l = 0, c; do { @@ -105,13 +105,13 @@ __device__ uint32_t get_u32(byte_stream_s *bs) * * @return Decoded 32 bit integer */ -inline __device__ int32_t get_i32(byte_stream_s *bs) +inline __device__ int32_t get_i32(byte_stream_s* bs) { uint32_t u = get_u32(bs); return (int32_t)((u >> 1u) ^ -(int32_t)(u & 1)); } -__device__ void skip_struct_field(byte_stream_s *bs, int field_type) +__device__ void skip_struct_field(byte_stream_s* bs, int field_type) { int struct_depth = 0; int rep_cnt = 0; @@ -161,11 +161,11 @@ __device__ void skip_struct_field(byte_stream_s *bs, int field_type) */ struct ParquetFieldInt32 { int field; - int32_t &val; + int32_t& val; - __device__ ParquetFieldInt32(int f, int32_t &v) : field(f), val(v) {} + __device__ ParquetFieldInt32(int f, int32_t& v) : field(f), val(v) {} - inline __device__ bool operator()(byte_stream_s *bs, int field_type) + inline __device__ bool operator()(byte_stream_s* bs, int field_type) { val = get_i32(bs); return (field_type != ST_FLD_I32); @@ -180,11 +180,11 @@ struct ParquetFieldInt32 { template struct ParquetFieldEnum { int field; - Enum &val; + Enum& val; - __device__ ParquetFieldEnum(int f, Enum &v) : field(f), val(v) {} + __device__ ParquetFieldEnum(int f, Enum& v) : field(f), val(v) {} - inline __device__ bool operator()(byte_stream_s *bs, int field_type) + inline __device__ bool operator()(byte_stream_s* bs, int field_type) { val = static_cast(get_i32(bs)); return (field_type != ST_FLD_I32); @@ -204,7 +204,7 @@ struct ParquetFieldStruct { __device__ ParquetFieldStruct(int f) : field(f) {} - inline __device__ bool operator()(byte_stream_s *bs, int field_type) + inline __device__ bool operator()(byte_stream_s* bs, int field_type) { return ((field_type != ST_FLD_STRUCT) || !op(bs)); } @@ -226,10 +226,10 @@ struct ParquetFieldStruct { template struct FunctionSwitchImpl { template - static inline __device__ bool run(byte_stream_s *bs, + static inline __device__ bool run(byte_stream_s* bs, int field_type, - const int &field, - thrust::tuple &ops) + const int& field, + thrust::tuple& ops) { if (field == thrust::get(ops).field) { return thrust::get(ops)(bs, field_type); @@ -242,10 +242,10 @@ struct FunctionSwitchImpl { template <> struct FunctionSwitchImpl<0> { template - static inline __device__ bool run(byte_stream_s *bs, + static inline __device__ bool run(byte_stream_s* bs, int field_type, - const int &field, - thrust::tuple &ops) + const int& field, + thrust::tuple& ops) { if (field == thrust::get<0>(ops).field) { return thrust::get<0>(ops)(bs, field_type); @@ -267,7 +267,7 @@ struct FunctionSwitchImpl<0> { * byte stream. Otherwise true is returned. */ template -inline __device__ bool parse_header(thrust::tuple &op, byte_stream_s *bs) +inline __device__ bool parse_header(thrust::tuple& op, byte_stream_s* bs) { constexpr int index = thrust::tuple_size>::value - 1; int field = 0; @@ -284,7 +284,7 @@ inline __device__ bool parse_header(thrust::tuple &op, byte_stream_ } struct gpuParseDataPageHeader { - __device__ bool operator()(byte_stream_s *bs) + __device__ bool operator()(byte_stream_s* bs) { auto op = thrust::make_tuple(ParquetFieldInt32(1, bs->page.num_input_values), ParquetFieldEnum(2, bs->page.encoding), @@ -295,7 +295,7 @@ struct gpuParseDataPageHeader { }; struct gpuParseDictionaryPageHeader { - __device__ bool operator()(byte_stream_s *bs) + __device__ bool operator()(byte_stream_s* bs) { auto op = thrust::make_tuple(ParquetFieldInt32(1, bs->page.num_input_values), ParquetFieldEnum(2, bs->page.encoding)); @@ -304,7 +304,7 @@ struct gpuParseDictionaryPageHeader { }; struct gpuParseDataPageHeaderV2 { - __device__ bool operator()(byte_stream_s *bs) + __device__ bool operator()(byte_stream_s* bs) { auto op = thrust::make_tuple(ParquetFieldInt32(1, bs->page.num_input_values), ParquetFieldInt32(3, bs->page.num_rows), @@ -316,7 +316,7 @@ struct gpuParseDataPageHeaderV2 { }; struct gpuParsePageHeader { - __device__ bool operator()(byte_stream_s *bs) + __device__ bool operator()(byte_stream_s* bs) { auto op = thrust::make_tuple(ParquetFieldEnum(1, bs->page_type), ParquetFieldInt32(2, bs->page.uncompressed_page_size), @@ -336,14 +336,14 @@ struct gpuParsePageHeader { */ // blockDim {128,1,1} extern "C" __global__ void __launch_bounds__(128) - gpuDecodePageHeaders(ColumnChunkDesc *chunks, int32_t num_chunks) + gpuDecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks) { gpuParsePageHeader parse_page_header; __shared__ byte_stream_s bs_g[4]; int lane_id = threadIdx.x % 32; int chunk = (blockIdx.x * 4) + (threadIdx.x / 32); - byte_stream_s *const bs = &bs_g[threadIdx.x / 32]; + byte_stream_s* const bs = &bs_g[threadIdx.x / 32]; if (chunk < num_chunks and lane_id == 0) bs->ck = chunks[chunk]; __syncthreads(); @@ -354,7 +354,7 @@ extern "C" __global__ void __launch_bounds__(128) uint32_t dictionary_page_count = 0; int32_t max_num_pages; int32_t num_dict_pages = bs->ck.num_dict_pages; - PageInfo *page_info; + PageInfo* page_info; if (!lane_id) { bs->base = bs->cur = bs->ck.compressed_data; @@ -402,7 +402,7 @@ extern "C" __global__ void __launch_bounds__(128) break; default: index_out = -1; break; } - bs->page.page_data = const_cast(bs->cur); + bs->page.page_data = const_cast(bs->cur); bs->cur += bs->page.compressed_page_size; } else { bs->cur = bs->end; @@ -434,21 +434,21 @@ extern "C" __global__ void __launch_bounds__(128) */ // blockDim {128,1,1} extern "C" __global__ void __launch_bounds__(128) - gpuBuildStringDictionaryIndex(ColumnChunkDesc *chunks, int32_t num_chunks) + gpuBuildStringDictionaryIndex(ColumnChunkDesc* chunks, int32_t num_chunks) { __shared__ ColumnChunkDesc chunk_g[4]; int lane_id = threadIdx.x % 32; int chunk = (blockIdx.x * 4) + (threadIdx.x / 32); - ColumnChunkDesc *const ck = &chunk_g[threadIdx.x / 32]; + ColumnChunkDesc* const ck = &chunk_g[threadIdx.x / 32]; if (chunk < num_chunks and lane_id == 0) *ck = chunks[chunk]; __syncthreads(); if (chunk >= num_chunks) { return; } if (!lane_id && ck->num_dict_pages > 0 && ck->str_dict_index) { // Data type to describe a string - string_index_pair *dict_index = ck->str_dict_index; - const uint8_t *dict = ck->page_info[0].page_data; + string_index_pair* dict_index = ck->str_dict_index; + const uint8_t* dict = ck->page_info[0].page_data; int dict_size = ck->page_info[0].uncompressed_page_size; int num_entries = ck->page_info[0].num_input_values; int pos = 0, cur = 0; @@ -464,13 +464,13 @@ extern "C" __global__ void __launch_bounds__(128) } } // TODO: Could store 8 entries in shared mem, then do a single warp-wide store - dict_index[i].first = reinterpret_cast(dict + pos + 4); + dict_index[i].first = reinterpret_cast(dict + pos + 4); dict_index[i].second = len; } } } -void __host__ DecodePageHeaders(ColumnChunkDesc *chunks, +void __host__ DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream) { @@ -479,7 +479,7 @@ void __host__ DecodePageHeaders(ColumnChunkDesc *chunks, gpuDecodePageHeaders<<>>(chunks, num_chunks); } -void __host__ BuildStringDictionaryIndex(ColumnChunkDesc *chunks, +void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream) { diff --git a/cpp/src/io/parquet/parquet.cpp b/cpp/src/io/parquet/parquet.cpp index 2a1bd0d5a18..6c658788fa1 100644 --- a/cpp/src/io/parquet/parquet.cpp +++ b/cpp/src/io/parquet/parquet.cpp @@ -63,7 +63,8 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth) if (n == 0xf) n = get_i32(); t = g_list2struct[c & 0xf]; if (depth > 10) return false; - for (int32_t i = 0; i < n; i++) skip_struct_field(t, depth + 1); + for (int32_t i = 0; i < n; i++) + skip_struct_field(t, depth + 1); } break; case ST_FLD_STRUCT: for (;;) { @@ -84,10 +85,10 @@ bool CompactProtocolReader::skip_struct_field(int t, int depth) template struct FunctionSwitchImpl { template - static inline bool run(CompactProtocolReader *cpr, + static inline bool run(CompactProtocolReader* cpr, int field_type, - const int &field, - std::tuple &ops) + const int& field, + std::tuple& ops) { if (field == std::get(ops).field()) { return std::get(ops)(cpr, field_type); @@ -100,10 +101,10 @@ struct FunctionSwitchImpl { template <> struct FunctionSwitchImpl<0> { template - static inline bool run(CompactProtocolReader *cpr, + static inline bool run(CompactProtocolReader* cpr, int field_type, - const int &field, - std::tuple &ops) + const int& field, + std::tuple& ops) { if (field == std::get<0>(ops).field()) { return std::get<0>(ops)(cpr, field_type); @@ -115,7 +116,7 @@ struct FunctionSwitchImpl<0> { }; template -inline bool function_builder(CompactProtocolReader *cpr, std::tuple &op) +inline bool function_builder(CompactProtocolReader* cpr, std::tuple& op) { constexpr int index = std::tuple_size>::value - 1; int field = 0; @@ -131,7 +132,7 @@ inline bool function_builder(CompactProtocolReader *cpr, std::tuple return true; } -bool CompactProtocolReader::read(FileMetaData *f) +bool CompactProtocolReader::read(FileMetaData* f) { auto op = std::make_tuple(ParquetFieldInt32(1, f->version), ParquetFieldStructList(2, f->schema), @@ -142,7 +143,7 @@ bool CompactProtocolReader::read(FileMetaData *f) return function_builder(this, op); } -bool CompactProtocolReader::read(SchemaElement *s) +bool CompactProtocolReader::read(SchemaElement* s) { auto op = std::make_tuple(ParquetFieldEnum(1, s->type), ParquetFieldInt32(2, s->type_length), @@ -156,7 +157,7 @@ bool CompactProtocolReader::read(SchemaElement *s) return function_builder(this, op); } -bool CompactProtocolReader::read(LogicalType *l) +bool CompactProtocolReader::read(LogicalType* l) { auto op = std::make_tuple(ParquetFieldUnion(1, l->isset.STRING, l->STRING), @@ -174,40 +175,40 @@ bool CompactProtocolReader::read(LogicalType *l) return function_builder(this, op); } -bool CompactProtocolReader::read(DecimalType *d) +bool CompactProtocolReader::read(DecimalType* d) { auto op = std::make_tuple(ParquetFieldInt32(1, d->scale), ParquetFieldInt32(2, d->precision)); return function_builder(this, op); } -bool CompactProtocolReader::read(TimeType *t) +bool CompactProtocolReader::read(TimeType* t) { auto op = std::make_tuple(ParquetFieldBool(1, t->isAdjustedToUTC), ParquetFieldStruct(2, t->unit)); return function_builder(this, op); } -bool CompactProtocolReader::read(TimestampType *t) +bool CompactProtocolReader::read(TimestampType* t) { auto op = std::make_tuple(ParquetFieldBool(1, t->isAdjustedToUTC), ParquetFieldStruct(2, t->unit)); return function_builder(this, op); } -bool CompactProtocolReader::read(TimeUnit *u) +bool CompactProtocolReader::read(TimeUnit* u) { auto op = std::make_tuple(ParquetFieldUnion(1, u->isset.MILLIS, u->MILLIS), ParquetFieldUnion(2, u->isset.MICROS, u->MICROS)); return function_builder(this, op); } -bool CompactProtocolReader::read(IntType *i) +bool CompactProtocolReader::read(IntType* i) { auto op = std::make_tuple(ParquetFieldInt8(1, i->bitWidth), ParquetFieldBool(2, i->isSigned)); return function_builder(this, op); } -bool CompactProtocolReader::read(RowGroup *r) +bool CompactProtocolReader::read(RowGroup* r) { auto op = std::make_tuple(ParquetFieldStructList(1, r->columns), ParquetFieldInt64(2, r->total_byte_size), @@ -215,7 +216,7 @@ bool CompactProtocolReader::read(RowGroup *r) return function_builder(this, op); } -bool CompactProtocolReader::read(ColumnChunk *c) +bool CompactProtocolReader::read(ColumnChunk* c) { auto op = std::make_tuple(ParquetFieldString(1, c->file_path), ParquetFieldInt64(2, c->file_offset), @@ -227,7 +228,7 @@ bool CompactProtocolReader::read(ColumnChunk *c) return function_builder(this, op); } -bool CompactProtocolReader::read(ColumnChunkMetaData *c) +bool CompactProtocolReader::read(ColumnChunkMetaData* c) { auto op = std::make_tuple(ParquetFieldEnum(1, c->type), ParquetFieldEnumList(2, c->encodings), @@ -243,7 +244,7 @@ bool CompactProtocolReader::read(ColumnChunkMetaData *c) return function_builder(this, op); } -bool CompactProtocolReader::read(PageHeader *p) +bool CompactProtocolReader::read(PageHeader* p) { auto op = std::make_tuple(ParquetFieldEnum(1, p->type), ParquetFieldInt32(2, p->uncompressed_page_size), @@ -253,7 +254,7 @@ bool CompactProtocolReader::read(PageHeader *p) return function_builder(this, op); } -bool CompactProtocolReader::read(DataPageHeader *d) +bool CompactProtocolReader::read(DataPageHeader* d) { auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values), ParquetFieldEnum(2, d->encoding), @@ -262,14 +263,14 @@ bool CompactProtocolReader::read(DataPageHeader *d) return function_builder(this, op); } -bool CompactProtocolReader::read(DictionaryPageHeader *d) +bool CompactProtocolReader::read(DictionaryPageHeader* d) { auto op = std::make_tuple(ParquetFieldInt32(1, d->num_values), ParquetFieldEnum(2, d->encoding)); return function_builder(this, op); } -bool CompactProtocolReader::read(KeyValue *k) +bool CompactProtocolReader::read(KeyValue* k) { auto op = std::make_tuple(ParquetFieldString(1, k->key), ParquetFieldString(2, k->value)); return function_builder(this, op); @@ -282,24 +283,24 @@ bool CompactProtocolReader::read(KeyValue *k) * * @return True if schema constructed completely, false otherwise */ -bool CompactProtocolReader::InitSchema(FileMetaData *md) +bool CompactProtocolReader::InitSchema(FileMetaData* md) { if (static_cast(WalkSchema(md)) != md->schema.size()) return false; /* Inside FileMetaData, there is a std::vector of RowGroups and each RowGroup contains a * a std::vector of ColumnChunks. Each ColumnChunk has a member ColumnMetaData, which contains * a std::vector of std::strings representing paths. The purpose of the code below is to set the - * schema_idx of each column of each row to it corresonding row_group. This is effectively + * schema_idx of each column of each row to it corresponding row_group. This is effectively * mapping the columns to the schema. */ - for (auto &row_group : md->row_groups) { + for (auto& row_group : md->row_groups) { int current_schema_index = 0; - for (auto &column : row_group.columns) { + for (auto& column : row_group.columns) { int parent = 0; // root of schema - for (auto const &path : column.meta_data.path_in_schema) { + for (auto const& path : column.meta_data.path_in_schema) { auto const it = [&] { // find_if starting at (current_schema_index + 1) and then wrapping - auto schema = [&](auto const &e) { return e.parent_idx == parent && e.name == path; }; + auto schema = [&](auto const& e) { return e.parent_idx == parent && e.name == path; }; auto mid = md->schema.cbegin() + current_schema_index + 1; auto it = std::find_if(mid, md->schema.cend(), schema); if (it != md->schema.cend()) return it; @@ -328,10 +329,10 @@ bool CompactProtocolReader::InitSchema(FileMetaData *md) * @return The node index that was populated */ int CompactProtocolReader::WalkSchema( - FileMetaData *md, int idx, int parent_idx, int max_def_level, int max_rep_level) + FileMetaData* md, int idx, int parent_idx, int max_def_level, int max_rep_level) { if (idx >= 0 && (size_t)idx < md->schema.size()) { - SchemaElement *e = &md->schema[idx]; + SchemaElement* e = &md->schema[idx]; if (e->repetition_type == OPTIONAL) { ++max_def_level; } else if (e->repetition_type == REPEATED) { diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index eefff518a9a..2232017409d 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -166,7 +166,7 @@ struct SchemaElement { int max_repetition_level = 0; int parent_idx = 0; - bool operator==(SchemaElement const &other) const + bool operator==(SchemaElement const& other) const { return type == other.type && converted_type == other.converted_type && type_length == other.type_length && repetition_type == other.repetition_type && @@ -232,7 +232,7 @@ struct ColumnChunkMetaData { * column * * Each column chunk lives in a particular row group and are guaranteed to be - * contiguous in the file. Any mssing or corrupted chunks can be skipped during + * contiguous in the file. Any missing or corrupted chunks can be skipped during * reading. */ struct ColumnChunk { @@ -356,8 +356,8 @@ class CompactProtocolReader { static const uint8_t g_list2struct[16]; public: - explicit CompactProtocolReader(const uint8_t *base = nullptr, size_t len = 0) { init(base, len); } - void init(const uint8_t *base, size_t len) + explicit CompactProtocolReader(const uint8_t* base = nullptr, size_t len = 0) { init(base, len); } + void init(const uint8_t* base, size_t len) { m_base = m_cur = base; m_end = base + len; @@ -400,7 +400,7 @@ class CompactProtocolReader { uint64_t u = get_u64(); return (int64_t)((u >> 1u) ^ -(int64_t)(u & 1)); } - int32_t get_listh(uint8_t *el_type) noexcept + int32_t get_listh(uint8_t* el_type) noexcept { uint32_t c = getb(); int32_t sz = c >> 4; @@ -412,40 +412,40 @@ class CompactProtocolReader { public: // Generate Thrift structure parsing routines - bool read(FileMetaData *f); - bool read(SchemaElement *s); - bool read(LogicalType *l); - bool read(DecimalType *d); - bool read(TimeType *t); - bool read(TimeUnit *u); - bool read(TimestampType *t); - bool read(IntType *t); - bool read(RowGroup *r); - bool read(ColumnChunk *c); - bool read(ColumnChunkMetaData *c); - bool read(PageHeader *p); - bool read(DataPageHeader *d); - bool read(DictionaryPageHeader *d); - bool read(KeyValue *k); + bool read(FileMetaData* f); + bool read(SchemaElement* s); + bool read(LogicalType* l); + bool read(DecimalType* d); + bool read(TimeType* t); + bool read(TimeUnit* u); + bool read(TimestampType* t); + bool read(IntType* t); + bool read(RowGroup* r); + bool read(ColumnChunk* c); + bool read(ColumnChunkMetaData* c); + bool read(PageHeader* p); + bool read(DataPageHeader* d); + bool read(DictionaryPageHeader* d); + bool read(KeyValue* k); public: static int NumRequiredBits(uint32_t max_level) noexcept { return 32 - CountLeadingZeros32(max_level); } - bool InitSchema(FileMetaData *md); + bool InitSchema(FileMetaData* md); protected: - int WalkSchema(FileMetaData *md, + int WalkSchema(FileMetaData* md, int idx = 0, int parent_idx = 0, int max_def_level = 0, int max_rep_level = 0); protected: - const uint8_t *m_base = nullptr; - const uint8_t *m_cur = nullptr; - const uint8_t *m_end = nullptr; + const uint8_t* m_base = nullptr; + const uint8_t* m_cur = nullptr; + const uint8_t* m_end = nullptr; friend class ParquetFieldBool; friend class ParquetFieldInt8; @@ -473,12 +473,12 @@ class CompactProtocolReader { */ class ParquetFieldBool { int field_val; - bool &val; + bool& val; public: - ParquetFieldBool(int f, bool &v) : field_val(f), val(v) {} + ParquetFieldBool(int f, bool& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader *cpr, int field_type) + inline bool operator()(CompactProtocolReader* cpr, int field_type) { return (field_type != ST_FLD_TRUE && field_type != ST_FLD_FALSE) || !(val = (field_type == ST_FLD_TRUE), true); @@ -494,12 +494,12 @@ class ParquetFieldBool { */ class ParquetFieldInt8 { int field_val; - int8_t &val; + int8_t& val; public: - ParquetFieldInt8(int f, int8_t &v) : field_val(f), val(v) {} + ParquetFieldInt8(int f, int8_t& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader *cpr, int field_type) + inline bool operator()(CompactProtocolReader* cpr, int field_type) { val = cpr->getb(); return (field_type != ST_FLD_BYTE); @@ -515,12 +515,12 @@ class ParquetFieldInt8 { */ class ParquetFieldInt32 { int field_val; - int32_t &val; + int32_t& val; public: - ParquetFieldInt32(int f, int32_t &v) : field_val(f), val(v) {} + ParquetFieldInt32(int f, int32_t& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader *cpr, int field_type) + inline bool operator()(CompactProtocolReader* cpr, int field_type) { val = cpr->get_i32(); return (field_type != ST_FLD_I32); @@ -536,12 +536,12 @@ class ParquetFieldInt32 { */ class ParquetFieldInt64 { int field_val; - int64_t &val; + int64_t& val; public: - ParquetFieldInt64(int f, int64_t &v) : field_val(f), val(v) {} + ParquetFieldInt64(int f, int64_t& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader *cpr, int field_type) + inline bool operator()(CompactProtocolReader* cpr, int field_type) { val = cpr->get_i64(); return (field_type < ST_FLD_I16 || field_type > ST_FLD_I64); @@ -559,12 +559,12 @@ class ParquetFieldInt64 { template class ParquetFieldStructListFunctor { int field_val; - std::vector &val; + std::vector& val; public: - ParquetFieldStructListFunctor(int f, std::vector &v) : field_val(f), val(v) {} + ParquetFieldStructListFunctor(int f, std::vector& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader *cpr, int field_type) + inline bool operator()(CompactProtocolReader* cpr, int field_type) { if (field_type != ST_FLD_LIST) return true; @@ -584,7 +584,7 @@ class ParquetFieldStructListFunctor { }; template -ParquetFieldStructListFunctor ParquetFieldStructList(int f, std::vector &v) +ParquetFieldStructListFunctor ParquetFieldStructList(int f, std::vector& v) { return ParquetFieldStructListFunctor(f, v); } @@ -597,17 +597,17 @@ ParquetFieldStructListFunctor ParquetFieldStructList(int f, std::vector &v */ class ParquetFieldString { int field_val; - std::string &val; + std::string& val; public: - ParquetFieldString(int f, std::string &v) : field_val(f), val(v) {} + ParquetFieldString(int f, std::string& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader *cpr, int field_type) + inline bool operator()(CompactProtocolReader* cpr, int field_type) { if (field_type != ST_FLD_BINARY) return true; uint32_t n = cpr->get_u32(); if (n < (size_t)(cpr->m_end - cpr->m_cur)) { - val.assign((const char *)cpr->m_cur, n); + val.assign((const char*)cpr->m_cur, n); cpr->m_cur += n; return false; } else { @@ -627,12 +627,12 @@ class ParquetFieldString { template class ParquetFieldStructFunctor { int field_val; - T &val; + T& val; public: - ParquetFieldStructFunctor(int f, T &v) : field_val(f), val(v) {} + ParquetFieldStructFunctor(int f, T& v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader *cpr, int field_type) + inline bool operator()(CompactProtocolReader* cpr, int field_type) { return (field_type != ST_FLD_STRUCT || !(cpr->read(&val))); } @@ -641,7 +641,7 @@ class ParquetFieldStructFunctor { }; template -ParquetFieldStructFunctor ParquetFieldStruct(int f, T &v) +ParquetFieldStructFunctor ParquetFieldStruct(int f, T& v) { return ParquetFieldStructFunctor(f, v); } @@ -657,13 +657,13 @@ ParquetFieldStructFunctor ParquetFieldStruct(int f, T &v) template class ParquetFieldUnionFunctor { int field_val; - bool &is_set; - T &val; + bool& is_set; + T& val; public: - ParquetFieldUnionFunctor(int f, bool &b, T &v) : field_val(f), is_set(b), val(v) {} + ParquetFieldUnionFunctor(int f, bool& b, T& v) : field_val(f), is_set(b), val(v) {} - inline bool operator()(CompactProtocolReader *cpr, int field_type) + inline bool operator()(CompactProtocolReader* cpr, int field_type) { if (field_type != ST_FLD_STRUCT) { return true; @@ -679,13 +679,13 @@ class ParquetFieldUnionFunctor { template struct ParquetFieldUnionFunctor { int field_val; - bool &is_set; - T &val; + bool& is_set; + T& val; public: - ParquetFieldUnionFunctor(int f, bool &b, T &v) : field_val(f), is_set(b), val(v) {} + ParquetFieldUnionFunctor(int f, bool& b, T& v) : field_val(f), is_set(b), val(v) {} - inline bool operator()(CompactProtocolReader *cpr, int field_type) + inline bool operator()(CompactProtocolReader* cpr, int field_type) { if (field_type != ST_FLD_STRUCT) { return true; @@ -700,7 +700,7 @@ struct ParquetFieldUnionFunctor { }; template -ParquetFieldUnionFunctor::value> ParquetFieldUnion(int f, bool &b, T &v) +ParquetFieldUnionFunctor::value> ParquetFieldUnion(int f, bool& b, T& v) { return ParquetFieldUnionFunctor::value>(f, b, v); } @@ -713,11 +713,11 @@ ParquetFieldUnionFunctor::value> ParquetFieldUnion(int f, bo template class ParquetFieldEnum { int field_val; - Enum &val; + Enum& val; public: - ParquetFieldEnum(int f, Enum &v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader *cpr, int field_type) + ParquetFieldEnum(int f, Enum& v) : field_val(f), val(v) {} + inline bool operator()(CompactProtocolReader* cpr, int field_type) { val = static_cast(cpr->get_i32()); return (field_type != ST_FLD_I32); @@ -735,11 +735,11 @@ class ParquetFieldEnum { template class ParquetFieldEnumListFunctor { int field_val; - std::vector &val; + std::vector& val; public: - ParquetFieldEnumListFunctor(int f, std::vector &v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader *cpr, int field_type) + ParquetFieldEnumListFunctor(int f, std::vector& v) : field_val(f), val(v) {} + inline bool operator()(CompactProtocolReader* cpr, int field_type) { if (field_type != ST_FLD_LIST) return true; int current_byte = cpr->getb(); @@ -747,7 +747,9 @@ class ParquetFieldEnumListFunctor { int n = current_byte >> 4; if (n == 0xf) n = cpr->get_u32(); val.resize(n); - for (int32_t i = 0; i < n; i++) { val[i] = static_cast(cpr->get_i32()); } + for (int32_t i = 0; i < n; i++) { + val[i] = static_cast(cpr->get_i32()); + } return false; } @@ -755,7 +757,7 @@ class ParquetFieldEnumListFunctor { }; template -ParquetFieldEnumListFunctor ParquetFieldEnumList(int field, std::vector &v) +ParquetFieldEnumListFunctor ParquetFieldEnumList(int field, std::vector& v) { return ParquetFieldEnumListFunctor(field, v); } @@ -768,11 +770,11 @@ ParquetFieldEnumListFunctor ParquetFieldEnumList(int field, std::vector &v */ class ParquetFieldStringList { int field_val; - std::vector &val; + std::vector& val; public: - ParquetFieldStringList(int f, std::vector &v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader *cpr, int field_type) + ParquetFieldStringList(int f, std::vector& v) : field_val(f), val(v) {} + inline bool operator()(CompactProtocolReader* cpr, int field_type) { if (field_type != ST_FLD_LIST) return true; int current_byte = cpr->getb(); @@ -783,7 +785,7 @@ class ParquetFieldStringList { for (int32_t i = 0; i < n; i++) { uint32_t l = cpr->get_u32(); if (l < (size_t)(cpr->m_end - cpr->m_cur)) { - val[i].assign((const char *)cpr->m_cur, l); + val[i].assign((const char*)cpr->m_cur, l); cpr->m_cur += l; } else return true; @@ -801,14 +803,14 @@ class ParquetFieldStringList { */ class ParquetFieldStructBlob { int field_val; - std::vector &val; + std::vector& val; public: - ParquetFieldStructBlob(int f, std::vector &v) : field_val(f), val(v) {} - inline bool operator()(CompactProtocolReader *cpr, int field_type) + ParquetFieldStructBlob(int f, std::vector& v) : field_val(f), val(v) {} + inline bool operator()(CompactProtocolReader* cpr, int field_type) { if (field_type != ST_FLD_STRUCT) return true; - const uint8_t *start = cpr->m_cur; + const uint8_t* start = cpr->m_cur; cpr->skip_struct_field(field_type); if (cpr->m_cur > start) { val.assign(start, cpr->m_cur - 1); } return false; diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 1b6bb9ad7ca..abd7ccef523 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -95,15 +95,15 @@ struct PageNestingInfo { int32_t value_count; // total # of values decoded in this page/nesting-level int32_t null_count; // null count int32_t valid_map_offset; // current offset in bits relative to valid_map - uint8_t *data_out; // pointer into output buffer - uint32_t *valid_map; // pointer into output validity buffer + uint8_t* data_out; // pointer into output buffer + uint32_t* valid_map; // pointer into output validity buffer }; /** * @brief Struct describing a particular page of column chunk data */ struct PageInfo { - uint8_t *page_data; // Compressed page data before decompression, or uncompressed data after + uint8_t* page_data; // Compressed page data before decompression, or uncompressed data after // decompression int32_t compressed_page_size; // compressed data size in bytes int32_t uncompressed_page_size; // uncompressed data size in bytes @@ -139,7 +139,7 @@ struct PageInfo { // nesting information (input/output) for each page int num_nesting_levels; - PageNestingInfo *nesting; + PageNestingInfo* nesting; }; /** @@ -148,7 +148,7 @@ struct PageInfo { struct ColumnChunkDesc { ColumnChunkDesc() = default; explicit constexpr ColumnChunkDesc(size_t compressed_size_, - uint8_t *compressed_data_, + uint8_t* compressed_data_, size_t num_values_, uint16_t datatype_, uint16_t datatype_length_, @@ -190,7 +190,7 @@ struct ColumnChunkDesc { { } - uint8_t const *compressed_data; // pointer to compressed column chunk data + uint8_t const* compressed_data; // pointer to compressed column chunk data size_t compressed_size; // total compressed data size for this chunk size_t num_values; // total number of values in this column size_t start_row; // starting row of this chunk @@ -204,11 +204,11 @@ struct ColumnChunkDesc { int32_t num_data_pages; // number of data pages int32_t num_dict_pages; // number of dictionary pages int32_t max_num_pages; // size of page_info array - PageInfo *page_info; // output page info for up to num_dict_pages + + PageInfo* page_info; // output page info for up to num_dict_pages + // num_data_pages (dictionary pages first) - string_index_pair *str_dict_index; // index for string dictionary - uint32_t **valid_map_base; // base pointers of valid bit map for this column - void **column_data_base; // base pointers of column data + string_index_pair* str_dict_index; // index for string dictionary + uint32_t** valid_map_base; // base pointers of valid bit map for this column + void** column_data_base; // base pointers of column data int8_t codec; // compressed codec enum int8_t converted_type; // converted type enum int8_t decimal_scale; // decimal scale pow(10, -decimal_scale) @@ -222,21 +222,21 @@ struct ColumnChunkDesc { * @brief Struct describing an encoder column */ struct parquet_column_device_view : stats_column_desc { - uint32_t *dict_index; //!< Dictionary index [row] - uint32_t *dict_data; //!< Dictionary data (unique row indices) + uint32_t* dict_index; //!< Dictionary index [row] + uint32_t* dict_data; //!< Dictionary data (unique row indices) uint8_t physical_type; //!< physical data type uint8_t converted_type; //!< logical data type uint8_t level_bits; //!< bits to encode max definition (lower nibble) & repetition (upper nibble) //!< levels constexpr uint8_t num_def_level_bits() { return level_bits & 0xf; } constexpr uint8_t num_rep_level_bits() { return level_bits >> 4; } - size_type const *const - *nesting_offsets; //!< If column is a nested type, contains offset array of each nesting level + size_type const* const* + nesting_offsets; //!< If column is a nested type, contains offset array of each nesting level - size_type const *level_offsets; //!< Offset array for per-row pre-calculated rep/def level values - uint8_t const *rep_values; //!< Pre-calculated repetition level values - uint8_t const *def_values; //!< Pre-calculated definition level values - uint8_t *nullability; //!< Array of nullability of each nesting level. e.g. nullable[0] is + size_type const* level_offsets; //!< Offset array for per-row pre-calculated rep/def level values + uint8_t const* rep_values; //!< Pre-calculated repetition level values + uint8_t const* def_values; //!< Pre-calculated definition level values + uint8_t* nullability; //!< Array of nullability of each nesting level. e.g. nullable[0] is //!< nullability of parent_column. May be different from col.nullable() in //!< case of chunked writing. }; @@ -265,7 +265,7 @@ constexpr size_t kDictScratchSize = (1 << kDictHashBits) * sizeof(uint32_t); /** * @brief Return the byte length of parquet dtypes that are physically represented by INT32 */ -inline uint32_t __device__ GetDtypeLogicalLen(column_device_view *col) +inline uint32_t __device__ GetDtypeLogicalLen(column_device_view* col) { switch (col->type().id()) { case cudf::type_id::INT8: @@ -291,18 +291,18 @@ struct EncPage; * @brief Struct describing an encoder column chunk */ struct EncColumnChunk { - parquet_column_device_view const *col_desc; //!< Column description - PageFragment *fragments; //!< First fragment in chunk - uint8_t *uncompressed_bfr; //!< Uncompressed page data - uint8_t *compressed_bfr; //!< Compressed page data - statistics_chunk const *stats; //!< Fragment statistics + parquet_column_device_view const* col_desc; //!< Column description + PageFragment* fragments; //!< First fragment in chunk + uint8_t* uncompressed_bfr; //!< Uncompressed page data + uint8_t* compressed_bfr; //!< Compressed page data + statistics_chunk const* stats; //!< Fragment statistics uint32_t bfr_size; //!< Uncompressed buffer size uint32_t compressed_size; //!< Compressed buffer size uint32_t start_row; //!< First row of chunk uint32_t num_rows; //!< Number of rows in chunk uint32_t num_values; //!< Number of values in chunk. Different from num_rows for nested types uint32_t first_fragment; //!< First fragment of chunk - EncPage *pages; //!< Ptr to pages that belong to this chunk + EncPage* pages; //!< Ptr to pages that belong to this chunk uint32_t first_page; //!< First page of chunk uint32_t num_pages; //!< Number of pages in chunk uint32_t dictionary_id; //!< Dictionary id for this chunk @@ -318,12 +318,12 @@ struct EncColumnChunk { * @brief Struct describing an encoder data page */ struct EncPage { - uint8_t *page_data; //!< Ptr to uncompressed page - uint8_t *compressed_data; //!< Ptr to compressed page + uint8_t* page_data; //!< Ptr to uncompressed page + uint8_t* compressed_data; //!< Ptr to compressed page uint16_t num_fragments; //!< Number of fragments in page PageType page_type; //!< Page type uint8_t dict_bits_plus1; //!< 0=plain, nonzero:bits to encoding dictionary indices + 1 - EncColumnChunk *chunk; //!< Chunk that this page belongs to + EncColumnChunk* chunk; //!< Chunk that this page belongs to uint32_t chunk_id; //!< Index in chunk array uint32_t hdr_size; //!< Size of page header uint32_t max_hdr_size; //!< Maximum size of page header @@ -333,7 +333,7 @@ struct EncPage { uint32_t num_leaf_values; //!< Values in page. Different from num_rows in case of nested types uint32_t num_values; //!< Number of def/rep level values in page. Includes null/empty elements in //!< non-leaf levels - gpu_inflate_status_s *comp_stat; //!< Ptr to compression status + gpu_inflate_status_s* comp_stat; //!< Ptr to compression status }; /** @@ -343,7 +343,7 @@ struct EncPage { * @param[in] num_chunks Number of column chunks * @param[in] stream CUDA stream to use, default 0 */ -void DecodePageHeaders(ColumnChunkDesc *chunks, int32_t num_chunks, rmm::cuda_stream_view stream); +void DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream); /** * @brief Launches kernel for building the dictionary index for the column @@ -353,7 +353,7 @@ void DecodePageHeaders(ColumnChunkDesc *chunks, int32_t num_chunks, rmm::cuda_st * @param[in] num_chunks Number of column chunks * @param[in] stream CUDA stream to use, default 0 */ -void BuildStringDictionaryIndex(ColumnChunkDesc *chunks, +void BuildStringDictionaryIndex(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream); @@ -376,14 +376,14 @@ void BuildStringDictionaryIndex(ColumnChunkDesc *chunks, * @param[in] min_rows crop all rows below min_row * @param[in] stream Cuda stream */ -void PreprocessColumnData(hostdevice_vector &pages, - hostdevice_vector const &chunks, - std::vector &input_columns, - std::vector &output_columns, +void PreprocessColumnData(hostdevice_vector& pages, + hostdevice_vector const& chunks, + std::vector& input_columns, + std::vector& output_columns, size_t num_rows, size_t min_row, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr); + rmm::mr::device_memory_resource* mr); /** * @brief Launches kernel for reading the column data stored in the pages @@ -397,8 +397,8 @@ void PreprocessColumnData(hostdevice_vector &pages, * @param[in] min_row Minimum number of rows to read * @param[in] stream CUDA stream to use, default 0 */ -void DecodePageData(hostdevice_vector &pages, - hostdevice_vector const &chunks, +void DecodePageData(hostdevice_vector& pages, + hostdevice_vector const& chunks, size_t num_rows, size_t min_row, rmm::cuda_stream_view stream); @@ -436,8 +436,8 @@ struct dremel_data { * @return A struct containing dremel data */ dremel_data get_dremel_data(column_view h_col, - rmm::device_uvector const &d_nullability, - std::vector const &nullability, + rmm::device_uvector const& d_nullability, + std::vector const& nullability, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** @@ -486,8 +486,8 @@ void InitEncoderPages(cudf::detail::device_2dspan chunks, device_span pages, device_span col_desc, int32_t num_columns, - statistics_merge_group *page_grstats = nullptr, - statistics_merge_group *chunk_grstats = nullptr, + statistics_merge_group* page_grstats = nullptr, + statistics_merge_group* chunk_grstats = nullptr, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** @@ -524,7 +524,7 @@ void DecideCompression(device_span chunks, void EncodePageHeaders(device_span pages, device_span comp_out = {}, device_span page_stats = {}, - const statistics_chunk *chunk_stats = nullptr, + const statistics_chunk* chunk_stats = nullptr, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** @@ -546,7 +546,7 @@ void GatherPages(device_span chunks, * @param[in] stream CUDA stream to use, default 0 */ void BuildChunkDictionaries(device_span chunks, - uint32_t *dev_scratch, + uint32_t* dev_scratch, rmm::cuda_stream_view stream); } // namespace gpu diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 0863bca7b03..3bf11063035 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -51,7 +51,7 @@ constexpr uint32_t PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED = (1 << 24); namespace { -parquet::ConvertedType logical_type_to_converted_type(parquet::LogicalType const &logical) +parquet::ConvertedType logical_type_to_converted_type(parquet::LogicalType const& logical) { if (logical.isset.STRING) { return parquet::UTF8; @@ -96,7 +96,7 @@ parquet::ConvertedType logical_type_to_converted_type(parquet::LogicalType const /** * @brief Function that translates Parquet datatype to cuDF type enum */ -type_id to_type_id(SchemaElement const &schema, +type_id to_type_id(SchemaElement const& schema, bool strings_to_categorical, type_id timestamp_type_id, bool strict_decimal_types) @@ -232,7 +232,7 @@ std::tuple conversion_info(type_id column_type_id, } // namespace -std::string name_from_path(const std::vector &path_in_schema) +std::string name_from_path(const std::vector& path_in_schema) { // For the case of lists, we will see a schema that looks like: // a.list.element.list.element @@ -273,16 +273,16 @@ std::string name_from_path(const std::vector &path_in_schema) * @brief Class for parsing dataset metadata */ struct metadata : public FileMetaData { - explicit metadata(datasource *source) + explicit metadata(datasource* source) { constexpr auto header_len = sizeof(file_header_s); constexpr auto ender_len = sizeof(file_ender_s); const auto len = source->size(); const auto header_buffer = source->host_read(0, header_len); - const auto header = reinterpret_cast(header_buffer->data()); + const auto header = reinterpret_cast(header_buffer->data()); const auto ender_buffer = source->host_read(len - ender_len, ender_len); - const auto ender = reinterpret_cast(ender_buffer->data()); + const auto ender = reinterpret_cast(ender_buffer->data()); CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source"); CUDF_EXPECTS(header->magic == parquet_magic && ender->magic == parquet_magic, "Corrupted header or footer"); @@ -304,11 +304,11 @@ class aggregate_metadata { /** * @brief Create a metadata object from each element in the source vector */ - auto metadatas_from_sources(std::vector> const &sources) + auto metadatas_from_sources(std::vector> const& sources) { std::vector metadatas; std::transform( - sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const &source) { + sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) { return metadata(source.get()); }); return metadatas; @@ -321,8 +321,10 @@ class aggregate_metadata { { std::map merged; // merge key/value maps TODO: warn/throw if there are mismatches? - for (auto const &pfm : per_file_metadata) { - for (auto const &kv : pfm.key_value_metadata) { merged[kv.key] = kv.value; } + for (auto const& pfm : per_file_metadata) { + for (auto const& kv : pfm.key_value_metadata) { + merged[kv.key] = kv.value; + } } return merged; } @@ -333,7 +335,7 @@ class aggregate_metadata { size_type calc_num_rows() const { return std::accumulate( - per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto &sum, auto &pfm) { + per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) { return sum + pfm.num_rows; }); } @@ -344,13 +346,13 @@ class aggregate_metadata { size_type calc_num_row_groups() const { return std::accumulate( - per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto &sum, auto &pfm) { + per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) { return sum + pfm.row_groups.size(); }); } public: - aggregate_metadata(std::vector> const &sources) + aggregate_metadata(std::vector> const& sources) : per_file_metadata(metadatas_from_sources(sources)), agg_keyval_map(merge_keyval_metadata()), num_rows(calc_num_rows()), @@ -358,7 +360,7 @@ class aggregate_metadata { { // Verify that the input files have matching numbers of columns size_type num_cols = -1; - for (auto const &pfm : per_file_metadata) { + for (auto const& pfm : per_file_metadata) { if (pfm.row_groups.size() != 0) { if (num_cols == -1) num_cols = pfm.row_groups[0].columns.size(); @@ -368,27 +370,27 @@ class aggregate_metadata { } } // Verify that the input files have matching schemas - for (auto const &pfm : per_file_metadata) { + for (auto const& pfm : per_file_metadata) { CUDF_EXPECTS(per_file_metadata[0].schema == pfm.schema, "All sources must have the same schemas"); } } - auto const &get_row_group(size_type row_group_index, size_type src_idx) const + auto const& get_row_group(size_type row_group_index, size_type src_idx) const { CUDF_EXPECTS(src_idx >= 0 && src_idx < static_cast(per_file_metadata.size()), "invalid source index"); return per_file_metadata[src_idx].row_groups[row_group_index]; } - auto const &get_column_metadata(size_type row_group_index, + auto const& get_column_metadata(size_type row_group_index, size_type src_idx, int schema_idx) const { auto col = std::find_if( per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(), per_file_metadata[src_idx].row_groups[row_group_index].columns.end(), - [schema_idx](ColumnChunk const &col) { return col.schema_idx == schema_idx ? true : false; }); + [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx ? true : false; }); CUDF_EXPECTS(col != std::end(per_file_metadata[src_idx].row_groups[row_group_index].columns), "Found no metadata for schema index"); return col->meta_data; @@ -398,9 +400,9 @@ class aggregate_metadata { auto get_num_row_groups() const { return num_row_groups; } - auto const &get_schema(int schema_idx) const { return per_file_metadata[0].schema[schema_idx]; } + auto const& get_schema(int schema_idx) const { return per_file_metadata[0].schema[schema_idx]; } - auto const &get_key_value_metadata() const { return agg_keyval_map; } + auto const& get_key_value_metadata() const { return agg_keyval_map; } /** * @brief Gets the concrete nesting depth of output cudf columns @@ -411,7 +413,7 @@ class aggregate_metadata { */ inline int get_output_nesting_depth(int schema_index) const { - auto &pfm = per_file_metadata[0]; + auto& pfm = per_file_metadata[0]; int depth = 0; // walk upwards, skipping repeated fields @@ -462,7 +464,7 @@ class aggregate_metadata { * * @param names List of column names to load, where index column name(s) will be added */ - void add_pandas_index_names(std::vector &names) const + void add_pandas_index_names(std::vector& names) const { auto str = get_pandas_index(); if (str.length() != 0) { @@ -499,9 +501,9 @@ class aggregate_metadata { * * @return List of row group indexes and its starting row */ - auto select_row_groups(std::vector> const &row_groups, - size_type &row_start, - size_type &row_count) const + auto select_row_groups(std::vector> const& row_groups, + size_type& row_start, + size_type& row_count) const { if (!row_groups.empty()) { std::vector selection; @@ -510,7 +512,7 @@ class aggregate_metadata { row_count = 0; for (size_t src_idx = 0; src_idx < row_groups.size(); ++src_idx) { - for (auto const &rowgroup_idx : row_groups[src_idx]) { + for (auto const& rowgroup_idx : row_groups[src_idx]) { CUDF_EXPECTS( rowgroup_idx >= 0 && rowgroup_idx < static_cast(per_file_metadata[src_idx].row_groups.size()), @@ -561,16 +563,16 @@ class aggregate_metadata { * @param[in] strict_decimal_types True if it is an error to load an unsupported decimal type * */ - void build_column_info(int &schema_idx, - std::vector &input_columns, - std::vector &output_columns, - std::deque &nesting, + void build_column_info(int& schema_idx, + std::vector& input_columns, + std::vector& output_columns, + std::deque& nesting, bool strings_to_categorical, type_id timestamp_type_id, bool strict_decimal_types) const { int start_schema_idx = schema_idx; - auto const &schema = get_schema(schema_idx); + auto const& schema = get_schema(schema_idx); schema_idx++; // if I am a stub, continue on @@ -595,7 +597,7 @@ class aggregate_metadata { ? data_type{col_type, numeric::scale_type{-schema.decimal_scale}} : data_type{col_type}; output_columns.emplace_back(dtype, schema.repetition_type == OPTIONAL ? true : false); - column_buffer &output_col = output_columns.back(); + column_buffer& output_col = output_columns.back(); output_col.name = schema.name; // build each child @@ -613,7 +615,7 @@ class aggregate_metadata { // data stored) so add me to the list. if (schema.num_children == 0) { input_columns.emplace_back(input_column_info{start_schema_idx, schema.name}); - input_column_info &input_col = input_columns.back(); + input_column_info& input_col = input_columns.back(); std::copy(nesting.begin(), nesting.end(), std::back_inserter(input_col.nesting)); } @@ -631,13 +633,13 @@ class aggregate_metadata { * @return input column information, output column information, list of output column schema * indices */ - auto select_columns(std::vector const &use_names, + auto select_columns(std::vector const& use_names, bool include_index, bool strings_to_categorical, type_id timestamp_type_id, bool strict_decimal_types) const { - auto const &pfm = per_file_metadata[0]; + auto const& pfm = per_file_metadata[0]; // determine the list of output columns // @@ -659,16 +661,16 @@ class aggregate_metadata { if (use_names.empty()) { // walk the schema and choose all top level columns for (size_t schema_idx = 1; schema_idx < pfm.schema.size(); schema_idx++) { - auto const &schema = pfm.schema[schema_idx]; + auto const& schema = pfm.schema[schema_idx]; if (schema.parent_idx == 0) { output_column_schemas.push_back(schema_idx); } } } else { // Load subset of columns; include PANDAS index unless excluded std::vector local_use_names = use_names; if (include_index) { add_pandas_index_names(local_use_names); } - for (const auto &use_name : local_use_names) { + for (const auto& use_name : local_use_names) { for (size_t schema_idx = 1; schema_idx < pfm.schema.size(); schema_idx++) { - auto const &schema = pfm.schema[schema_idx]; + auto const& schema = pfm.schema[schema_idx]; // We select only top level columns by name. Selecting nested columns by name is not // supported. Top level columns are identified by their parent being the root (idx == 0) if (use_name == schema.name and schema.parent_idx == 0) { @@ -711,9 +713,9 @@ class aggregate_metadata { * @param src_col_schema The column schema to generate the new mapping for * @param md File metadata information */ -void generate_depth_remappings(std::map, std::vector>> &remap, +void generate_depth_remappings(std::map, std::vector>>& remap, int src_col_schema, - aggregate_metadata const &md) + aggregate_metadata const& md) { // already generated for this level if (remap.find(src_col_schema) != remap.end()) { return; } @@ -724,11 +726,11 @@ void generate_depth_remappings(std::map, std::ve "Attempting to remap a schema more than once"); auto inserted = remap.insert(std::pair, std::vector>>{src_col_schema, {}}); - auto &depth_remap = inserted.first->second; + auto& depth_remap = inserted.first->second; - std::vector &rep_depth_remap = (depth_remap.first); + std::vector& rep_depth_remap = (depth_remap.first); rep_depth_remap.resize(schema.max_repetition_level + 1); - std::vector &def_depth_remap = (depth_remap.second); + std::vector& def_depth_remap = (depth_remap.second); def_depth_remap.resize(schema.max_definition_level + 1); // the key: @@ -822,12 +824,12 @@ void generate_depth_remappings(std::map, std::ve * @copydoc cudf::io::detail::parquet::read_column_chunks */ void reader::impl::read_column_chunks( - std::vector> &page_data, - hostdevice_vector &chunks, // TODO const? + std::vector>& page_data, + hostdevice_vector& chunks, // TODO const? size_t begin_chunk, size_t end_chunk, - const std::vector &column_chunk_offsets, - std::vector const &chunk_source_map, + const std::vector& column_chunk_offsets, + std::vector const& chunk_source_map, rmm::cuda_stream_view stream) { // Transfer chunk data, coalescing adjacent chunks @@ -850,7 +852,7 @@ void reader::impl::read_column_chunks( next_chunk++; } if (io_size != 0) { - auto &source = _sources[chunk_source_map[chunk]]; + auto& source = _sources[chunk_source_map[chunk]]; if (source->is_device_read_preferred(io_size)) { page_data[chunk] = source->device_read(io_offset, io_size, stream); } else { @@ -872,7 +874,7 @@ void reader::impl::read_column_chunks( /** * @copydoc cudf::io::detail::parquet::count_page_headers */ -size_t reader::impl::count_page_headers(hostdevice_vector &chunks, +size_t reader::impl::count_page_headers(hostdevice_vector& chunks, rmm::cuda_stream_view stream) { size_t total_pages = 0; @@ -891,8 +893,8 @@ size_t reader::impl::count_page_headers(hostdevice_vector /** * @copydoc cudf::io::detail::parquet::decode_page_headers */ -void reader::impl::decode_page_headers(hostdevice_vector &chunks, - hostdevice_vector &pages, +void reader::impl::decode_page_headers(hostdevice_vector& chunks, + hostdevice_vector& pages, rmm::cuda_stream_view stream) { // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages), @@ -912,15 +914,17 @@ void reader::impl::decode_page_headers(hostdevice_vector & * @copydoc cudf::io::detail::parquet::decompress_page_data */ rmm::device_buffer reader::impl::decompress_page_data( - hostdevice_vector &chunks, - hostdevice_vector &pages, + hostdevice_vector& chunks, + hostdevice_vector& pages, rmm::cuda_stream_view stream) { - auto for_each_codec_page = [&](parquet::Compression codec, const std::function &f) { + auto for_each_codec_page = [&](parquet::Compression codec, const std::function& f) { for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { const auto page_stride = chunks[c].max_num_pages; if (chunks[c].codec == codec) { - for (int k = 0; k < page_stride; k++) { f(page_count + k); } + for (int k = 0; k < page_stride; k++) { + f(page_count + k); + } } page_count += page_stride; } @@ -936,7 +940,7 @@ rmm::device_buffer reader::impl::decompress_page_data( std::make_pair(parquet::SNAPPY, 0), std::make_pair(parquet::BROTLI, 0)}; - for (auto &codec : codecs) { + for (auto& codec : codecs) { for_each_codec_page(codec.first, [&](size_t page) { total_decomp_size += pages[page].uncompressed_page_size; codec.second++; @@ -954,12 +958,12 @@ rmm::device_buffer reader::impl::decompress_page_data( size_t decomp_offset = 0; int32_t argc = 0; - for (const auto &codec : codecs) { + for (const auto& codec : codecs) { if (codec.second > 0) { int32_t start_pos = argc; for_each_codec_page(codec.first, [&](size_t page) { - auto dst_base = static_cast(decomp_pages.data()); + auto dst_base = static_cast(decomp_pages.data()); inflate_in[argc].srcDevice = pages[page].page_data; inflate_in[argc].srcSize = pages[page].compressed_page_size; inflate_in[argc].dstDevice = dst_base + decomp_offset; @@ -969,7 +973,7 @@ rmm::device_buffer reader::impl::decompress_page_data( inflate_out[argc].status = static_cast(-1000); inflate_out[argc].reserved = 0; - pages[page].page_data = static_cast(inflate_in[argc].dstDevice); + pages[page].page_data = static_cast(inflate_in[argc].dstDevice); decomp_offset += inflate_in[argc].dstSize; argc++; }); @@ -1027,17 +1031,17 @@ rmm::device_buffer reader::impl::decompress_page_data( /** * @copydoc cudf::io::detail::parquet::allocate_nesting_info */ -void reader::impl::allocate_nesting_info(hostdevice_vector const &chunks, - hostdevice_vector &pages, - hostdevice_vector &page_nesting_info, +void reader::impl::allocate_nesting_info(hostdevice_vector const& chunks, + hostdevice_vector& pages, + hostdevice_vector& page_nesting_info, rmm::cuda_stream_view stream) { // compute total # of page_nesting infos needed and allocate space. doing this in one // buffer to keep it to a single gpu allocation size_t const total_page_nesting_infos = std::accumulate( - chunks.host_ptr(), chunks.host_ptr() + chunks.size(), 0, [&](int total, auto &chunk) { + chunks.host_ptr(), chunks.host_ptr() + chunks.size(), 0, [&](int total, auto& chunk) { // the schema of the input column - auto const &schema = _metadata->get_schema(chunk.src_col_schema); + auto const& schema = _metadata->get_schema(chunk.src_col_schema); auto const per_page_nesting_info_size = max( schema.max_definition_level + 1, _metadata->get_output_nesting_depth(chunk.src_col_schema)); return total + (per_page_nesting_info_size * chunk.num_data_pages); @@ -1053,7 +1057,7 @@ void reader::impl::allocate_nesting_info(hostdevice_vector int src_info_index = 0; for (size_t idx = 0; idx < chunks.size(); idx++) { int src_col_schema = chunks[idx].src_col_schema; - auto &schema = _metadata->get_schema(src_col_schema); + auto& schema = _metadata->get_schema(src_col_schema); auto const per_page_nesting_info_size = std::max( schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema)); @@ -1078,7 +1082,7 @@ void reader::impl::allocate_nesting_info(hostdevice_vector int src_col_schema = chunks[idx].src_col_schema; // schema of the input column - auto &schema = _metadata->get_schema(src_col_schema); + auto& schema = _metadata->get_schema(src_col_schema); // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc) int max_depth = _metadata->get_output_nesting_depth(src_col_schema); @@ -1101,7 +1105,7 @@ void reader::impl::allocate_nesting_info(hostdevice_vector if (!cur_schema.is_stub()) { // initialize each page within the chunk for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) { - gpu::PageNestingInfo *pni = + gpu::PageNestingInfo* pni = &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)]; // if we have lists, set our start and end depth remappings @@ -1109,8 +1113,8 @@ void reader::impl::allocate_nesting_info(hostdevice_vector auto remap = depth_remapping.find(src_col_schema); CUDF_EXPECTS(remap != depth_remapping.end(), "Could not find depth remapping for schema"); - std::vector const &rep_depth_remap = (remap->second.first); - std::vector const &def_depth_remap = (remap->second.second); + std::vector const& rep_depth_remap = (remap->second.first); + std::vector const& def_depth_remap = (remap->second.second); for (size_t m = 0; m < rep_depth_remap.size(); m++) { pni[m].start_depth = rep_depth_remap[m]; @@ -1145,8 +1149,8 @@ void reader::impl::allocate_nesting_info(hostdevice_vector /** * @copydoc cudf::io::detail::parquet::preprocess_columns */ -void reader::impl::preprocess_columns(hostdevice_vector &chunks, - hostdevice_vector &pages, +void reader::impl::preprocess_columns(hostdevice_vector& chunks, + hostdevice_vector& pages, size_t min_row, size_t total_rows, bool has_lists, @@ -1158,10 +1162,10 @@ void reader::impl::preprocess_columns(hostdevice_vector &c // if there are no lists, simply allocate every allocate every output // column to be of size num_rows if (!has_lists) { - std::function &)> create_columns = - [&](std::vector &cols) { + std::function&)> create_columns = + [&](std::vector& cols) { for (size_t idx = 0; idx < cols.size(); idx++) { - auto &col = cols[idx]; + auto& col = cols[idx]; col.create(total_rows, stream, _mr); create_columns(col.children); } @@ -1178,14 +1182,14 @@ void reader::impl::preprocess_columns(hostdevice_vector &c /** * @copydoc cudf::io::detail::parquet::decode_page_data */ -void reader::impl::decode_page_data(hostdevice_vector &chunks, - hostdevice_vector &pages, - hostdevice_vector &page_nesting, +void reader::impl::decode_page_data(hostdevice_vector& chunks, + hostdevice_vector& pages, + hostdevice_vector& page_nesting, size_t min_row, size_t total_rows, rmm::cuda_stream_view stream) { - auto is_dict_chunk = [](const gpu::ColumnChunkDesc &chunk) { + auto is_dict_chunk = [](const gpu::ColumnChunkDesc& chunk) { return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0; }; @@ -1207,20 +1211,20 @@ void reader::impl::decode_page_data(hostdevice_vector &chu std::accumulate(chunks.host_ptr(), chunks.host_ptr(chunks.size()), 0, - [&](size_t cursum, gpu::ColumnChunkDesc const &chunk) { + [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) { return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema); }); // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i` - auto chunk_nested_valids = hostdevice_vector(sum_max_depths); - auto chunk_nested_data = hostdevice_vector(sum_max_depths); + auto chunk_nested_valids = hostdevice_vector(sum_max_depths); + auto chunk_nested_data = hostdevice_vector(sum_max_depths); auto chunk_offsets = std::vector(); // Update chunks with pointers to column data. for (size_t c = 0, page_count = 0, str_ofs = 0, chunk_off = 0; c < chunks.size(); c++) { - input_column_info const &input_col = _input_columns[chunks[c].src_col_index]; + input_column_info const& input_col = _input_columns[chunks[c].src_col_index]; CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema, "Column/page schema index mismatch"); @@ -1275,9 +1279,9 @@ void reader::impl::decode_page_data(hostdevice_vector &chu // // we do this by only handing out the pointers to the first child we come across. // - auto *cols = &_output_columns; + auto* cols = &_output_columns; for (size_t idx = 0; idx < max_depth; idx++) { - auto &out_buf = (*cols)[input_col.nesting[idx]]; + auto& out_buf = (*cols)[input_col.nesting[idx]]; cols = &out_buf.children; int owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK; @@ -1317,11 +1321,11 @@ void reader::impl::decode_page_data(hostdevice_vector &chu // last value that should then be followed by a terminator (because rows can span // page boundaries). for (size_t idx = 0; idx < _input_columns.size(); idx++) { - input_column_info const &input_col = _input_columns[idx]; + input_column_info const& input_col = _input_columns[idx]; - auto *cols = &_output_columns; + auto* cols = &_output_columns; for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { - auto &out_buf = (*cols)[input_col.nesting[l_idx]]; + auto& out_buf = (*cols)[input_col.nesting[l_idx]]; cols = &out_buf.children; if (out_buf.type.id() != type_id::LIST || @@ -1329,11 +1333,11 @@ void reader::impl::decode_page_data(hostdevice_vector &chu continue; } CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column"); - auto &child = (*cols)[input_col.nesting[l_idx + 1]]; + auto& child = (*cols)[input_col.nesting[l_idx + 1]]; // the final offset for a list at level N is the size of it's child int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; - cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), + cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), &offset, sizeof(offset), cudaMemcpyHostToDevice, @@ -1344,17 +1348,17 @@ void reader::impl::decode_page_data(hostdevice_vector &chu // update null counts in the final column buffers for (size_t idx = 0; idx < pages.size(); idx++) { - gpu::PageInfo *pi = &pages[idx]; + gpu::PageInfo* pi = &pages[idx]; if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } - gpu::ColumnChunkDesc *col = &chunks[pi->chunk_idx]; - input_column_info const &input_col = _input_columns[col->src_col_index]; + gpu::ColumnChunkDesc* col = &chunks[pi->chunk_idx]; + input_column_info const& input_col = _input_columns[col->src_col_index]; int index = pi->nesting - page_nesting.device_ptr(); - gpu::PageNestingInfo *pni = &page_nesting[index]; + gpu::PageNestingInfo* pni = &page_nesting[index]; - auto *cols = &_output_columns; + auto* cols = &_output_columns; for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { - auto &out_buf = (*cols)[input_col.nesting[l_idx]]; + auto& out_buf = (*cols)[input_col.nesting[l_idx]]; cols = &out_buf.children; // if I wasn't the one who wrote out the validity bits, skip it @@ -1368,9 +1372,9 @@ void reader::impl::decode_page_data(hostdevice_vector &chu stream.synchronize(); } -reader::impl::impl(std::vector> &&sources, - parquet_reader_options const &options, - rmm::mr::device_memory_resource *mr) +reader::impl::impl(std::vector>&& sources, + parquet_reader_options const& options, + rmm::mr::device_memory_resource* mr) : _mr(mr), _sources(std::move(sources)) { // Open and parse the source dataset metadata @@ -1397,7 +1401,7 @@ reader::impl::impl(std::vector> &&sources, table_with_metadata reader::impl::read(size_type skip_rows, size_type num_rows, - std::vector> const &row_group_list, + std::vector> const& row_group_list, rmm::cuda_stream_view stream) { // Select only row groups required @@ -1431,8 +1435,8 @@ table_with_metadata reader::impl::read(size_type skip_rows, // Initialize column chunk information size_t total_decompressed_size = 0; auto remaining_rows = num_rows; - for (const auto &rg : selected_row_groups) { - const auto &row_group = _metadata->get_row_group(rg.index, rg.source_index); + for (const auto& rg : selected_row_groups) { + const auto& row_group = _metadata->get_row_group(rg.index, rg.source_index); auto const row_group_start = rg.start_row; auto const row_group_source = rg.source_index; auto const row_group_rows = std::min(remaining_rows, row_group.num_rows); @@ -1442,8 +1446,8 @@ table_with_metadata reader::impl::read(size_type skip_rows, for (size_t i = 0; i < num_input_columns; ++i) { auto col = _input_columns[i]; // look up metadata - auto &col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); - auto &schema = _metadata->get_schema(col.schema_idx); + auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); + auto& schema = _metadata->get_schema(col.schema_idx); // this column contains repetition levels and will require a preprocess if (schema.max_repetition_level > 0) { has_lists = true; } @@ -1579,7 +1583,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, // Return column names (must match order of returned columns) out_metadata.column_names.resize(_output_columns.size()); for (size_t i = 0; i < _output_column_schemas.size(); i++) { - auto const &schema = _metadata->get_schema(_output_column_schemas[i]); + auto const& schema = _metadata->get_schema(_output_column_schemas[i]); out_metadata.column_names[i] = schema.name; } @@ -1590,19 +1594,19 @@ table_with_metadata reader::impl::read(size_type skip_rows, } // Forward to implementation -reader::reader(std::vector const &filepaths, - parquet_reader_options const &options, +reader::reader(std::vector const& filepaths, + parquet_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : _impl(std::make_unique(datasource::create(filepaths), options, mr)) { } // Forward to implementation -reader::reader(std::vector> &&sources, - parquet_reader_options const &options, +reader::reader(std::vector>&& sources, + parquet_reader_options const& options, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : _impl(std::make_unique(std::move(sources), options, mr)) { } @@ -1611,7 +1615,7 @@ reader::reader(std::vector> &&sources, reader::~reader() = default; // Forward to implementation -table_with_metadata reader::read(parquet_reader_options const &options, +table_with_metadata reader::read(parquet_reader_options const& options, rmm::cuda_stream_view stream) { return _impl->read( diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index ffd8975a8d2..b93107aa9b2 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -60,9 +60,9 @@ class reader::impl { * @param options Settings for controlling reading behavior * @param mr Device memory resource to use for device memory allocation */ - explicit impl(std::vector> &&sources, - parquet_reader_options const &options, - rmm::mr::device_memory_resource *mr); + explicit impl(std::vector>&& sources, + parquet_reader_options const& options, + rmm::mr::device_memory_resource* mr); /** * @brief Read an entire set or a subset of data and returns a set of columns @@ -76,7 +76,7 @@ class reader::impl { */ table_with_metadata read(size_type skip_rows, size_type num_rows, - std::vector> const &row_group_indices, + std::vector> const& row_group_indices, rmm::cuda_stream_view stream); private: @@ -91,12 +91,12 @@ class reader::impl { * @param stream CUDA stream used for device memory operations and kernel launches. * */ - void read_column_chunks(std::vector> &page_data, - hostdevice_vector &chunks, + void read_column_chunks(std::vector>& page_data, + hostdevice_vector& chunks, size_t begin_chunk, size_t end_chunk, - const std::vector &column_chunk_offsets, - std::vector const &chunk_source_map, + const std::vector& column_chunk_offsets, + std::vector const& chunk_source_map, rmm::cuda_stream_view stream); /** @@ -107,7 +107,7 @@ class reader::impl { * * @return The total number of pages */ - size_t count_page_headers(hostdevice_vector &chunks, + size_t count_page_headers(hostdevice_vector& chunks, rmm::cuda_stream_view stream); /** @@ -117,8 +117,8 @@ class reader::impl { * @param pages List of page information * @param stream CUDA stream used for device memory operations and kernel launches. */ - void decode_page_headers(hostdevice_vector &chunks, - hostdevice_vector &pages, + void decode_page_headers(hostdevice_vector& chunks, + hostdevice_vector& pages, rmm::cuda_stream_view stream); /** @@ -130,8 +130,8 @@ class reader::impl { * * @return Device buffer to decompressed page data */ - rmm::device_buffer decompress_page_data(hostdevice_vector &chunks, - hostdevice_vector &pages, + rmm::device_buffer decompress_page_data(hostdevice_vector& chunks, + hostdevice_vector& pages, rmm::cuda_stream_view stream); /** @@ -149,9 +149,9 @@ class reader::impl { * @param page_nesting_info The allocated nesting info structs. * @param stream CUDA stream used for device memory operations and kernel launches. */ - void allocate_nesting_info(hostdevice_vector const &chunks, - hostdevice_vector &pages, - hostdevice_vector &page_nesting_info, + void allocate_nesting_info(hostdevice_vector const& chunks, + hostdevice_vector& pages, + hostdevice_vector& page_nesting_info, rmm::cuda_stream_view stream); /** @@ -172,8 +172,8 @@ class reader::impl { * a preprocess. * @param[in] stream Cuda stream */ - void preprocess_columns(hostdevice_vector &chunks, - hostdevice_vector &pages, + void preprocess_columns(hostdevice_vector& chunks, + hostdevice_vector& pages, size_t min_row, size_t total_rows, bool has_lists, @@ -189,15 +189,15 @@ class reader::impl { * @param total_rows Number of rows to output * @param stream CUDA stream used for device memory operations and kernel launches. */ - void decode_page_data(hostdevice_vector &chunks, - hostdevice_vector &pages, - hostdevice_vector &page_nesting, + void decode_page_data(hostdevice_vector& chunks, + hostdevice_vector& pages, + hostdevice_vector& page_nesting, size_t min_row, size_t total_rows, rmm::cuda_stream_view stream); private: - rmm::mr::device_memory_resource *_mr = nullptr; + rmm::mr::device_memory_resource* _mr = nullptr; std::vector> _sources; std::unique_ptr _metadata; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 77210b5a2ab..73924512bce 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -87,14 +87,14 @@ struct linked_column_view : public column_view { // copy of this object. Options: // 1. Inherit from column_view_base. Only lose out on children vector. That is not needed. // 2. Don't inherit at all. make linked_column_view keep a reference wrapper to its column_view - linked_column_view(column_view const &col) : column_view(col), parent(nullptr) + linked_column_view(column_view const& col) : column_view(col), parent(nullptr) { for (auto child_it = col.child_begin(); child_it < col.child_end(); ++child_it) { children.push_back(std::make_shared(this, *child_it)); } } - linked_column_view(linked_column_view *parent, column_view const &col) + linked_column_view(linked_column_view* parent, column_view const& col) : column_view(col), parent(parent) { for (auto child_it = col.child_begin(); child_it < col.child_end(); ++child_it) { @@ -102,7 +102,7 @@ struct linked_column_view : public column_view { } } - linked_column_view *parent; //!< Pointer to parent of this column. Nullptr if root + linked_column_view* parent; //!< Pointer to parent of this column. Nullptr if root LinkedColVector children; }; @@ -112,10 +112,10 @@ struct linked_column_view : public column_view { * @param table table of columns to convert * @return Vector of converted linked_column_views */ -LinkedColVector input_table_to_linked_columns(table_view const &table) +LinkedColVector input_table_to_linked_columns(table_view const& table) { LinkedColVector result; - for (column_view const &col : table) { + for (column_view const& col : table) { result.emplace_back(std::make_shared(col)); } @@ -144,9 +144,9 @@ struct schema_tree_node : public SchemaElement { }; struct leaf_schema_fn { - schema_tree_node &col_schema; - LinkedColPtr const &col; - column_in_metadata const &col_meta; + schema_tree_node& col_schema; + LinkedColPtr const& col; + column_in_metadata const& col_meta; bool timestamp_is_int96; template @@ -370,8 +370,8 @@ struct leaf_schema_fn { * Recursively traverses through linked_columns and corresponding metadata to construct schema tree. * The resulting schema tree is stored in a vector in pre-order traversal order. */ -std::vector construct_schema_tree(LinkedColVector const &linked_columns, - table_input_metadata const &metadata, +std::vector construct_schema_tree(LinkedColVector const& linked_columns, + table_input_metadata const& metadata, bool single_write_mode, bool int96_timestamps) { @@ -384,8 +384,8 @@ std::vector construct_schema_tree(LinkedColVector const &linke root.parent_idx = -1; // root schema has no parent schema.push_back(std::move(root)); - std::function add_schema = - [&](LinkedColPtr const &col, column_in_metadata const &col_meta, size_t parent_idx) { + std::function add_schema = + [&](LinkedColPtr const& col, column_in_metadata const& col_meta, size_t parent_idx) { bool col_nullable = [&]() { if (single_write_mode) { return col->nullable(); @@ -500,8 +500,8 @@ std::vector construct_schema_tree(LinkedColVector const &linke * */ struct parquet_column_view { - parquet_column_view(schema_tree_node const &schema_node, - std::vector const &schema_tree, + parquet_column_view(schema_tree_node const& schema_node, + std::vector const& schema_tree, rmm::cuda_stream_view stream); column_view leaf_column_view() const; @@ -510,7 +510,7 @@ struct parquet_column_view { column_view cudf_column_view() const { return cudf_col; } parquet::Type physical_type() const { return schema_node.type; } - std::vector const &get_path_in_schema() { return path_in_schema; } + std::vector const& get_path_in_schema() { return path_in_schema; } // LIST related member functions uint8_t max_def_level() const noexcept { return _max_def_level; } @@ -518,8 +518,8 @@ struct parquet_column_view { bool is_list() const noexcept { return _is_list; } // Dictionary related member functions - uint32_t *get_dict_data() { return (_dict_data.size()) ? _dict_data.data() : nullptr; } - uint32_t *get_dict_index() { return (_dict_index.size()) ? _dict_index.data() : nullptr; } + uint32_t* get_dict_data() { return (_dict_data.size()) ? _dict_data.data() : nullptr; } + uint32_t* get_dict_index() { return (_dict_index.size()) ? _dict_index.data() : nullptr; } void use_dictionary(bool use_dict) { _dictionary_used = use_dict; } void alloc_dictionary(size_t max_num_rows, rmm::cuda_stream_view stream) { @@ -563,8 +563,8 @@ struct parquet_column_view { rmm::device_uvector _dict_index; }; -parquet_column_view::parquet_column_view(schema_tree_node const &schema_node, - std::vector const &schema_tree, +parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, + std::vector const& schema_tree, rmm::cuda_stream_view stream) : schema_node(schema_node), _d_nullability(0, stream), @@ -578,7 +578,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const &schema_node, auto curr_col = schema_node.leaf_column.get(); column_view single_inheritance_cudf_col = *curr_col; while (curr_col->parent) { - auto const &parent = *curr_col->parent; + auto const& parent = *curr_col->parent; // For list columns, we still need to retain the offset child column. auto children = @@ -718,7 +718,7 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s return desc; } -void writer::impl::init_page_fragments(cudf::detail::hostdevice_2dvector &frag, +void writer::impl::init_page_fragments(cudf::detail::hostdevice_2dvector& frag, device_span col_desc, uint32_t num_rows, uint32_t fragment_size) @@ -745,7 +745,7 @@ void writer::impl::gather_fragment_statistics( } void writer::impl::build_chunk_dictionaries( - hostdevice_2dvector &chunks, + hostdevice_2dvector& chunks, device_span col_desc, uint32_t num_columns, uint32_t num_dictionaries) @@ -762,11 +762,11 @@ void writer::impl::build_chunk_dictionaries( chunks.device_to_host(stream, true); } -void writer::impl::init_encoder_pages(hostdevice_2dvector &chunks, +void writer::impl::init_encoder_pages(hostdevice_2dvector& chunks, device_span col_desc, device_span pages, - statistics_chunk *page_stats, - statistics_chunk *frag_stats, + statistics_chunk* page_stats, + statistics_chunk* frag_stats, uint32_t num_columns, uint32_t num_pages, uint32_t num_stats_bfr) @@ -795,14 +795,14 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector & stream.synchronize(); } -void writer::impl::encode_pages(hostdevice_2dvector &chunks, +void writer::impl::encode_pages(hostdevice_2dvector& chunks, device_span pages, uint32_t pages_in_batch, uint32_t first_page_in_batch, uint32_t rowgroups_in_batch, uint32_t first_rowgroup, - const statistics_chunk *page_stats, - const statistics_chunk *chunk_stats) + const statistics_chunk* page_stats, + const statistics_chunk* chunk_stats) { auto batch_pages = pages.subspan(first_page_in_batch, pages_in_batch); @@ -844,10 +844,10 @@ void writer::impl::encode_pages(hostdevice_2dvector &chunks } writer::impl::impl(std::unique_ptr sink, - parquet_writer_options const &options, + parquet_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : _mr(mr), stream(stream), compression_(to_parquet_compression(options.get_compression())), @@ -863,10 +863,10 @@ writer::impl::impl(std::unique_ptr sink, } writer::impl::impl(std::unique_ptr sink, - chunked_parquet_writer_options const &options, + chunked_parquet_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : _mr(mr), stream(stream), compression_(to_parquet_compression(options.get_compression())), @@ -892,7 +892,7 @@ void writer::impl::init_state() current_chunk_offset = sizeof(file_header_s); } -void writer::impl::write(table_view const &table) +void writer::impl::write(table_view const& table) { CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed"); @@ -901,8 +901,8 @@ void writer::impl::write(table_view const &table) if (not table_meta) { table_meta = std::make_unique(table); } // Fill unnamed columns' names in table_meta - std::function add_default_name = - [&](column_in_metadata &col_meta, std::string default_name) { + std::function add_default_name = + [&](column_in_metadata& col_meta, std::string default_name) { if (col_meta.get_name().empty()) col_meta.set_name(default_name); for (size_type i = 0; i < col_meta.num_children(); ++i) { add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i)); @@ -917,14 +917,16 @@ void writer::impl::write(table_view const &table) // Construct parquet_column_views from the schema tree leaf nodes. std::vector parquet_columns; - for (schema_tree_node const &schema_node : schema_tree) { + for (schema_tree_node const& schema_node : schema_tree) { if (schema_node.leaf_column) { parquet_columns.emplace_back(schema_node, schema_tree, stream); } } // Mass allocation of column_device_views for each parquet_column_view std::vector cudf_cols; cudf_cols.reserve(parquet_columns.size()); - for (auto const &parq_col : parquet_columns) { cudf_cols.push_back(parq_col.cudf_column_view()); } + for (auto const& parq_col : parquet_columns) { + cudf_cols.push_back(parq_col.cudf_column_view()); + } table_view single_streams_table(cudf_cols); size_type num_columns = single_streams_table.num_columns(); @@ -938,7 +940,7 @@ void writer::impl::write(table_view const &table) std::transform(table_meta->user_data.begin(), table_meta->user_data.end(), std::back_inserter(md.key_value_metadata), - [](auto const &kv) { + [](auto const& kv) { return KeyValue{kv.first, kv.second}; }); md.schema = this_table_schema; @@ -960,7 +962,7 @@ void writer::impl::write(table_view const &table) // This should've been `auto const&` but isn't since dictionary space is allocated when calling // get_device_view(). Fix during dictionary refactor. std::transform( - parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto &pcol) { + parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto& pcol) { return pcol.get_device_view(stream); }); @@ -1039,7 +1041,7 @@ void writer::impl::write(table_view const &table) md.row_groups[global_r].total_byte_size = 0; md.row_groups[global_r].columns.resize(num_columns); for (int i = 0; i < num_columns; i++) { - gpu::EncColumnChunk *ck = &chunks[r][i]; + gpu::EncColumnChunk* ck = &chunks[r][i]; bool dict_enable = false; *ck = {}; @@ -1088,7 +1090,9 @@ void writer::impl::write(table_view const &table) } // Free unused dictionaries - for (auto &col : parquet_columns) { col.check_dictionary_used(stream); } + for (auto& col : parquet_columns) { + col.check_dictionary_used(stream); + } // Build chunk dictionaries and count pages if (num_chunks != 0) { @@ -1107,7 +1111,7 @@ void writer::impl::write(table_view const &table) size_t rowgroup_size = 0; if (r < num_rowgroups) { for (int i = 0; i < num_columns; i++) { - gpu::EncColumnChunk *ck = &chunks[r][i]; + gpu::EncColumnChunk* ck = &chunks[r][i]; ck->first_page = num_pages; num_pages += ck->num_pages; pages_in_batch += ck->num_pages; @@ -1146,11 +1150,11 @@ void writer::impl::write(table_view const &table) // This contains stats for both the pages and the rowgroups. TODO: make them separate. rmm::device_uvector page_stats(num_stats_bfr, stream); for (uint32_t b = 0, r = 0; b < (uint32_t)batch_list.size(); b++) { - uint8_t *bfr = static_cast(uncomp_bfr.data()); - uint8_t *bfr_c = static_cast(comp_bfr.data()); + uint8_t* bfr = static_cast(uncomp_bfr.data()); + uint8_t* bfr_c = static_cast(comp_bfr.data()); for (uint32_t j = 0; j < batch_list[b]; j++, r++) { for (int i = 0; i < num_columns; i++) { - gpu::EncColumnChunk *ck = &chunks[r][i]; + gpu::EncColumnChunk* ck = &chunks[r][i]; ck->uncompressed_bfr = bfr; ck->compressed_bfr = bfr_c; bfr += ck->bfr_size; @@ -1194,8 +1198,8 @@ void writer::impl::write(table_view const &table) : nullptr); for (; r < rnext; r++, global_r++) { for (auto i = 0; i < num_columns; i++) { - gpu::EncColumnChunk *ck = &chunks[r][i]; - uint8_t *dev_bfr; + gpu::EncColumnChunk* ck = &chunks[r][i]; + uint8_t* dev_bfr; if (ck->is_compressed) { md.row_groups[global_r].columns[i].meta_data.codec = compression_; dev_bfr = ck->compressed_bfr; @@ -1220,7 +1224,7 @@ void writer::impl::write(table_view const &table) } else { if (!host_bfr) { host_bfr = pinned_buffer{[](size_t size) { - uint8_t *ptr = nullptr; + uint8_t* ptr = nullptr; CUDA_TRY(cudaMallocHost(&ptr, size)); return ptr; }(max_chunk_bfr_size), @@ -1255,7 +1259,7 @@ void writer::impl::write(table_view const &table) } std::unique_ptr> writer::impl::close( - std::string const &column_chunks_file_path) + std::string const& column_chunks_file_path) { if (closed) { return nullptr; } closed = true; @@ -1273,15 +1277,17 @@ std::unique_ptr> writer::impl::close( file_header_s fhdr = {parquet_magic}; buffer_.resize(0); buffer_.insert(buffer_.end(), - reinterpret_cast(&fhdr), - reinterpret_cast(&fhdr) + sizeof(fhdr)); - for (auto &rowgroup : md.row_groups) { - for (auto &col : rowgroup.columns) { col.file_path = column_chunks_file_path; } + reinterpret_cast(&fhdr), + reinterpret_cast(&fhdr) + sizeof(fhdr)); + for (auto& rowgroup : md.row_groups) { + for (auto& col : rowgroup.columns) { + col.file_path = column_chunks_file_path; + } } fendr.footer_len = static_cast(cpw.write(md)); buffer_.insert(buffer_.end(), - reinterpret_cast(&fendr), - reinterpret_cast(&fendr) + sizeof(fendr)); + reinterpret_cast(&fendr), + reinterpret_cast(&fendr) + sizeof(fendr)); return std::make_unique>(std::move(buffer_)); } else { return {nullptr}; @@ -1290,19 +1296,19 @@ std::unique_ptr> writer::impl::close( // Forward to implementation writer::writer(std::unique_ptr sink, - parquet_writer_options const &options, + parquet_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : _impl(std::make_unique(std::move(sink), options, mode, stream, mr)) { } writer::writer(std::unique_ptr sink, - chunked_parquet_writer_options const &options, + chunked_parquet_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : _impl(std::make_unique(std::move(sink), options, mode, stream, mr)) { } @@ -1311,23 +1317,23 @@ writer::writer(std::unique_ptr sink, writer::~writer() = default; // Forward to implementation -void writer::write(table_view const &table) { _impl->write(table); } +void writer::write(table_view const& table) { _impl->write(table); } // Forward to implementation -std::unique_ptr> writer::close(std::string const &column_chunks_file_path) +std::unique_ptr> writer::close(std::string const& column_chunks_file_path) { return _impl->close(column_chunks_file_path); } std::unique_ptr> writer::merge_rowgroup_metadata( - const std::vector>> &metadata_list) + const std::vector>>& metadata_list) { std::vector output; CompactProtocolWriter cpw(&output); FileMetaData md; md.row_groups.reserve(metadata_list.size()); - for (const auto &blob : metadata_list) { + for (const auto& blob : metadata_list) { CompactProtocolReader cpreader( blob.get()->data(), std::max(blob.get()->size(), sizeof(file_ender_s)) - sizeof(file_ender_s)); @@ -1356,13 +1362,13 @@ std::unique_ptr> writer::merge_rowgroup_metadata( file_ender_s fendr; fhdr.magic = parquet_magic; output.insert(output.end(), - reinterpret_cast(&fhdr), - reinterpret_cast(&fhdr) + sizeof(fhdr)); + reinterpret_cast(&fhdr), + reinterpret_cast(&fhdr) + sizeof(fhdr)); fendr.footer_len = static_cast(cpw.write(md)); fendr.magic = parquet_magic; output.insert(output.end(), - reinterpret_cast(&fendr), - reinterpret_cast(&fendr) + sizeof(fendr)); + reinterpret_cast(&fendr), + reinterpret_cast(&fendr) + sizeof(fendr)); return std::make_unique>(std::move(output)); } diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh index fd148724712..333f0e1aae7 100644 --- a/cpp/src/io/statistics/column_statistics.cuh +++ b/cpp/src/io/statistics/column_statistics.cuh @@ -60,21 +60,21 @@ using block_reduce_storage = detail::block_reduce_storage; */ template struct calculate_group_statistics_functor { - block_reduce_storage &temp_storage; + block_reduce_storage& temp_storage; /** * @brief Construct a statistics calculator * * @param d_temp_storage Temporary storage to be used by cub calls */ - __device__ calculate_group_statistics_functor(block_reduce_storage &d_temp_storage) + __device__ calculate_group_statistics_functor(block_reduce_storage& d_temp_storage) : temp_storage(d_temp_storage) { } template ::is_ignored> * = nullptr> - __device__ void operator()(stats_state_s &s, uint32_t t) + std::enable_if_t::is_ignored>* = nullptr> + __device__ void operator()(stats_state_s& s, uint32_t t) { // No-op for unsupported aggregation types } @@ -88,8 +88,8 @@ struct calculate_group_statistics_functor { * @param t thread id */ template ::is_ignored> * = nullptr> - __device__ void operator()(stats_state_s &s, uint32_t t) + std::enable_if_t::is_ignored>* = nullptr> + __device__ void operator()(stats_state_s& s, uint32_t t) { detail::storage_wrapper storage(temp_storage); @@ -123,17 +123,17 @@ struct calculate_group_statistics_functor { */ template struct merge_group_statistics_functor { - block_reduce_storage &temp_storage; + block_reduce_storage& temp_storage; - __device__ merge_group_statistics_functor(block_reduce_storage &d_temp_storage) + __device__ merge_group_statistics_functor(block_reduce_storage& d_temp_storage) : temp_storage(d_temp_storage) { } template ::is_ignored> * = nullptr> - __device__ void operator()(merge_state_s &s, - const statistics_chunk *chunks, + std::enable_if_t::is_ignored>* = nullptr> + __device__ void operator()(merge_state_s& s, + const statistics_chunk* chunks, const uint32_t num_chunks, uint32_t t) { @@ -141,9 +141,9 @@ struct merge_group_statistics_functor { } template ::is_ignored> * = nullptr> - __device__ void operator()(merge_state_s &s, - const statistics_chunk *chunks, + std::enable_if_t::is_ignored>* = nullptr> + __device__ void operator()(merge_state_s& s, + const statistics_chunk* chunks, const uint32_t num_chunks, uint32_t t) { @@ -151,7 +151,9 @@ struct merge_group_statistics_functor { typed_statistics_chunk::is_aggregated> chunk; - for (uint32_t i = t; i < num_chunks; i += block_size) { chunk.reduce(chunks[i]); } + for (uint32_t i = t; i < num_chunks; i += block_size) { + chunk.reduce(chunks[i]); + } chunk.has_minmax = (chunk.minimum_value <= chunk.maximum_value); chunk = block_reduce(chunk, storage); @@ -170,17 +172,16 @@ struct merge_group_statistics_functor { * @tparam T Type of object */ template -__device__ void cooperative_load(T &destination, const T *source = nullptr) +__device__ void cooperative_load(T& destination, const T* source = nullptr) { using load_type = std::conditional_t<((sizeof(T) % sizeof(uint32_t)) == 0), uint32_t, uint8_t>; if (source == nullptr) { for (auto i = threadIdx.x; i < (sizeof(T) / sizeof(load_type)); i += blockDim.x) { - reinterpret_cast(&destination)[i] = load_type{0}; + reinterpret_cast(&destination)[i] = load_type{0}; } } else { for (auto i = threadIdx.x; i < sizeof(T) / sizeof(load_type); i += blockDim.x) { - reinterpret_cast(&destination)[i] = - reinterpret_cast(source)[i]; + reinterpret_cast(&destination)[i] = reinterpret_cast(source)[i]; } } } @@ -195,7 +196,7 @@ __device__ void cooperative_load(T &destination, const T *source = nullptr) */ template __global__ void __launch_bounds__(block_size, 1) - gpu_calculate_group_statistics(statistics_chunk *chunks, const statistics_group *groups) + gpu_calculate_group_statistics(statistics_chunk* chunks, const statistics_group* groups) { __shared__ __align__(8) stats_state_s state; __shared__ block_reduce_storage storage; @@ -229,8 +230,8 @@ namespace detail { * @tparam IO File format for which statistics calculation is being done */ template -void calculate_group_statistics(statistics_chunk *chunks, - const statistics_group *groups, +void calculate_group_statistics(statistics_chunk* chunks, + const statistics_group* groups, uint32_t num_chunks, rmm::cuda_stream_view stream) { @@ -250,9 +251,9 @@ void calculate_group_statistics(statistics_chunk *chunks, */ template __global__ void __launch_bounds__(block_size, 1) - gpu_merge_group_statistics(statistics_chunk *chunks_out, - const statistics_chunk *chunks_in, - const statistics_merge_group *groups) + gpu_merge_group_statistics(statistics_chunk* chunks_out, + const statistics_chunk* chunks_in, + const statistics_merge_group* groups) { __shared__ __align__(8) merge_state_s state; __shared__ block_reduce_storage storage; @@ -284,9 +285,9 @@ __global__ void __launch_bounds__(block_size, 1) * @tparam IO File format for which statistics calculation is being done */ template -void merge_group_statistics(statistics_chunk *chunks_out, - const statistics_chunk *chunks_in, - const statistics_merge_group *groups, +void merge_group_statistics(statistics_chunk* chunks_out, + const statistics_chunk* chunks_in, + const statistics_merge_group* groups, uint32_t num_chunks, rmm::cuda_stream_view stream) { diff --git a/cpp/src/io/statistics/conversion_type_select.cuh b/cpp/src/io/statistics/conversion_type_select.cuh index 225377bfc4b..b76a5fcf3cd 100644 --- a/cpp/src/io/statistics/conversion_type_select.cuh +++ b/cpp/src/io/statistics/conversion_type_select.cuh @@ -70,7 +70,7 @@ template class Detect; /** - * @brief Utility class to detect multiple occurences of a type in the first element of pairs in a + * @brief Utility class to detect multiple occurrences of a type in the first element of pairs in a * tuple For eg. with the following tuple : * * using conversion_types = diff --git a/cpp/src/io/statistics/orc_column_statistics.cu b/cpp/src/io/statistics/orc_column_statistics.cu index ad8a05a56f5..9e0dc1c1b7d 100644 --- a/cpp/src/io/statistics/orc_column_statistics.cu +++ b/cpp/src/io/statistics/orc_column_statistics.cu @@ -26,14 +26,14 @@ namespace io { namespace detail { template <> -void merge_group_statistics(statistics_chunk *chunks_out, - const statistics_chunk *chunks_in, - const statistics_merge_group *groups, +void merge_group_statistics(statistics_chunk* chunks_out, + const statistics_chunk* chunks_in, + const statistics_merge_group* groups, uint32_t num_chunks, rmm::cuda_stream_view stream); template <> -void calculate_group_statistics(statistics_chunk *chunks, - const statistics_group *groups, +void calculate_group_statistics(statistics_chunk* chunks, + const statistics_group* groups, uint32_t num_chunks, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/statistics/parquet_column_statistics.cu b/cpp/src/io/statistics/parquet_column_statistics.cu index ad067cd4aad..525065576de 100644 --- a/cpp/src/io/statistics/parquet_column_statistics.cu +++ b/cpp/src/io/statistics/parquet_column_statistics.cu @@ -26,14 +26,14 @@ namespace io { namespace detail { template <> -void merge_group_statistics(statistics_chunk *chunks_out, - const statistics_chunk *chunks_in, - const statistics_merge_group *groups, +void merge_group_statistics(statistics_chunk* chunks_out, + const statistics_chunk* chunks_in, + const statistics_merge_group* groups, uint32_t num_chunks, rmm::cuda_stream_view stream); template <> -void calculate_group_statistics(statistics_chunk *chunks, - const statistics_group *groups, +void calculate_group_statistics(statistics_chunk* chunks, + const statistics_group* groups, uint32_t num_chunks, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh index f7bf6e407c1..c60e4eebaa0 100644 --- a/cpp/src/io/statistics/statistics.cuh +++ b/cpp/src/io/statistics/statistics.cuh @@ -53,15 +53,15 @@ struct stats_column_desc { //!< nested columns int32_t ts_scale; //!< timestamp scale (>0: multiply by scale, <0: divide by -scale) - column_device_view *leaf_column; //!< Pointer to leaf column - column_device_view *parent_column; //!< Pointer to parent column. Is nullptr if not list type. + column_device_view* leaf_column; //!< Pointer to leaf column + column_device_view* parent_column; //!< Pointer to parent column. Is nullptr if not list type. }; struct string_stats { - const char *ptr; //!< ptr to character data + const char* ptr; //!< ptr to character data uint32_t length; //!< length of string - __host__ __device__ __forceinline__ volatile string_stats &operator=( - const string_view &val) volatile + __host__ __device__ __forceinline__ volatile string_stats& operator=( + const string_view& val) volatile { ptr = val.data(); length = val.size_bytes(); @@ -99,13 +99,13 @@ struct statistics_chunk { }; struct statistics_group { - const stats_column_desc *col; //!< Column information + const stats_column_desc* col; //!< Column information uint32_t start_row; //!< Start row of this group uint32_t num_rows; //!< Number of rows in group }; struct statistics_merge_group { - const stats_column_desc *col; //!< Column information + const stats_column_desc* col; //!< Column information uint32_t start_chunk; //!< Start chunk of this group uint32_t num_chunks; //!< Number of chunks in group }; diff --git a/cpp/src/io/statistics/statistics_type_identification.cuh b/cpp/src/io/statistics/statistics_type_identification.cuh index 84399a307a5..869e2833285 100644 --- a/cpp/src/io/statistics/statistics_type_identification.cuh +++ b/cpp/src/io/statistics/statistics_type_identification.cuh @@ -55,8 +55,8 @@ struct conversion_map { std::pair>; }; -// In Parquet timestamps and durations with second resoluion are converted to -// milliseconds. Timestamps and durations with nanosecond resoluion are +// In Parquet timestamps and durations with second resolution are converted to +// milliseconds. Timestamps and durations with nanosecond resolution are // converted to microseconds. template <> struct conversion_map { diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh index 759aa2517b6..2b4f69df10f 100644 --- a/cpp/src/io/utilities/block_utils.cuh +++ b/cpp/src/io/utilities/block_utils.cuh @@ -124,18 +124,18 @@ inline __device__ double Int128ToDouble_rn(uint64_t lo, int64_t hi) return sign * __fma_rn(__ll2double_rn(hi), 4294967296.0 * 4294967296.0, __ull2double_rn(lo)); } -inline __device__ uint32_t unaligned_load32(const uint8_t *p) +inline __device__ uint32_t unaligned_load32(const uint8_t* p) { uint32_t ofs = 3 & reinterpret_cast(p); - const uint32_t *p32 = reinterpret_cast(p - ofs); + const uint32_t* p32 = reinterpret_cast(p - ofs); uint32_t v = p32[0]; return (ofs) ? __funnelshift_r(v, p32[1], ofs * 8) : v; } -inline __device__ uint64_t unaligned_load64(const uint8_t *p) +inline __device__ uint64_t unaligned_load64(const uint8_t* p) { uint32_t ofs = 3 & reinterpret_cast(p); - const uint32_t *p32 = reinterpret_cast(p - ofs); + const uint32_t* p32 = reinterpret_cast(p - ofs); uint32_t v0 = p32[0]; uint32_t v1 = p32[1]; if (ofs) { @@ -146,10 +146,10 @@ inline __device__ uint64_t unaligned_load64(const uint8_t *p) } template -inline __device__ void memcpy_block(void *dstv, const void *srcv, uint32_t len, uint32_t t) +inline __device__ void memcpy_block(void* dstv, const void* srcv, uint32_t len, uint32_t t) { - uint8_t *dst = static_cast(dstv); - const uint8_t *src = static_cast(srcv); + uint8_t* dst = static_cast(dstv); + const uint8_t* src = static_cast(srcv); uint32_t dst_align_bytes, src_align_bytes, src_align_bits; // Align output to 32-bit dst_align_bytes = 3 & -reinterpret_cast(dst); @@ -166,7 +166,7 @@ inline __device__ void memcpy_block(void *dstv, const void *srcv, uint32_t len, src_align_bytes = (uint32_t)(3 & reinterpret_cast(src)); src_align_bits = src_align_bytes * 8; while (len >= 4) { - const uint32_t *src32 = reinterpret_cast(src - src_align_bytes); + const uint32_t* src32 = reinterpret_cast(src - src_align_bytes); uint32_t copy_cnt = min(len >> 2, nthreads); uint32_t v; if (t < copy_cnt) { @@ -174,7 +174,7 @@ inline __device__ void memcpy_block(void *dstv, const void *srcv, uint32_t len, if (src_align_bits != 0) { v = __funnelshift_r(v, src32[t + 1], src_align_bits); } } if (sync_before_store) { __syncthreads(); } - if (t < copy_cnt) { reinterpret_cast(dst)[t] = v; } + if (t < copy_cnt) { reinterpret_cast(dst)[t] = v; } src += copy_cnt * 4; dst += copy_cnt * 4; len -= copy_cnt * 4; diff --git a/cpp/src/io/utilities/column_utils.cuh b/cpp/src/io/utilities/column_utils.cuh index c08f42583ef..03ea041706a 100644 --- a/cpp/src/io/utilities/column_utils.cuh +++ b/cpp/src/io/utilities/column_utils.cuh @@ -49,7 +49,7 @@ namespace io { template rmm::device_uvector create_leaf_column_device_views( typename cudf::device_span col_desc, - const table_device_view &parent_table_device_view, + const table_device_view& parent_table_device_view, rmm::cuda_stream_view stream) { rmm::device_uvector leaf_column_views(parent_table_device_view.num_columns(), @@ -71,7 +71,7 @@ rmm::device_uvector create_leaf_column_device_views( : col.child(0); } // Store leaf_column to device storage - column_device_view *leaf_col_ptr = leaf_columns.begin() + index; + column_device_view* leaf_col_ptr = leaf_columns.begin() + index; *leaf_col_ptr = col; col_desc[index].leaf_column = leaf_col_ptr; }); diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index ac8deccd078..4b23d008344 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -32,7 +32,7 @@ namespace { */ class file_source : public datasource { public: - explicit file_source(const char *filepath) + explicit file_source(const char* filepath) : _file(filepath, O_RDONLY), _cufile_in(detail::make_cufile_input(filepath)) { } @@ -58,7 +58,7 @@ class file_source : public datasource { size_t device_read(size_t offset, size_t size, - uint8_t *dst, + uint8_t* dst, rmm::cuda_stream_view stream) override { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); @@ -84,7 +84,7 @@ class file_source : public datasource { */ class memory_mapped_source : public file_source { public: - explicit memory_mapped_source(const char *filepath, size_t offset, size_t size) + explicit memory_mapped_source(const char* filepath, size_t offset, size_t size) : file_source(filepath) { if (_file.size() != 0) map(_file.desc(), offset, size); @@ -103,17 +103,17 @@ class memory_mapped_source : public file_source { auto const read_size = std::min(size, _map_size - (offset - _map_offset)); return std::make_unique( - static_cast(_map_addr) + (offset - _map_offset), read_size); + static_cast(_map_addr) + (offset - _map_offset), read_size); } - size_t host_read(size_t offset, size_t size, uint8_t *dst) override + size_t host_read(size_t offset, size_t size, uint8_t* dst) override { CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping"); // Clamp length to available data in the mapped region auto const read_size = std::min(size, _map_size - (offset - _map_offset)); - auto const src = static_cast(_map_addr) + (offset - _map_offset); + auto const src = static_cast(_map_addr) + (offset - _map_offset); std::memcpy(dst, src, read_size); return read_size; } @@ -139,7 +139,7 @@ class memory_mapped_source : public file_source { private: size_t _map_size = 0; size_t _map_offset = 0; - void *_map_addr = nullptr; + void* _map_addr = nullptr; }; /** @@ -150,7 +150,7 @@ class memory_mapped_source : public file_source { */ class direct_read_source : public file_source { public: - explicit direct_read_source(const char *filepath) : file_source(filepath) {} + explicit direct_read_source(const char* filepath) : file_source(filepath) {} std::unique_ptr host_read(size_t offset, size_t size) override { @@ -164,7 +164,7 @@ class direct_read_source : public file_source { return buffer::create(std::move(v)); } - size_t host_read(size_t offset, size_t size, uint8_t *dst) override + size_t host_read(size_t offset, size_t size, uint8_t* dst) override { lseek(_file.desc(), offset, SEEK_SET); @@ -186,9 +186,9 @@ class direct_read_source : public file_source { */ class user_datasource_wrapper : public datasource { public: - explicit user_datasource_wrapper(datasource *const source) : source(source) {} + explicit user_datasource_wrapper(datasource* const source) : source(source) {} - size_t host_read(size_t offset, size_t size, uint8_t *dst) override + size_t host_read(size_t offset, size_t size, uint8_t* dst) override { return source->host_read(offset, size, dst); } @@ -202,7 +202,7 @@ class user_datasource_wrapper : public datasource { size_t device_read(size_t offset, size_t size, - uint8_t *dst, + uint8_t* dst, rmm::cuda_stream_view stream) override { return source->device_read(offset, size, dst, stream); @@ -218,12 +218,12 @@ class user_datasource_wrapper : public datasource { size_t size() const override { return source->size(); } private: - datasource *const source; ///< A non-owning pointer to the user-implemented datasource + datasource* const source; ///< A non-owning pointer to the user-implemented datasource }; } // namespace -std::unique_ptr datasource::create(const std::string &filepath, +std::unique_ptr datasource::create(const std::string& filepath, size_t offset, size_t size) { @@ -237,14 +237,14 @@ std::unique_ptr datasource::create(const std::string &filepath, return std::make_unique(filepath.c_str(), offset, size); } -std::unique_ptr datasource::create(host_buffer const &buffer) +std::unique_ptr datasource::create(host_buffer const& buffer) { // Use Arrow IO buffer class for zero-copy reads of host memory return std::make_unique(std::make_shared( - reinterpret_cast(buffer.data), buffer.size)); + reinterpret_cast(buffer.data), buffer.size)); } -std::unique_ptr datasource::create(datasource *source) +std::unique_ptr datasource::create(datasource* source) { // instantiate a wrapper that forwards the calls to the user implementation return std::make_unique(source); diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index abf3a3fdef0..b5fb9fb51bc 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -32,13 +32,13 @@ size_t get_file_size(int file_descriptor) return static_cast(st.st_size); } -file_wrapper::file_wrapper(std::string const &filepath, int flags) +file_wrapper::file_wrapper(std::string const& filepath, int flags) : fd(open(filepath.c_str(), flags)), _size{get_file_size(fd)} { CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath); } -file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode) +file_wrapper::file_wrapper(std::string const& filepath, int flags, mode_t mode) : fd(open(filepath.c_str(), flags, mode)), _size{get_file_size(fd)} { CUDF_EXPECTS(fd != -1, "Cannot open file " + filepath); @@ -46,7 +46,7 @@ file_wrapper::file_wrapper(std::string const &filepath, int flags, mode_t mode) file_wrapper::~file_wrapper() { close(fd); } -std::string getenv_or(std::string const &env_var_name, std::string const &default_val) +std::string getenv_or(std::string const& env_var_name, std::string const& default_val) { auto const env_val = std::getenv(env_var_name.c_str()); return (env_val == nullptr) ? default_val : std::string(env_val); @@ -81,7 +81,7 @@ cufile_config::cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", defau } } } -cufile_config const *cufile_config::instance() +cufile_config const* cufile_config::instance() { static cufile_config _instance; return &_instance; @@ -94,18 +94,18 @@ class cufile_shim { private: cufile_shim(); - void *cf_lib = nullptr; - decltype(cuFileDriverOpen) *driver_open = nullptr; - decltype(cuFileDriverClose) *driver_close = nullptr; + void* cf_lib = nullptr; + decltype(cuFileDriverOpen)* driver_open = nullptr; + decltype(cuFileDriverClose)* driver_close = nullptr; std::unique_ptr init_error; auto is_valid() const noexcept { return init_error == nullptr; } public: - cufile_shim(cufile_shim const &) = delete; - cufile_shim &operator=(cufile_shim const &) = delete; + cufile_shim(cufile_shim const&) = delete; + cufile_shim& operator=(cufile_shim const&) = delete; - static cufile_shim const *instance(); + static cufile_shim const* instance(); ~cufile_shim() { @@ -113,10 +113,10 @@ class cufile_shim { dlclose(cf_lib); } - decltype(cuFileHandleRegister) *handle_register = nullptr; - decltype(cuFileHandleDeregister) *handle_deregister = nullptr; - decltype(cuFileRead) *read = nullptr; - decltype(cuFileWrite) *write = nullptr; + decltype(cuFileHandleRegister)* handle_register = nullptr; + decltype(cuFileHandleDeregister)* handle_deregister = nullptr; + decltype(cuFileRead)* read = nullptr; + decltype(cuFileWrite)* write = nullptr; }; cufile_shim::cufile_shim() @@ -140,12 +140,12 @@ cufile_shim::cufile_shim() CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol"); CUDF_EXPECTS(driver_open().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver"); - } catch (cudf::logic_error const &err) { + } catch (cudf::logic_error const& err) { init_error = std::make_unique(err); } } -cufile_shim const *cufile_shim::instance() +cufile_shim const* cufile_shim::instance() { static cufile_shim _instance; // Defer throwing to avoid repeated attempts to load the library @@ -165,7 +165,7 @@ void cufile_registered_file::register_handle() cufile_registered_file::~cufile_registered_file() { shim->handle_deregister(cf_handle); } -cufile_input_impl::cufile_input_impl(std::string const &filepath) +cufile_input_impl::cufile_input_impl(std::string const& filepath) : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_RDONLY | O_DIRECT) { } @@ -183,7 +183,7 @@ std::unique_ptr cufile_input_impl::read(size_t offset, size_t cufile_input_impl::read(size_t offset, size_t size, - uint8_t *dst, + uint8_t* dst, rmm::cuda_stream_view stream) { CUDF_EXPECTS(shim->read(cf_file.handle(), dst, size, offset, 0) != -1, @@ -192,19 +192,19 @@ size_t cufile_input_impl::read(size_t offset, return size; } -cufile_output_impl::cufile_output_impl(std::string const &filepath) +cufile_output_impl::cufile_output_impl(std::string const& filepath) : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664) { } -void cufile_output_impl::write(void const *data, size_t offset, size_t size) +void cufile_output_impl::write(void const* data, size_t offset, size_t size) { CUDF_EXPECTS(shim->write(cf_file.handle(), data, size, offset, 0) != -1, "cuFile error writing to a file"); } #endif -std::unique_ptr make_cufile_input(std::string const &filepath) +std::unique_ptr make_cufile_input(std::string const& filepath) { #ifdef CUFILE_FOUND if (cufile_config::instance()->is_enabled()) { @@ -218,7 +218,7 @@ std::unique_ptr make_cufile_input(std::string const &filepath return nullptr; } -std::unique_ptr make_cufile_output(std::string const &filepath) +std::unique_ptr make_cufile_output(std::string const& filepath) { #ifdef CUFILE_FOUND if (cufile_config::instance()->is_enabled()) { diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 8a742076338..e92191095e3 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -40,8 +40,8 @@ class file_wrapper { size_t _size; public: - explicit file_wrapper(std::string const &filepath, int flags); - explicit file_wrapper(std::string const &filepath, int flags, mode_t mode); + explicit file_wrapper(std::string const& filepath, int flags); + explicit file_wrapper(std::string const& filepath, int flags, mode_t mode); ~file_wrapper(); auto size() const { return _size; } auto desc() const { return fd; } @@ -105,7 +105,7 @@ class cufile_input : public cufile_io_base { * * @return The number of bytes read */ - virtual size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) = 0; + virtual size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) = 0; }; /** @@ -122,7 +122,7 @@ class cufile_output : public cufile_io_base { * @param offset Number of bytes from the start * @param size Number of bytes to write */ - virtual void write(void const *data, size_t offset, size_t size) = 0; + virtual void write(void const* data, size_t offset, size_t size) = 0; }; #ifdef CUFILE_FOUND @@ -152,7 +152,7 @@ class cufile_config { */ bool is_required() const { return policy == "ALWAYS"; } - static cufile_config const *instance(); + static cufile_config const* instance(); }; /** @@ -162,14 +162,14 @@ struct cufile_registered_file { void register_handle(); public: - cufile_registered_file(cufile_shim const *shim, std::string const &filepath, int flags) + cufile_registered_file(cufile_shim const* shim, std::string const& filepath, int flags) : _file(filepath, flags), shim{shim} { register_handle(); } - cufile_registered_file(cufile_shim const *shim, - std::string const &filepath, + cufile_registered_file(cufile_shim const* shim, + std::string const& filepath, int flags, mode_t mode) : _file(filepath, flags, mode), shim{shim} @@ -177,14 +177,14 @@ struct cufile_registered_file { register_handle(); } - auto const &handle() const noexcept { return cf_handle; } + auto const& handle() const noexcept { return cf_handle; } ~cufile_registered_file(); private: file_wrapper const _file; CUfileHandle_t cf_handle = nullptr; - cufile_shim const *shim = nullptr; + cufile_shim const* shim = nullptr; }; /** @@ -194,16 +194,16 @@ struct cufile_registered_file { */ class cufile_input_impl final : public cufile_input { public: - cufile_input_impl(std::string const &filepath); + cufile_input_impl(std::string const& filepath); std::unique_ptr read(size_t offset, size_t size, rmm::cuda_stream_view stream) override; - size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override; + size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) override; private: - cufile_shim const *shim = nullptr; + cufile_shim const* shim = nullptr; cufile_registered_file const cf_file; }; @@ -214,12 +214,12 @@ class cufile_input_impl final : public cufile_input { */ class cufile_output_impl final : public cufile_output { public: - cufile_output_impl(std::string const &filepath); + cufile_output_impl(std::string const& filepath); - void write(void const *data, size_t offset, size_t size) override; + void write(void const* data, size_t offset, size_t size) override; private: - cufile_shim const *shim = nullptr; + cufile_shim const* shim = nullptr; cufile_registered_file const cf_file; }; #else @@ -233,7 +233,7 @@ class cufile_input_impl final : public cufile_input { CUDF_FAIL("Only used to compile without cufile library, should not be called"); } - size_t read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream) override + size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) override { CUDF_FAIL("Only used to compile without cufile library, should not be called"); } @@ -241,7 +241,7 @@ class cufile_input_impl final : public cufile_input { class cufile_output_impl final : public cufile_output { public: - void write(void const *data, size_t offset, size_t size) override + void write(void const* data, size_t offset, size_t size) override { CUDF_FAIL("Only used to compile without cufile library, should not be called"); } @@ -254,7 +254,7 @@ class cufile_output_impl final : public cufile_output { * Returns a null pointer if an exception occurs in the `cufile_input_impl` constructor, or if the * cuFile library is not installed. */ -std::unique_ptr make_cufile_input(std::string const &filepath); +std::unique_ptr make_cufile_input(std::string const& filepath); /** * @brief Creates a `cufile_output_impl` object @@ -262,7 +262,7 @@ std::unique_ptr make_cufile_input(std::string const &filepath * Returns a null pointer if an exception occurs in the `cufile_output_impl` constructor, or if the * cuFile library is not installed. */ -std::unique_ptr make_cufile_output(std::string const &filepath); +std::unique_ptr make_cufile_output(std::string const& filepath); } // namespace detail } // namespace io diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index ee4b23bf831..147e53ba32b 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -38,8 +38,8 @@ class hostdevice_vector { hostdevice_vector() {} - hostdevice_vector(hostdevice_vector &&v) { move(std::move(v)); } - hostdevice_vector &operator=(hostdevice_vector &&v) + hostdevice_vector(hostdevice_vector&& v) { move(std::move(v)); } + hostdevice_vector& operator=(hostdevice_vector&& v) { move(std::move(v)); return *this; @@ -70,7 +70,7 @@ class hostdevice_vector { } } - bool insert(const T &data) + bool insert(const T& data) { if (num_elements < max_elements) { h_data[num_elements] = data; @@ -84,12 +84,12 @@ class hostdevice_vector { size_t size() const noexcept { return num_elements; } size_t memory_size() const noexcept { return sizeof(T) * num_elements; } - T &operator[](size_t i) const { return h_data[i]; } - T *host_ptr(size_t offset = 0) const { return h_data + offset; } - T *device_ptr(size_t offset = 0) { return reinterpret_cast(d_data.data()) + offset; } - T const *device_ptr(size_t offset = 0) const + T& operator[](size_t i) const { return h_data[i]; } + T* host_ptr(size_t offset = 0) const { return h_data + offset; } + T* device_ptr(size_t offset = 0) { return reinterpret_cast(d_data.data()) + offset; } + T const* device_ptr(size_t offset = 0) const { - return reinterpret_cast(d_data.data()) + offset; + return reinterpret_cast(d_data.data()) + offset; } operator cudf::device_span() { return {device_ptr(), max_elements}; } @@ -113,7 +113,7 @@ class hostdevice_vector { } private: - void move(hostdevice_vector &&v) + void move(hostdevice_vector&& v) { stream = v.stream; max_elements = v.max_elements; @@ -129,7 +129,7 @@ class hostdevice_vector { rmm::cuda_stream_view stream{}; size_t max_elements{}; size_t num_elements{}; - T *h_data{}; + T* h_data{}; rmm::device_buffer d_data{}; }; @@ -175,6 +175,15 @@ class hostdevice_2dvector { auto size() const noexcept { return _size; } + T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); } + T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); } + + T const* base_host_ptr(size_t offset = 0) const { return _data.host_ptr(offset); } + + T const* base_device_ptr(size_t offset = 0) const { return _data.device_ptr(offset); } + + size_t memory_size() const noexcept { return _data.memory_size(); } + void host_to_device(rmm::cuda_stream_view stream, bool synchronize = false) { _data.host_to_device(stream, synchronize); diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index c7eae48cbbc..a6b4978aeab 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -335,7 +335,9 @@ __device__ __inline__ cudf::size_type* infer_integral_field_counter(char const* // Remove preceding zeros if (digit_count >= (sizeof(int64_max_abs) - 1)) { // Trim zeros at the beginning of raw_data - while (*data_begin == '0' && (data_begin < data_end)) { data_begin++; } + while (*data_begin == '0' && (data_begin < data_end)) { + data_begin++; + } } digit_count = data_end - data_begin; diff --git a/cpp/src/io/utilities/trie.cu b/cpp/src/io/utilities/trie.cu index 82d8f5e8336..bf03d6a6a89 100644 --- a/cpp/src/io/utilities/trie.cu +++ b/cpp/src/io/utilities/trie.cu @@ -33,7 +33,7 @@ namespace cudf { namespace detail { -rmm::device_uvector create_serialized_trie(const std::vector &keys, +rmm::device_uvector create_serialized_trie(const std::vector& keys, rmm::cuda_stream_view stream) { static constexpr int alphabet_size = std::numeric_limits::max() + 1; @@ -47,8 +47,8 @@ rmm::device_uvector create_serialized_trie(const std::vectorchildren[character] == nullptr) @@ -61,9 +61,9 @@ rmm::device_uvector create_serialized_trie(const std::vector 0 && is_white(no_comments[stop])) { stop--; } + while (stop > 0 && is_white(no_comments[stop])) { + stop--; + } CUDF_EXPECTS(stop != 0 || !is_white(no_comments[0]), "No CUDA device function name found in the input CUDA code.\n"); start = stop; - while (start > 0 && !is_white(no_comments[start])) { start--; } + while (start > 0 && !is_white(no_comments[start])) { + start--; + } start++; stop++; CUDF_EXPECTS(start < stop, "No CUDA device function name found in the input CUDA code.\n"); diff --git a/cpp/src/jit/parser.hpp b/cpp/src/jit/parser.hpp index 61228d7ffce..0b752d77d1f 100644 --- a/cpp/src/jit/parser.hpp +++ b/cpp/src/jit/parser.hpp @@ -106,7 +106,7 @@ class ptx_parser { std::vector parse_function_body(const std::string& src); /** - * @brief Remove leading white chractors and call `parse_instruction`. + * @brief Remove leading white characters and call `parse_instruction`. * * @param src The statement to be parsed. * @return The resulting CUDA statement. @@ -124,8 +124,8 @@ class ptx_parser { * * ---> asm volatile (" fma.rn.f32 _f4, _f3, _f1, _f2;"); * - * If a regiter from the input parameters list is used in an instruction - * its type is inferred from the intruction and saved in the `input_arg_list` + * If a register from the input parameters list is used in an instruction + * its type is inferred from the instruction and saved in the `input_arg_list` * to be used in when parsing the function header. * * See the document at https://github.com/hummingtree/cudf/wiki/PTX-parser diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index dfe3231e897..e6110edfaa8 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -33,14 +33,14 @@ namespace cudf { namespace detail { std::pair, std::unique_ptr
> get_empty_joined_table( - table_view const &probe, table_view const &build) + table_view const& probe, table_view const& build) { std::unique_ptr
empty_probe = empty_like(probe); std::unique_ptr
empty_build = empty_like(build); return std::make_pair(std::move(empty_probe), std::move(empty_build)); } -VectorPair concatenate_vector_pairs(VectorPair &a, VectorPair &b, rmm::cuda_stream_view stream) +VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream) { CUDF_EXPECTS((a.first->size() == a.second->size()), "Mismatch between sizes of vectors in vector pair"); @@ -90,12 +90,11 @@ struct valid_range { */ std::pair>, std::unique_ptr>> -get_left_join_indices_complement( - std::unique_ptr> &right_indices, - size_type left_table_row_count, - size_type right_table_row_count, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) +get_left_join_indices_complement(std::unique_ptr>& right_indices, + size_type left_table_row_count, + size_type right_table_row_count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Get array of indices that do not appear in right_indices @@ -169,8 +168,8 @@ get_left_join_indices_complement( * * @return Built hash table. */ -std::unique_ptr> build_join_hash_table( - cudf::table_view const &build, null_equality compare_nulls, rmm::cuda_stream_view stream) +std::unique_ptr> build_join_hash_table( + cudf::table_view const& build, null_equality compare_nulls, rmm::cuda_stream_view stream) { auto build_device_table = cudf::table_device_view::create(build, stream); @@ -198,7 +197,7 @@ std::unique_ptr> build_join_ *hash_table, hash_build, build_table_num_rows, - static_cast(row_bitmask.data()), + static_cast(row_bitmask.data()), failure.data()); // Check error code from the kernel if (failure.value(stream) == 1) { CUDF_FAIL("Hash Table insert failure."); } @@ -228,11 +227,11 @@ std::pair>, std::unique_ptr>> probe_join_hash_table(cudf::table_device_view build_table, cudf::table_device_view probe_table, - multimap_type const &hash_table, + multimap_type const& hash_table, null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { // Use the output size directly if provided. Otherwise, compute the exact output size constexpr cudf::detail::join_kind ProbeJoinKind = (JoinKind == cudf::detail::join_kind::FULL_JOIN) @@ -308,10 +307,10 @@ probe_join_hash_table(cudf::table_device_view build_table, */ std::size_t get_full_join_size(cudf::table_device_view build_table, cudf::table_device_view probe_table, - multimap_type const &hash_table, + multimap_type const& hash_table, null_equality compare_nulls, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { std::size_t join_size = compute_join_output_size( build_table, probe_table, hash_table, compare_nulls, stream); @@ -342,7 +341,7 @@ std::size_t get_full_join_size(cudf::table_device_view build_table, right_indices->data(), write_index.data(), join_size); - // Rlease intermediate memory alloation + // Release intermediate memory allocation left_indices->resize(0, stream); auto const left_table_row_count = probe_table.num_rows(); @@ -383,8 +382,8 @@ std::size_t get_full_join_size(cudf::table_device_view build_table, return join_size + left_join_complement_size; } -std::unique_ptr combine_table_pair(std::unique_ptr &&left, - std::unique_ptr &&right) +std::unique_ptr combine_table_pair(std::unique_ptr&& left, + std::unique_ptr&& right) { auto joined_cols = left->release(); auto right_cols = right->release(); @@ -398,7 +397,7 @@ std::unique_ptr combine_table_pair(std::unique_ptr &&l hash_join::hash_join_impl::~hash_join_impl() = default; -hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build, +hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build, null_equality compare_nulls, rmm::cuda_stream_view stream) : _hash_table(nullptr) @@ -421,11 +420,11 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build, std::pair>, std::unique_ptr>> -hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, +hash_join::hash_join_impl::inner_join(cudf::table_view const& probe, null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const + rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); return compute_hash_join( @@ -434,11 +433,11 @@ hash_join::hash_join_impl::inner_join(cudf::table_view const &probe, std::pair>, std::unique_ptr>> -hash_join::hash_join_impl::left_join(cudf::table_view const &probe, +hash_join::hash_join_impl::left_join(cudf::table_view const& probe, null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const + rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); return compute_hash_join( @@ -447,18 +446,18 @@ hash_join::hash_join_impl::left_join(cudf::table_view const &probe, std::pair>, std::unique_ptr>> -hash_join::hash_join_impl::full_join(cudf::table_view const &probe, +hash_join::hash_join_impl::full_join(cudf::table_view const& probe, null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const + rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); return compute_hash_join( probe, compare_nulls, output_size, stream, mr); } -std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const &probe, +std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& probe, null_equality compare_nulls, rmm::cuda_stream_view stream) const { @@ -472,7 +471,7 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const &p *build_table, *probe_table, *_hash_table, compare_nulls, stream); } -std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const &probe, +std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& probe, null_equality compare_nulls, rmm::cuda_stream_view stream) const { @@ -488,10 +487,10 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const &pr *build_table, *probe_table, *_hash_table, compare_nulls, stream); } -std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const &probe, +std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& probe, null_equality compare_nulls, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const + rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); @@ -507,11 +506,11 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const &pr template std::pair>, std::unique_ptr>> -hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, +hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe, null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const + rmm::mr::device_memory_resource* mr) const { CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty"); CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE, @@ -533,7 +532,7 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, std::cend(_build), std::cbegin(flattened_probe_table), std::cend(flattened_probe_table), - [](const auto &b, const auto &p) { return b.type() == p.type(); }), + [](const auto& b, const auto& p) { return b.type() == p.type(); }), "Mismatch in joining column data types"); return probe_join_indices( @@ -543,11 +542,11 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, template std::pair>, std::unique_ptr>> -hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe, +hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe, null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) const + rmm::mr::device_memory_resource* mr) const { // Trivial left join case - exit early if (!_hash_table && JoinKind != cudf::detail::join_kind::INNER_JOIN) { diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index f9ccbd68c74..1b4cbf4ba1d 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -153,6 +153,17 @@ std::pair, std::unique_ptr
> get_empty_joined_table std::unique_ptr combine_table_pair(std::unique_ptr&& left, std::unique_ptr&& right); +VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream); + +std::pair>, + std::unique_ptr>> +get_left_join_indices_complement( + std::unique_ptr>& right_indices, + size_type left_table_row_count, + size_type right_table_row_count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace detail struct hash_join::hash_join_impl { diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 6cb04cadcac..cf711524f0b 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -15,6 +15,7 @@ */ #include #include +#include #include #include @@ -219,6 +220,21 @@ std::unique_ptr
full_join(table_view const& left_input, return combine_table_pair(std::move(left_result), std::move(right_result)); } +std::pair>, + std::unique_ptr>> +conditional_join(table_view left, + table_view right, + ast::expression binary_predicate, + null_equality compare_nulls, + join_kind JoinKind, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return get_conditional_join_indices( + left, right, JoinKind, binary_predicate, compare_nulls, stream, mr); +} + } // namespace detail hash_join::~hash_join() = default; @@ -356,4 +372,88 @@ std::unique_ptr
full_join(table_view const& left, left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } +std::pair>, + std::unique_ptr>> +conditional_inner_join(table_view left, + table_view right, + ast::expression binary_predicate, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + return detail::conditional_join(left, + right, + binary_predicate, + compare_nulls, + detail::join_kind::INNER_JOIN, + rmm::cuda_stream_default, + mr); +} + +std::pair>, + std::unique_ptr>> +conditional_left_join(table_view left, + table_view right, + ast::expression binary_predicate, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + return detail::conditional_join(left, + right, + binary_predicate, + compare_nulls, + detail::join_kind::LEFT_JOIN, + rmm::cuda_stream_default, + mr); +} + +std::pair>, + std::unique_ptr>> +conditional_full_join(table_view left, + table_view right, + ast::expression binary_predicate, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + return detail::conditional_join(left, + right, + binary_predicate, + compare_nulls, + detail::join_kind::FULL_JOIN, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr> conditional_left_semi_join( + table_view left, + table_view right, + ast::expression binary_predicate, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + return std::move(detail::conditional_join(left, + right, + binary_predicate, + compare_nulls, + detail::join_kind::LEFT_SEMI_JOIN, + rmm::cuda_stream_default, + mr) + .first); +} + +std::unique_ptr> conditional_left_anti_join( + table_view left, + table_view right, + ast::expression binary_predicate, + null_equality compare_nulls, + rmm::mr::device_memory_resource* mr) +{ + return std::move(detail::conditional_join(left, + right, + binary_predicate, + compare_nulls, + detail::join_kind::LEFT_ANTI_JOIN, + rmm::cuda_stream_default, + mr) + .first); +} } // namespace cudf diff --git a/cpp/src/join/join_kernels.cuh b/cpp/src/join/join_kernels.cuh index 4298706987c..6d0810ea800 100644 --- a/cpp/src/join/join_kernels.cuh +++ b/cpp/src/join/join_kernels.cuh @@ -18,12 +18,18 @@ #include #include +#include +#include +#include #include #include #include +#include #include "join_common_utils.hpp" +#include + namespace cudf { namespace detail { /** @@ -203,39 +209,63 @@ __global__ void compute_join_output_size(multimap_type multi_map, * @brief Computes the output size of joining the left table to the right table. * * This method uses a nested loop to iterate over the left and right tables and count the number of - * matches. + * matches according to a boolean expression. * * @tparam block_size The number of threads per block for this kernel + * @tparam has_nulls Whether or not the inputs may contain nulls. * * @param[in] left_table The left table * @param[in] right_table The right table * @param[in] JoinKind The type of join to be performed - * @param[in] check_row_equality The row equality comparator + * @param[in] compare_nulls Controls whether null join-key values should match or not. + * @param[in] plan Container of device data required to evaluate the desired expression. * @param[out] output_size The resulting output size */ -template -__global__ void compute_nested_loop_join_output_size(table_device_view left_table, +template +__global__ void compute_conditional_join_output_size(table_device_view left_table, table_device_view right_table, join_kind JoinKind, - row_equality check_row_equality, + null_equality compare_nulls, + ast::detail::device_ast_plan plan, cudf::size_type* output_size) { + // The (required) extern storage of the shared memory array leads to + // conflicting declarations between different templates. The easiest + // workaround is to declare an arbitrary (here char) array type then cast it + // after the fact to the appropriate type. + extern __shared__ char raw_intermediate_storage[]; + cudf::ast::detail::IntermediateDataType* intermediate_storage = + reinterpret_cast*>(raw_intermediate_storage); + auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates]; + cudf::size_type thread_counter(0); const cudf::size_type left_start_idx = threadIdx.x + blockIdx.x * blockDim.x; const cudf::size_type left_stride = blockDim.x * gridDim.x; const cudf::size_type left_num_rows = left_table.num_rows(); const cudf::size_type right_num_rows = right_table.num_rows(); + auto evaluator = cudf::ast::detail::expression_evaluator( + left_table, right_table, plan, thread_intermediate_storage, compare_nulls); + for (cudf::size_type left_row_index = left_start_idx; left_row_index < left_num_rows; left_row_index += left_stride) { bool found_match = false; for (cudf::size_type right_row_index = 0; right_row_index < right_num_rows; right_row_index++) { - if (check_row_equality(left_row_index, right_row_index)) { - ++thread_counter; + auto output_dest = cudf::ast::detail::value_expression_result(); + evaluator.evaluate(output_dest, left_row_index, right_row_index, 0); + if (output_dest.is_valid() && output_dest.value()) { + if ((JoinKind != join_kind::LEFT_ANTI_JOIN) && + !(JoinKind == join_kind::LEFT_SEMI_JOIN && found_match)) { + ++thread_counter; + } found_match = true; } } - if ((JoinKind == join_kind::LEFT_JOIN) && (!found_match)) { ++thread_counter; } + if ((JoinKind == join_kind::LEFT_JOIN || JoinKind == join_kind::LEFT_ANTI_JOIN || + JoinKind == join_kind::FULL_JOIN) && + (!found_match)) { + ++thread_counter; + } } using BlockReduce = cub::BlockReduce; @@ -428,32 +458,35 @@ __global__ void probe_hash_table(multimap_type multi_map, } /** - * @brief Performs a nested loop join to find all matching rows between the - * left and right tables and generate the output for the desired Join - * operation. + * @brief Performs a join conditioned on a predicate to find all matching rows + * between the left and right tables and generate the output for the desired + * Join operation. * * @tparam block_size The number of threads per block for this kernel * @tparam output_cache_size The side of the shared memory buffer to cache join * output results - + * @tparam has_nulls Whether or not the inputs may contain nulls. + * * @param[in] left_table The left table * @param[in] right_table The right table * @param[in] JoinKind The type of join to be performed - * @param[in] check_row_equality The row equality comparator + * @param compare_nulls Controls whether null join-key values should match or not. * @param[out] join_output_l The left result of the join operation * @param[out] join_output_r The right result of the join operation * @param[in,out] current_idx A global counter used by threads to coordinate * writes to the global output + * @param plan Container of device data required to evaluate the desired expression. * @param[in] max_size The maximum size of the output */ -template -__global__ void nested_loop_join(table_device_view left_table, +template +__global__ void conditional_join(table_device_view left_table, table_device_view right_table, join_kind JoinKind, - row_equality check_row_equality, + null_equality compare_nulls, cudf::size_type* join_output_l, cudf::size_type* join_output_r, cudf::size_type* current_idx, + cudf::ast::detail::device_ast_plan plan, const cudf::size_type max_size) { constexpr int num_warps = block_size / detail::warp_size; @@ -461,6 +494,15 @@ __global__ void nested_loop_join(table_device_view left_table, __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size]; __shared__ cudf::size_type join_shared_r[num_warps][output_cache_size]; + // Normally the casting of a shared memory array is used to create multiple + // arrays of different types from the shared memory buffer, but here it is + // used to circumvent conflicts between arrays of different types between + // different template instantiations due to the extern specifier. + extern __shared__ char raw_intermediate_storage[]; + cudf::ast::detail::IntermediateDataType* intermediate_storage = + reinterpret_cast*>(raw_intermediate_storage); + auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates]; + const int warp_id = threadIdx.x / detail::warp_size; const int lane_id = threadIdx.x % detail::warp_size; const cudf::size_type left_num_rows = left_table.num_rows(); @@ -473,18 +515,34 @@ __global__ void nested_loop_join(table_device_view left_table, cudf::size_type left_row_index = threadIdx.x + blockIdx.x * blockDim.x; const unsigned int activemask = __ballot_sync(0xffffffff, left_row_index < left_num_rows); + + auto evaluator = cudf::ast::detail::expression_evaluator( + left_table, right_table, plan, thread_intermediate_storage, compare_nulls); + if (left_row_index < left_num_rows) { bool found_match = false; - for (size_type right_row_index(0); right_row_index < right_num_rows; right_row_index++) { - if (check_row_equality(left_row_index, right_row_index)) { + for (size_type right_row_index(0); right_row_index < right_num_rows; ++right_row_index) { + auto output_dest = cudf::ast::detail::value_expression_result(); + evaluator.evaluate(output_dest, left_row_index, right_row_index, 0); + + if (output_dest.is_valid() && output_dest.value()) { // If the rows are equal, then we have found a true match + // In the case of left anti joins we only add indices from left after + // the loop if we have found _no_ matches from the right. + // In the case of left semi joins we only add the first match (note + // that the current logic relies on the fact that we process all right + // table rows for a single left table row on a single thread so that no + // synchronization of found_match is required). + if ((JoinKind != join_kind::LEFT_ANTI_JOIN) && + !(JoinKind == join_kind::LEFT_SEMI_JOIN && found_match)) { + add_pair_to_cache(left_row_index, + right_row_index, + current_idx_shared, + warp_id, + join_shared_l[warp_id], + join_shared_r[warp_id]); + } found_match = true; - add_pair_to_cache(left_row_index, - right_row_index, - current_idx_shared, - warp_id, - join_shared_l[warp_id], - join_shared_r[warp_id]); } __syncwarp(activemask); @@ -506,8 +564,11 @@ __global__ void nested_loop_join(table_device_view left_table, } } - // If performing a LEFT join and no match was found, insert a Null into the output - if ((JoinKind == join_kind::LEFT_JOIN) && (!found_match)) { + // Left, left anti, and full joins all require saving left columns that + // aren't present in the right. + if ((JoinKind == join_kind::LEFT_JOIN || JoinKind == join_kind::LEFT_ANTI_JOIN || + JoinKind == join_kind::FULL_JOIN) && + (!found_match)) { add_pair_to_cache(left_row_index, static_cast(JoinNoneValue), current_idx_shared, diff --git a/cpp/src/join/nested_loop_join.cuh b/cpp/src/join/nested_loop_join.cuh index 5054305a41a..9848477a894 100644 --- a/cpp/src/join/nested_loop_join.cuh +++ b/cpp/src/join/nested_loop_join.cuh @@ -19,7 +19,8 @@ #include "join_common_utils.hpp" #include "join_kernels.cuh" -#include +#include +#include #include #include #include @@ -28,167 +29,153 @@ #include #include +#include -#include +#include + +#include namespace cudf { namespace detail { + /** - * @brief Gives an estimate of the size of the join output produced when - * joining two tables together. - * - * @throw cudf::logic_error if JoinKind is not INNER_JOIN or LEFT_JOIN + * @brief Computes the join operation between two tables and returns the + * output indices of left and right table as a combined table * - * @param left The left hand table - * @param right The right hand table + * @param left Table of left columns to join + * @param right Table of right columns to join + * tables have been flipped, meaning the output indices should also be flipped * @param JoinKind The type of join to be performed * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches * - * @return An estimate of the size of the output of the join operation + * @return Join output indices vector pair */ -size_type estimate_nested_loop_join_output_size(table_device_view left, - table_device_view right, - join_kind JoinKind, - null_equality compare_nulls, - rmm::cuda_stream_view stream) +std::pair>, + std::unique_ptr>> +get_conditional_join_indices(table_view const& left, + table_view const& right, + join_kind JoinKind, + ast::expression binary_pred, + null_equality compare_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - const size_type left_num_rows{left.num_rows()}; - const size_type right_num_rows{right.num_rows()}; - - if (right_num_rows == 0) { - // If the right table is empty, we know exactly how large the output - // will be for the different types of joins and can return immediately + // We can immediately filter out cases where the right table is empty. In + // some cases, we return all the rows of the left table with a corresponding + // null index for the right table; in others, we return an empty output. + if (right.num_rows() == 0) { switch (JoinKind) { - // Inner join with an empty table will have no output - case join_kind::INNER_JOIN: return 0; - - // Left join with an empty table will have an output of NULL rows - // equal to the number of rows in the left table - case join_kind::LEFT_JOIN: return left_num_rows; - - default: CUDF_FAIL("Unsupported join type"); + // Left, left anti, and full (which are effectively left because we are + // guaranteed that left has more rows than right) all return a all the + // row indices from left with a corresponding NULL from the right. + case join_kind::LEFT_JOIN: + case join_kind::LEFT_ANTI_JOIN: + case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream); + // Inner and left semi joins return empty output because no matches can exist. + case join_kind::INNER_JOIN: + case join_kind::LEFT_SEMI_JOIN: + return std::make_pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); } } - // Allocate storage for the counter used to get the size of the join output - size_type h_size_estimate{0}; - rmm::device_scalar size_estimate(0, stream); + // Prepare output column. Whether or not the output column is nullable is + // determined by whether any of the columns in the input table are nullable. + // If none of the input columns actually contain nulls, we can still use the + // non-nullable version of the expression evaluation code path for + // performance, so we capture that information as well. + auto const nullable = + std::any_of(left.begin(), left.end(), [](column_view c) { return c.nullable(); }) || + std::any_of(right.begin(), right.end(), [](column_view c) { return c.nullable(); }); + auto const has_nulls = + nullable && + (std::any_of( + left.begin(), left.end(), [](column_view c) { return c.nullable() && c.has_nulls(); }) || + std::any_of( + right.begin(), right.end(), [](column_view c) { return c.nullable() && c.has_nulls(); })); + + auto const plan = ast::detail::ast_plan{binary_pred, left, right, has_nulls, stream, mr}; + CUDF_EXPECTS(plan.output_type().id() == type_id::BOOL8, + "The expression must produce a boolean output."); - CHECK_CUDA(stream.value()); + auto left_table = table_device_view::create(left, stream); + auto right_table = table_device_view::create(right, stream); + // Allocate storage for the counter used to get the size of the join output + rmm::device_scalar size(0, stream, mr); + CHECK_CUDA(stream.value()); constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; - int numBlocks{-1}; + detail::grid_1d config(left_table->num_rows(), block_size); + auto const shmem_size_per_block = plan.dev_plan.shmem_per_thread * config.num_threads_per_block; - CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &numBlocks, compute_nested_loop_join_output_size, block_size, 0)); - - int dev_id{-1}; - CUDA_TRY(cudaGetDevice(&dev_id)); - - int num_sms{-1}; - CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id)); - - size_estimate.set_value_zero(stream); - - row_equality equality{left, right, compare_nulls == null_equality::EQUAL}; // Determine number of output rows without actually building the output to simply // find what the size of the output will be. - compute_nested_loop_join_output_size - <<>>( - left, right, JoinKind, equality, size_estimate.data()); + join_kind KernelJoinKind = JoinKind == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : JoinKind; + if (has_nulls) { + compute_conditional_join_output_size + <<>>( + *left_table, *right_table, KernelJoinKind, compare_nulls, plan.dev_plan, size.data()); + } else { + compute_conditional_join_output_size + <<>>( + *left_table, *right_table, KernelJoinKind, compare_nulls, plan.dev_plan, size.data()); + } CHECK_CUDA(stream.value()); - h_size_estimate = size_estimate.value(stream); + size_type const join_size = size.value(stream); - return h_size_estimate; -} - -/** - * @brief Computes the join operation between two tables and returns the - * output indices of left and right table as a combined table - * - * @param left Table of left columns to join - * @param right Table of right columns to join - * @param flip_join_indices Flag that indicates whether the left and right - * tables have been flipped, meaning the output indices should also be flipped - * @param JoinKind The type of join to be performed - * @param compare_nulls Controls whether null join-key values should match or not. - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return Join output indices vector pair - */ -std::pair, rmm::device_uvector> -get_base_nested_loop_join_indices(table_view const& left, - table_view const& right, - bool flip_join_indices, - join_kind JoinKind, - null_equality compare_nulls, - rmm::cuda_stream_view stream) -{ - // The `right` table is always used for the inner loop. We want to use the smaller table - // for the inner loop. Thus, if `left` is smaller than `right`, swap `left/right`. - if ((JoinKind == join_kind::INNER_JOIN) && (right.num_rows() > left.num_rows())) { - return get_base_nested_loop_join_indices(right, left, true, JoinKind, compare_nulls, stream); + // If the output size will be zero, we can return immediately. + if (join_size == 0) { + return std::make_pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); } - // Trivial left join case - exit early - if ((JoinKind == join_kind::LEFT_JOIN) && (right.num_rows() == 0)) { - return get_trivial_left_join_indices(left, stream); + + rmm::device_scalar write_index(0, stream); + + auto left_indices = std::make_unique>(join_size, stream, mr); + auto right_indices = std::make_unique>(join_size, stream, mr); + + const auto& join_output_l = left_indices->data(); + const auto& join_output_r = right_indices->data(); + if (has_nulls) { + conditional_join + <<>>( + *left_table, + *right_table, + KernelJoinKind, + compare_nulls, + join_output_l, + join_output_r, + write_index.data(), + plan.dev_plan, + join_size); + } else { + conditional_join + <<>>( + *left_table, + *right_table, + KernelJoinKind, + compare_nulls, + join_output_l, + join_output_r, + write_index.data(), + plan.dev_plan, + join_size); } - auto left_table = table_device_view::create(left, stream); - auto right_table = table_device_view::create(right, stream); + CHECK_CUDA(stream.value()); - size_type estimated_size = estimate_nested_loop_join_output_size( - *left_table, *right_table, JoinKind, compare_nulls, stream); + auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices)); - // If the estimated output size is zero, return immediately - if (estimated_size == 0) { - return std::make_pair(rmm::device_uvector{0, stream}, - rmm::device_uvector{0, stream}); + // For full joins, get the indices in the right table that were not joined to + // by any row in the left table. + if (JoinKind == join_kind::FULL_JOIN) { + auto complement_indices = detail::get_left_join_indices_complement( + join_indices.second, left.num_rows(), right.num_rows(), stream, mr); + join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); } - - // Because we are approximating the number of joined elements, our approximation - // might be incorrect and we might have underestimated the number of joined elements. - // As such we will need to de-allocate memory and re-allocate memory to ensure - // that the final output is correct. - rmm::device_scalar write_index(0, stream); - size_type join_size{0}; - - rmm::device_uvector left_indices{0, stream}; - rmm::device_uvector right_indices{0, stream}; - auto current_estimated_size = estimated_size; - do { - left_indices.resize(estimated_size, stream); - right_indices.resize(estimated_size, stream); - - constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE}; - detail::grid_1d config(left_table->num_rows(), block_size); - write_index.set_value_zero(stream); - - row_equality equality{*left_table, *right_table, compare_nulls == null_equality::EQUAL}; - const auto& join_output_l = flip_join_indices ? right_indices.data() : left_indices.data(); - const auto& join_output_r = flip_join_indices ? left_indices.data() : right_indices.data(); - nested_loop_join - <<>>(*left_table, - *right_table, - JoinKind, - equality, - join_output_l, - join_output_r, - write_index.data(), - estimated_size); - - CHECK_CUDA(stream.value()); - - join_size = write_index.value(stream); - current_estimated_size = estimated_size; - estimated_size *= 2; - } while ((current_estimated_size < join_size)); - - left_indices.resize(join_size, stream); - right_indices.resize(join_size, stream); - return std::make_pair(std::move(left_indices), std::move(right_indices)); + return join_indices; } } // namespace detail diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu index c57327569a4..d451540deb6 100644 --- a/cpp/src/lists/copying/scatter_helper.cu +++ b/cpp/src/lists/copying/scatter_helper.cu @@ -30,16 +30,6 @@ namespace cudf { namespace lists { namespace detail { -void assert_same_data_type(column_view const& lhs, column_view const& rhs) -{ - CUDF_EXPECTS(lhs.type().id() == rhs.type().id(), "Mismatched Data types."); - // Empty string column has no children - CUDF_EXPECTS(lhs.type().id() == type_id::STRING or lhs.num_children() == rhs.num_children(), - "Mismatched number of child columns."); - - for (int i{0}; i < lhs.num_children(); ++i) { assert_same_data_type(lhs.child(i), rhs.child(i)); } -} - /** * @brief Constructs null mask for a scattered list's child column * diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu index 3ce0f91fd71..55a6523ebdd 100644 --- a/cpp/src/lists/explode.cu +++ b/cpp/src/lists/explode.cu @@ -251,9 +251,9 @@ std::unique_ptr
explode_outer(table_view const& input_table, } } if (null_or_empty[idx]) { - auto invalid_index = null_or_empty_offset_p[idx] == 0 - ? offsets[idx] - : offsets[idx] + null_or_empty_offset_p[idx] - 1; + auto invalid_index = null_or_empty_offset_p[idx] == 0 + ? offsets[idx] + : offsets[idx] + null_or_empty_offset_p[idx] - 1; gather_map_p[invalid_index] = idx; explode_col_gather_map_p[invalid_index] = InvalidIndex; diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu index c99b366c2dd..5baef2c7639 100644 --- a/cpp/src/reductions/minmax.cu +++ b/cpp/src/reductions/minmax.cu @@ -103,7 +103,7 @@ rmm::device_scalar reduce_device(InputIterator d_in, template struct minmax_binary_op : public thrust::binary_function, minmax_pair, minmax_pair> { - __device__ minmax_pair operator()(minmax_pair const &lhs, minmax_pair const &rhs) const + __device__ minmax_pair operator()(minmax_pair const& lhs, minmax_pair const& rhs) const { return minmax_pair{thrust::min(lhs.min_val, rhs.min_val), thrust::max(lhs.max_val, rhs.max_val)}; @@ -148,7 +148,7 @@ struct minmax_functor { } template - auto reduce(column_view const &col, rmm::cuda_stream_view stream) + auto reduce(column_view const& col, rmm::cuda_stream_view stream) { auto device_col = column_device_view::create(col, stream); // compute minimum and maximum values @@ -174,16 +174,16 @@ struct minmax_functor { *max_data = result->max_val; } - ResultType *result; - T *min_data; - T *max_data; + ResultType* result; + T* min_data; + T* max_data; }; template () and !std::is_same::value and - !cudf::is_dictionary()> * = nullptr> + !cudf::is_dictionary()>* = nullptr> std::pair, std::unique_ptr> operator()( - cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) + cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // compute minimum and maximum values auto dev_result = reduce(col, stream); @@ -200,9 +200,9 @@ struct minmax_functor { /** * @brief Specialization for strings column. */ - template ::value> * = nullptr> + template ::value>* = nullptr> std::pair, std::unique_ptr> operator()( - cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) + cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // compute minimum and maximum values auto dev_result = reduce(col, stream); @@ -219,9 +219,9 @@ struct minmax_functor { /** * @brief Specialization for dictionary column. */ - template ()> * = nullptr> + template ()>* = nullptr> std::pair, std::unique_ptr> operator()( - cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) + cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // compute minimum and maximum values auto dev_result = reduce(col, stream); @@ -236,9 +236,9 @@ struct minmax_functor { get_element(keys, static_cast(host_result.max_val), stream, mr)}; } - template ()> * = nullptr> + template ()>* = nullptr> std::pair, std::unique_ptr> operator()( - cudf::column_view const &, rmm::cuda_stream_view, rmm::mr::device_memory_resource *) + cudf::column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) { CUDF_FAIL("type not supported for minmax() operation"); } @@ -247,7 +247,7 @@ struct minmax_functor { } // namespace std::pair, std::unique_ptr> minmax( - cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) + cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (col.null_count() == col.size()) { // this handles empty and all-null columns @@ -264,7 +264,7 @@ std::pair, std::unique_ptr> minmax( * @copydoc cudf::minmax */ std::pair, std::unique_ptr> minmax( - const column_view &col, rmm::mr::device_memory_resource *mr) + const column_view& col, rmm::mr::device_memory_resource* mr) { return detail::minmax(col, rmm::cuda_stream_default, mr); } diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 00539b6d7a5..a8117373ca4 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -32,19 +32,19 @@ namespace detail { struct reduce_dispatch_functor { column_view const col; data_type output_dtype; - rmm::mr::device_memory_resource *mr; + rmm::mr::device_memory_resource* mr; rmm::cuda_stream_view stream; - reduce_dispatch_functor(column_view const &col, + reduce_dispatch_functor(column_view const& col, data_type output_dtype, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) : col(col), output_dtype(output_dtype), mr(mr), stream(stream) { } template - std::unique_ptr operator()(std::unique_ptr const &agg) + std::unique_ptr operator()(std::unique_ptr const& agg) { switch (k) { case aggregation::SUM: return reduction::sum(col, output_dtype, stream, mr); break; @@ -58,11 +58,11 @@ struct reduce_dispatch_functor { break; case aggregation::MEAN: return reduction::mean(col, output_dtype, stream, mr); break; case aggregation::VARIANCE: { - auto var_agg = dynamic_cast(agg.get()); + auto var_agg = dynamic_cast(agg.get()); return reduction::variance(col, output_dtype, var_agg->_ddof, stream, mr); } break; case aggregation::STD: { - auto var_agg = dynamic_cast(agg.get()); + auto var_agg = dynamic_cast(agg.get()); return reduction::standard_deviation(col, output_dtype, var_agg->_ddof, stream, mr); } break; case aggregation::MEDIAN: { @@ -73,7 +73,7 @@ struct reduce_dispatch_functor { return get_element(*col_ptr, 0, stream, mr); } break; case aggregation::QUANTILE: { - auto quantile_agg = dynamic_cast(agg.get()); + auto quantile_agg = dynamic_cast(agg.get()); CUDF_EXPECTS(quantile_agg->_quantiles.size() == 1, "Reduction quantile accepts only one quantile value"); auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, mr); @@ -89,7 +89,7 @@ struct reduce_dispatch_functor { return get_element(*col_ptr, 0, stream, mr); } break; case aggregation::NUNIQUE: { - auto nunique_agg = dynamic_cast(agg.get()); + auto nunique_agg = dynamic_cast(agg.get()); return make_fixed_width_scalar( detail::distinct_count( col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream), @@ -97,7 +97,7 @@ struct reduce_dispatch_functor { mr); } break; case aggregation::NTH_ELEMENT: { - auto nth_agg = dynamic_cast(agg.get()); + auto nth_agg = dynamic_cast(agg.get()); return reduction::nth_element(col, nth_agg->_n, nth_agg->_null_handling, stream, mr); } break; default: CUDF_FAIL("Unsupported reduction operator"); @@ -106,11 +106,11 @@ struct reduce_dispatch_functor { }; std::unique_ptr reduce( - column_view const &col, - std::unique_ptr const &agg, + column_view const& col, + std::unique_ptr const& agg, data_type output_dtype, rmm::cuda_stream_view stream = rmm::cuda_stream_default, - rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { std::unique_ptr result = make_default_constructed_scalar(output_dtype, stream, mr); result->set_valid_async(false, stream); @@ -124,10 +124,10 @@ std::unique_ptr reduce( } } // namespace detail -std::unique_ptr reduce(column_view const &col, - std::unique_ptr const &agg, +std::unique_ptr reduce(column_view const& col, + std::unique_ptr const& agg, data_type output_dtype, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::reduce(col, agg, output_dtype, rmm::cuda_stream_default, mr); diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu index f729f812b28..1beb9ecb282 100644 --- a/cpp/src/reductions/scan/scan_inclusive.cu +++ b/cpp/src/reductions/scan/scan_inclusive.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -67,7 +68,46 @@ rmm::device_buffer mask_scan(const column_view& input_view, namespace { /** - * @brief Dispatcher for running Scan operation on input column + * @brief Strings inclusive scan operator + * + * This was specifically created to workaround a thrust issue + * https://github.com/NVIDIA/thrust/issues/1479 + * where invalid values are passed to the operator. + * + * This operator will accept index values, check them and then + * run the `Op` operation on the individual string_view objects. + * The returned result is the appropriate index value. + */ +template +struct string_scan_operator { + column_device_view const col; ///< strings column device view + string_view const null_replacement{}; ///< value used when element is null + bool const has_nulls; ///< true if col has null elements + + string_scan_operator(column_device_view const& col, bool has_nulls = true) + : col{col}, null_replacement{Op::template identity()}, has_nulls{has_nulls} + { + CUDF_EXPECTS(type_id::STRING == col.type().id(), "the data type mismatch"); + // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash + if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask"); + } + + CUDA_DEVICE_CALLABLE + size_type operator()(size_type lhs, size_type rhs) const + { + // thrust::inclusive_scan may pass us garbage values so we need to protect ourselves; + // in these cases the return value does not matter since the result is not used + if (lhs < 0 || rhs < 0 || lhs >= col.size() || rhs >= col.size()) return 0; + string_view d_lhs = + has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element(lhs); + string_view d_rhs = + has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element(rhs); + return Op{}(d_lhs, d_rhs) == d_lhs ? lhs : rhs; + } +}; + +/** + * @brief Dispatcher for running a Scan operation on an input column * * @tparam Op device binary operator */ @@ -117,22 +157,25 @@ struct scan_dispatcher { { auto d_input = column_device_view::create(input_view, stream); - rmm::device_uvector result(input_view.size(), stream); - auto begin = - make_null_replacement_iterator(*d_input, Op::template identity(), input_view.has_nulls()); - thrust::inclusive_scan( - rmm::exec_policy(stream), begin, begin + input_view.size(), result.data(), Op{}); - - CHECK_CUDA(stream.value()); - return cudf::make_strings_column(result, Op::template identity(), stream, mr); + // build indices of the scan operation results + rmm::device_uvector result(input_view.size(), stream); + thrust::inclusive_scan(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input_view.size()), + result.begin(), + string_scan_operator{*d_input, input_view.has_nulls()}); + + // call gather using the indices to build the output column + return cudf::strings::detail::gather( + strings_column_view(input_view), result.begin(), result.end(), false, stream, mr); } public: /** - * @brief creates new column from input column by applying scan operation + * @brief Creates a new column from the input column by applying the scan operation * - * @param input input column view - * @param inclusive inclusive or exclusive scan + * @param input Input column view + * @param null_handling How null row entries are to be processed * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @return diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu index 5bbdb5988e7..98156224cfe 100644 --- a/cpp/src/reshape/byte_cast.cu +++ b/cpp/src/reshape/byte_cast.cu @@ -108,7 +108,7 @@ std::unique_ptr byte_list_conversion::operator()( } // namespace /** - * @copydoc cudf::byte_cast(input_column,flip_endianess,rmm::mr::device_memory_resource) + * @copydoc cudf::byte_cast(input_column,flip_endianness,rmm::mr::device_memory_resource) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -124,7 +124,7 @@ std::unique_ptr byte_cast(column_view const& input_column, } // namespace detail /** - * @copydoc cudf::byte_cast(input_column,flip_endianess,rmm::mr::device_memory_resource) + * @copydoc cudf::byte_cast(input_column,flip_endianness,rmm::mr::device_memory_resource) */ std::unique_ptr byte_cast(column_view const& input_column, flip_endianness endian_configuration, diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu index 2f19c8158c5..fa12fabffdc 100644 --- a/cpp/src/reshape/tile.cu +++ b/cpp/src/reshape/tile.cu @@ -40,10 +40,10 @@ struct tile_functor { } // anonymous namespace namespace detail { -std::unique_ptr
tile(const table_view &in, +std::unique_ptr
tile(const table_view& in, size_type count, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(count >= 0, "Count cannot be negative"); @@ -59,9 +59,9 @@ std::unique_ptr
tile(const table_view &in, } } // namespace detail -std::unique_ptr
tile(const table_view &in, +std::unique_ptr
tile(const table_view& in, size_type count, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::tile(in, count, rmm::cuda_stream_default, mr); diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index d7114608787..862e44a0d2b 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -339,8 +339,8 @@ std::unique_ptr empty_output_for_rolling_aggregation(column_view const& // TODO: // Ideally, for UDF aggregations, the returned column would match // the agg's return type. It currently returns empty_like(input), because: - // 1. This preserves prior behaviour for empty input columns. - // 2. There is insufficient information to construct nested return colums. + // 1. This preserves prior behavior for empty input columns. + // 2. There is insufficient information to construct nested return columns. // `cudf::make_udf_aggregation()` expresses the return type as a `data_type` // which cannot express recursively nested types (e.g. `STRUCT>`.) // 3. In any case, UDFs that return nested types are not currently supported. @@ -616,7 +616,7 @@ class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggre return aggs; } - // COLLECT_LIST aggregations do not peform a rolling operation at all. They get processed + // COLLECT_LIST aggregations do not perform a rolling operation at all. They get processed // entirely in the finalize() step. std::vector> visit( data_type, cudf::detail::collect_list_aggregation const&) override @@ -624,7 +624,7 @@ class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggre return {}; } - // COLLECT_SET aggregations do not peform a rolling operation at all. They get processed + // COLLECT_SET aggregations do not perform a rolling operation at all. They get processed // entirely in the finalize() step. std::vector> visit( data_type, cudf::detail::collect_set_aggregation const&) override diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp index bd64cc39f47..5fabcf5b14e 100644 --- a/cpp/src/rolling/rolling_detail.hpp +++ b/cpp/src/rolling/rolling_detail.hpp @@ -29,30 +29,30 @@ namespace detail { // store functor template struct rolling_store_output_functor { - CUDA_HOST_DEVICE_CALLABLE void operator()(T &out, T &val, size_type count) { out = val; } + CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count) { out = val; } }; // Specialization for MEAN template struct rolling_store_output_functor<_T, true> { // SFINAE for non-bool types - template () || cudf::is_timestamp())> * = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(T &out, T &val, size_type count) + template () || cudf::is_timestamp())>* = nullptr> + CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count) { out = val / count; } // SFINAE for bool type - template ()> * = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(T &out, T &val, size_type count) + template ()>* = nullptr> + CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count) { out = static_cast(val) / count; } // SFINAE for timestamp types - template ()> * = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(T &out, T &val, size_type count) + template ()>* = nullptr> + CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count) { out = static_cast(val.time_since_epoch() / count); } diff --git a/cpp/src/rolling/rolling_jit_detail.hpp b/cpp/src/rolling/rolling_jit_detail.hpp index bba82f4d669..7fe9b68103e 100644 --- a/cpp/src/rolling/rolling_jit_detail.hpp +++ b/cpp/src/rolling/rolling_jit_detail.hpp @@ -30,8 +30,8 @@ T minimum(T a, T b) } struct preceding_window_wrapper { - const cudf::size_type *d_group_offsets; - const cudf::size_type *d_group_labels; + const cudf::size_type* d_group_offsets; + const cudf::size_type* d_group_labels; cudf::size_type preceding_window; cudf::size_type operator[](cudf::size_type idx) @@ -43,8 +43,8 @@ struct preceding_window_wrapper { }; struct following_window_wrapper { - const cudf::size_type *d_group_offsets; - const cudf::size_type *d_group_labels; + const cudf::size_type* d_group_offsets; + const cudf::size_type* d_group_labels; cudf::size_type following_window; cudf::size_type operator[](cudf::size_type idx) diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp index 546eb050a60..045bfbe0327 100644 --- a/cpp/src/scalar/scalar.cpp +++ b/cpp/src/scalar/scalar.cpp @@ -181,6 +181,12 @@ T fixed_point_scalar::fixed_point_value(rmm::cuda_stream_view stream) const numeric::scaled_integer{_data.value(stream), numeric::scale_type{type().scale()}}}; } +template +fixed_point_scalar::operator value_type() const +{ + return this->fixed_point_value(rmm::cuda_stream_default); +} + template typename fixed_point_scalar::rep_type* fixed_point_scalar::data() { diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu index 66548ac1e73..c8a908e44cd 100644 --- a/cpp/src/sort/rank.cu +++ b/cpp/src/sort/rank.cu @@ -101,7 +101,7 @@ template void tie_break_ranks_transform(cudf::device_span dense_rank_sorted, TieIterator tie_iter, - column_view const &sorted_order_view, + column_view const& sorted_order_view, outputIterator rank_iter, TieBreaker tie_breaker, Transformer transformer, @@ -227,18 +227,18 @@ void rank_average(cudf::device_span group_keys, } // anonymous namespace -std::unique_ptr rank(column_view const &input, +std::unique_ptr rank(column_view const& input, rank_method method, order column_order, null_policy null_handling, null_order null_precedence, bool percentage, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { - data_type const output_type = (percentage or method == rank_method::AVERAGE) - ? data_type(type_id::FLOAT64) - : data_type(type_to_id()); + data_type const output_type = (percentage or method == rank_method::AVERAGE) + ? data_type(type_id::FLOAT64) + : data_type(type_to_id()); std::unique_ptr rank_column = [&null_handling, &output_type, &input, &stream, &mr] { // na_option=keep assign NA to NA values if (null_handling == null_policy::EXCLUDE) @@ -329,13 +329,13 @@ std::unique_ptr rank(column_view const &input, } } // namespace detail -std::unique_ptr rank(column_view const &input, +std::unique_ptr rank(column_view const& input, rank_method method, order column_order, null_policy null_handling, null_order null_precedence, bool percentage, - rmm::mr::device_memory_resource *mr) + rmm::mr::device_memory_resource* mr) { return detail::rank(input, method, diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu index c1e341217ab..29ff7b242e6 100644 --- a/cpp/src/strings/capitalize.cu +++ b/cpp/src/strings/capitalize.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -38,12 +39,26 @@ namespace { * @brief Base class for capitalize and title functors. * * Utility functions here manage access to the character case and flags tables. + * Any derived class must supply a `capitalize_next` member function. + * + * @tparam Derived class uses the CRTP pattern to reuse code logic. */ +template struct base_fn { character_flags_table_type const* d_flags; character_cases_table_type const* d_case_table; + special_case_mapping const* d_special_case_mapping; + column_device_view const d_column; + offset_type* d_offsets{}; + char* d_chars{}; - base_fn() : d_flags(get_character_flags_table()), d_case_table(get_character_cases_table()) {} + base_fn(column_device_view const& d_column) + : d_flags(get_character_flags_table()), + d_case_table(get_character_cases_table()), + d_special_case_mapping(get_special_case_mapping_table()), + d_column(d_column) + { + } using char_info = thrust::pair; @@ -54,94 +69,113 @@ struct base_fn { return char_info{code_point, flag}; } - __device__ char_utf8 convert_char(char_info const& info) const + __device__ int32_t convert_char(char_info const& info, char* d_buffer) const { - return codepoint_to_utf8(d_case_table[info.first]); - } -}; + auto const code_point = info.first; + auto const flag = info.second; -/** - * @brief Capitalize functor. - * - * This capitalizes the first letter of the string. - * Also lower-case any characters after the first letter. - */ -struct capitalize_fn : base_fn { - column_device_view const d_column; - offset_type* d_offsets{}; - char* d_chars{}; + if (!IS_SPECIAL(flag)) { + auto const new_char = codepoint_to_utf8(d_case_table[code_point]); + return d_buffer ? detail::from_char_utf8(new_char, d_buffer) + : detail::bytes_in_char_utf8(new_char); + } - capitalize_fn(column_device_view const& d_column) : base_fn(), d_column(d_column) {} + special_case_mapping m = d_special_case_mapping[get_special_case_hash_index(code_point)]; + + auto const count = IS_LOWER(flag) ? m.num_upper_chars : m.num_lower_chars; + auto const* chars = IS_LOWER(flag) ? m.upper : m.lower; + size_type bytes = 0; + for (uint16_t idx = 0; idx < count; idx++) { + bytes += d_buffer + ? detail::from_char_utf8(detail::codepoint_to_utf8(chars[idx]), d_buffer + bytes) + : detail::bytes_in_char_utf8(detail::codepoint_to_utf8(chars[idx])); + } + return bytes; + } + /** + * @brief Operator called for each row in `d_column`. + * + * This logic is shared by capitalize() and title() functions. + * The derived class must supply a `capitalize_next` member function. + */ __device__ void operator()(size_type idx) { if (d_column.is_null(idx)) { if (!d_chars) d_offsets[idx] = 0; } + Derived& derived = static_cast(*this); auto const d_str = d_column.element(idx); offset_type bytes = 0; auto d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) { - auto const info = get_char_info(*itr); + bool capitalize = true; + for (auto const chr : d_str) { + auto const info = get_char_info(chr); auto const flag = info.second; - auto const change_case = (itr == d_str.begin()) ? IS_LOWER(flag) : IS_UPPER(flag); - auto const new_char = change_case ? convert_char(info) : *itr; - - if (d_buffer) - d_buffer += detail::from_char_utf8(new_char, d_buffer); - else - bytes += detail::bytes_in_char_utf8(new_char); + auto const change_case = capitalize ? IS_LOWER(flag) : IS_UPPER(flag); + + if (change_case) { + auto const char_bytes = convert_char(info, d_buffer); + bytes += char_bytes; + d_buffer += d_buffer ? char_bytes : 0; + } else { + if (d_buffer) { + d_buffer += detail::from_char_utf8(chr, d_buffer); + } else { + bytes += detail::bytes_in_char_utf8(chr); + } + } + + // capitalize the next char if this one is a delimiter + capitalize = derived.capitalize_next(chr, flag); } if (!d_chars) d_offsets[idx] = bytes; } }; +/** + * @brief Capitalize functor. + * + * This capitalizes the first character of the string and lower-cases + * the remaining characters. + * If a delimiter is specified, capitalization continues within the string + * on the first eligible character after any delimiter. + */ +struct capitalize_fn : base_fn { + string_view const d_delimiters; + + capitalize_fn(column_device_view const& d_column, string_view const& d_delimiters) + : base_fn(d_column), d_delimiters(d_delimiters) + { + } + + __device__ bool capitalize_next(char_utf8 const chr, character_flags_table_type const) + { + return !d_delimiters.empty() && (d_delimiters.find(chr) >= 0); + } +}; + /** * @brief Title functor. * * This capitalizes the first letter of each word. - * The beginning of a word is identified as the first alphabetic - * character after a non-alphabetic character. - * Also, lower-case all other alpabetic characters. + * The beginning of a word is identified as the first sequence_type + * character after a non-sequence_type character. + * Also, lower-case all other alphabetic characters. */ -struct title_fn : base_fn { - column_device_view const d_column; +struct title_fn : base_fn { string_character_types sequence_type; - offset_type* d_offsets{}; - char* d_chars{}; title_fn(column_device_view const& d_column, string_character_types sequence_type) - : base_fn(), d_column(d_column), sequence_type(sequence_type) + : base_fn(d_column), sequence_type(sequence_type) { } - __device__ void operator()(size_type idx) + __device__ bool capitalize_next(char_utf8 const, character_flags_table_type const flag) { - if (d_column.is_null(idx)) { - if (!d_chars) d_offsets[idx] = 0; - } - - auto const d_str = d_column.element(idx); - offset_type bytes = 0; - auto d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - bool capitalize = true; - for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) { - auto const info = get_char_info(*itr); - auto const flag = info.second; - auto const change_case = - (flag & sequence_type) && (capitalize ? IS_LOWER(flag) : IS_UPPER(flag)); - auto const new_char = change_case ? convert_char(info) : *itr; - // capitalize the next char if this one is not a sequence_type - capitalize = (flag & sequence_type) == 0; - - if (d_buffer) - d_buffer += detail::from_char_utf8(new_char, d_buffer); - else - bytes += detail::bytes_in_char_utf8(new_char); - } - if (!d_chars) d_offsets[idx] = bytes; - } + return (flag & sequence_type) == 0; + }; }; /** @@ -154,10 +188,10 @@ struct title_fn : base_fn { * @param mr Device memory resource used for allocating the new device_buffer */ template -std::unique_ptr capitalize_utility(CapitalFn cfn, - strings_column_view const& input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr capitalizer(CapitalFn cfn, + strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto children = cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr); @@ -173,12 +207,15 @@ std::unique_ptr capitalize_utility(CapitalFn cfn, } // namespace std::unique_ptr capitalize(strings_column_view const& input, + string_scalar const& delimiters, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(delimiters.is_valid(stream), "Delimiter must be a valid string"); if (input.is_empty()) return make_empty_column(data_type{type_id::STRING}); - auto d_column = column_device_view::create(input.parent(), stream); - return capitalize_utility(capitalize_fn{*d_column}, input, stream, mr); + auto const d_column = column_device_view::create(input.parent(), stream); + auto const d_delimiters = delimiters.value(stream); + return capitalizer(capitalize_fn{*d_column, d_delimiters}, input, stream, mr); } std::unique_ptr title(strings_column_view const& input, @@ -188,16 +225,17 @@ std::unique_ptr title(strings_column_view const& input, { if (input.is_empty()) return make_empty_column(data_type{type_id::STRING}); auto d_column = column_device_view::create(input.parent(), stream); - return capitalize_utility(title_fn{*d_column, sequence_type}, input, stream, mr); + return capitalizer(title_fn{*d_column, sequence_type}, input, stream, mr); } } // namespace detail std::unique_ptr capitalize(strings_column_view const& input, + string_scalar const& delimiter, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::capitalize(input, rmm::cuda_stream_default, mr); + return detail::capitalize(input, delimiter, rmm::cuda_stream_default, mr); } std::unique_ptr title(strings_column_view const& input, diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index 5a69ac7b3d5..ccbedf99bc2 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -94,9 +94,9 @@ std::unique_ptr join_strings(strings_column_view const& strings, // only one entry so it is either all valid or all null auto const null_count = static_cast(strings.null_count() == strings_count && !narep.is_valid()); - auto null_mask = null_count - ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr) - : rmm::device_buffer{0, stream, mr}; + auto null_mask = null_count + ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr) + : rmm::device_buffer{0, stream, mr}; auto chars_column = create_chars_child_column(bytes, stream, mr); auto d_chars = chars_column->mutable_view().data(); thrust::for_each_n( diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu index c012663794b..2ef27759124 100644 --- a/cpp/src/strings/combine/join_list_elements.cu +++ b/cpp/src/strings/combine/join_list_elements.cu @@ -135,8 +135,8 @@ struct compute_size_and_concatenate_fn { struct scalar_separator_fn { string_scalar_device_view const d_separator; - __device__ bool is_null_list(column_device_view const& lists_dv, size_type const idx) const - noexcept + __device__ bool is_null_list(column_device_view const& lists_dv, + size_type const idx) const noexcept { return lists_dv.is_null(idx); } @@ -202,8 +202,8 @@ struct column_separators_fn { column_device_view const separators_dv; string_scalar_device_view const sep_narep_dv; - __device__ bool is_null_list(column_device_view const& lists_dv, size_type const idx) const - noexcept + __device__ bool is_null_list(column_device_view const& lists_dv, + size_type const idx) const noexcept { return lists_dv.is_null(idx) || (separators_dv.is_null(idx) && !sep_narep_dv.is_valid()); } diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu index e2188365785..628dbcb8755 100644 --- a/cpp/src/strings/contains.cu +++ b/cpp/src/strings/contains.cu @@ -56,8 +56,8 @@ struct contains_fn { if (d_strings.is_null(idx)) return 0; string_view d_str = d_strings.element(idx); int32_t begin = 0; - int32_t end = bmatch ? 1 // match only the beginning of the string; - : -1; // this handles empty strings too + int32_t end = bmatch ? 1 // match only the beginning of the string; + : -1; // this handles empty strings too return static_cast(prog.find(idx, d_str, begin, end)); } }; diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index 0ec13b3648b..d804ac66961 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -796,7 +796,8 @@ struct datetime_formatter { val = val / 10; } ptr = tmpl + bytes - 1; - while (bytes-- > 0) *str++ = *ptr--; + while (bytes-- > 0) + *str++ = *ptr--; return str; } diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index 7e6769a869b..aaee8c45169 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -267,7 +267,8 @@ struct duration_to_string_fn : public duration_to_string_size_fn { } digits_idx = std::max(digits_idx, min_digits); // digits are backwards, reverse the string into the output - while (digits_idx-- > 0) *str++ = digits[digits_idx]; + while (digits_idx-- > 0) + *str++ = digits[digits_idx]; return str; } diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index 94c34f92c66..2f57b38249f 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -192,7 +192,7 @@ namespace { * @brief Calculate the size of the each string required for * converting each value in base-10 format. * - * ouput format is [-]integer.fraction + * output format is [-]integer.fraction */ template struct decimal_to_string_size_fn { diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index d4d6974cef5..b0910acb2a2 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -234,7 +234,7 @@ struct ftos_converter { static constexpr double upper_limit = 1000000000; // max is 1x10^9 static constexpr double lower_limit = 0.0001; // printf uses scientific notation below this // Tables for doing normalization: converting to exponent form - // IEEE double float has maximum exponent of 305 so these should cover everthing + // IEEE double float has maximum exponent of 305 so these should cover everything const double upper10[9] = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256}; const double lower10[9] = {.1, .01, .0001, 1e-8, 1e-16, 1e-32, 1e-64, 1e-128, 1e-256}; const double blower10[9] = {1.0, .1, .001, 1e-7, 1e-15, 1e-31, 1e-63, 1e-127, 1e-255}; @@ -252,7 +252,8 @@ struct ftos_converter { *ptr++ = (char)('0' + (value % 10)); value /= 10; } - while (ptr != buffer) *output++ = *--ptr; // 54321 -> 12345 + while (ptr != buffer) + *output++ = *--ptr; // 54321 -> 12345 return output; } diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu index 7043174f5bf..c624819999f 100644 --- a/cpp/src/strings/convert/convert_hex.cu +++ b/cpp/src/strings/convert/convert_hex.cu @@ -154,7 +154,9 @@ struct integer_to_hex_fn { // compute the number of output bytes int bytes = sizeof(IntegerType); int byte_index = sizeof(IntegerType); - while ((--byte_index > 0) && (value_bytes[byte_index] & 0xFF) == 0) { --bytes; } + while ((--byte_index > 0) && (value_bytes[byte_index] & 0xFF) == 0) { + --bytes; + } // create output byte_index = bytes - 1; diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu index d7b79547f29..4e323b98a2e 100644 --- a/cpp/src/strings/convert/convert_ipv4.cu +++ b/cpp/src/strings/convert/convert_ipv4.cu @@ -146,7 +146,8 @@ struct integers_to_ipv4_fn { else { char digits[3]; int num_digits = convert(value, digits); - while (num_digits-- > 0) *out_ptr++ = digits[num_digits]; + while (num_digits-- > 0) + *out_ptr++ = digits[num_digits]; } if ((n + 1) < 4) *out_ptr++ = '.'; shift_bits -= 8; diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 33647c7b22f..abf2dc25097 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -335,9 +335,9 @@ std::unique_ptr url_decode( size_type chars_start = (strings.offset() == 0) ? 0 : cudf::detail::get_value( strings.offsets(), strings.offset(), stream); - size_type chars_end = (offset_count == strings.offsets().size()) - ? strings.chars_size() - : cudf::detail::get_value( + size_type chars_end = (offset_count == strings.offsets().size()) + ? strings.chars_size() + : cudf::detail::get_value( strings.offsets(), strings.offset() + strings_count, stream); size_type chars_bytes = chars_end - chars_start; diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh index 75ae7b3af6c..746923526a1 100644 --- a/cpp/src/strings/convert/utilities.cuh +++ b/cpp/src/strings/convert/utilities.cuh @@ -81,7 +81,8 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer) char* ptr = d_buffer; if (is_negative) *ptr++ = '-'; // digits are backwards, reverse the string into the output - while (digits_idx-- > 0) *ptr++ = digits[digits_idx]; + while (digits_idx-- > 0) + *ptr++ = digits[digits_idx]; return bytes; } diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu new file mode 100644 index 00000000000..3545ec6d259 --- /dev/null +++ b/cpp/src/strings/copying/shift.cu @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf::strings::detail { + +namespace { + +struct adjust_offsets_fn { + column_device_view const d_column; + string_view const d_filler; + size_type const offset; + + __device__ offset_type operator()(size_type idx) + { + if (offset < 0) { + auto const first = d_column.element(-offset); + auto const last_index = d_column.size() + offset; + if (idx < last_index) { + return d_column.element(idx - offset) - first; + } else { + auto const last = d_column.element(d_column.size() - 1); + return (last - first) + ((idx - last_index + 1) * d_filler.size_bytes()); + } + } else { + if (idx < offset) { + return idx * d_filler.size_bytes(); + } else { + auto const total_filler = d_filler.size_bytes() * offset; + return total_filler + d_column.element(idx - offset); + } + } + } +}; + +struct shift_chars_fn { + column_device_view const d_column; + string_view const d_filler; + size_type const offset; + + __device__ char operator()(size_type idx) + { + if (offset < 0) { + auto const last_index = -offset; + if (idx < last_index) { + auto const first_index = d_column.size() + offset; + return d_column.element(idx + first_index); + } else { + auto const char_index = idx - last_index; + return d_filler.data()[char_index % d_filler.size_bytes()]; + } + } else { + if (idx < offset) { + return d_filler.data()[idx % d_filler.size_bytes()]; + } else { + return d_column.element(idx - offset); + } + } + } +}; + +} // namespace + +std::unique_ptr shift(strings_column_view const& input, + size_type offset, + scalar const& fill_value, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto d_fill_str = static_cast(fill_value).value(stream); + + // output offsets column is the same size as the input + auto const input_offsets = + cudf::slice(input.offsets(), {input.offset(), input.offset() + input.size() + 1}).front(); + auto const offsets_size = input_offsets.size(); + auto offsets_column = cudf::detail::allocate_like( + input_offsets, offsets_size, mask_allocation_policy::NEVER, stream, mr); + + // run kernel to simultaneously shift and adjust the values in the output offsets column + auto d_offsets = mutable_column_device_view::create(offsets_column->mutable_view(), stream); + auto const d_input_offsets = column_device_view::create(input_offsets, stream); + thrust::transform(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(offsets_size), + d_offsets->data(), + adjust_offsets_fn{*d_input_offsets, d_fill_str, offset}); + + // compute the shift-offset for the output characters child column + auto const shift_offset = [&] { + auto const index = (offset >= 0) ? offset : offsets_size - 1 + offset; + return (offset < 0 ? -1 : 1) * + cudf::detail::get_value(offsets_column->view(), index, stream); + }(); + + // create output chars child column + auto const chars_size = + cudf::detail::get_value(offsets_column->view(), offsets_size - 1, stream); + auto chars_column = create_chars_child_column(chars_size, stream, mr); + auto d_chars = mutable_column_device_view::create(chars_column->mutable_view(), stream); + auto const d_input_chars = column_device_view::create(input.chars(), stream); + + // run kernel to shift the characters + thrust::transform(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(chars_size), + d_chars->data(), + shift_chars_fn{*d_input_chars, d_fill_str, shift_offset}); + + // caller sets the null-mask + return make_strings_column(input.size(), + std::move(offsets_column), + std::move(chars_column), + 0, + rmm::device_buffer{}, + stream, + mr); +} + +} // namespace cudf::strings::detail diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index dfdd3226844..409e1892c91 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -99,6 +99,44 @@ class parser { return false; } + CUDA_HOST_DEVICE_CALLABLE bool is_hex_digit(char c) + { + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); + } + + CUDA_HOST_DEVICE_CALLABLE int64_t chars_left() { return input_len - ((pos - input) + 1); } + + /** + * @brief Parse an escape sequence. + * + * Must be a valid sequence as specified by the JSON format + * https://www.json.org/json-en.html + * + * @returns True on success or false on fail. + */ + CUDA_HOST_DEVICE_CALLABLE bool parse_escape_seq() + { + if (*pos != '\\') { return false; } + char c = *++pos; + + // simple case + if (c == '\"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || c == 'r' || + c == 't') { + pos++; + return true; + } + + // hex digits: must be of the form uXXXX where each X is a valid hex digit + if (c == 'u' && chars_left() >= 4 && is_hex_digit(pos[1]) && is_hex_digit(pos[2]) && + is_hex_digit(pos[3]) && is_hex_digit(pos[4])) { + pos += 5; + return true; + } + + // an illegal escape sequence. + return false; + } + /** * @brief Parse a quote-enclosed JSON string. * @@ -123,12 +161,16 @@ class parser { const char* start = ++pos; while (!eof()) { - if (*pos == quote) { + // handle escaped characters + if (*pos == '\\') { + if (!parse_escape_seq()) { return parse_result::ERROR; } + } else if (*pos == quote) { str = string_view(start, pos - start); pos++; return parse_result::SUCCESS; + } else { + pos++; } - pos++; } } } @@ -230,15 +272,22 @@ class json_state : private parser { int arr_count = 0; while (!eof(end)) { - // could do some additional checks here. we know our current - // element type, so we could be more strict on what kinds of - // characters we expect to see. - switch (*end++) { - case '{': obj_count++; break; - case '}': obj_count--; break; - case '[': arr_count++; break; - case ']': arr_count--; break; - default: break; + // parse strings explicitly so we handle all interesting corner cases (such as strings + // containing {, }, [ or ] + if (is_quote(*end)) { + string_view str; + pos = end; + if (parse_string(str, false, *end) == parse_result::ERROR) { return parse_result::ERROR; } + end = pos; + } else { + char const c = *end++; + switch (c) { + case '{': obj_count++; break; + case '}': obj_count--; break; + case '[': arr_count++; break; + case ']': arr_count--; break; + default: break; + } } if (obj_count == 0 && arr_count == 0) { break; } } @@ -620,7 +669,7 @@ std::pair>, int> build_comma if (op.type == path_operator_type::ROOT) { CUDF_EXPECTS(h_operators.size() == 0, "Root operator ($) can only exist at the root"); } - // if we havent' gotten a root operator to start, and we're not empty, quietly push a + // if we have not gotten a root operator to start, and we're not empty, quietly push a // root operator now. if (h_operators.size() == 0 && op.type != path_operator_type::ROOT && op.type != path_operator_type::END) { diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index 253bf846993..6fee47ea225 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -95,7 +95,8 @@ std::unique_ptr pad( string_view d_str = d_strings.element(idx); auto length = d_str.length(); char* ptr = d_chars + d_offsets[idx]; - while (length++ < width) ptr += from_char_utf8(d_fill_char, ptr); + while (length++ < width) + ptr += from_char_utf8(d_fill_char, ptr); copy_string(ptr, d_str); }); } else if (side == pad_side::RIGHT) { @@ -109,7 +110,8 @@ std::unique_ptr pad( auto length = d_str.length(); char* ptr = d_chars + d_offsets[idx]; ptr = copy_string(ptr, d_str); - while (length++ < width) ptr += from_char_utf8(d_fill_char, ptr); + while (length++ < width) + ptr += from_char_utf8(d_fill_char, ptr); }); } else if (side == pad_side::BOTH) { thrust::for_each_n( @@ -124,9 +126,11 @@ std::unique_ptr pad( auto right_pad = (width & 1) ? pad / 2 : (pad - pad / 2); // odd width = right-justify auto left_pad = pad - right_pad; // e.g. width=7 gives "++foxx+" while width=6 gives "+fox++" - while (left_pad-- > 0) ptr += from_char_utf8(d_fill_char, ptr); + while (left_pad-- > 0) + ptr += from_char_utf8(d_fill_char, ptr); ptr = copy_string(ptr, d_str); - while (right_pad-- > 0) ptr += from_char_utf8(d_fill_char, ptr); + while (right_pad-- > 0) + ptr += from_char_utf8(d_fill_char, ptr); }); } @@ -181,7 +185,8 @@ std::unique_ptr zfill( string_view d_str = d_strings.element(idx); auto length = d_str.length(); char* out_ptr = d_chars + d_offsets[idx]; - while (length++ < width) *out_ptr++ = '0'; // prepend zero char + while (length++ < width) + *out_ptr++ = '0'; // prepend zero char copy_string(out_ptr, d_str); }); diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 6cac49d3c26..0e00221dabf 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -701,11 +701,13 @@ class regex_compiler { regex_parser::Item item = in[i]; if (item.d.yycount.n <= 0) { // need to erase - for (std::size_t j = 0; j < i - rep_start; j++) out.pop_back(); + for (std::size_t j = 0; j < i - rep_start; j++) + out.pop_back(); } else { // repeat for (int j = 1; j < item.d.yycount.n; j++) - for (std::size_t k = rep_start; k < i; k++) out.push_back(in[k]); + for (std::size_t k = rep_start; k < i; k++) + out.push_back(in[k]); } // optional repeats @@ -715,7 +717,8 @@ class regex_compiler { o_item.t = LBRA_NC; o_item.d.yy = 0; out.push_back(o_item); - for (std::size_t k = rep_start; k < i; k++) out.push_back(in[k]); + for (std::size_t k = rep_start; k < i; k++) + out.push_back(in[k]); } for (int j = item.d.yycount.n; j < item.d.yycount.m; j++) { regex_parser::Item o_item; @@ -746,7 +749,8 @@ class regex_compiler { } } else // copy it once then put '*' { - for (std::size_t k = rep_start; k < i; k++) out.push_back(in[k]); + for (std::size_t k = rep_start; k < i; k++) + out.push_back(in[k]); if (item.t == COUNTED) { o_item.t = STAR; @@ -841,12 +845,14 @@ void reprog::optimize1() if (_insts[i].type != NOP) { { int target_id = _insts[i].u2.next_id; - while (_insts[target_id].type == NOP) target_id = _insts[target_id].u2.next_id; + while (_insts[target_id].type == NOP) + target_id = _insts[target_id].u2.next_id; _insts[i].u2.next_id = target_id; } if (_insts[i].type == OR) { int target_id = _insts[i].u1.right_id; - while (_insts[target_id].type == NOP) target_id = _insts[target_id].u2.next_id; + while (_insts[target_id].type == NOP) + target_id = _insts[target_id].u2.next_id; _insts[i].u1.right_id = target_id; } } @@ -854,7 +860,8 @@ void reprog::optimize1() // skip NOPs from the beginning { int target_id = _startinst_id; - while (_insts[target_id].type == NOP) target_id = _insts[target_id].u2.next_id; + while (_insts[target_id].type == NOP) + target_id = _insts[target_id].u2.next_id; _startinst_id = target_id; } // actually remove the no-ops @@ -950,7 +957,8 @@ void reprog::print() printf("startinst_id=%d\n", _startinst_id); if (_startinst_ids.size() > 0) { printf("startinst_ids:"); - for (size_t i = 0; i < _startinst_ids.size(); i++) printf(" %d", _startinst_ids[i]); + for (size_t i = 0; i < _startinst_ids.size(); i++) + printf(" %d", _startinst_ids[i]); printf("\n"); } diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index eddda3fe0eb..854fce15fd4 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -231,7 +231,8 @@ __device__ inline int32_t reprog_device::regexec( if (((eos < 0) || (pos < eos)) && match == 0) { int32_t i = 0; auto ids = startinst_ids(); - while (ids[i] >= 0) jnk.list1->activate(ids[i++], (group_id == 0 ? pos : -1), -1); + while (ids[i] >= 0) + jnk.list1->activate(ids[i++], (group_id == 0 ? pos : -1), -1); } c = static_cast(pos >= txtlen ? 0 : *itr); diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index 2d9d40e2d68..5b058d7b696 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -56,7 +56,7 @@ struct replace_multi_regex_fn { reprog_device* progs; // array of regex progs size_type number_of_patterns; found_range* d_found_ranges; // working array matched (begin,end) values - column_device_view const d_repls; // replacment strings + column_device_view const d_repls; // replacement strings int32_t* d_offsets{}; // these are null when char* d_chars{}; // only computing size @@ -105,8 +105,8 @@ struct replace_multi_regex_fn { size_type end = d_ranges[ptn_idx].second; string_view d_repl = d_repls.size() > 1 ? d_repls.element(ptn_idx) : d_repls.element(0); - auto spos = d_str.byte_offset(begin); - auto epos = d_str.byte_offset(end); + auto spos = d_str.byte_offset(begin); + auto epos = d_str.byte_offset(end); nbytes += d_repl.size_bytes() - (epos - spos); if (out_ptr) { // copy unmodified content plus new replacement string out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos); diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index 4185e6db685..979974a2fdb 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -568,9 +568,9 @@ std::unique_ptr replace(strings_column_view con (strings.offset() == 0) ? 0 : cudf::detail::get_value(strings.offsets(), strings.offset(), stream); - size_type const chars_end = (offset_count == strings.offsets().size()) - ? strings.chars_size() - : cudf::detail::get_value( + size_type const chars_end = (offset_count == strings.offsets().size()) + ? strings.chars_size() + : cudf::detail::get_value( strings.offsets(), strings.offset() + strings_count, stream); size_type const chars_bytes = chars_end - chars_start; @@ -604,11 +604,11 @@ std::unique_ptr replace( auto const offset_count = strings_count + 1; auto const d_offsets = strings.offsets().data() + strings.offset(); size_type chars_start = (strings.offset() == 0) ? 0 - : cudf::detail::get_value( + : cudf::detail::get_value( strings.offsets(), strings.offset(), stream); - size_type chars_end = (offset_count == strings.offsets().size()) - ? strings.chars_size() - : cudf::detail::get_value( + size_type chars_end = (offset_count == strings.offsets().size()) + ? strings.chars_size() + : cudf::detail::get_value( strings.offsets(), strings.offset() + strings_count, stream); return replace_char_parallel( strings, chars_start, chars_end, d_target, d_repl, maxrepl, stream, mr); diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index ae0ea4b90e6..9c5be1c9ca3 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -154,10 +154,10 @@ struct split_tokenizer_fn : base_split_tokenizer { auto next_delim = ((idx + col) < positions_count) // boundary check for delims in last string ? (base_ptr + d_positions[idx + col]) // start of next delimiter : str_end_ptr; // or end of this string - auto eptr = (next_delim < str_end_ptr) // make sure delimiter is inside this string + auto eptr = (next_delim < str_end_ptr) // make sure delimiter is inside this string && (col + 1 < token_count) // and this is not the last token - ? next_delim - : str_end_ptr; + ? next_delim + : str_end_ptr; // store the token into the output vector d_tokens[col * d_strings.size()] = string_index_pair{str_ptr, static_cast(eptr - str_ptr)}; @@ -281,10 +281,10 @@ struct rsplit_tokenizer_fn : base_split_tokenizer { auto prev_delim = (idx >= col) // boundary check for delims in first string ? (base_ptr + d_positions[idx - col] + 1) // end of prev delimiter : str_begin_ptr; // or the start of this string - auto sptr = (prev_delim > str_begin_ptr) // make sure delimiter is inside the string + auto sptr = (prev_delim > str_begin_ptr) // make sure delimiter is inside the string && (col + 1 < token_count) // and this is not the last token - ? prev_delim - : str_begin_ptr; + ? prev_delim + : str_begin_ptr; // store the token into the output -- building the array backwards d_tokens[d_strings.size() * (token_count - 1 - col)] = string_index_pair{sptr, static_cast(str_ptr - sptr)}; diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp index 4cd85fc5e7e..904ce5470ce 100644 --- a/cpp/src/table/table.cpp +++ b/cpp/src/table/table.cpp @@ -28,7 +28,9 @@ table::table(table const& other) : _num_rows{other.num_rows()} { CUDF_FUNC_RANGE(); _columns.reserve(other._columns.size()); - for (auto const& c : other._columns) { _columns.emplace_back(std::make_unique(*c)); } + for (auto const& c : other._columns) { + _columns.emplace_back(std::make_unique(*c)); + } } // Move the contents of a vector `unique_ptr` @@ -53,7 +55,9 @@ table::table(table_view view, rmm::cuda_stream_view stream, rmm::mr::device_memo { CUDF_FUNC_RANGE(); _columns.reserve(view.num_columns()); - for (auto const& c : view) { _columns.emplace_back(std::make_unique(c, stream, mr)); } + for (auto const& c : view) { + _columns.emplace_back(std::make_unique(c, stream, mr)); + } } // Create immutable view @@ -61,7 +65,9 @@ table_view table::view() const { std::vector views; views.reserve(_columns.size()); - for (auto const& c : _columns) { views.push_back(c->view()); } + for (auto const& c : _columns) { + views.push_back(c->view()); + } return table_view{views}; } @@ -70,7 +76,9 @@ mutable_table_view table::mutable_view() { std::vector views; views.reserve(_columns.size()); - for (auto const& c : _columns) { views.push_back(c->mutable_view()); } + for (auto const& c : _columns) { + views.push_back(c->mutable_view()); + } return mutable_table_view{views}; } diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu index 62daeed6d79..859a6be3bb0 100644 --- a/cpp/src/table/table_device_view.cu +++ b/cpp/src/table/table_device_view.cu @@ -55,7 +55,7 @@ template class table_device_view_base; template class table_device_view_base; namespace { -struct is_relationally_comparable_impl { +struct is_relationally_comparable_functor { template constexpr bool operator()() { @@ -74,7 +74,7 @@ bool is_relationally_comparable(TableView const& lhs, TableView const& rhs) // TODO: possible to implement without double type dispatcher. return lhs.column(i).type() == rhs.column(i).type() and type_dispatcher(lhs.column(i).type(), - is_relationally_comparable_impl{}); + is_relationally_comparable_functor{}); }); } diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp index c64bf5b2823..abd909f8cfc 100644 --- a/cpp/src/table/table_view.cpp +++ b/cpp/src/table/table_view.cpp @@ -43,7 +43,9 @@ auto concatenate_column_views(std::vector const& views) { using ColumnView = typename ViewType::ColumnView; std::vector concat_cols; - for (auto& view : views) { concat_cols.insert(concat_cols.end(), view.begin(), view.end()); } + for (auto& view : views) { + concat_cols.insert(concat_cols.end(), view.begin(), view.end()); + } return concat_cols; } diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index cab5a54a57d..f9b2355b2ff 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -41,7 +41,7 @@ namespace { /** * @brief Generate ngrams from strings column. * - * Adjacent strings are concatented with the provided separator. + * Adjacent strings are concatenated with the provided separator. * The number of adjacent strings join depends on the specified ngrams value. * For example: for bigrams (ngrams=2), pairs of strings are concatenated. */ diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index f99c831e745..e20c7120571 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -380,7 +380,7 @@ __device__ size_type row_size_functor::operator()(column_device_vie /** * @brief Kernel for computing per-row sizes in bits. * - * @param cols An span of column_device_views represeting a column hierarcy + * @param cols An span of column_device_views representing a column hierarchy * @param info An span of column_info structs corresponding the elements in `cols` * @param output Output span of size (# rows) where per-row bit sizes are stored * @param max_branch_depth Maximum depth of the span stack needed per-thread diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp new file mode 100644 index 00000000000..d297148de45 --- /dev/null +++ b/cpp/src/utilities/type_checks.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +#include + +namespace cudf { +namespace { + +struct columns_equal_fn { + template + bool operator()(column_view const&, column_view const&) + { + return true; + } +}; + +template <> +bool columns_equal_fn::operator()(column_view const& lhs, column_view const& rhs) +{ + auto const kidx = dictionary_column_view::keys_column_index; + return lhs.num_children() > 0 and rhs.num_children() > 0 + ? lhs.child(kidx).type() == rhs.child(kidx).type() + : lhs.is_empty() and rhs.is_empty(); +} + +template <> +bool columns_equal_fn::operator()(column_view const& lhs, column_view const& rhs) +{ + auto const& ci = lists_column_view::child_column_index; + return column_types_equal(lhs.child(ci), rhs.child(ci)); +} + +template <> +bool columns_equal_fn::operator()(column_view const& lhs, column_view const& rhs) +{ + return lhs.num_children() == rhs.num_children() and + std::all_of(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(lhs.num_children()), + [&](auto i) { return column_types_equal(lhs.child(i), rhs.child(i)); }); +} + +}; // namespace + +// Implementation note: avoid using double dispatch for this function +// as it increases code paths to NxN for N types. +bool column_types_equal(column_view const& lhs, column_view const& rhs) +{ + if (lhs.type() != rhs.type()) { return false; } + return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs); +} + +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 4360b418e95..ddb5d88f2d0 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -62,11 +62,13 @@ ConfigureTest(GROUPBY_TEST groupby/count_tests.cpp groupby/groups_tests.cpp groupby/keys_tests.cpp + groupby/m2_tests.cpp groupby/min_tests.cpp groupby/max_scan_tests.cpp groupby/max_tests.cpp groupby/mean_tests.cpp groupby/median_tests.cpp + groupby/merge_m2_tests.cpp groupby/merge_lists_tests.cpp groupby/merge_sets_tests.cpp groupby/min_scan_tests.cpp @@ -86,6 +88,7 @@ ConfigureTest(GROUPBY_TEST # - join tests ------------------------------------------------------------------------------------ ConfigureTest(JOIN_TEST join/join_tests.cpp + join/conditional_join_tests.cu join/cross_join_tests.cpp join/semi_anti_join_tests.cpp) @@ -161,6 +164,8 @@ ConfigureTest(BINARY_TEST binaryop/binop-verify-input-test.cpp binaryop/binop-null-test.cpp binaryop/binop-integration-test.cpp + binaryop/binop-compiled-test.cpp + binaryop/binop-compiled-fixed_point-test.cpp binaryop/binop-generic-ptx-test.cpp ) @@ -219,6 +224,7 @@ ConfigureTest(COPYING_TEST copying/scatter_list_tests.cpp copying/scatter_list_scalar_tests.cpp copying/scatter_struct_tests.cpp + copying/scatter_struct_scalar_tests.cpp copying/segmented_gather_list_tests.cpp copying/shift_tests.cpp copying/slice_tests.cpp @@ -233,7 +239,8 @@ ConfigureTest(UTILITIES_TEST utilities_tests/column_utilities_tests.cpp utilities_tests/column_wrapper_tests.cpp utilities_tests/lists_column_wrapper_tests.cpp - utilities_tests/default_stream_tests.cpp) + utilities_tests/default_stream_tests.cpp + utilities_tests/type_check_tests.cpp) ################################################################################################### # - span tests ------------------------------------------------------------------------------- diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index 74937d4deea..48e19a2f587 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -31,10 +31,13 @@ #include #include -#include +#include +#include #include +#include #include +#include template using column_wrapper = cudf::test::fixed_width_column_wrapper; @@ -409,4 +412,46 @@ TEST_F(TransformTest, PyMod) cudf::test::expect_columns_equal(expected, result->view(), true); } +TEST_F(TransformTest, BasicAdditionNulls) +{ + auto c_0 = column_wrapper{{3, 20, 1, 50}, {0, 0, 1, 1}}; + auto c_1 = column_wrapper{{10, 7, 20, 0}, {0, 1, 0, 1}}; + auto table = cudf::table_view{{c_0, c_1}}; + + auto col_ref_0 = cudf::ast::column_reference(0); + auto col_ref_1 = cudf::ast::column_reference(1); + auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1); + + auto expected = column_wrapper{{0, 0, 0, 50}, {0, 0, 0, 1}}; + auto result = cudf::ast::compute_column(table, expression); + + cudf::test::expect_columns_equal(expected, result->view(), true); +} + +TEST_F(TransformTest, BasicAdditionLargeNulls) +{ + auto N = 2000; + auto a = thrust::make_counting_iterator(0); + + auto validities = std::vector(N); + std::fill(validities.begin(), validities.begin() + N / 2, 0); + std::fill(validities.begin() + (N / 2), validities.end(), 0); + + std::random_device rd; + std::mt19937 gen(rd()); + std::shuffle(validities.begin(), validities.end(), gen); + + auto col = column_wrapper(a, a + N, validities.begin()); + auto table = cudf::table_view{{col}}; + + auto col_ref = cudf::ast::column_reference(0); + auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref, col_ref); + + auto b = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); + auto expected = column_wrapper(b, b + N, validities.begin()); + auto result = cudf::ast::compute_column(table, expression); + + cudf::test::expect_columns_equal(expected, result->view(), true); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/binaryop/assert-binops.h b/cpp/tests/binaryop/assert-binops.h index 9e762a1c987..65859251e42 100644 --- a/cpp/tests/binaryop/assert-binops.h +++ b/cpp/tests/binaryop/assert-binops.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Copyright 2018-2019 BlazingDB, Inc. * Copyright 2018 Christian Noboa Mardini @@ -36,28 +36,21 @@ namespace binop { // result returned by the binop operation into string, which is then used for display purposes // when the values do not match. struct stringify_out_values { - template ()>* = nullptr> - std::string operator()(TypeOut lhs, TypeOut rhs) const + template + std::string operator()(size_type i, TypeOut lhs, TypeOut rhs) const { std::stringstream out_str; - out_str << "lhs: " << lhs << "\nrhs: " << rhs; - return out_str.str(); - } - - template ()>* = nullptr> - std::string operator()(TypeOut lhs, TypeOut rhs) const - { - std::stringstream out_str; - out_str << "lhs: " << lhs.time_since_epoch().count() - << "\nrhs: " << rhs.time_since_epoch().count(); - return out_str.str(); - } - - template ()>* = nullptr> - std::string operator()(TypeOut lhs, TypeOut rhs) const - { - std::stringstream out_str; - out_str << "lhs: " << lhs.count() << "\nrhs: " << rhs.count(); + out_str << "[" << i << "]:\n"; + if constexpr (is_fixed_point()) { + out_str << "lhs: " << std::string(lhs) << "\nrhs: " << std::string(rhs); + } else if constexpr (is_timestamp()) { + out_str << "lhs: " << lhs.time_since_epoch().count() + << "\nrhs: " << rhs.time_since_epoch().count(); + } else if constexpr (is_duration()) { + out_str << "lhs: " << lhs.count() << "\nrhs: " << rhs.count(); + } else { + out_str << "lhs: " << lhs << "\nrhs: " << rhs; + } return out_str.str(); } }; @@ -101,7 +94,7 @@ void ASSERT_BINOP(column_view const& out, for (size_t i = 0; i < out_data.size(); ++i) { auto lhs = out_data[i]; auto rhs = (TypeOut)(op(lhs_h, rhs_data[i])); - ASSERT_TRUE(value_comparator(lhs, rhs)) << stringify_out_values{}(lhs, rhs); + ASSERT_TRUE(value_comparator(lhs, rhs)) << stringify_out_values{}(i, lhs, rhs); } if (rhs.nullable()) { @@ -148,7 +141,7 @@ void ASSERT_BINOP(column_view const& out, for (size_t i = 0; i < out_data.size(); ++i) { auto lhs = out_data[i]; auto rhs = (TypeOut)(op(lhs_data[i], rhs_h)); - ASSERT_TRUE(value_comparator(lhs, rhs)) << stringify_out_values{}(lhs, rhs); + ASSERT_TRUE(value_comparator(lhs, rhs)) << stringify_out_values{}(i, lhs, rhs); } if (lhs.nullable()) { @@ -196,7 +189,7 @@ void ASSERT_BINOP(column_view const& out, for (size_t i = 0; i < out_data.size(); ++i) { auto lhs = out_data[i]; auto rhs = (TypeOut)(op(lhs_data[i], rhs_data[i])); - ASSERT_TRUE(value_comparator(lhs, rhs)) << stringify_out_values{}(lhs, rhs); + ASSERT_TRUE(value_comparator(lhs, rhs)) << stringify_out_values{}(i, lhs, rhs); } if (lhs.nullable() and rhs.nullable()) { diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp new file mode 100644 index 00000000000..feb75cc3f09 --- /dev/null +++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp @@ -0,0 +1,721 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include "cudf/utilities/error.hpp" + +namespace cudf::test::binop { + +template +struct FixedPointCompiledTestBothReps : public cudf::test::BaseFixture { +}; + +template +using wrapper = cudf::test::fixed_width_column_wrapper; +TYPED_TEST_CASE(FixedPointCompiledTestBothReps, cudf::test::FixedPointTypes); + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd) +{ + using namespace numeric; + using decimalXX = TypeParam; + + auto const sz = std::size_t{1000}; + + auto begin = cudf::detail::make_counting_transform_iterator(1, [](auto i) { + return decimalXX{i, scale_type{0}}; + }); + auto const vec1 = std::vector(begin, begin + sz); + auto const vec2 = std::vector(sz, decimalXX{2, scale_type{0}}); + auto expected = std::vector(sz); + + std::transform(std::cbegin(vec1), + std::cend(vec1), + std::cbegin(vec2), + std::begin(expected), + std::plus()); + + auto const lhs = wrapper(vec1.begin(), vec1.end()); + auto const rhs = wrapper(vec2.begin(), vec2.end()); + auto const expected_col = wrapper(expected.begin(), expected.end()); + + auto const type = + cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD, + static_cast(lhs).type(), + static_cast(rhs).type()); + auto const result = + cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpMultiply) +{ + using namespace numeric; + using decimalXX = TypeParam; + + auto const sz = std::size_t{1000}; + + auto begin = cudf::detail::make_counting_transform_iterator(1, [](auto i) { + return decimalXX{i, scale_type{0}}; + }); + auto const vec1 = std::vector(begin, begin + sz); + auto const vec2 = std::vector(sz, decimalXX{2, scale_type{0}}); + auto expected = std::vector(sz); + + std::transform(std::cbegin(vec1), + std::cend(vec1), + std::cbegin(vec2), + std::begin(expected), + std::multiplies()); + + auto const lhs = wrapper(vec1.begin(), vec1.end()); + auto const rhs = wrapper(vec2.begin(), vec2.end()); + auto const expected_col = wrapper(expected.begin(), expected.end()); + + auto const type = + cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL, + static_cast(lhs).type(), + static_cast(rhs).type()); + auto const result = + cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, result->view()); +} + +template +using fp_wrapper = cudf::test::fixed_point_column_wrapper; + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpMultiply2) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{11, 22, 33, 44, 55}, scale_type{-1}}; + auto const rhs = fp_wrapper{{10, 10, 10, 10, 10}, scale_type{0}}; + auto const expected = fp_wrapper{{110, 220, 330, 440, 550}, scale_type{-1}}; + + auto const type = + cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::MUL, + static_cast(lhs).type(), + static_cast(rhs).type()); + auto const result = + cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::MUL, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{10, 30, 50, 70}, scale_type{-1}}; + auto const rhs = fp_wrapper{{4, 4, 4, 4}, scale_type{0}}; + auto const expected = fp_wrapper{{2, 7, 12, 17}, scale_type{-1}}; + + auto const type = + cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV, + static_cast(lhs).type(), + static_cast(rhs).type()); + auto const result = + cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv2) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{10, 30, 50, 70}, scale_type{-1}}; + auto const rhs = fp_wrapper{{4, 4, 4, 4}, scale_type{-2}}; + auto const expected = fp_wrapper{{2, 7, 12, 17}, scale_type{1}}; + + auto const type = + cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::DIV, + static_cast(lhs).type(), + static_cast(rhs).type()); + auto const result = + cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv3) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{10, 30, 50, 70}, scale_type{-1}}; + auto const rhs = make_fixed_point_scalar(12, scale_type{-1}); + auto const expected = fp_wrapper{{0, 2, 4, 5}, scale_type{0}}; + + auto const type = cudf::binary_operation_fixed_point_output_type( + cudf::binary_operator::DIV, static_cast(lhs).type(), rhs->type()); + auto const result = + cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpDiv4) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto begin = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 11; }); + auto result_begin = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i * 11) / 12; }); + auto const lhs = fp_wrapper(begin, begin + 1000, scale_type{-1}); + auto const rhs = make_fixed_point_scalar(12, scale_type{-1}); + auto const expected = fp_wrapper(result_begin, result_begin + 1000, scale_type{0}); + + auto const type = cudf::binary_operation_fixed_point_output_type( + cudf::binary_operator::DIV, static_cast(lhs).type(), rhs->type()); + auto const result = + cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd2) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{11, 22, 33, 44, 55}, scale_type{-1}}; + auto const rhs = fp_wrapper{{100, 200, 300, 400, 500}, scale_type{-2}}; + auto const expected = fp_wrapper{{210, 420, 630, 840, 1050}, scale_type{-2}}; + + auto const type = + cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD, + static_cast(lhs).type(), + static_cast(rhs).type()); + auto const result = + cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd3) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{1100, 2200, 3300, 4400, 5500}, scale_type{-3}}; + auto const rhs = fp_wrapper{{100, 200, 300, 400, 500}, scale_type{-2}}; + auto const expected = fp_wrapper{{2100, 4200, 6300, 8400, 10500}, scale_type{-3}}; + + auto const type = + cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD, + static_cast(lhs).type(), + static_cast(rhs).type()); + auto const result = + cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd4) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{11, 22, 33, 44, 55}, scale_type{-1}}; + auto const rhs = make_fixed_point_scalar(100, scale_type{-2}); + auto const expected = fp_wrapper{{210, 320, 430, 540, 650}, scale_type{-2}}; + + auto const type = cudf::binary_operation_fixed_point_output_type( + cudf::binary_operator::ADD, static_cast(lhs).type(), rhs->type()); + auto const result = + cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::ADD, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd5) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = make_fixed_point_scalar(100, scale_type{-2}); + auto const rhs = fp_wrapper{{11, 22, 33, 44, 55}, scale_type{-1}}; + auto const expected = fp_wrapper{{210, 320, 430, 540, 650}, scale_type{-2}}; + + auto const type = cudf::binary_operation_fixed_point_output_type( + cudf::binary_operator::ADD, lhs->type(), static_cast(rhs).type()); + auto const result = + cudf::experimental::binary_operation(*lhs, rhs, cudf::binary_operator::ADD, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd6) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const col = fp_wrapper{{30, 4, 5, 6, 7, 8}, scale_type{0}}; + + auto const expected1 = fp_wrapper{{60, 8, 10, 12, 14, 16}, scale_type{0}}; + auto const expected2 = fp_wrapper{{6, 0, 1, 1, 1, 1}, scale_type{1}}; + auto const type1 = cudf::data_type{cudf::type_to_id(), 0}; + auto const type2 = cudf::data_type{cudf::type_to_id(), 1}; + auto const result1 = + cudf::experimental::binary_operation(col, col, cudf::binary_operator::ADD, type1); + auto const result2 = + cudf::experimental::binary_operation(col, col, cudf::binary_operator::ADD, type2); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointCast) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const col = fp_wrapper{{6, 8, 10, 12, 14, 16}, scale_type{0}}; + auto const expected = fp_wrapper{{0, 0, 1, 1, 1, 1}, scale_type{1}}; + auto const type = cudf::data_type{cudf::type_to_id(), 1}; + auto const result = cudf::cast(col, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpMultiplyScalar) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{11, 22, 33, 44, 55}, scale_type{-1}}; + auto const rhs = make_fixed_point_scalar(100, scale_type{-1}); + auto const expected = fp_wrapper{{1100, 2200, 3300, 4400, 5500}, scale_type{-2}}; + + auto const type = cudf::binary_operation_fixed_point_output_type( + cudf::binary_operator::MUL, static_cast(lhs).type(), rhs->type()); + auto const result = + cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::MUL, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpSimplePlus) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{150, 200}, scale_type{-2}}; + auto const rhs = fp_wrapper{{2250, 1005}, scale_type{-3}}; + auto const expected = fp_wrapper{{3750, 3005}, scale_type{-3}}; + + auto const type = + cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD, + static_cast(lhs).type(), + static_cast(rhs).type()); + auto const result = + cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::ADD, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimple) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const trues = std::vector(4, true); + auto const col1 = fp_wrapper{{1, 2, 3, 4}, scale_type{0}}; + auto const col2 = fp_wrapper{{100, 200, 300, 400}, scale_type{-2}}; + auto const expected = wrapper(trues.begin(), trues.end()); + + auto const result = cudf::experimental::binary_operation( + col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimpleScale0) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const trues = std::vector(4, true); + auto const col = fp_wrapper{{1, 2, 3, 4}, scale_type{0}}; + auto const expected = wrapper(trues.begin(), trues.end()); + + auto const result = cudf::experimental::binary_operation( + col, col, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimpleScale0Null) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const col1 = fp_wrapper{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{0}}; + auto const col2 = fp_wrapper{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}}; + auto const expected = wrapper{{0, 1, 0, 1}, {0, 0, 0, 0}}; + + auto const result = cudf::experimental::binary_operation( + col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualSimpleScale2Null) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const col1 = fp_wrapper{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{-2}}; + auto const col2 = fp_wrapper{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}}; + auto const expected = wrapper{{0, 1, 0, 1}, {0, 0, 0, 0}}; + + auto const result = cudf::experimental::binary_operation( + col1, col2, binary_operator::EQUAL, cudf::data_type{type_id::BOOL8}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpEqualLessGreater) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const sz = std::size_t{1000}; + + // TESTING binary op ADD + + auto begin = cudf::detail::make_counting_transform_iterator(1, [](auto e) { return e * 1000; }); + auto const vec1 = std::vector(begin, begin + sz); + auto const vec2 = std::vector(sz, 0); + + auto const iota_3 = fp_wrapper(vec1.begin(), vec1.end(), scale_type{-3}); + auto const zeros_3 = fp_wrapper(vec2.begin(), vec2.end(), scale_type{-1}); + + auto const type = + cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::ADD, + static_cast(iota_3).type(), + static_cast(zeros_3).type()); + auto const iota_3_after_add = + cudf::experimental::binary_operation(zeros_3, iota_3, binary_operator::ADD, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(iota_3, iota_3_after_add->view()); + + // TESTING binary op EQUAL, LESS, GREATER + + auto const trues = std::vector(sz, true); + auto const true_col = wrapper(trues.begin(), trues.end()); + + auto const btype = cudf::data_type{type_id::BOOL8}; + auto const equal_result = cudf::experimental::binary_operation( + iota_3, iota_3_after_add->view(), binary_operator::EQUAL, btype); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, equal_result->view()); + + auto const less_result = cudf::experimental::binary_operation( + zeros_3, iota_3_after_add->view(), binary_operator::LESS, btype); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, less_result->view()); + + auto const greater_result = cudf::experimental::binary_operation( + iota_3_after_add->view(), zeros_3, binary_operator::GREATER, btype); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(true_col, greater_result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpNullMaxSimple) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const trues = std::vector(4, true); + auto const col1 = fp_wrapper{{40, 30, 20, 10, 0}, {1, 0, 1, 1, 0}, scale_type{-2}}; + auto const col2 = fp_wrapper{{10, 20, 30, 40, 0}, {1, 1, 1, 0, 0}, scale_type{-2}}; + auto const expected = fp_wrapper{{40, 20, 30, 10, 0}, {1, 1, 1, 1, 0}, scale_type{-2}}; + + auto const type = + cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MAX, + static_cast(col1).type(), + static_cast(col2).type()); + auto const result = + cudf::experimental::binary_operation(col1, col2, binary_operator::NULL_MAX, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpNullMinSimple) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const trues = std::vector(4, true); + auto const col1 = fp_wrapper{{40, 30, 20, 10, 0}, {1, 1, 1, 0, 0}, scale_type{-1}}; + auto const col2 = fp_wrapper{{10, 20, 30, 40, 0}, {1, 0, 1, 1, 0}, scale_type{-1}}; + auto const expected = fp_wrapper{{10, 30, 20, 40, 0}, {1, 1, 1, 1, 0}, scale_type{-1}}; + + auto const type = + cudf::binary_operation_fixed_point_output_type(cudf::binary_operator::NULL_MIN, + static_cast(col1).type(), + static_cast(col2).type()); + auto const result = + cudf::experimental::binary_operation(col1, col2, binary_operator::NULL_MIN, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpNullEqualsSimple) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const trues = std::vector(4, true); + auto const col1 = fp_wrapper{{400, 300, 300, 100}, {1, 1, 1, 0}, scale_type{-2}}; + auto const col2 = fp_wrapper{{40, 200, 20, 400}, {1, 0, 1, 0}, scale_type{-1}}; + auto const expected = wrapper{{1, 0, 0, 1}, {1, 1, 1, 1}}; + + auto const result = cudf::experimental::binary_operation( + col1, col2, binary_operator::NULL_EQUALS, cudf::data_type{type_id::BOOL8}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{100, 300, 500, 700}, scale_type{-2}}; + auto const rhs = fp_wrapper{{4, 4, 4, 4}, scale_type{0}}; + auto const expected = fp_wrapper{{25, 75, 125, 175}, scale_type{-2}}; + + auto const type = data_type{type_to_id(), -2}; + auto const result = + cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div2) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{100000, 300000, 500000, 700000}, scale_type{-3}}; + auto const rhs = fp_wrapper{{20, 20, 20, 20}, scale_type{-1}}; + auto const expected = fp_wrapper{{5000, 15000, 25000, 35000}, scale_type{-2}}; + + auto const type = data_type{type_to_id(), -2}; + auto const result = + cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div3) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{10000, 30000, 50000, 70000}, scale_type{-2}}; + auto const rhs = fp_wrapper{{3, 9, 3, 3}, scale_type{0}}; + auto const expected = fp_wrapper{{3333, 3333, 16666, 23333}, scale_type{-2}}; + + auto const type = data_type{type_to_id(), -2}; + auto const result = + cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div4) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{10, 30, 50, 70}, scale_type{1}}; + auto const rhs = make_fixed_point_scalar(3, scale_type{0}); + auto const expected = fp_wrapper{{3, 10, 16, 23}, scale_type{1}}; + + auto const type = data_type{type_to_id(), 1}; + auto const result = + cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div6) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = make_fixed_point_scalar(3000, scale_type{-3}); + auto const rhs = fp_wrapper{{10, 30, 50, 70}, scale_type{-1}}; + + auto const expected = fp_wrapper{{300, 100, 60, 42}, scale_type{-2}}; + + auto const type = data_type{type_to_id(), -2}; + auto const result = + cudf::experimental::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div7) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = make_fixed_point_scalar(1200, scale_type{0}); + auto const rhs = fp_wrapper{{100, 200, 300, 500, 600, 800, 1200, 1300}, scale_type{-2}}; + + auto const expected = fp_wrapper{{12, 6, 4, 2, 2, 1, 1, 0}, scale_type{2}}; + + auto const type = data_type{type_to_id(), 2}; + auto const result = + cudf::experimental::binary_operation(*lhs, rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div8) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{4000, 6000, 80000}, scale_type{-1}}; + auto const rhs = make_fixed_point_scalar(5000, scale_type{-3}); + auto const expected = fp_wrapper{{0, 1, 16}, scale_type{2}}; + + auto const type = data_type{type_to_id(), 2}; + auto const result = + cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div9) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{10, 20, 30}, scale_type{2}}; + auto const rhs = make_fixed_point_scalar(7, scale_type{1}); + auto const expected = fp_wrapper{{1, 2, 4}, scale_type{1}}; + + auto const type = data_type{type_to_id(), 1}; + auto const result = + cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div10) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{100, 200, 300}, scale_type{1}}; + auto const rhs = make_fixed_point_scalar(7, scale_type{0}); + auto const expected = fp_wrapper{{14, 28, 42}, scale_type{1}}; + + auto const type = data_type{type_to_id(), 1}; + auto const result = + cudf::experimental::binary_operation(lhs, *rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOp_Div11) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const lhs = fp_wrapper{{1000, 2000, 3000}, scale_type{1}}; + auto const rhs = fp_wrapper{{7, 7, 7}, scale_type{0}}; + auto const expected = fp_wrapper{{142, 285, 428}, scale_type{1}}; + + auto const type = data_type{type_to_id(), 1}; + auto const result = + cudf::experimental::binary_operation(lhs, rhs, cudf::binary_operator::DIV, type); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} + +TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpThrows) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = device_storage_type_t; + + auto const col = fp_wrapper{{100, 300, 500, 700}, scale_type{-2}}; + auto const non_bool_type = data_type{type_to_id(), -2}; + auto const float_type = data_type{type_id::FLOAT32}; + EXPECT_THROW( + cudf::experimental::binary_operation(col, col, cudf::binary_operator::LESS, non_bool_type), + cudf::logic_error); + // Allowed now, but not allowed in jit. + // EXPECT_THROW(cudf::experimental::binary_operation(col, col, cudf::binary_operator::MUL, + // float_type), + // cudf::logic_error); +} + +} // namespace cudf::test::binop diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp new file mode 100644 index 00000000000..081ae41fef1 --- /dev/null +++ b/cpp/tests/binaryop/binop-compiled-test.cpp @@ -0,0 +1,610 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include "cudf/utilities/error.hpp" + +#include + +namespace cudf::test::binop { + +template +auto lhs_random_column(size_type size) +{ + return BinaryOperationTest::make_random_wrapped_column(size); +} + +template <> +auto lhs_random_column(size_type size) +{ + return cudf::test::strings_column_wrapper({"eee", "bb", "", "", "aa", "bbb", "ééé"}, + {1, 1, 0, 1, 1, 1, 1}); +} +template +auto rhs_random_column(size_type size) +{ + return BinaryOperationTest::make_random_wrapped_column(size); +} +template <> +auto rhs_random_column(size_type size) +{ + return cudf::test::strings_column_wrapper({"ééé", "bbb", "aa", "", "", "bb", "eee"}, + {1, 1, 1, 1, 0, 1, 1}); +} + +// combinations to test +// n t d +// n n.n n.t n.d +// t t.n t.t t.d +// d d.n d.t d.d + +constexpr size_type col_size = 10000; +template +struct BinaryOperationCompiledTest : public BinaryOperationTest { + using TypeOut = cudf::test::GetType; + using TypeLhs = cudf::test::GetType; + using TypeRhs = cudf::test::GetType; + + template