From 88e6a293384224ec3f5564d02cbddfe1a8f3b45b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 11 Jan 2022 13:59:53 -0800 Subject: [PATCH 01/23] Wrap CI script shell variables in quotes to fix local testing. (#10018) This is a tiny PR that wraps shell script variables in quotes. This fixes an issue I saw in the upload script when `${BUILD_MODE}` was not set during local testing of the CI environment. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Jordan Jacobelli (https://github.com/Ethyling) URL: https://github.com/rapidsai/cudf/pull/10018 --- ci/benchmark/build.sh | 2 +- ci/cpu/upload.sh | 2 +- ci/gpu/build.sh | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index 979db1b5034..59bd908d151 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -98,7 +98,7 @@ conda list --show-channel-urls ################################################################################ logger "Build libcudf..." -if [[ ${BUILD_MODE} == "pull-request" ]]; then +if [[ "${BUILD_MODE}" == "pull-request" ]]; then "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds else "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests -l --ptds diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh index 40e80def8ae..e6ef72d930c 100755 --- a/ci/cpu/upload.sh +++ b/ci/cpu/upload.sh @@ -12,7 +12,7 @@ export GPUCI_RETRY_SLEEP=30 export LABEL_OPTION=${LABEL_OPTION:-"--label main"} # Skip uploads unless BUILD_MODE == "branch" -if [ ${BUILD_MODE} != "branch" ]; then +if [ "${BUILD_MODE}" != "branch" ]; then echo "Skipping upload" return 0 fi diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 059e359e4e9..a8afc03af94 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -124,7 +124,7 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then ################################################################################ gpuci_logger "Build from source" - if [[ ${BUILD_MODE} == "pull-request" ]]; then + if [[ "${BUILD_MODE}" == "pull-request" ]]; then "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds else "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests -l --ptds @@ -222,7 +222,7 @@ else install_dask gpuci_logger "Build python libs from source" - if [[ ${BUILD_MODE} == "pull-request" ]]; then + if [[ "${BUILD_MODE}" == "pull-request" ]]; then "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds else "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka -l --ptds From 25a7485eb752c66e042012e78f0832199ab20aeb Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 11 Jan 2022 17:28:30 -0500 Subject: [PATCH 02/23] Fix regex doc describing hexadecimal escape characters (#10009) Fixes a documentation error found while diagnosing a hex regex pattern question. The hex escape sequence only specifies a single character (not a single byte). So this means it can only be used to match ASCII characters (code-points 0-127) and not all UTF-8 characters. This is the same as for octal escape sequences. Also, the example provided for hex in the documentation has been corrected to use a valid ASCII character. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/10009 --- cpp/doxygen/regex.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index b721448b45a..76ebb48d195 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -30,7 +30,7 @@ The details are based on features documented at https://www.regular-expressions. | Literal character | Any character except `[\^$.⎮?*+()` | All characters except the listed special characters match a single instance of themselves | `a` matches `a` | | Literal curly braces | `{` and `}` | `{` and `}` are literal characters, unless they are part of a valid regular expression token such as a quantifier `{3}` | `{` matches `{` | | Backslash escapes a metacharacter | `\` followed by any of `[\^$.⎮?*+(){}` | A backslash escapes special characters to suppress their special meaning | `\*` matches `*` | -| Hexadecimal escape | `\xFF` where `FF` are 2 hexadecimal digits | Matches the character at the specified position in the code page | `\xA9` matches `©` | +| Hexadecimal escape | `\xFF` where `FF` are 2 hexadecimal digits | Matches the character at the specified position in the ASCII table | `\x40` matches `@` | | Character escape | `\n`, `\r` and `\t` | Match an line-feed (LF) character, carriage return (CR) character and a tab character respectively | `\r\n` matches a Windows CRLF line break | | Character escape | `\a` | Match the "alert" or "bell" control character (ASCII 0x07) | | | Character escape | `\f` | Match the form-feed control character (ASCII 0x0C) | | From 3216342f01d198cfbe2ef9e2ac861674414dc493 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 11 Jan 2022 17:04:59 -0600 Subject: [PATCH 03/23] Raise in `query` if dtype is not supported (#9921) Closes https://github.com/rapidsai/cudf/issues/9894 Authors: - https://github.com/brandon-b-miller Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/9921 --- python/cudf/cudf/tests/test_query.py | 23 ++++++++++++++++++++ python/cudf/cudf/utils/queryutils.py | 32 ++++++++++++++++++++++------ 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index 9a02d5145bb..3de38b2cf6f 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -209,3 +209,26 @@ def test_query_with_index_keyword(query, a_val, b_val, c_val): expect = pdf.query(query) assert_eq(out, expect) + + +@pytest.mark.parametrize( + "data, query", + [ + # Only need to test the dtypes that pandas + # supports but that we do not + (["a", "b", "c"], "data == 'a'"), + ], +) +def test_query_unsupported_dtypes(data, query): + gdf = cudf.DataFrame({"data": data}) + + # make sure the query works in pandas + pdf = gdf.to_pandas() + pdf_result = pdf.query(query) + + expect = pd.DataFrame({"data": ["a"]}) + assert_eq(expect, pdf_result) + + # but fails in cuDF + with pytest.raises(TypeError): + gdf.query(query) diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 217466a5a1b..d9153c2b1d2 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -10,9 +10,20 @@ import cudf from cudf.core.column import column_empty from cudf.utils import applyutils +from cudf.utils.dtypes import ( + BOOL_TYPES, + DATETIME_TYPES, + NUMERIC_TYPES, + TIMEDELTA_TYPES, +) ENVREF_PREFIX = "__CUDF_ENVREF__" +SUPPORTED_QUERY_TYPES = { + np.dtype(dt) + for dt in NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES +} + class QuerySyntaxError(ValueError): pass @@ -197,6 +208,20 @@ def query_execute(df, expr, callenv): # compile compiled = query_compile(expr) + columns = compiled["colnames"] + + # prepare col args + colarrays = [cudf.core.dataframe.extract_col(df, col) for col in columns] + + # wait to check the types until we know which cols are used + if any(col.dtype not in SUPPORTED_QUERY_TYPES for col in colarrays): + raise TypeError( + "query only supports numeric, datetime, timedelta, " + "or bool dtypes." + ) + + colarrays = [col.data_array_view for col in colarrays] + kernel = compiled["kernel"] # process env args envargs = [] @@ -214,13 +239,6 @@ def query_execute(df, expr, callenv): raise NameError(msg.format(name)) else: envargs.append(val) - columns = compiled["colnames"] - # prepare col args - - colarrays = [ - cudf.core.dataframe.extract_col(df, col).data_array_view - for col in columns - ] # allocate output buffer nrows = len(df) From 813ac97b2143c8d1d8ca95435863f5234408a681 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Tue, 11 Jan 2022 15:16:25 -0800 Subject: [PATCH 04/23] Use list of column inputs for `apply_boolean_mask` (#9832) This PR brings changes from #9558 to `apply_boolean_mask` and removes the `as_frame` -> `as_column` round trip. Benchmark the column method: ``` ------------------------------------- benchmark 'col0': 2 tests ------------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------- column_apply_boolean_mask[col0] (afte) 87.0090 (1.0) 132.8980 (1.0) 95.8815 (1.0) column_apply_boolean_mask[col0] (befo) 210.4580 (2.42) 307.8270 (2.32) 225.4821 (2.35) ----------------------------------------------------------------------------------------------------- ------------------------------------- benchmark 'col1': 2 tests ------------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------- column_apply_boolean_mask[col1] (afte) 74.2240 (1.0) 110.0600 (1.0) 75.6356 (1.0) column_apply_boolean_mask[col1] (befo) 172.5240 (2.32) 278.5250 (2.53) 176.5672 (2.33) ----------------------------------------------------------------------------------------------------- ------------------------------------- benchmark 'col2': 2 tests ------------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------- column_apply_boolean_mask[col2] (afte) 101.5740 (1.0) 141.8850 (1.0) 110.2334 (1.0) column_apply_boolean_mask[col2] (befo) 234.1140 (2.30) 312.7140 (2.20) 245.5453 (2.23) ----------------------------------------------------------------------------------------------------- ------------------------------------- benchmark 'col3': 2 tests ------------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------- column_apply_boolean_mask[col3] (afte) 88.7710 (1.0) 142.7500 (1.0) 90.5082 (1.0) column_apply_boolean_mask[col3] (befo) 195.0980 (2.20) 303.1020 (2.12) 199.8368 (2.21) ----------------------------------------------------------------------------------------------------- ``` Dataframe benchmark ``` ----------------------------------- benchmark '100': 2 tests ----------------------------------- Name (time in us) Min Max Mean ------------------------------------------------------------------------------------------------ df_apply_boolean_mask[100] (afte) 380.6770 (1.05) 654.7080 (1.18) 389.3374 (1.03) df_apply_boolean_mask[100] (befo) 362.3220 (1.0) 554.6130 (1.0) 378.7087 (1.0) ------------------------------------------------------------------------------------------------ ----------------------------------- benchmark '10000': 2 tests ----------------------------------- Name (time in us) Min Max Mean -------------------------------------------------------------------------------------------------- df_apply_boolean_mask[10000] (afte) 399.5240 (1.05) 461.6310 (1.0) 405.1225 (1.04) df_apply_boolean_mask[10000] (befo) 379.4080 (1.0) 564.5770 (1.22) 389.6990 (1.0) -------------------------------------------------------------------------------------------------- ``` Authors: - Michael Wang (https://github.com/isVoid) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/9832 --- python/cudf/cudf/_lib/stream_compaction.pyx | 18 +++++---------- python/cudf/cudf/core/_base_index.py | 25 ++++++++++++++++++++- python/cudf/cudf/core/algorithms.py | 4 ++-- python/cudf/cudf/core/column/column.py | 10 ++++++--- python/cudf/cudf/core/frame.py | 13 ----------- python/cudf/cudf/core/indexed_frame.py | 20 +++++++++++++++++ 6 files changed, 58 insertions(+), 32 deletions(-) diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index ef47e843723..4330c565982 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -75,24 +75,22 @@ def drop_nulls(columns: list, how="any", keys=None, thresh=None): return columns_from_unique_ptr(move(c_result)) -def apply_boolean_mask(source_table, Column boolean_mask): +def apply_boolean_mask(columns: list, Column boolean_mask): """ Drops the rows which correspond to False in boolean_mask. Parameters ---------- - source_table : source table whose rows are dropped as per boolean_mask + columns : list of columns whose rows are dropped as per boolean_mask boolean_mask : a boolean column of same size as source_table Returns ------- - Frame obtained from applying mask + columns obtained from applying mask """ - assert pd.api.types.is_bool_dtype(boolean_mask.dtype) - cdef unique_ptr[table] c_result - cdef table_view source_table_view = table_view_from_table(source_table) + cdef table_view source_table_view = table_view_from_columns(columns) cdef column_view boolean_mask_view = boolean_mask.view() with nogil: @@ -103,13 +101,7 @@ def apply_boolean_mask(source_table, Column boolean_mask): ) ) - return data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names, - index_names=( - None if source_table._index - is None else source_table._index_names) - ) + return columns_from_unique_ptr(move(c_result)) def drop_duplicates(columns: list, diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index aa89b8f849f..683f3fefe1c 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -9,8 +9,15 @@ import pandas as pd import cudf +from cudf._lib.stream_compaction import apply_boolean_mask from cudf._typing import DtypeObj -from cudf.api.types import is_dtype_equal, is_integer, is_list_like, is_scalar +from cudf.api.types import ( + is_bool_dtype, + is_dtype_equal, + is_integer, + is_list_like, + is_scalar, +) from cudf.core.abc import Serializable from cudf.core.column import ColumnBase, column from cudf.core.column_accessor import ColumnAccessor @@ -1414,6 +1421,22 @@ def from_pandas(cls, index, nan_as_null=None): def _constructor_expanddim(self): return cudf.MultiIndex + def _apply_boolean_mask(self, boolean_mask): + """Apply boolean mask to each row of `self`. + + Rows corresponding to `False` is dropped. + """ + boolean_mask = cudf.core.column.as_column(boolean_mask) + if not is_bool_dtype(boolean_mask.dtype): + raise ValueError("boolean_mask is not boolean type.") + + result = self.__class__._from_columns( + apply_boolean_mask(list(self._columns), boolean_mask), + column_names=self._column_names, + ) + result._copy_type_metadata(self) + return result + def _split_columns_by_levels(self, levels): if isinstance(levels, int) and levels > 0: raise ValueError(f"Out of bound level: {levels}") diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py index 18c86f82f9c..a2a909968dc 100644 --- a/python/cudf/cudf/core/algorithms.py +++ b/python/cudf/cudf/core/algorithms.py @@ -5,8 +5,8 @@ import numpy as np from cudf.core.column import as_column -from cudf.core.frame import Frame from cudf.core.index import Index, RangeIndex +from cudf.core.indexed_frame import IndexedFrame from cudf.core.series import Series @@ -92,7 +92,7 @@ def _index_or_values_interpolation(column, index=None): if num_nan == 0 or num_nan == len(column): return column - to_interp = Frame(data={None: column}, index=index) + to_interp = IndexedFrame(data={None: column}, index=index) known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask)) known_x = known_x_and_y._index._column.values diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c1e037499fc..a966276842f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -35,6 +35,7 @@ ) from cudf._lib.scalar import as_device_scalar from cudf._lib.stream_compaction import ( + apply_boolean_mask, distinct_count as cpp_distinct_count, drop_duplicates, drop_nulls, @@ -997,9 +998,12 @@ def as_decimal32_column( raise NotImplementedError def apply_boolean_mask(self, mask) -> ColumnBase: - mask = as_column(mask, dtype="bool") - return ( - self.as_frame()._apply_boolean_mask(boolean_mask=mask)._as_column() + mask = as_column(mask) + if not is_bool_dtype(mask.dtype): + raise ValueError("boolean_mask is not boolean type.") + + return apply_boolean_mask([self], mask)[0]._with_type_metadata( + self.dtype ) def argsort( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0345966d6bd..6e47c0f41cf 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1461,19 +1461,6 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): return self[out_cols] - def _apply_boolean_mask(self, boolean_mask): - """ - Applies boolean mask to each row of `self`, - rows corresponding to `False` is dropped - """ - result = self.__class__._from_data( - *libcudf.stream_compaction.apply_boolean_mask( - self, as_column(boolean_mask) - ) - ) - result._copy_type_metadata(self) - return result - def interpolate( self, method="linear", diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 2f4d4a88195..7c5783bf637 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -19,6 +19,7 @@ from cudf._typing import ColumnLike from cudf.api.types import ( _is_non_decimal_numeric_dtype, + is_bool_dtype, is_categorical_dtype, is_integer_dtype, is_list_like, @@ -1197,6 +1198,25 @@ def resample( else cudf.core.resample.DataFrameResampler(self, by=by) ) + def _apply_boolean_mask(self, boolean_mask): + """Apply boolean mask to each row of `self`. + + Rows corresponding to `False` is dropped. + """ + boolean_mask = cudf.core.column.as_column(boolean_mask) + if not is_bool_dtype(boolean_mask.dtype): + raise ValueError("boolean_mask is not boolean type.") + + result = self.__class__._from_columns( + libcudf.stream_compaction.apply_boolean_mask( + list(self._index._columns + self._columns), boolean_mask + ), + column_names=self._column_names, + index_names=self._index.names, + ) + result._copy_type_metadata(self) + return result + def _reset_index(self, level, drop, col_level=0, col_fill=""): """Shared path for DataFrame.reset_index and Series.reset_index.""" if level is not None and not isinstance(level, (tuple, list)): From a43682e99ab618ec2028cd224abcfc56e2b2fabb Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 11 Jan 2022 19:20:48 -0500 Subject: [PATCH 05/23] cudftestutil no longer propagates compiler flags to external users (#10017) Fixes #9952 Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Mark Harris (https://github.com/harrism) - Keith Kraus (https://github.com/kkraus14) URL: https://github.com/rapidsai/cudf/pull/10017 --- cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 84e486c7e18..a8100fb3f92 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -618,7 +618,7 @@ set_target_properties( ) target_compile_options( - cudftestutil PUBLIC "$<$:${CUDF_CXX_FLAGS}>" + cudftestutil PUBLIC "$:${CUDF_CXX_FLAGS}>>" "$:${CUDF_CUDA_FLAGS}>>" ) From 093b0ad62ba44b21df2f6f4d23949ef49469d824 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 12 Jan 2022 08:45:06 -0500 Subject: [PATCH 06/23] Add strings tests to transpose_test.cpp (#9985) This is a follow on to PR #9937. Adds to the gtests in `transpose_test.cpp` to include strings as supported by `cudf::transpose`. No function has changed -- only additional tests have been added. The utility functions in `transpose_test.cpp` were enhanced to include string types by accepting a column-wrapper type template parameter as required. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/9985 --- cpp/include/cudf_test/type_lists.hpp | 9 +++- cpp/tests/transpose/transpose_test.cpp | 58 ++++++++++++++++---------- 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp index 097d072a5b4..3c46b912639 100644 --- a/cpp/include/cudf_test/type_lists.hpp +++ b/cpp/include/cudf_test/type_lists.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -158,6 +158,13 @@ std::enable_if_t::value, TypeParam> make_type_pa return TypeParam{typename TypeParam::duration(init_value)}; } +template +std::enable_if_t, TypeParam> make_type_param_scalar( + T const init_value) +{ + return std::to_string(init_value); +} + /** * @brief Type list for all integral types except type bool. */ diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp index 7b7b7d8a4a9..e3d9808b211 100644 --- a/cpp/tests/transpose/transpose_test.cpp +++ b/cpp/tests/transpose/transpose_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,9 +22,9 @@ #include #include #include +#include namespace { -using cudf::test::fixed_width_column_wrapper; template auto generate_vectors(size_t ncols, size_t nrows, F generator) @@ -59,10 +59,10 @@ auto transpose_vectors(std::vector> const& input) return transposed; } -template +template auto make_columns(std::vector> const& values) { - std::vector> columns; + std::vector columns; columns.reserve(values.size()); for (auto const& value_col : values) { @@ -72,11 +72,11 @@ auto make_columns(std::vector> const& values) return columns; } -template +template auto make_columns(std::vector> const& values, std::vector> const& valids) { - std::vector> columns; + std::vector columns; columns.reserve(values.size()); for (size_t col = 0; col < values.size(); ++col) { @@ -86,15 +86,14 @@ auto make_columns(std::vector> const& values, return columns; } -template -auto make_table_view(std::vector> const& cols) +template +auto make_table_view(std::vector const& cols) { std::vector views(cols.size()); - std::transform( - cols.begin(), cols.end(), views.begin(), [](fixed_width_column_wrapper const& col) { - return static_cast(col); - }); + std::transform(cols.begin(), cols.end(), views.begin(), [](auto const& col) { + return static_cast(col); + }); return cudf::table_view(views); } @@ -102,6 +101,10 @@ auto make_table_view(std::vector> const& cols) template void run_test(size_t ncols, size_t nrows, bool add_nulls) { + using ColumnWrapper = std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; + std::mt19937 rng(1); // Generate values as vector of vectors @@ -109,8 +112,8 @@ void run_test(size_t ncols, size_t nrows, bool add_nulls) ncols, nrows, [&rng]() { return cudf::test::make_type_param_scalar(rng()); }); auto const valuesT = transpose_vectors(values); - std::vector> input_cols; - std::vector> expected_cols; + std::vector input_cols; + std::vector expected_cols; std::vector expected_nulls(nrows); if (add_nulls) { @@ -129,11 +132,11 @@ void run_test(size_t ncols, size_t nrows, bool add_nulls) }); // Create column wrappers from vector of vectors - input_cols = make_columns(values, valids); - expected_cols = make_columns(valuesT, validsT); + input_cols = make_columns(values, valids); + expected_cols = make_columns(valuesT, validsT); } else { - input_cols = make_columns(values); - expected_cols = make_columns(valuesT); + input_cols = make_columns(values); + expected_cols = make_columns(valuesT); } // Create table views from column wrappers @@ -158,7 +161,13 @@ template class TransposeTest : public cudf::test::BaseFixture { }; -TYPED_TEST_SUITE(TransposeTest, cudf::test::FixedWidthTypes); +// Using std::string here instead of cudf::test::StringTypes allows us to +// use std::vector utilities in this file just like the fixed-width types. +// Should consider changing cudf::test::StringTypes to std::string instead of cudf::string_view. +using StdStringType = cudf::test::Types; +using TransposeTypes = cudf::test::Concat; + +TYPED_TEST_SUITE(TransposeTest, TransposeTypes); // cudf::test::FixedWidthTypes); TYPED_TEST(TransposeTest, SingleValue) { run_test(1, 1, false); } @@ -182,11 +191,14 @@ TYPED_TEST(TransposeTest, EmptyTable) { run_test(0, 0, false); } TYPED_TEST(TransposeTest, EmptyColumns) { run_test(10, 0, false); } -TYPED_TEST(TransposeTest, MismatchedColumns) +class TransposeTestError : public cudf::test::BaseFixture { +}; + +TEST_F(TransposeTestError, MismatchedColumns) { - fixed_width_column_wrapper col1({1, 2, 3}); - fixed_width_column_wrapper col2{{4, 5, 6}}; - fixed_width_column_wrapper col3{{7, 8, 9}}; + cudf::test::fixed_width_column_wrapper col1({1, 2, 3}); + cudf::test::fixed_width_column_wrapper col2{{4, 5, 6}}; + cudf::test::fixed_width_column_wrapper col3{{7, 8, 9}}; cudf::table_view input{{col1, col2, col3}}; EXPECT_THROW(cudf::transpose(input), cudf::logic_error); } From 76f89db80a64a2aa49b618aad80fe80e34e0332f Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Wed, 12 Jan 2022 12:57:00 -0800 Subject: [PATCH 07/23] Update JNI to use new arena mr constructor (#10027) And fix a failing test. Authors: - Rong Ou (https://github.com/rongou) Approvers: - Jason Lowe (https://github.com/jlowe) - Robert (Bobby) Evans (https://github.com/revans2) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/10027 --- java/src/main/native/src/RmmJni.cpp | 6 +++--- java/src/test/java/ai/rapids/cudf/RmmTest.java | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index d07b754c8db..769e8d2f356 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -348,10 +348,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j } else if (use_arena_alloc) { if (use_managed_mem) { Initialized_resource = rmm::mr::make_owning_wrapper( - std::make_shared(), pool_size, pool_size); + std::make_shared(), pool_size); } else { Initialized_resource = rmm::mr::make_owning_wrapper( - std::make_shared(), pool_size, pool_size); + std::make_shared(), pool_size); } } else if (use_cuda_async_alloc) { // Use `limiting_resource_adaptor` to set a hard limit on the max pool size since diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java index f9d097158b6..c56b131de86 100644 --- a/java/src/test/java/ai/rapids/cudf/RmmTest.java +++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -414,7 +414,7 @@ public void testCudaAsyncIsIncompatibleWithManaged() { @Test public void testCudaMemoryBuffer() { - Rmm.initialize(RmmAllocationMode.ARENA, Rmm.logToStderr(), 1024); + Rmm.initialize(RmmAllocationMode.ARENA, Rmm.logToStderr(), 8 * 1024 * 1024); try (CudaMemoryBuffer one = CudaMemoryBuffer.allocate(512); CudaMemoryBuffer two = CudaMemoryBuffer.allocate(1024)) { assertEquals(512, one.length); From b8c4816d2ce5205e7b88e5f9be74bf4ea75dfbf5 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 12 Jan 2022 15:11:17 -0600 Subject: [PATCH 08/23] Unpin `dask` and `distributed` in CI (#10028) This PR unpins dask and distributed in CI. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - https://github.com/jakirkham URL: https://github.com/rapidsai/cudf/pull/10028 --- ci/benchmark/build.sh | 2 +- ci/gpu/build.sh | 2 +- conda/environments/cudf_dev_cuda11.5.yml | 4 ++-- conda/recipes/dask-cudf/meta.yaml | 8 ++++---- python/dask_cudf/dev_requirements.txt | 4 ++-- python/dask_cudf/setup.py | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index 59bd908d151..534ac19ee98 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/" export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" # Dask & Distributed git tag -export DASK_DISTRIBUTED_GIT_TAG='2021.11.2' +export DASK_DISTRIBUTED_GIT_TAG='main' function remove_libcudf_kernel_cache_dir { EXITCODE=$? diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index a8afc03af94..39a39c46eff 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags` export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` # Dask & Distributed git tag -export DASK_DISTRIBUTED_GIT_TAG='2021.11.2' +export DASK_DISTRIBUTED_GIT_TAG='main' # ucx-py version export UCX_PY_VERSION='0.24.*' diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index cc8d50a1717..c258a5caabb 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -41,8 +41,8 @@ dependencies: - pydocstyle=6.1.1 - typing_extensions - pre-commit - - dask>=2021.11.1,<=2021.11.2 - - distributed>=2021.11.1,<=2021.11.2 + - dask>=2021.11.1 + - distributed>=2021.11.1 - streamz - arrow-cpp=5.0.0 - dlpack>=0.5,<0.6.0a0 diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index da8bcea430a..fd34ff4112d 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -27,14 +27,14 @@ requirements: host: - python - cudf {{ version }} - - dask>=2021.11.1,<=2021.11.2 - - distributed>=2021.11.1,<=2021.11.2 + - dask>=2021.11.1 + - distributed>=2021.11.1 - cudatoolkit {{ cuda_version }} run: - python - cudf {{ version }} - - dask>=2021.11.1,<=2021.11.2 - - distributed>=2021.11.1,<=2021.11.2 + - dask>=2021.11.1 + - distributed>=2021.11.1 - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} test: # [linux64] diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt index db85515f379..d8b0745be79 100644 --- a/python/dask_cudf/dev_requirements.txt +++ b/python/dask_cudf/dev_requirements.txt @@ -1,7 +1,7 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -dask>=2021.11.1,<=2021.11.2 -distributed>=2021.11.1,<=2021.11.2 +dask>=2021.11.1 +distributed>=2021.11.1 fsspec>=0.6.0 numba>=0.53.1 numpy diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py index b52c2ea37d6..425839772eb 100644 --- a/python/dask_cudf/setup.py +++ b/python/dask_cudf/setup.py @@ -10,8 +10,8 @@ install_requires = [ "cudf", - "dask>=2021.11.1,<=2021.11.2", - "distributed>=2021.11.1,<=2021.11.2", + "dask>=2021.11.1", + "distributed>=2021.11.1", "fsspec>=0.6.0", "numpy", "pandas>=1.0,<1.4.0dev0", From 3176258bb2f1cdd03d80be54fe52208885fc44da Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 12 Jan 2022 15:41:32 -0800 Subject: [PATCH 09/23] Return null count from inplace_bitmask_and. (#9904) This PR updates the function `cudf::detail::inplace_bitmask_and` to return the null count of the result. This change aligns `inplace_bitmask_and` with behavior changes introduced in #9616 to return null counts from functions acting on bitmasks. This will be helpful for #9621. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Nghia Truong (https://github.com/ttnghia) - Robert Maynard (https://github.com/robertmaynard) URL: https://github.com/rapidsai/cudf/pull/9904 --- cpp/include/cudf/detail/null_mask.hpp | 4 ++-- cpp/src/bitmask/null_mask.cu | 14 +++++++------- cpp/src/structs/utilities.cpp | 5 +++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index 6ee406de5ef..83ef78a8250 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -268,9 +268,9 @@ std::pair bitmask_or( * @param mask_size_bits The number of bits to be ANDed in each mask * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer - * @return rmm::device_buffer Output bitmask + * @return Count of set bits */ -void inplace_bitmask_and( +cudf::size_type inplace_bitmask_and( device_span dest_mask, host_span masks, host_span masks_begin_bits, diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index ec3776fb6d5..d1107ad3cfd 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -404,14 +404,14 @@ std::vector segmented_null_count(const bitmask_type* bitmask, } // Inplace Bitwise AND of the masks -void inplace_bitmask_and(device_span dest_mask, - host_span masks, - host_span begin_bits, - size_type mask_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +cudf::size_type inplace_bitmask_and(device_span dest_mask, + host_span masks, + host_span begin_bits, + size_type mask_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - inplace_bitmask_binop( + return inplace_bitmask_binop( [] __device__(bitmask_type left, bitmask_type right) { return left & right; }, dest_mask, masks, diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index 43a32c8405a..afea8a55b16 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -322,14 +322,15 @@ void superimpose_parent_nulls(bitmask_type const* parent_null_mask, reinterpret_cast(parent_null_mask), reinterpret_cast(current_child_mask)}; std::vector begin_bits{0, 0}; - cudf::detail::inplace_bitmask_and( + auto const valid_count = cudf::detail::inplace_bitmask_and( device_span(current_child_mask, num_bitmask_words(child.size())), masks, begin_bits, child.size(), stream, mr); - child.set_null_count(UNKNOWN_NULL_COUNT); + auto const null_count = child.size() - valid_count; + child.set_null_count(null_count); } // If the child is also a struct, repeat for all grandchildren. From 4950a7ae376200d086d4108edb572d164f4e81c8 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 12 Jan 2022 20:16:34 -0800 Subject: [PATCH 10/23] Remove deprecated `method` parameter from `merge` and `join`. (#9944) This PR removes the deprecated `method` parameter from `DataFrame.merge` and `DataFrame.join`. This resolves #9353 and follows up on #9291. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/9944 --- python/cudf/cudf/core/dataframe.py | 30 +------------------------- python/cudf/cudf/tests/test_joining.py | 4 ++-- 2 files changed, 3 insertions(+), 31 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fe6ac8e1529..123f86cc200 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3438,7 +3438,6 @@ def merge( sort=False, lsuffix=None, rsuffix=None, - method=None, indicator=False, suffixes=("_x", "_y"), ): @@ -3490,9 +3489,6 @@ def merge( suffixes: Tuple[str, str], defaults to ('_x', '_y') Suffixes applied to overlapping column names on the left and right sides - method : - This parameter is unused. It is deprecated and will be removed in a - future version. Returns ------- @@ -3554,13 +3550,6 @@ def merge( else: lsuffix, rsuffix = suffixes - if method is not None: - warnings.warn( - "The 'method' argument is deprecated and will be removed " - "in a future version of cudf.", - FutureWarning, - ) - # Compute merge gdf_result = super()._merge( right, @@ -3578,14 +3567,7 @@ def merge( @annotate("JOIN", color="blue", domain="cudf_python") def join( - self, - other, - on=None, - how="left", - lsuffix="", - rsuffix="", - sort=False, - method=None, + self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False, ): """Join columns with other DataFrame on index or on a key column. @@ -3599,9 +3581,6 @@ def join( column names when avoiding conflicts. sort : bool Set to True to ensure sorted ordering. - method : - This parameter is unused. It is deprecated and will be removed in a - future version. Returns ------- @@ -3615,13 +3594,6 @@ def join( - *on* is not supported yet due to lack of multi-index support. """ - if method is not None: - warnings.warn( - "The 'method' argument is deprecated and will be removed " - "in a future version of cudf.", - FutureWarning, - ) - df = self.merge( other, left_index=True, diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index d25c6130bfb..2fb7393f5b4 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -256,7 +256,7 @@ def test_dataframe_join_mismatch_cats(how): pdf1 = pdf1.set_index("join_col") pdf2 = pdf2.set_index("join_col") - join_gdf = gdf1.join(gdf2, how=how, sort=True, method="hash") + join_gdf = gdf1.join(gdf2, how=how, sort=True) join_pdf = pdf1.join(pdf2, how=how) got = join_gdf.fillna(-1).to_pandas() @@ -403,7 +403,7 @@ def test_dataframe_merge_order(): gdf2["id"] = [4, 5] gdf2["a"] = [7, 8] - gdf = gdf1.merge(gdf2, how="left", on=["id", "a"], method="hash") + gdf = gdf1.merge(gdf2, how="left", on=["id", "a"]) df1 = pd.DataFrame() df2 = pd.DataFrame() From fe71baba07f4b582b5ec2e36ad50301f1186ca34 Mon Sep 17 00:00:00 2001 From: MithunR Date: Thu, 13 Jan 2022 06:11:42 -0800 Subject: [PATCH 11/23] Fix memory leaks in JNI native code. (#10029) This commit fixes a couple of minor, host-side memory leaks in the JNI native code. The objects in question did not need to go on the heap. They have, in this commit, been switched to stack objects. Authors: - MithunR (https://github.com/mythrocks) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/10029 --- java/src/main/native/src/ColumnVectorJni.cpp | 4 ++-- java/src/main/native/src/ColumnViewJni.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index e61ab8444d1..b0286f9ac27 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -359,10 +359,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_hash(JNIEnv *env, jobje std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(), std::back_inserter(column_views), [](auto const &p_column) { return *p_column; }); - cudf::table_view *input_table = new cudf::table_view(column_views); + cudf::table_view input_table{column_views}; std::unique_ptr result = - cudf::hash(*input_table, static_cast(hash_function_id), seed); + cudf::hash(input_table, static_cast(hash_function_id), seed); return reinterpret_cast(result.release()); } CATCH_STD(env, 0); diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 73ea49c18d9..d2cc2ab7d1c 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1604,17 +1604,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(), std::back_inserter(column_views), [](auto const &p_column) { return *p_column; }); - cudf::table_view *input_table = new cudf::table_view(column_views); + cudf::table_view input_table{column_views}; cudf::binary_operator op = static_cast(bin_op); switch (op) { case cudf::binary_operator::BITWISE_AND: { - auto [new_bitmask, null_count] = cudf::bitmask_and(*input_table); + auto [new_bitmask, null_count] = cudf::bitmask_and(input_table); copy->set_null_mask(std::move(new_bitmask), null_count); break; } case cudf::binary_operator::BITWISE_OR: { - auto [new_bitmask, null_count] = cudf::bitmask_or(*input_table); + auto [new_bitmask, null_count] = cudf::bitmask_or(input_table); copy->set_null_mask(std::move(new_bitmask), null_count); break; } From d0c85e152db772b37d713b0f86ae787311d673ac Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Thu, 13 Jan 2022 12:21:56 -0500 Subject: [PATCH 12/23] build.sh respects the `--build_metrics` and `--incl_cache_stats` flags (#10035) Previously the script would do a comparison check like "$V"=="ON" which isn't a comparison in bash but a joining of strings ( "OFF==ON"). To do a comparison of two strings you need to have white space around `==`. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10035 --- build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sh b/build.sh index f5a59b6edcf..45074a6645f 100755 --- a/build.sh +++ b/build.sh @@ -186,7 +186,7 @@ if buildAll || hasArg libcudf; then # get the current count before the compile starts FILES_IN_CCACHE="" - if [[ "$BUILD_REPORT_INCL_CACHE_STATS"=="ON" && -x "$(command -v ccache)" ]]; then + if [[ "$BUILD_REPORT_INCL_CACHE_STATS" == "ON" && -x "$(command -v ccache)" ]]; then FILES_IN_CCACHE=$(ccache -s | grep "files in cache") echo "$FILES_IN_CCACHE" # zero the ccache statistics @@ -212,7 +212,7 @@ if buildAll || hasArg libcudf; then compile_total=$(( compile_end - compile_start )) # Record build times - if [[ "$BUILD_REPORT_METRICS"=="ON" && -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then + if [[ "$BUILD_REPORT_METRICS" == "ON" && -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then echo "Formatting build metrics" python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt xml > ${LIB_BUILD_DIR}/ninja_log.xml MSG="

" From dbe65f1b977d76c93932a42ec8047690e84f0267 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 13 Jan 2022 11:09:02 -0700 Subject: [PATCH 13/23] Fix null check when comparing structs in `arg_min` operation of reduction/groupby (#10026) This is another fix for https://github.com/NVIDIA/spark-rapids/pull/4434, when the null order is wrongly handled if the input structs column does not have nulls at the top level but only has null at the children levels. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/10026 --- cpp/src/reductions/struct_minmax_util.cuh | 15 +++---- cpp/tests/groupby/max_tests.cpp | 51 ++++++++++++++++------- cpp/tests/groupby/min_tests.cpp | 51 ++++++++++++++++------- 3 files changed, 80 insertions(+), 37 deletions(-) diff --git a/cpp/src/reductions/struct_minmax_util.cuh b/cpp/src/reductions/struct_minmax_util.cuh index e5832b849bd..1de48ef482d 100644 --- a/cpp/src/reductions/struct_minmax_util.cuh +++ b/cpp/src/reductions/struct_minmax_util.cuh @@ -103,13 +103,14 @@ class comparison_binop_generator { { if (is_min_op) { null_orders = flattened_input.null_orders(); - // Null structs are excluded from the operations, and that is equivalent to considering - // nulls as larger than all other non-null STRUCT elements (if finding for ARGMIN), or - // smaller than all other non-null STRUCT elements (if finding for ARGMAX). - // Thus, we need to set a separate null order for the top level structs column (which is - // stored at the first position in the null_orders array) to achieve this purpose. - null_orders.front() = cudf::null_order::AFTER; - null_orders_dvec = cudf::detail::make_device_uvector_async(null_orders, stream); + // If the input column has nulls (at the top level), null structs are excluded from the + // operations, and that is equivalent to considering top-level nulls as larger than all other + // non-null STRUCT elements (if finding for ARGMIN), or smaller than all other non-null STRUCT + // elements (if finding for ARGMAX). Thus, we need to set a separate null order for the top + // level structs column (which is stored at the first position in the null_orders array) to + // achieve this purpose. + if (input.has_nulls()) { null_orders.front() = cudf::null_order::AFTER; } + null_orders_dvec = cudf::detail::make_device_uvector_async(null_orders, stream); } // else: Don't need to generate nulls order to copy to device memory if we have all null orders // are BEFORE (that happens when we have is_min_op == false). diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp index 47bed11df30..266312d16a2 100644 --- a/cpp/tests/groupby/max_tests.cpp +++ b/cpp/tests/groupby/max_tests.cpp @@ -391,22 +391,43 @@ TEST_F(groupby_max_struct_test, null_keys_and_values) TEST_F(groupby_max_struct_test, values_with_null_child) { constexpr int32_t null{0}; - auto const keys = fixed_width_column_wrapper{1, 1}; - auto const vals = [] { - auto child1 = fixed_width_column_wrapper{1, 1}; - auto child2 = fixed_width_column_wrapper{{-1, null}, null_at(1)}; - return structs_column_wrapper{child1, child2}; - }(); - - auto const expect_keys = fixed_width_column_wrapper{1}; - auto const expect_vals = [] { - auto child1 = fixed_width_column_wrapper{1}; - auto child2 = fixed_width_column_wrapper{-1}; - return structs_column_wrapper{child1, child2}; - }(); + { + auto const keys = fixed_width_column_wrapper{1, 1}; + auto const vals = [] { + auto child1 = fixed_width_column_wrapper{1, 1}; + auto child2 = fixed_width_column_wrapper{{-1, null}, null_at(1)}; + return structs_column_wrapper{child1, child2}; + }(); + + auto const expect_keys = fixed_width_column_wrapper{1}; + auto const expect_vals = [] { + auto child1 = fixed_width_column_wrapper{1}; + auto child2 = fixed_width_column_wrapper{-1}; + return structs_column_wrapper{child1, child2}; + }(); + + auto agg = cudf::make_max_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + } - auto agg = cudf::make_max_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + { + auto const keys = fixed_width_column_wrapper{1, 1}; + auto const vals = [] { + auto child1 = fixed_width_column_wrapper{{-1, null}, null_at(1)}; + auto child2 = fixed_width_column_wrapper{{null, null}, nulls_at({0, 1})}; + return structs_column_wrapper{child1, child2}; + }(); + + auto const expect_keys = fixed_width_column_wrapper{1}; + auto const expect_vals = [] { + auto child1 = fixed_width_column_wrapper{-1}; + auto child2 = fixed_width_column_wrapper{{null}, null_at(0)}; + return structs_column_wrapper{child1, child2}; + }(); + + auto agg = cudf::make_max_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + } } } // namespace test diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp index 64bffe1c883..00fa282cee4 100644 --- a/cpp/tests/groupby/min_tests.cpp +++ b/cpp/tests/groupby/min_tests.cpp @@ -390,22 +390,43 @@ TEST_F(groupby_min_struct_test, null_keys_and_values) TEST_F(groupby_min_struct_test, values_with_null_child) { constexpr int32_t null{0}; - auto const keys = fixed_width_column_wrapper{1, 1}; - auto const vals = [] { - auto child1 = fixed_width_column_wrapper{1, 1}; - auto child2 = fixed_width_column_wrapper{{-1, null}, null_at(1)}; - return structs_column_wrapper{child1, child2}; - }(); - - auto const expect_keys = fixed_width_column_wrapper{1}; - auto const expect_vals = [] { - auto child1 = fixed_width_column_wrapper{1}; - auto child2 = fixed_width_column_wrapper{{null}, null_at(0)}; - return structs_column_wrapper{child1, child2}; - }(); + { + auto const keys = fixed_width_column_wrapper{1, 1}; + auto const vals = [] { + auto child1 = fixed_width_column_wrapper{1, 1}; + auto child2 = fixed_width_column_wrapper{{-1, null}, null_at(1)}; + return structs_column_wrapper{child1, child2}; + }(); + + auto const expect_keys = fixed_width_column_wrapper{1}; + auto const expect_vals = [] { + auto child1 = fixed_width_column_wrapper{1}; + auto child2 = fixed_width_column_wrapper{{null}, null_at(0)}; + return structs_column_wrapper{child1, child2}; + }(); + + auto agg = cudf::make_min_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + } - auto agg = cudf::make_min_aggregation(); - test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + { + auto const keys = fixed_width_column_wrapper{1, 1}; + auto const vals = [] { + auto child1 = fixed_width_column_wrapper{{-1, null}, null_at(1)}; + auto child2 = fixed_width_column_wrapper{{null, null}, nulls_at({0, 1})}; + return structs_column_wrapper{child1, child2}; + }(); + + auto const expect_keys = fixed_width_column_wrapper{1}; + auto const expect_vals = [] { + auto child1 = fixed_width_column_wrapper{{null}, null_at(0)}; + auto child2 = fixed_width_column_wrapper{{null}, null_at(0)}; + return structs_column_wrapper{child1, child2}; + }(); + + auto agg = cudf::make_min_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)); + } } } // namespace test From c07fdabb491815eab8e4eb7655b2946218ee8a42 Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Thu, 13 Jan 2022 12:16:36 -0600 Subject: [PATCH 14/23] Load balance optimization for contiguous_split (#9755) The existing `contiguous_split` implementation was vulnerable to situations where `number of columns N * number of splits M` was < the number of SMs on the gpu. This PR implements a postprocessing step which attempts to distribute the amount of bytes to be copied as evenly as possible across all available SMs. PR has been updated to repartition using a constant chunk size of 1 MB. This yields better results than the initial approach. Before/after benchmarks for some particularly degenerate cases (T4) ``` Before (4 partitions) 4GB, 4 columns, no splits 43.3 ms 43.3 ms 8 bytes_per_second=46.1738G/s After 4GB, 4 columns, no splits 10.1 ms 10.1 ms 8 bytes_per_second=198.642G/s ``` ``` Before (2 partitions) 1GB, 1 column + validity, no splits 114 ms 114 ms 8 bytes_per_second=17.5212G/s After 1GB, 1 column + validity, no splits 10.5 ms 10.6 ms 8 bytes_per_second=189.784G/s ``` Authors: - https://github.com/nvdbaranec Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Vyas Ramasubramani (https://github.com/vyasr) - Elias Stehle (https://github.com/elstehle) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/9755 --- .../copying/contiguous_split_benchmark.cu | 65 +++-- cpp/src/copying/contiguous_split.cu | 248 +++++++++++++++--- cpp/tests/copying/split_tests.cpp | 27 ++ 3 files changed, 277 insertions(+), 63 deletions(-) diff --git a/cpp/benchmarks/copying/contiguous_split_benchmark.cu b/cpp/benchmarks/copying/contiguous_split_benchmark.cu index 506d676d196..55e1360efc8 100644 --- a/cpp/benchmarks/copying/contiguous_split_benchmark.cu +++ b/cpp/benchmarks/copying/contiguous_split_benchmark.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,10 +34,18 @@ void BM_contiguous_split_common(benchmark::State& state, int64_t bytes_total) { // generate splits - cudf::size_type split_stride = num_rows / num_splits; std::vector splits; - for (int idx = 0; idx < num_rows; idx += split_stride) { - splits.push_back(std::min(idx + split_stride, static_cast(num_rows))); + if (num_splits > 0) { + cudf::size_type const split_stride = num_rows / num_splits; + // start after the first element. + auto iter = thrust::make_counting_iterator(1); + splits.reserve(num_splits); + std::transform(iter, + iter + num_splits, + std::back_inserter(splits), + [split_stride, num_rows](cudf::size_type i) { + return std::min(i * split_stride, static_cast(num_rows)); + }); } std::vector> columns(src_cols.size()); @@ -53,7 +61,8 @@ void BM_contiguous_split_common(benchmark::State& state, auto result = cudf::contiguous_split(src_table, splits); } - state.SetBytesProcessed(static_cast(state.iterations()) * bytes_total); + // it's 2x bytes_total because we're both reading and writing. + state.SetBytesProcessed(static_cast(state.iterations()) * bytes_total * 2); } class ContiguousSplit : public cudf::benchmark { @@ -61,13 +70,13 @@ class ContiguousSplit : public cudf::benchmark { void BM_contiguous_split(benchmark::State& state) { - int64_t total_desired_bytes = state.range(0); - cudf::size_type num_cols = state.range(1); - cudf::size_type num_splits = state.range(2); - bool include_validity = state.range(3) == 0 ? false : true; + int64_t const total_desired_bytes = state.range(0); + cudf::size_type const num_cols = state.range(1); + cudf::size_type const num_splits = state.range(2); + bool const include_validity = state.range(3) == 0 ? false : true; cudf::size_type el_size = 4; // ints and floats - int64_t num_rows = total_desired_bytes / (num_cols * el_size); + int64_t const num_rows = total_desired_bytes / (num_cols * el_size); // generate input table srand(31337); @@ -85,8 +94,10 @@ void BM_contiguous_split(benchmark::State& state) } } - size_t total_bytes = total_desired_bytes; - if (include_validity) { total_bytes += num_rows / (sizeof(cudf::bitmask_type) * 8); } + int64_t const total_bytes = + total_desired_bytes + + (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols) + : 0); BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes); } @@ -102,17 +113,17 @@ int rand_range(int r) void BM_contiguous_split_strings(benchmark::State& state) { - int64_t total_desired_bytes = state.range(0); - cudf::size_type num_cols = state.range(1); - cudf::size_type num_splits = state.range(2); - bool include_validity = state.range(3) == 0 ? false : true; + int64_t const total_desired_bytes = state.range(0); + cudf::size_type const num_cols = state.range(1); + cudf::size_type const num_splits = state.range(2); + bool const include_validity = state.range(3) == 0 ? false : true; - const int64_t string_len = 8; + constexpr int64_t string_len = 8; std::vector h_strings{ "aaaaaaaa", "bbbbbbbb", "cccccccc", "dddddddd", "eeeeeeee", "ffffffff", "gggggggg", "hhhhhhhh"}; - int64_t col_len_bytes = total_desired_bytes / num_cols; - int64_t num_rows = col_len_bytes / string_len; + int64_t const col_len_bytes = total_desired_bytes / num_cols; + int64_t const num_rows = col_len_bytes / string_len; // generate input table srand(31337); @@ -133,8 +144,10 @@ void BM_contiguous_split_strings(benchmark::State& state) } } - size_t total_bytes = total_desired_bytes + (num_rows * sizeof(cudf::size_type)); - if (include_validity) { total_bytes += num_rows / (sizeof(cudf::bitmask_type) * 8); } + int64_t const total_bytes = + total_desired_bytes + ((num_rows + 1) * sizeof(cudf::offset_type)) + + (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols) + : 0); BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes); } @@ -157,12 +170,16 @@ CSBM_BENCHMARK_DEFINE(6Gb10ColsValidity, (int64_t)6 * 1024 * 1024 * 1024, 10, 25 CSBM_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 256, 0); CSBM_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 256, 1); CSBM_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 0); -CSBM_BENCHMARK_DEFINE(46b10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 1); +CSBM_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 1); +CSBM_BENCHMARK_DEFINE(4Gb4ColsNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1); +CSBM_BENCHMARK_DEFINE(4Gb4ColsValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1); CSBM_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 0); CSBM_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 1); CSBM_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 0); CSBM_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 1); +CSBM_BENCHMARK_DEFINE(1Gb1ColNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1); +CSBM_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1); #define CSBM_STRINGS_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \ BENCHMARK_DEFINE_F(ContiguousSplitStrings, name)(::benchmark::State & state) \ @@ -179,8 +196,12 @@ CSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1 CSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 256, 1); CSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 0); CSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 1); +CSBM_STRINGS_BENCHMARK_DEFINE(4Gb4ColsNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 0); +CSBM_STRINGS_BENCHMARK_DEFINE(4Gb4ColsValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1); CSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 0); CSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 1); CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 0); CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 1); +CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 0); +CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1); diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 8dc93bc1de3..f8c0006ed45 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -82,16 +83,21 @@ struct src_buf_info { * M partitions, then we have N*M destination buffers. */ struct dst_buf_info { + // constant across all copy commands for this buffer std::size_t buf_size; // total size of buffer, including padding int num_elements; // # of elements to be copied int element_size; // size of each element in bytes - int num_rows; // # of rows (which may be different from num_elements in the case of validity or - // offset buffers) - int src_row_index; // row index to start reading from from my associated source buffer + int num_rows; // # of rows to be copied(which may be different from num_elements in the case of + // validity or offset buffers) + + int src_element_index; // element index to start reading from from my associated source buffer std::size_t dst_offset; // my offset into the per-partition allocation int value_shift; // amount to shift values down by (for offset buffers) int bit_shift; // # of bits to shift right by (for validity buffers) - size_type valid_count; + size_type valid_count; // validity count for this block of work + + int src_buf_index; // source buffer index + int dst_buf_index; // destination buffer index }; /** @@ -116,7 +122,7 @@ struct dst_buf_info { * @param t Thread index * @param num_elements Number of elements to copy * @param element_size Size of each element in bytes - * @param src_row_index Row index to start copying at + * @param src_element_index Element index to start copying at * @param stride Size of the kernel block * @param value_shift Shift incoming 4-byte offset values down by this amount * @param bit_shift Shift incoming data right by this many bits @@ -129,14 +135,14 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, int t, std::size_t num_elements, std::size_t element_size, - std::size_t src_row_index, + std::size_t src_element_index, uint32_t stride, int value_shift, int bit_shift, std::size_t num_rows, size_type* valid_count) { - src += (src_row_index * element_size); + src += (src_element_index * element_size); size_type thread_valid_count = 0; @@ -240,38 +246,36 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, } /** - * @brief Kernel which copies a single buffer from a set of partitioned - * column buffers. + * @brief Kernel which copies data from multiple source buffers to multiple + * destination buffers. * * When doing a contiguous_split on X columns comprising N total internal buffers * with M splits, we end up having to copy N*M source/destination buffer pairs. + * These logical copies are further subdivided to distribute the amount of work + * to be done as evenly as possible across the multiprocessors on the device. * This kernel is arranged such that each block copies 1 source/destination pair. - * This function retrieves the relevant buffers and then calls copy_buffer to perform - * the actual copy. * - * @param num_src_bufs Total number of source buffers (N) - * @param src_bufs Input source buffers (N) - * @param dst_bufs Destination buffers (N*M) + * @param src_bufs Input source buffers + * @param dst_bufs Destination buffers * @param buf_info Information on the range of values to be copied for each destination buffer. */ template -__global__ void copy_partition(int num_src_bufs, - uint8_t const** src_bufs, - uint8_t** dst_bufs, - dst_buf_info* buf_info) +__global__ void copy_partitions(uint8_t const** src_bufs, + uint8_t** dst_bufs, + dst_buf_info* buf_info) { - int const partition_index = blockIdx.x / num_src_bufs; - int const src_buf_index = blockIdx.x % num_src_bufs; - std::size_t const buf_index = (partition_index * num_src_bufs) + src_buf_index; + auto const buf_index = blockIdx.x; + auto const src_buf_index = buf_info[buf_index].src_buf_index; + auto const dst_buf_index = buf_info[buf_index].dst_buf_index; // copy, shifting offsets and validity bits as needed copy_buffer( - dst_bufs[partition_index] + buf_info[buf_index].dst_offset, + dst_bufs[dst_buf_index] + buf_info[buf_index].dst_offset, src_bufs[src_buf_index], threadIdx.x, buf_info[buf_index].num_elements, buf_info[buf_index].element_size, - buf_info[buf_index].src_row_index, + buf_info[buf_index].src_element_index, blockDim.x, buf_info[buf_index].value_shift, buf_info[buf_index].bit_shift, @@ -728,9 +732,32 @@ struct dst_offset_output_iterator { using reference = std::size_t&; using iterator_category = thrust::output_device_iterator_tag; - dst_offset_output_iterator operator+ __host__ __device__(int i) + dst_offset_output_iterator operator+ __host__ __device__(int i) { return {c + i}; } + + void operator++ __host__ __device__() { c++; } + + reference operator[] __device__(int i) { return dereference(c + i); } + reference operator* __device__() { return dereference(c); } + + private: + reference __device__ dereference(dst_buf_info* c) { return c->dst_offset; } +}; + +/** + * @brief Output iterator for writing values to the valid_count field of the + * dst_buf_info struct + */ +struct dst_valid_count_output_iterator { + dst_buf_info* c; + using value_type = size_type; + using difference_type = size_type; + using pointer = size_type*; + using reference = size_type&; + using iterator_category = thrust::output_device_iterator_tag; + + dst_valid_count_output_iterator operator+ __host__ __device__(int i) { - return dst_offset_output_iterator{c + i}; + return dst_valid_count_output_iterator{c + i}; } void operator++ __host__ __device__() { c++; } @@ -739,7 +766,7 @@ struct dst_offset_output_iterator { reference operator* __device__() { return dereference(c); } private: - reference __device__ dereference(dst_buf_info* c) { return c->dst_offset; } + reference __device__ dereference(dst_buf_info* c) { return c->valid_count; } }; /** @@ -762,6 +789,148 @@ struct size_of_helper { } }; +/** + * @brief Functor for returning the number of chunks an input buffer is being + * subdivided into during the repartitioning step. + * + * Note: columns types which themselves inherently have no data (strings, lists, + * structs) return 0. + */ +struct num_chunks_func { + thrust::pair const* chunks; + __device__ size_t operator()(size_type i) const { return thrust::get<0>(chunks[i]); } +}; + +void copy_data(int num_bufs, + int num_src_bufs, + uint8_t const** d_src_bufs, + uint8_t** d_dst_bufs, + dst_buf_info* _d_dst_buf_info, + rmm::cuda_stream_view stream) +{ + // Since we parallelize at one block per copy, we are vulnerable to situations where we + // have small numbers of copies to do (a combination of small numbers of splits and/or columns), + // so we will take the actual set of outgoing source/destination buffers and further partition + // them into much smaller chunks in order to drive up the number of blocks and overall occupancy. + auto const desired_chunk_size = size_t{1 * 1024 * 1024}; + rmm::device_uvector> chunks(num_bufs, stream); + thrust::transform( + rmm::exec_policy(stream), + _d_dst_buf_info, + _d_dst_buf_info + num_bufs, + chunks.begin(), + [desired_chunk_size] __device__(dst_buf_info const& buf) -> thrust::pair { + // Total bytes for this incoming partition + size_t const bytes = buf.num_elements * buf.element_size; + + // This clause handles nested data types (e.g. list or string) that store no data in the roow + // columns, only in their children. + if (bytes == 0) { return {1, 0}; } + + // The number of chunks we want to subdivide this buffer into + size_t const num_chunks = + max(size_t{1}, util::round_up_unsafe(bytes, desired_chunk_size) / desired_chunk_size); + + // NOTE: leaving chunk size as a separate parameter for future tuning + // possibilities, even though in the current implementation it will be a + // constant. + return {num_chunks, desired_chunk_size}; + }); + + rmm::device_uvector chunk_offsets(num_bufs + 1, stream); + auto buf_count_iter = cudf::detail::make_counting_transform_iterator( + 0, [num_bufs, num_chunks = num_chunks_func{chunks.begin()}] __device__(size_type i) { + return i == num_bufs ? 0 : num_chunks(i); + }); + thrust::exclusive_scan(rmm::exec_policy(stream), + buf_count_iter, + buf_count_iter + num_bufs + 1, + chunk_offsets.begin(), + 0); + + auto out_to_in_index = [chunk_offsets = chunk_offsets.begin(), num_bufs] __device__(size_type i) { + return static_cast( + thrust::upper_bound(thrust::seq, chunk_offsets, chunk_offsets + num_bufs + 1, i) - + chunk_offsets) - + 1; + }; + + // apply the chunking. + auto const num_chunks = + cudf::detail::make_counting_transform_iterator(0, num_chunks_func{chunks.begin()}); + size_type const new_buf_count = + thrust::reduce(rmm::exec_policy(stream), num_chunks, num_chunks + chunks.size()); + rmm::device_uvector d_dst_buf_info(new_buf_count, stream); + auto iter = thrust::make_counting_iterator(0); + thrust::for_each( + rmm::exec_policy(stream), + iter, + iter + new_buf_count, + [_d_dst_buf_info, + d_dst_buf_info = d_dst_buf_info.begin(), + chunks = chunks.begin(), + chunk_offsets = chunk_offsets.begin(), + num_bufs, + num_src_bufs, + out_to_in_index] __device__(size_type i) { + size_type const in_buf_index = out_to_in_index(i); + size_type const chunk_index = i - chunk_offsets[in_buf_index]; + auto const chunk_size = thrust::get<1>(chunks[in_buf_index]); + dst_buf_info const& in = _d_dst_buf_info[in_buf_index]; + + // adjust info + dst_buf_info& out = d_dst_buf_info[i]; + out.element_size = in.element_size; + out.value_shift = in.value_shift; + out.bit_shift = in.bit_shift; + out.valid_count = + in.valid_count; // valid count will be set to 1 if this is a validity buffer + out.src_buf_index = in.src_buf_index; + out.dst_buf_index = in.dst_buf_index; + + size_type const elements_per_chunk = + out.element_size == 0 ? 0 : chunk_size / out.element_size; + out.num_elements = ((chunk_index + 1) * elements_per_chunk) > in.num_elements + ? in.num_elements - (chunk_index * elements_per_chunk) + : elements_per_chunk; + + size_type const rows_per_chunk = + // if this is a validity buffer, each element is a bitmask_type, which + // corresponds to 32 rows. + out.valid_count > 0 + ? elements_per_chunk * static_cast(detail::size_in_bits()) + : elements_per_chunk; + out.num_rows = ((chunk_index + 1) * rows_per_chunk) > in.num_rows + ? in.num_rows - (chunk_index * rows_per_chunk) + : rows_per_chunk; + + out.src_element_index = in.src_element_index + (chunk_index * elements_per_chunk); + out.dst_offset = in.dst_offset + (chunk_index * chunk_size); + + // out.bytes and out.buf_size are unneeded here because they are only used to + // calculate real output buffer sizes. the data we are generating here is + // purely intermediate for the purposes of doing more uniform copying of data + // underneath the final structure of the output + }); + + // perform the copy + constexpr size_type block_size = 256; + copy_partitions<<>>( + d_src_bufs, d_dst_bufs, d_dst_buf_info.data()); + + // postprocess valid_counts + auto keys = cudf::detail::make_counting_transform_iterator( + 0, [out_to_in_index] __device__(size_type i) { return out_to_in_index(i); }); + auto values = thrust::make_transform_iterator( + d_dst_buf_info.begin(), [] __device__(dst_buf_info const& info) { return info.valid_count; }); + thrust::reduce_by_key(rmm::exec_policy(stream), + keys, + keys + new_buf_count, + values, + thrust::make_discard_iterator(), + dst_valid_count_output_iterator{_d_dst_buf_info}); +} + }; // anonymous namespace namespace detail { @@ -933,9 +1102,9 @@ std::vector contiguous_split(cudf::table_view const& input, } } - // final row indices and row count - int const out_row_index = src_info.is_validity ? row_start / 32 : row_start; - int const num_rows = row_end - row_start; + // final element indices and row count + int const out_element_index = src_info.is_validity ? row_start / 32 : row_start; + int const num_rows = row_end - row_start; // if I am an offsets column, all my values need to be shifted int const value_shift = src_info.offsets == nullptr ? 0 : src_info.offsets[row_start]; // if I am a validity column, we may need to shift bits @@ -953,15 +1122,17 @@ std::vector contiguous_split(cudf::table_view const& input, std::size_t const bytes = static_cast(num_elements) * static_cast(element_size); - return dst_buf_info{util::round_up_unsafe(bytes, 64ul), + return dst_buf_info{util::round_up_unsafe(bytes, split_align), num_elements, element_size, num_rows, - out_row_index, + out_element_index, 0, value_shift, bit_shift, - src_info.is_validity ? 1 : 0}; + src_info.is_validity ? 1 : 0, + src_buf_index, + split_index}; }); // compute total size of each partition @@ -1043,12 +1214,8 @@ std::vector contiguous_split(cudf::table_view const& input, CUDA_TRY(cudaMemcpyAsync( d_src_bufs, h_src_bufs, src_bufs_size + dst_bufs_size, cudaMemcpyHostToDevice, stream.value())); - // copy. 1 block per buffer - { - constexpr size_type block_size = 256; - copy_partition<<>>( - num_src_bufs, d_src_bufs, d_dst_bufs, d_dst_buf_info); - } + // perform the copy. + copy_data(num_bufs, num_src_bufs, d_src_bufs, d_dst_bufs, d_dst_buf_info, stream); // DtoH dst info (to retrieve null counts) CUDA_TRY(cudaMemcpyAsync( @@ -1078,7 +1245,6 @@ std::vector contiguous_split(cudf::table_view const& input, cols.clear(); } - return result; } @@ -1092,4 +1258,4 @@ std::vector contiguous_split(cudf::table_view const& input, return cudf::detail::contiguous_split(input, splits, rmm::cuda_stream_default, mr); } -}; // namespace cudf +}; // namespace cudf \ No newline at end of file diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index b5a793ecd1c..1ee732b8a59 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -1315,6 +1316,32 @@ TEST_F(ContiguousSplitUntypedTest, ProgressiveSizes) } } +TEST_F(ContiguousSplitUntypedTest, ValidityRepartition) +{ + // it is tricky to actually get the internal repartitioning/load-balancing code to add new splits + // inside a validity buffer. Under almost all situations, the fraction of bytes that validity + // represents is so small compared to the bytes for all other data, that those buffers end up not + // getting subdivided. this test forces it happen by using a small, single column of int8's, which + // keeps the overall fraction that validity takes up large enough to cause a repartition. + srand(0); + auto rvalids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return static_cast(rand()) / static_cast(RAND_MAX) < 0.5f ? 0 : 1; + }); + cudf::size_type const num_rows = 2000000; + auto col = cudf::sequence(num_rows, cudf::numeric_scalar{0}); + col->set_null_mask(cudf::test::detail::make_null_mask(rvalids, rvalids + num_rows)); + + cudf::table_view t({*col}); + auto result = cudf::contiguous_split(t, {num_rows / 2}); + auto expected = cudf::split(t, {num_rows / 2}); + CUDF_EXPECTS(result.size() == expected.size(), + "Mismatch in split results in ValidityRepartition test"); + + for (size_t idx = 0; idx < result.size(); idx++) { + CUDF_TEST_EXPECT_TABLES_EQUAL(result[idx].table, expected[idx]); + } +} + TEST_F(ContiguousSplitUntypedTest, ValidityEdgeCase) { // tests an edge case where the splits cause the final validity data to be copied From 1eceaed26d8242401f2be12b50eb635872fe1bf6 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Fri, 14 Jan 2022 06:29:14 +0530 Subject: [PATCH 15/23] Add partitioning support to Parquet chunked writer (#10000) Chunked writer (`class ParquetWriter`) now takes an argument `partition_cols`. For each call to `write_table(df)`, the `df` is partitioned and the parts are appended to the same corresponding file in the dataset directory. This can be used when partitioning is desired but when one wants to avoid making many small files in each sub directory e.g. Instead of repeated call to `write_to_dataset` like so: ```python write_to_dataset(df1, root_path, partition_cols=['group']) write_to_dataset(df2, root_path, partition_cols=['group']) ... ``` which will yield the following structure ``` root_dir/ group=value1/ .parquet .parquet ... group=value2/ .parquet .parquet ... ... ``` One can write with ```python pw = ParquetWriter(root_path, partition_cols=['group']) pw.write_table(df1) pw.write_table(df2) pw.close() ``` to get the structure ``` root_dir/ group=value1/ .parquet group=value2/ .parquet ... ``` Closes #7196 Also workaround fixes fixes #9216 fixes #7011 TODO: - [x] Tests Authors: - Devavret Makkar (https://github.com/devavret) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/10000 --- python/cudf/cudf/_lib/cpp/io/parquet.pxd | 4 + python/cudf/cudf/_lib/parquet.pyx | 52 +++-- python/cudf/cudf/io/parquet.py | 257 ++++++++++++++++++++--- python/cudf/cudf/tests/test_parquet.py | 83 +++++++- 4 files changed, 350 insertions(+), 46 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index 07b312361f2..d02fffe9c0d 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -207,6 +207,10 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: parquet_chunked_writer& write( cudf_table_view.table_view table_, ) except+ + parquet_chunked_writer& write( + const cudf_table_view.table_view& table_, + const vector[cudf_io_types.partition_info]& partitions, + ) except+ unique_ptr[vector[uint8_t]] close( vector[string] column_chunks_file_paths, ) except+ diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 36099b03ef6..16873435e1d 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -411,23 +411,31 @@ cdef class ParquetWriter: cdef unique_ptr[cpp_parquet_chunked_writer] writer cdef unique_ptr[table_input_metadata] tbl_meta cdef cudf_io_types.sink_info sink - cdef unique_ptr[cudf_io_types.data_sink] _data_sink + cdef vector[unique_ptr[cudf_io_types.data_sink]] _data_sink cdef cudf_io_types.statistics_freq stat_freq cdef cudf_io_types.compression_type comp_type cdef object index - def __cinit__(self, object path, object index=None, + def __cinit__(self, object filepaths_or_buffers, object index=None, object compression=None, str statistics="ROWGROUP"): - self.sink = make_sink_info(path, self._data_sink) + filepaths_or_buffers = ( + list(filepaths_or_buffers) + if is_list_like(filepaths_or_buffers) + else [filepaths_or_buffers] + ) + self.sink = make_sinks_info(filepaths_or_buffers, self._data_sink) self.stat_freq = _get_stat_freq(statistics) self.comp_type = _get_comp_type(compression) self.index = index self.initialized = False - def write_table(self, table): + def write_table(self, table, object partitions_info=None): """ Writes a single table to the file """ if not self.initialized: - self._initialize_chunked_state(table) + self._initialize_chunked_state( + table, + num_partitions=len(partitions_info) if partitions_info else 1 + ) cdef table_view tv if self.index is not False and ( @@ -437,8 +445,15 @@ cdef class ParquetWriter: else: tv = table_view_from_table(table, ignore_index=True) + cdef vector[cudf_io_types.partition_info] partitions + if partitions_info is not None: + for part in partitions_info: + partitions.push_back( + cudf_io_types.partition_info(part[0], part[1]) + ) + with nogil: - self.writer.get()[0].write(tv) + self.writer.get()[0].write(tv, partitions) def close(self, object metadata_file_path=None): cdef unique_ptr[vector[uint8_t]] out_metadata_c @@ -449,7 +464,13 @@ cdef class ParquetWriter: # Update metadata-collection options if metadata_file_path is not None: - column_chunks_file_paths.push_back(str.encode(metadata_file_path)) + if is_list_like(metadata_file_path): + for path in metadata_file_path: + column_chunks_file_paths.push_back(str.encode(path)) + else: + column_chunks_file_paths.push_back( + str.encode(metadata_file_path) + ) with nogil: out_metadata_c = move( @@ -463,10 +484,13 @@ cdef class ParquetWriter: return np.asarray(out_metadata_py) return None - def __dealloc__(self): + def __enter__(self): + return self + + def __exit__(self, *args): self.close() - def _initialize_chunked_state(self, table): + def _initialize_chunked_state(self, table, num_partitions=1): """ Prepares all the values required to build the chunked_parquet_writer_options and creates a writer""" cdef table_view tv @@ -499,10 +523,14 @@ cdef class ParquetWriter: table[name]._column, self.tbl_meta.get().column_metadata[i] ) - pandas_metadata = generate_pandas_metadata(table, self.index) + index = ( + False if isinstance(table._index, cudf.RangeIndex) else self.index + ) + pandas_metadata = generate_pandas_metadata(table, index) + cdef map[string, string] tmp_user_data + tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata) cdef vector[map[string, string]] user_data - user_data.resize(1) - user_data.back()[str.encode("pandas")] = str.encode(pandas_metadata) + user_data = vector[map[string, string]](num_partitions, tmp_user_data) cdef chunked_parquet_writer_options args with nogil: diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index ca03e40e2a6..9694d19e159 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -5,9 +5,11 @@ import warnings from collections import defaultdict from contextlib import ExitStack +from typing import Dict, List, Tuple from uuid import uuid4 import fsspec +import numpy as np import pyarrow as pa from pyarrow import dataset as ds, parquet as pq @@ -126,32 +128,21 @@ def write_to_dataset( if partition_cols is not None and len(partition_cols) > 0: - data_cols = df.columns.drop(partition_cols) - if len(data_cols) == 0: - raise ValueError("No data left to save outside partition columns") - - part_names, part_offsets, _, grouped_df = df.groupby( - partition_cols - )._grouped() - if not preserve_index: - grouped_df.reset_index(drop=True, inplace=True) - grouped_df.drop(columns=partition_cols, inplace=True) - # Copy the entire keys df in one operation rather than using iloc - part_names = part_names.to_pandas().to_frame(index=False) - - full_paths = [] - metadata_file_paths = [] - for keys in part_names.itertuples(index=False): - subdir = fs.sep.join( - [f"{name}={val}" for name, val in zip(partition_cols, keys)] - ) - prefix = fs.sep.join([root_path, subdir]) - fs.mkdirs(prefix, exist_ok=True) - filename = filename or uuid4().hex + ".parquet" - full_path = fs.sep.join([prefix, filename]) - full_paths.append(full_path) - if return_metadata: - metadata_file_paths.append(fs.sep.join([subdir, filename])) + ( + full_paths, + metadata_file_paths, + grouped_df, + part_offsets, + _, + ) = _get_partitioned( + df, + root_path, + partition_cols, + filename, + fs, + preserve_index, + **kwargs, + ) if return_metadata: kwargs["metadata_file_path"] = metadata_file_paths @@ -164,7 +155,7 @@ def write_to_dataset( ) else: - filename = filename or uuid4().hex + ".parquet" + filename = filename or _generate_filename() full_path = fs.sep.join([root_path, filename]) if return_metadata: kwargs["metadata_file_path"] = filename @@ -737,13 +728,12 @@ def to_parquet( ) if partition_offsets: - kwargs["partitions_info"] = [ - ( - partition_offsets[i], - partition_offsets[i + 1] - partition_offsets[i], + kwargs["partitions_info"] = list( + zip( + partition_offsets, + np.roll(partition_offsets, -1) - partition_offsets, ) - for i in range(0, len(partition_offsets) - 1) - ] + )[:-1] return _write_parquet( df, @@ -790,9 +780,210 @@ def merge_parquet_filemetadata(filemetadata_list): return libparquet.merge_filemetadata(filemetadata_list) +def _generate_filename(): + return uuid4().hex + ".parquet" + + +def _get_partitioned( + df, + root_path, + partition_cols, + filename=None, + fs=None, + preserve_index=False, + **kwargs, +): + fs = ioutils._ensure_filesystem(fs, root_path, **kwargs) + fs.mkdirs(root_path, exist_ok=True) + if not (set(df._data) - set(partition_cols)): + raise ValueError("No data left to save outside partition columns") + + part_names, part_offsets, _, grouped_df = df.groupby( + partition_cols + )._grouped() + if not preserve_index: + grouped_df.reset_index(drop=True, inplace=True) + grouped_df.drop(columns=partition_cols, inplace=True) + # Copy the entire keys df in one operation rather than using iloc + part_names = part_names.to_pandas().to_frame(index=False) + + full_paths = [] + metadata_file_paths = [] + for keys in part_names.itertuples(index=False): + subdir = fs.sep.join( + [f"{name}={val}" for name, val in zip(partition_cols, keys)] + ) + prefix = fs.sep.join([root_path, subdir]) + fs.mkdirs(prefix, exist_ok=True) + filename = filename or _generate_filename() + full_path = fs.sep.join([prefix, filename]) + full_paths.append(full_path) + metadata_file_paths.append(fs.sep.join([subdir, filename])) + + return full_paths, metadata_file_paths, grouped_df, part_offsets, filename + + ParquetWriter = libparquet.ParquetWriter +class ParquetDatasetWriter: + def __init__( + self, + path, + partition_cols, + index=None, + compression=None, + statistics="ROWGROUP", + ) -> None: + """ + Write a parquet file or dataset incrementally + + Parameters + ---------- + path : str + File path or Root Directory path. Will be used as Root Directory + path while writing a partitioned dataset. + partition_cols : list + Column names by which to partition the dataset + Columns are partitioned in the order they are given + index : bool, default None + If ``True``, include the dataframe’s index(es) in the file output. + If ``False``, they will not be written to the file. If ``None``, + index(es) other than RangeIndex will be saved as columns. + compression : {'snappy', None}, default 'snappy' + Name of the compression to use. Use ``None`` for no compression. + statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP' + Level at which column statistics should be included in file. + + + Examples + ________ + Using a context + + >>> df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]}) + >>> df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]}) + >>> with ParquetDatasetWriter("./dataset", partition_cols=["a"]) as cw: + ... cw.write_table(df1) + ... cw.write_table(df2) + + By manually calling ``close()`` + + >>> cw = ParquetDatasetWriter("./dataset", partition_cols=["a"]) + >>> cw.write_table(df1) + >>> cw.write_table(df2) + >>> cw.close() + + Both the methods will generate the same directory structure + + .. code-block:: bash + + dataset/ + a=1 + .parquet + a=2 + .parquet + a=3 + .parquet + + """ + self.path = path + self.common_args = { + "index": index, + "compression": compression, + "statistics": statistics, + } + self.partition_cols = partition_cols + # Collection of `ParquetWriter`s, and the corresponding + # partition_col values they're responsible for + self._chunked_writers: List[ + Tuple[libparquet.ParquetWriter, List[str], str] + ] = [] + # Map of partition_col values to their ParquetWriter's index + # in self._chunked_writers for reverse lookup + self.path_cw_map: Dict[str, int] = {} + self.filename = None + + def write_table(self, df): + """ + Write a dataframe to the file/dataset + """ + ( + paths, + metadata_file_paths, + grouped_df, + offsets, + self.filename, + ) = _get_partitioned( + df, + self.path, + self.partition_cols, + preserve_index=self.common_args["index"], + filename=self.filename, + ) + + existing_cw_batch = defaultdict(dict) + new_cw_paths = [] + + for path, part_info, meta_path in zip( + paths, + zip(offsets, np.roll(offsets, -1) - offsets), + metadata_file_paths, + ): + if path in self.path_cw_map: # path is a currently open file + cw_idx = self.path_cw_map[path] + existing_cw_batch[cw_idx][path] = part_info + else: # path not currently handled by any chunked writer + new_cw_paths.append((path, part_info, meta_path)) + + # Write out the parts of grouped_df currently handled by existing cw's + for cw_idx, path_to_part_info_map in existing_cw_batch.items(): + cw = self._chunked_writers[cw_idx][0] + # match found paths with this cw's paths and nullify partition info + # for partition_col values not in this batch + this_cw_part_info = [ + path_to_part_info_map.get(path, (0, 0)) + for path in self._chunked_writers[cw_idx][1] + ] + cw.write_table(grouped_df, this_cw_part_info) + + # Create new cw for unhandled paths encountered in this write_table + new_paths, part_info, meta_paths = zip(*new_cw_paths) + self._chunked_writers.append( + ( + ParquetWriter(new_paths, **self.common_args), + new_paths, + meta_paths, + ) + ) + new_cw_idx = len(self._chunked_writers) - 1 + self.path_cw_map.update({k: new_cw_idx for k in new_paths}) + self._chunked_writers[-1][0].write_table(grouped_df, part_info) + + def close(self, return_metadata=False): + """ + Close all open files and optionally return footer metadata as a binary + blob + """ + + metadata = [ + cw.close(metadata_file_path=meta_path if return_metadata else None) + for cw, _, meta_path in self._chunked_writers + ] + + if return_metadata: + return ( + merge_parquet_filemetadata(metadata) + if len(metadata) > 1 + else metadata[0] + ) + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + def _check_decimal128_type(arrow_type): if isinstance(arrow_type, pa.Decimal128Type): if arrow_type.precision > cudf.Decimal64Dtype.MAX_PRECISION: diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 9a66de8a3a6..016ed1229f1 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -18,7 +18,11 @@ from pyarrow import fs as pa_fs, parquet as pq import cudf -from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata +from cudf.io.parquet import ( + ParquetDatasetWriter, + ParquetWriter, + merge_parquet_filemetadata, +) from cudf.testing import dataset_generator as dg from cudf.testing._utils import ( TIMEDELTA_TYPES, @@ -1573,6 +1577,16 @@ def test_parquet_writer_gpu_chunked(tmpdir, simple_pdf, simple_gdf): assert_eq(pd.read_parquet(gdf_fname), pd.concat([simple_pdf, simple_pdf])) +def test_parquet_writer_gpu_chunked_context(tmpdir, simple_pdf, simple_gdf): + gdf_fname = tmpdir.join("gdf.parquet") + + with ParquetWriter(gdf_fname) as writer: + writer.write_table(simple_gdf) + writer.write_table(simple_gdf) + + assert_eq(pd.read_parquet(gdf_fname), pd.concat([simple_pdf, simple_pdf])) + + def test_parquet_write_bytes_io(simple_gdf): output = BytesIO() simple_gdf.to_parquet(output) @@ -1627,6 +1641,73 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename): assert fn == filename +@pytest.mark.parametrize("return_meta", [True, False]) +def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta): + pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) + gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) + + df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]}) + df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]}) + + cw = ParquetDatasetWriter(gdf_dir, partition_cols=["a"], index=False) + cw.write_table(df1) + cw.write_table(df2) + meta_byte_array = cw.close(return_metadata=return_meta) + pdf = cudf.concat([df1, df2]).to_pandas() + pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"]) + + if return_meta: + fmd = pq.ParquetFile(BytesIO(meta_byte_array)).metadata + assert fmd.num_rows == len(pdf) + assert fmd.num_row_groups == 4 + files = { + os.path.join(directory, files[0]) + for directory, _, files in os.walk(gdf_dir) + if files + } + meta_files = { + os.path.join(gdf_dir, fmd.row_group(i).column(c).file_path) + for i in range(fmd.num_row_groups) + for c in range(fmd.row_group(i).num_columns) + } + assert files == meta_files + + # Read back with pandas to compare + expect_pd = pd.read_parquet(pdf_dir) + got_pd = pd.read_parquet(gdf_dir) + assert_eq(expect_pd, got_pd) + + # Check that cudf and pd return the same read + got_cudf = cudf.read_parquet(gdf_dir) + assert_eq(got_pd, got_cudf) + + +def test_parquet_writer_chunked_partitioned_context(tmpdir_factory): + pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) + gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) + + df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]}) + df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]}) + + with ParquetDatasetWriter( + gdf_dir, partition_cols=["a"], index=False + ) as cw: + cw.write_table(df1) + cw.write_table(df2) + + pdf = cudf.concat([df1, df2]).to_pandas() + pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"]) + + # Read back with pandas to compare + expect_pd = pd.read_parquet(pdf_dir) + got_pd = pd.read_parquet(gdf_dir) + assert_eq(expect_pd, got_pd) + + # Check that cudf and pd return the same read + got_cudf = cudf.read_parquet(gdf_dir) + assert_eq(got_pd, got_cudf) + + @pytest.mark.parametrize("cols", [None, ["b"]]) def test_parquet_write_to_dataset(tmpdir_factory, cols): dir1 = tmpdir_factory.mktemp("dir1") From ca77542cab1fc0bcf0d1c8cc67f79ef69fb02536 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 14 Jan 2022 10:11:47 -0500 Subject: [PATCH 16/23] Allow custom sort functions for dask-cudf `sort_values` (#9789) Similar to https://github.com/dask/dask/pull/8345, this PR allows the sorting function called on each partition in last step of dask-cudf's `sort_values` to be generalized, along with the kwargs that are supplied to it. This allows `sort_values` to be extended to support more complex ascending / null position handling. The context for this PR is a desire to simplify the [sorting algorithm](https://github.com/dask-contrib/dask-sql/blob/main/dask_sql/physical/utils/sort.py) used by dask-sql; since it only really differs from dask-cudf's sorting algorithm in that it uses a custom sorting function, it seems like it would be easier to allow for that extension upstream rather than duplicate code in dask-sql. Authors: - Charles Blackmon-Luca (https://github.com/charlesbluca) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Richard (Rick) Zamora (https://github.com/rjzamora) URL: https://github.com/rapidsai/cudf/pull/9789 --- python/dask_cudf/dask_cudf/core.py | 29 +++++++++---------- python/dask_cudf/dask_cudf/sorting.py | 21 ++++++++++++-- python/dask_cudf/dask_cudf/tests/test_sort.py | 19 ++++++++++++ 3 files changed, 51 insertions(+), 18 deletions(-) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index bf063918c89..e191873f82b 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -235,6 +235,8 @@ def sort_values( set_divisions=False, ascending=True, na_position="last", + sort_function=None, + sort_function_kwargs=None, **kwargs, ): if kwargs: @@ -242,21 +244,18 @@ def sort_values( f"Unsupported input arguments passed : {list(kwargs.keys())}" ) - if self.npartitions == 1: - df = self.map_partitions( - M.sort_values, by, ascending=ascending, na_position=na_position - ) - else: - df = sorting.sort_values( - self, - by, - max_branch=max_branch, - divisions=divisions, - set_divisions=set_divisions, - ignore_index=ignore_index, - ascending=ascending, - na_position=na_position, - ) + df = sorting.sort_values( + self, + by, + max_branch=max_branch, + divisions=divisions, + set_divisions=set_divisions, + ignore_index=ignore_index, + ascending=ascending, + na_position=na_position, + sort_function=sort_function, + sort_function_kwargs=sort_function_kwargs, + ) if ignore_index: return df.reset_index(drop=True) diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index e8551493bb1..af40d9ca41b 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -222,6 +222,8 @@ def sort_values( ignore_index=False, ascending=True, na_position="last", + sort_function=None, + sort_function_kwargs=None, ): """Sort by the given list/tuple of column names.""" if not isinstance(ascending, bool): @@ -235,6 +237,21 @@ def sort_values( elif not isinstance(by, list): by = [by] + # parse custom sort function / kwargs if provided + sort_kwargs = { + "by": by, + "ascending": ascending, + "na_position": na_position, + } + if sort_function is None: + sort_function = M.sort_values + if sort_function_kwargs is not None: + sort_kwargs.update(sort_function_kwargs) + + # handle single partition case + if npartitions == 1: + return df.map_partitions(sort_function, **sort_kwargs) + # Step 1 - Calculate new divisions (if necessary) if divisions is None: divisions = quantile_divisions(df, by, npartitions) @@ -265,9 +282,7 @@ def sort_values( df3.divisions = (None,) * (df3.npartitions + 1) # Step 3 - Return final sorted df - df4 = df3.map_partitions( - M.sort_values, by, ascending=ascending, na_position=na_position - ) + df4 = df3.map_partitions(sort_function, **sort_kwargs) if not isinstance(divisions, gd.DataFrame) and set_divisions: # Can't have multi-column divisions elsewhere in dask (yet) df4.divisions = methods.tolist(divisions) diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index f4ae83245cb..0b258dd33e7 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -83,3 +83,22 @@ def test_sort_values_with_nulls(data, by, ascending, na_position): # cudf ordering for nulls is non-deterministic dd.assert_eq(got[by], expect[by], check_index=False) + + +@pytest.mark.parametrize("by", [["a", "b"], ["b", "a"]]) +@pytest.mark.parametrize("nparts", [1, 10]) +def test_sort_values_custom_function(by, nparts): + df = cudf.DataFrame({"a": [1, 2, 3] * 20, "b": [4, 5, 6, 7] * 15}) + ddf = dd.from_pandas(df, npartitions=nparts) + + def f(partition, by_columns, ascending, na_position, **kwargs): + return partition.sort_values( + by_columns, ascending=ascending, na_position=na_position + ) + + with dask.config.set(scheduler="single-threaded"): + got = ddf.sort_values( + by=by[0], sort_function=f, sort_function_kwargs={"by_columns": by} + ) + expect = df.sort_values(by=by) + dd.assert_eq(got, expect, check_index=False) From ce31d7d3ad765c88bef9f5e860abe3e5488a1fbd Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 14 Jan 2022 10:55:47 -0500 Subject: [PATCH 17/23] Fix octal pattern matching in regex string (#9993) Closes #9946 Fixes decoding logic in regex pattern compile step to consume only up to the last octal character. The original logic was incorrectly discarding the next pattern character. And if the octal characters were specified at the end of the pattern invalid bytes were read passed the end of the pattern. This is what caused the intermittent failure since sometimes the invalid bytes were 0 which masked the issue. This PR also includes tests for octal patterns in various positions in the regex pattern. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Conor Hoekstra (https://github.com/codereport) - Mike Wilson (https://github.com/hyperbolic2346) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/9993 --- cpp/src/strings/regex/regcomp.cpp | 8 ++++---- cpp/tests/strings/contains_tests.cpp | 15 ++++++++++++++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 7da4915d668..8fbd82b8dc7 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -258,10 +258,10 @@ class regex_parser { // treating all quoted numbers as Octal, since we are not supporting backreferences if (yy >= '0' && yy <= '7') { yy = yy - '0'; - char32_t c = *exprp++; + char32_t c = *exprp; while (c >= '0' && c <= '7') { yy = (yy << 3) | (c - '0'); - c = *exprp++; + c = *(++exprp); } return CHAR; } else { @@ -926,7 +926,7 @@ void reprog::optimize2() _startinst_ids.push_back(-1); // terminator mark } -#ifndef NDBUG +#ifndef NDEBUG void reprog::print(regex_flags const flags) { printf("Flags = 0x%08x\n", static_cast(flags)); diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index f95b282171f..48c4aac9e8a 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -237,6 +237,19 @@ TEST_F(StringsContainsTests, MatchesIPV4Test) } } +TEST_F(StringsContainsTests, OctalTest) +{ + cudf::test::strings_column_wrapper strings({"AZ", "B", "CDAZEY", ""}); + auto strings_view = cudf::strings_column_view(strings); + cudf::test::fixed_width_column_wrapper expected({1, 0, 1, 0}); + auto results = cudf::strings::contains_re(strings_view, "\\101"); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = cudf::strings::contains_re(strings_view, "\\101Z"); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = cudf::strings::contains_re(strings_view, "D*\\101\\132"); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsContainsTests, EmbeddedNullCharacter) { std::vector data(10); From b01c8464b2e1412263a7368f01242f6cdd58e89b Mon Sep 17 00:00:00 2001 From: jakirkham Date: Fri, 14 Jan 2022 08:05:54 -0800 Subject: [PATCH 18/23] Allow CuPy 10 (#10048) Relaxes version constraints to allow CuPy 10. xref: https://github.com/rapidsai/integration/pull/413 Authors: - https://github.com/jakirkham Approvers: - Ashwin Srinath (https://github.com/shwina) - Jordan Jacobelli (https://github.com/Ethyling) URL: https://github.com/rapidsai/cudf/pull/10048 --- conda/environments/cudf_dev_cuda11.5.yml | 2 +- conda/recipes/cudf/meta.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index c258a5caabb..bbbc754e850 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -9,7 +9,7 @@ channels: dependencies: - clang=11.1.0 - clang-tools=11.1.0 - - cupy>=9.5.0,<10.0.0a0 + - cupy>=9.5.0,<11.0.0a0 - rmm=22.02.* - cmake>=3.20.1 - cmake_setuptools>=0.1.3 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 2600ab358cc..a20749bc8c9 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -40,7 +40,7 @@ requirements: - python - typing_extensions - pandas >=1.0,<1.4.0dev0 - - cupy >=9.5.0,<10.0.0a0 + - cupy >=9.5.0,<11.0.0a0 - numba >=0.54 - numpy - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} *cuda From 12adb8a93dc51e14c106b12c4a68f3a2e1fe3207 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 14 Jan 2022 14:59:40 -0600 Subject: [PATCH 19/23] Fix repr and concat of `StructColumn` (#10042) Fixes: #8963 This PR fixes a trivial issue in `concat` where the assumption was that `_with_type_metadata` is an in-place operation, but it isn't. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10042 --- python/cudf/cudf/core/dataframe.py | 14 +++++++++++--- python/cudf/cudf/core/series.py | 9 +++++---- python/cudf/cudf/tests/test_repr.py | 30 +++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 123f86cc200..8fb9b84d96b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1548,10 +1548,18 @@ def _concat( cudf.core.index.as_index(out.index._values) ) - # Reassign precision for any decimal cols + # Reassign precision for decimal cols & type schema for struct cols for name, col in out._data.items(): - if isinstance(col, cudf.core.column.Decimal64Column): - col = col._with_type_metadata(tables[0]._data[name].dtype) + if isinstance( + col, + ( + cudf.core.column.Decimal64Column, + cudf.core.column.StructColumn, + ), + ): + out._data[name] = col._with_type_metadata( + tables[0]._data[name].dtype + ) # Reassign index and column names if isinstance(objs[0].columns, pd.MultiIndex): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 11166320760..7da3bdbb31e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1448,10 +1448,11 @@ def _concat(cls, objs, axis=0, index=True): col = concat_columns([o._column for o in objs]) - if isinstance(col, cudf.core.column.Decimal64Column): - col = col._with_type_metadata(objs[0]._column.dtype) - - if isinstance(col, cudf.core.column.StructColumn): + # Reassign precision for decimal cols & type schema for struct cols + if isinstance( + col, + (cudf.core.column.Decimal64Column, cudf.core.column.StructColumn), + ): col = col._with_type_metadata(objs[0].dtype) return cls(data=col, index=index, name=name) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index f8c136b8c2d..82020f30f7c 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1475,3 +1475,33 @@ def test_empty_series_name(): gs = cudf.from_pandas(ps) assert ps.__repr__() == gs.__repr__() + + +def test_repr_struct_after_concat(): + df = cudf.DataFrame( + { + "a": cudf.Series( + [ + {"sa": 2056831253}, + {"sa": -1463792165}, + {"sa": 1735783038}, + {"sa": 103774433}, + {"sa": -1413247520}, + ] + * 13 + ), + "b": cudf.Series( + [ + {"sa": {"ssa": 1140062029}}, + None, + {"sa": {"ssa": 1998862860}}, + {"sa": None}, + {"sa": {"ssa": -395088502}}, + ] + * 13 + ), + } + ) + pdf = df.to_pandas() + + assert df.__repr__() == pdf.__repr__() From 8c8d6ef7fdc8f17159df63182ee9e9b0cf8df3b1 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 14 Jan 2022 17:22:07 -0600 Subject: [PATCH 20/23] Fix dataframe setitem with `ndarray` types (#10056) Fixes: #9928 This PR fixes 2d array assignment in `setitem` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/10056 --- python/cudf/cudf/core/dataframe.py | 18 ++++++++++++++++-- python/cudf/cudf/tests/test_dataframe.py | 11 +++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8fb9b84d96b..6bbb2fca77c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1123,7 +1123,15 @@ def __setitem__(self, arg, value): for col_name in self._data: self._data[col_name][mask] = value else: - if isinstance(value, DataFrame): + if isinstance(value, (cupy.ndarray, np.ndarray)): + _setitem_with_dataframe( + input_df=self, + replace_df=cudf.DataFrame(value), + input_cols=arg, + mask=None, + ignore_index=True, + ) + elif isinstance(value, DataFrame): _setitem_with_dataframe( input_df=self, replace_df=value, @@ -6401,6 +6409,7 @@ def _setitem_with_dataframe( replace_df: DataFrame, input_cols: Any = None, mask: Optional[cudf.core.column.ColumnBase] = None, + ignore_index: bool = False, ): """ This function sets item dataframes relevant columns with replacement df @@ -6408,6 +6417,7 @@ def _setitem_with_dataframe( :param replace_df: Replacement DataFrame to replace values with :param input_cols: columns to replace in the input dataframe :param mask: boolean mask in case of masked replacing + :param ignore_index: Whether to conduct index equality and reindex """ if input_cols is None: @@ -6418,7 +6428,11 @@ def _setitem_with_dataframe( "Number of Input Columns must be same replacement Dataframe" ) - if len(input_df) != 0 and not input_df.index.equals(replace_df.index): + if ( + not ignore_index + and len(input_df) != 0 + and not input_df.index.equals(replace_df.index) + ): replace_df = replace_df.reindex(input_df.index) for col_1, col_2 in zip(input_cols, replace_df.columns): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e5b298a8448..372587ba677 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9030,3 +9030,14 @@ def test_dataframe_add_suffix(): expected = pdf.add_suffix("_item") assert_eq(got, expected) + + +def test_dataframe_assign_cp_np_array(): + m, n = 5, 3 + cp_ndarray = cupy.random.randn(m, n) + pdf = pd.DataFrame({f"f_{i}": range(m) for i in range(n)}) + gdf = cudf.DataFrame({f"f_{i}": range(m) for i in range(n)}) + pdf[[f"f_{i}" for i in range(n)]] = cupy.asnumpy(cp_ndarray) + gdf[[f"f_{i}" for i in range(n)]] = cp_ndarray + + assert_eq(pdf, gdf) From e24fa8f0b0cca2c9a441002623fdbc40631ed369 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 14 Jan 2022 16:35:43 -0800 Subject: [PATCH 21/23] Run doctests. (#9815) This PR adds doctests and resolves #9513. Several issues were found by running doctests that have now been resolved: - [x] #9821 - [x] #9822 - [x] #9823 - [x] #9824 - [x] #9825 - [x] #9826 - [x] #9827 - [x] #9828 (workaround by deleting doctests) - [x] #9829 Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Ashwin Srinath (https://github.com/shwina) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/9815 --- python/cudf/cudf/__init__.py | 63 ++++++++++ python/cudf/cudf/api/__init__.py | 4 +- python/cudf/cudf/api/extensions/__init__.py | 6 + python/cudf/cudf/core/_base_index.py | 14 ++- python/cudf/cudf/core/column/categorical.py | 1 - python/cudf/cudf/core/dataframe.py | 122 ++++++++++++-------- python/cudf/cudf/core/groupby/groupby.py | 11 +- python/cudf/cudf/core/index.py | 72 ++++++------ python/cudf/cudf/core/multiindex.py | 16 +-- python/cudf/cudf/core/reshape.py | 3 +- python/cudf/cudf/core/scalar.py | 2 +- python/cudf/cudf/core/series.py | 42 ++++--- python/cudf/cudf/core/tools/datetimes.py | 8 +- python/cudf/cudf/tests/test_dataframe.py | 81 ++++++++----- python/cudf/cudf/tests/test_doctests.py | 102 ++++++++++++++++ python/cudf/cudf/utils/docutils.py | 14 +-- python/cudf/cudf/utils/ioutils.py | 6 +- 17 files changed, 402 insertions(+), 165 deletions(-) create mode 100644 python/cudf/cudf/tests/test_doctests.py diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 2461e7b09bc..4dadf6a1869 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -126,3 +126,66 @@ __version__ = get_versions()["version"] del get_versions + +__all__ = [ + "BaseIndex", + "CategoricalDtype", + "CategoricalIndex", + "DataFrame", + "DateOffset", + "DatetimeIndex", + "Decimal32Dtype", + "Decimal64Dtype", + "Float32Index", + "Float64Index", + "GenericIndex", + "Grouper", + "Index", + "Int16Index", + "Int32Index", + "Int64Index", + "Int8Index", + "IntervalDtype", + "IntervalIndex", + "ListDtype", + "MultiIndex", + "NA", + "RangeIndex", + "Scalar", + "Series", + "StringIndex", + "StructDtype", + "TimedeltaIndex", + "UInt16Index", + "UInt32Index", + "UInt64Index", + "UInt8Index", + "api", + "concat", + "cut", + "date_range", + "factorize", + "from_dataframe", + "from_dlpack", + "from_pandas", + "get_dummies", + "interval_range", + "isclose", + "melt", + "merge", + "merge_sorted", + "pivot", + "read_avro", + "read_csv", + "read_feather", + "read_hdf", + "read_json", + "read_orc", + "read_parquet", + "read_text", + "set_allocator", + "testing", + "to_datetime", + "to_numeric", + "unstack", +] diff --git a/python/cudf/cudf/api/__init__.py b/python/cudf/cudf/api/__init__.py index 21c24015e41..c66bfb4efeb 100644 --- a/python/cudf/cudf/api/__init__.py +++ b/python/cudf/cudf/api/__init__.py @@ -1,3 +1,5 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -from cudf.api import types +from cudf.api import extensions, types + +__all__ = ["extensions", "types"] diff --git a/python/cudf/cudf/api/extensions/__init__.py b/python/cudf/cudf/api/extensions/__init__.py index c971e6f7731..eeb5dcdb32a 100644 --- a/python/cudf/cudf/api/extensions/__init__.py +++ b/python/cudf/cudf/api/extensions/__init__.py @@ -5,3 +5,9 @@ register_index_accessor, register_series_accessor, ) + +__all__ = [ + "register_dataframe_accessor", + "register_index_accessor", + "register_series_accessor", +] diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 683f3fefe1c..4f2614e843f 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -495,7 +495,7 @@ def fillna(self, value, downcast=None): >>> import cudf >>> index = cudf.Index([1, 2, None, 4]) >>> index - Int64Index([1, 2, null, 4], dtype='int64') + Int64Index([1, 2, , 4], dtype='int64') >>> index.fillna(3) Int64Index([1, 2, 3, 4], dtype='int64') """ @@ -553,7 +553,7 @@ def to_pandas(self): >>> type(idx.to_pandas()) >>> type(idx) - + """ return pd.Index(self._values.to_pandas(), name=self.name) @@ -942,6 +942,7 @@ def is_interval(self): Examples -------- >>> import cudf + >>> import pandas as pd >>> idx = cudf.from_pandas( ... pd.Index([pd.Interval(left=0, right=5), ... pd.Interval(left=5, right=10)]) @@ -1105,15 +1106,16 @@ def join( Examples -------- >>> import cudf - >>> lhs = cudf.DataFrame( - ... {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b'] - ... ).index + >>> lhs = cudf.DataFrame({ + ... "a": [2, 3, 1], + ... "b": [3, 4, 2], + ... }).set_index(['a', 'b']).index >>> lhs MultiIndex([(2, 3), (3, 4), (1, 2)], names=['a', 'b']) - >>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index + >>> rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index('a').index >>> rhs Int64Index([1, 4, 3], dtype='int64', name='a') >>> lhs.join(rhs, how='inner') diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 4be7a422de0..de06e62cbb1 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -61,7 +61,6 @@ class CategoricalAccessor(ColumnMethods): -------- >>> s = cudf.Series([1,2,3], dtype='category') >>> s - >>> s 0 1 1 2 2 3 diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6bbb2fca77c..69600426ec0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -463,12 +463,12 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): ... [(t0+ timedelta(seconds=x)) for x in range(n)]) ... }) >>> df - id datetimes - 0 0 2018-10-07T12:00:00.000 - 1 1 2018-10-07T12:00:01.000 - 2 2 2018-10-07T12:00:02.000 - 3 3 2018-10-07T12:00:03.000 - 4 4 2018-10-07T12:00:04.000 + id datetimes + 0 0 2018-10-07 12:00:00 + 1 1 2018-10-07 12:00:01 + 2 2 2018-10-07 12:00:02 + 3 3 2018-10-07 12:00:03 + 4 4 2018-10-07 12:00:04 Build DataFrame via list of rows as tuples: @@ -984,23 +984,34 @@ def __getitem__(self, arg): Examples -------- - >>> df = DataFrame([('a', list(range(20))), - ... ('b', list(range(20))), - ... ('c', list(range(20)))]) - >>> df[:4] # get first 4 rows of all columns + >>> df = cudf.DataFrame({ + ... 'a': list(range(10)), + ... 'b': list(range(10)), + ... 'c': list(range(10)), + ... }) + + Get first 4 rows of all columns. + + >>> df[:4] a b c 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 - >>> df[-5:] # get last 5 rows of all columns - a b c - 15 15 15 15 - 16 16 16 16 - 17 17 17 17 - 18 18 18 18 - 19 19 19 19 - >>> df[['a', 'c']] # get columns a and c + + Get last 5 rows of all columns. + + >>> df[-5:] + a b c + 5 5 5 5 + 6 6 6 6 + 7 7 7 7 + 8 8 8 8 + 9 9 9 9 + + Get columns a and c. + + >>> df[['a', 'c']] a c 0 0 0 1 1 1 @@ -1012,8 +1023,17 @@ def __getitem__(self, arg): 7 7 7 8 8 8 9 9 9 - >>> df[[True, False, True, False]] # mask the entire dataframe, - # returning the rows specified in the boolean mask + + Return the rows specified in the boolean mask. + + >>> df[[True, False, True, False, True, + ... False, True, False, True, False]] + a b c + 0 0 0 0 + 2 2 2 2 + 4 4 4 4 + 6 6 6 6 + 8 8 8 8 """ if _is_scalar_or_zero_d_array(arg) or isinstance(arg, tuple): return self._get_columns_by_label(arg, downcast=True) @@ -1261,10 +1281,12 @@ def memory_usage(self, index=True, deep=False): object 40000 bool 5000 dtype: int64 + Use a Categorical for efficient storage of an object-dtype column with many repeated values. + >>> df['object'].astype('category').memory_usage(deep=True) - 5048 + 5008 """ if deep: warnings.warn( @@ -2225,11 +2247,11 @@ def reindex( 3 3 13.0 4 4 14.0 >>> df_new - key val sum - 0 0 10.0 NaN - 3 3 13.0 NaN - 4 4 14.0 NaN - 5 -1 NaN NaN + key val sum + 0 0 10.0 + 3 3 13.0 + 4 4 14.0 + 5 """ if labels is None and index is None and columns is None: @@ -3701,10 +3723,10 @@ def query(self, expr, local_dict=None): Examples -------- - >>> import cudf - >>> a = ('a', [1, 2, 2]) - >>> b = ('b', [3, 4, 5]) - >>> df = cudf.DataFrame([a, b]) + >>> df = cudf.DataFrame({ + ... "a": [1, 2, 2], + ... "b": [3, 4, 5], + ... }) >>> expr = "(a == 2 and b == 4) or (b == 3)" >>> df.query(expr) a b @@ -3720,8 +3742,8 @@ def query(self, expr, local_dict=None): >>> df['datetimes'] = data >>> search_date = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d') >>> df.query('datetimes==@search_date') - datetimes - 1 2018-10-08T00:00:00.000 + datetimes + 1 2018-10-08 Using local_dict: @@ -3732,9 +3754,9 @@ def query(self, expr, local_dict=None): >>> df['datetimes'] = data >>> search_date2 = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d') >>> df.query('datetimes==@search_date', - ... local_dict={'search_date':search_date2}) - datetimes - 1 2018-10-08T00:00:00.000 + ... local_dict={'search_date': search_date2}) + datetimes + 1 2018-10-08 """ # can't use `annotate` decorator here as we inspect the calling # environment. @@ -4189,18 +4211,23 @@ def info( dtypes: float64(1), int64(1), object(1) memory usage: 130.0+ bytes - Pipe output of DataFrame.info to buffer instead of sys.stdout, - get buffer content and writes to a text file: + Pipe output of DataFrame.info to a buffer instead of sys.stdout and + print buffer contents: >>> import io >>> buffer = io.StringIO() >>> df.info(buf=buffer) - >>> s = buffer.getvalue() - >>> with open("df_info.txt", "w", - ... encoding="utf-8") as f: - ... f.write(s) - ... - 369 + >>> print(buffer.getvalue()) + + RangeIndex: 5 entries, 0 to 4 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 130.0+ bytes The `memory_usage` parameter allows deep introspection mode, specially useful for big DataFrames and fine-tune memory optimization: @@ -5761,7 +5788,7 @@ def stack(self, level=-1, dropna=True): Examples -------- >>> import cudf - >>> df = cudf.DataFrame({'a':[0,1,3], 'b':[1,2,4]}) + >>> df = cudf.DataFrame({'a': [0, 1, 3], 'b': [1, 2, 4]}) >>> df.stack() 0 a 0 b 1 @@ -6084,8 +6111,11 @@ def explode(self, column, ignore_index=False): Examples -------- >>> import cudf - >>> cudf.DataFrame( - {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]}) + >>> df = cudf.DataFrame({ + ... "a": [[1, 2, 3], [], None, [4, 5]], + ... "b": [11, 22, 33, 44], + ... }) + >>> df a b 0 [1, 2, 3] 11 1 [] 22 diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 08ef3f07776..5b041ba53b9 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1256,9 +1256,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): -------- >>> import cudf >>> import pandas as pd - >>> df = cudf.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df = cudf.DataFrame({ + ... 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.], + ... }) >>> df Animal Max Speed 0 Falcon 380.0 @@ -1272,10 +1273,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): Parrot 25.0 >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Captive', 'Wild', 'Captive', 'Wild']] + ... ['Captive', 'Wild', 'Captive', 'Wild']] >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) >>> df = cudf.DataFrame({'Max Speed': [390., 350., 30., 20.]}, - index=index) + ... index=index) >>> df Max Speed Animal Type diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 859a81bc5f4..1e493708415 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1535,9 +1535,11 @@ class DatetimeIndex(GenericIndex): -------- >>> import cudf >>> cudf.DatetimeIndex([1, 2, 3, 4], name="a") - DatetimeIndex(['1970-01-01 00:00:00.001000', '1970-01-01 00:00:00.002000', - '1970-01-01 00:00:00.003000', '1970-01-01 00:00:00.004000'], - dtype='datetime64[ms]', name='a') + DatetimeIndex(['1970-01-01 00:00:00.000000001', + '1970-01-01 00:00:00.000000002', + '1970-01-01 00:00:00.000000003', + '1970-01-01 00:00:00.000000004'], + dtype='datetime64[ns]', name='a') """ def __init__( @@ -1899,12 +1901,13 @@ def ceil(self, freq): Examples -------- >>> import cudf - >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00", - ... "1999-12-31 18:40:00"]) + >>> gIndex = cudf.DatetimeIndex([ + ... "2020-05-31 08:05:42", + ... "1999-12-31 18:40:30", + ... ]) >>> gIndex.ceil("T") - DatetimeIndex(['2020-05-31 08:00:00', '1999-12-31 18:40:00'], - dtype='datetime64[ns]', freq=None) - """ + DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]') + """ # noqa: E501 out_column = self._values.ceil(freq) return self.__class__._from_data({self.name: out_column}) @@ -1930,12 +1933,13 @@ def floor(self, freq): Examples -------- >>> import cudf - >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:59:59" - ... ,"1999-12-31 18:44:59"]) + >>> gIndex = cudf.DatetimeIndex([ + ... "2020-05-31 08:59:59", + ... "1999-12-31 18:44:59", + ... ]) >>> gIndex.floor("T") - DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], - dtype='datetime64[ns]', freq=None) - """ + DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]') + """ # noqa: E501 out_column = self._values.floor(freq) return self.__class__._from_data({self.name: out_column}) @@ -1967,21 +1971,14 @@ def round(self, freq): ... "2001-01-01 00:05:04", ... ], dtype="datetime64[ns]") >>> dt_idx - DatetimeIndex(['2001-01-01 00:04:45', - '2001-01-01 00:05:04', - '2001-01-01 00:04:58'], - dtype='datetime64[ns]', freq=None) + DatetimeIndex(['2001-01-01 00:04:45', '2001-01-01 00:04:58', + '2001-01-01 00:05:04'], + dtype='datetime64[ns]') >>> dt_idx.round('H') - DatetimeIndex(['2001-01-01', - '2001-01-01', - '2001-01-01'], - dtype='datetime64[ns]', freq=None) + DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01'], dtype='datetime64[ns]') >>> dt_idx.round('T') - DatetimeIndex(['2001-01-01 00:05:00', - '2001-01-01 00:05:00', - '2001-01-01 00:05:00'], - dtype='datetime64[ns]', freq=None) - """ + DatetimeIndex(['2001-01-01 00:05:00', '2001-01-01 00:05:00', '2001-01-01 00:05:00'], dtype='datetime64[ns]') + """ # noqa: E501 out_column = self._values.round(freq) return self.__class__._from_data({self.name: out_column}) @@ -2018,14 +2015,15 @@ class TimedeltaIndex(GenericIndex): -------- >>> import cudf >>> cudf.TimedeltaIndex([1132223, 2023232, 342234324, 4234324], - ... dtype='timedelta64[ns]') - TimedeltaIndex(['00:00:00.001132', '00:00:00.002023', '00:00:00.342234', - '00:00:00.004234'], - dtype='timedelta64[ns]') - >>> cudf.TimedeltaIndex([1, 2, 3, 4], dtype='timedelta64[s]', + ... dtype="timedelta64[ns]") + TimedeltaIndex(['0 days 00:00:00.001132223', '0 days 00:00:00.002023232', + '0 days 00:00:00.342234324', '0 days 00:00:00.004234324'], + dtype='timedelta64[ns]') + >>> cudf.TimedeltaIndex([1, 2, 3, 4], dtype="timedelta64[s]", ... name="delta-index") - TimedeltaIndex(['00:00:01', '00:00:02', '00:00:03', '00:00:04'], - dtype='timedelta64[s]', name='delta-index') + TimedeltaIndex(['0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03', + '0 days 00:00:04'], + dtype='timedelta64[s]', name='delta-index') """ def __init__( @@ -2154,11 +2152,11 @@ class CategoricalIndex(GenericIndex): >>> import pandas as pd >>> cudf.CategoricalIndex( ... data=[1, 2, 3, 4], categories=[1, 2], ordered=False, name="a") - CategoricalIndex([1, 2, , ], categories=[1, 2], ordered=False, name='a', dtype='category', name='a') + CategoricalIndex([1, 2, , ], categories=[1, 2], ordered=False, dtype='category', name='a') >>> cudf.CategoricalIndex( ... data=[1, 2, 3, 4], dtype=pd.CategoricalDtype([1, 2, 3]), name="a") - CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], ordered=False, name='a', dtype='category', name='a') + CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], ordered=False, dtype='category', name='a') """ # noqa: E501 def __init__( @@ -2449,9 +2447,7 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None): >>> import cudf >>> import pandas as pd >>> cudf.IntervalIndex.from_breaks([0, 1, 2, 3]) - IntervalIndex([(0, 1], (1, 2], (2, 3]], - closed='right', - dtype='interval[int64]') + IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval') """ if copy: breaks = column.as_column(breaks, dtype=dtype).copy() diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index b333c862f21..3acc947c649 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -190,7 +190,7 @@ def rename(self, names, inplace=False): Renaming each levels of a MultiIndex to specified name: >>> midx = cudf.MultiIndex.from_product( - [('A', 'B'), (2020, 2021)], names=['c1', 'c2']) + ... [('A', 'B'), (2020, 2021)], names=['c1', 'c2']) >>> midx.rename(['lv1', 'lv2']) MultiIndex([('A', 2020), ('A', 2021), @@ -1086,7 +1086,7 @@ def values(self): [4, 2], [5, 1]]) >>> type(midx.values) - + """ return self.to_frame(index=False).values @@ -1587,13 +1587,13 @@ def get_loc(self, key, method=None, tolerance=None): -------- >>> import cudf >>> mi = cudf.MultiIndex.from_tuples( - [('a', 'd'), ('b', 'e'), ('b', 'f')]) + ... [('a', 'd'), ('b', 'e'), ('b', 'f')]) >>> mi.get_loc('b') slice(1, 3, None) >>> mi.get_loc(('b', 'e')) 1 >>> non_monotonic_non_unique_idx = cudf.MultiIndex.from_tuples( - [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')]) + ... [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')]) >>> non_monotonic_non_unique_idx.get_loc('b') # differ from pandas slice(1, 4, 2) @@ -1609,10 +1609,10 @@ def get_loc(self, key, method=None, tolerance=None): >>> import pandas as pd >>> import cudf - >>> x = pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), - (1, 1, 1), (1, 1, 1), (2, 2, 1)] - ) + >>> x = pd.MultiIndex.from_tuples([ + ... (2, 1, 1), (1, 2, 3), (1, 2, 1), + ... (1, 1, 1), (1, 1, 1), (2, 2, 1), + ... ]) >>> x.get_loc(1) array([False, True, True, True, True, False]) >>> cudf.from_pandas(x).get_loc(1) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 1733a6c0b9a..68113cfdca9 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -891,7 +891,7 @@ def pivot(data, index=None, columns=None, values=None): Examples -------- >>> a = cudf.DataFrame() - >>> a['a'] = [1, 1, 2, 2], + >>> a['a'] = [1, 1, 2, 2] >>> a['b'] = ['a', 'b', 'a', 'b'] >>> a['c'] = [1, 2, 3, 4] >>> a.pivot(index='a', columns='b') @@ -973,6 +973,7 @@ def unstack(df, level, fill_value=None): Examples -------- + >>> df = cudf.DataFrame() >>> df['a'] = [1, 1, 1, 2, 2] >>> df['b'] = [1, 2, 3, 1, 2] >>> df['c'] = [5, 6, 7, 8, 9] diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 787b28e213c..37bb8e32c5a 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -32,7 +32,7 @@ class Scalar(object): >>> cudf.Scalar(42, dtype='int64') + np.int8(21) Scalar(63, dtype=int64) >>> x = cudf.Scalar(42, dtype='datetime64[s]') - >>> y = cudf.Scalar(21, dtype='timedelta64[ns]) + >>> y = cudf.Scalar(21, dtype='timedelta64[ns]') >>> x - y Scalar(1970-01-01T00:00:41.999999979, dtype=datetime64[ns]) >>> cudf.Series([1,2,3]) + cudf.Scalar(1) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 7da3bdbb31e..6842a05a505 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -512,13 +512,26 @@ def from_pandas(cls, s, nan_as_null=None): @property def dt(self): """ - Accessor object for datetimelike properties of the Series values. + Accessor object for datetime-like properties of the Series values. Examples -------- + >>> s = cudf.Series(cudf.date_range( + ... start='2001-02-03 12:00:00', + ... end='2001-02-03 14:00:00', + ... freq='1H')) >>> s.dt.hour + 0 12 + 1 13 + dtype: int16 >>> s.dt.second + 0 0 + 1 0 + dtype: int16 >>> s.dt.day + 0 3 + 1 3 + dtype: int16 Returns ------- @@ -674,10 +687,12 @@ def drop( y 3 2 x 4 y 5 + dtype: int64 >>> s.drop(labels='y', level=1) 0 x 0 1 x 2 2 x 4 + Name: 2, dtype: int64 """ if labels is not None: if index is not None or columns is not None: @@ -1032,7 +1047,7 @@ def memory_usage(self, index=True, deep=False): -------- >>> s = cudf.Series(range(3), index=['a','b','c']) >>> s.memory_usage() - 48 + 43 Not including the index gives the size of the rest of the data, which is necessarily smaller: @@ -1539,7 +1554,7 @@ def dropna(self, axis=0, inplace=False, how=None): >>> ser 0 1 1 2 - 2 null + 2 dtype: int64 Drop null values from a Series. @@ -1800,7 +1815,7 @@ def data(self): 3 4 dtype: int64 >>> series.data - + >>> series.data.to_host_array() array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8) @@ -1824,14 +1839,7 @@ def as_mask(self): >>> import cudf >>> s = cudf.Series([True, False, True]) >>> s.as_mask() - - >>> s.as_mask().to_host_array() - array([ 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 181, 164, - 188, 1, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, - 127, 253, 214, 62, 241, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - dtype=uint8) + """ if not is_bool_dtype(self.dtype): raise TypeError( @@ -2805,11 +2813,11 @@ def autocorr(self, lag=1): Examples -------- >>> import cudf - >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05]) + >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05, 0.17]) >>> s.autocorr() - 0.10355263309024071 + 0.1438853844... >>> s.autocorr(lag=2) - -0.9999999999999999 + -0.9647548490... """ return self.corr(self.shift(lag)) @@ -3584,7 +3592,7 @@ def keys(self): dtype: int64 >>> sr.keys() - RangeIndex(start=0, stop=6) + RangeIndex(start=0, stop=6, step=1) >>> sr = cudf.Series(['a', 'b', 'c']) >>> sr 0 a @@ -3592,7 +3600,7 @@ def keys(self): 2 c dtype: object >>> sr.keys() - RangeIndex(start=0, stop=3) + RangeIndex(start=0, stop=3, step=1) >>> sr = cudf.Series([1, 2, 3], index=['a', 'b', 'c']) >>> sr a 1 diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 15426d0173a..62c31691ac1 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -397,10 +397,10 @@ class DateOffset: -------- >>> from cudf import DateOffset >>> ts = cudf.Series([ - "2000-01-01 00:00:00.012345678", - "2000-01-31 00:00:00.012345678", - "2000-02-29 00:00:00.012345678", - ], dtype='datetime64[ns]) + ... "2000-01-01 00:00:00.012345678", + ... "2000-01-31 00:00:00.012345678", + ... "2000-02-29 00:00:00.012345678", + ... ], dtype='datetime64[ns]') >>> ts + DateOffset(months=3) 0 2000-04-01 00:00:00.012345678 1 2000-04-30 00:00:00.012345678 diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 372587ba677..6171f20929d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -746,18 +746,31 @@ def test_index_astype(nelem): np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy()) -def test_dataframe_to_string(): - pd.options.display.max_rows = 5 - pd.options.display.max_columns = 8 - # Test basic +def test_dataframe_to_string_with_skipped_rows(): + # Test skipped rows df = cudf.DataFrame( {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} ) - string = str(df) - assert string.splitlines()[-1] == "[6 rows x 2 columns]" + with pd.option_context("display.max_rows", 5): + got = df.to_string() + + expect = textwrap.dedent( + """\ + a b + 0 1 11 + 1 2 12 + .. .. .. + 4 5 15 + 5 6 16 + + [6 rows x 2 columns]""" + ) + assert got == expect - # Test skipped columns + +def test_dataframe_to_string_with_skipped_rows_and_columns(): + # Test skipped rows and skipped columns df = cudf.DataFrame( { "a": [1, 2, 3, 4, 5, 6], @@ -766,11 +779,26 @@ def test_dataframe_to_string(): "d": [11, 12, 13, 14, 15, 16], } ) - string = df.to_string() - assert string.splitlines()[-1] == "[6 rows x 4 columns]" + with pd.option_context("display.max_rows", 5, "display.max_columns", 3): + got = df.to_string() + + expect = textwrap.dedent( + """\ + a ... d + 0 1 ... 11 + 1 2 ... 12 + .. .. ... .. + 4 5 ... 15 + 5 6 ... 16 + + [6 rows x 4 columns]""" + ) + assert got == expect + - # Test masked +def test_dataframe_to_string_with_masked_data(): + # Test masked data df = cudf.DataFrame( {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} ) @@ -783,34 +811,33 @@ def test_dataframe_to_string(): assert masked.null_count == 2 df["c"] = masked - # check data + # Check data values = masked.copy() validids = [0, 2, 3, 5] densearray = masked.dropna().to_numpy() np.testing.assert_equal(data[validids], densearray) - # valid position is correct - + # Valid position is correct for i in validids: assert data[i] == values[i] - # null position is correct + # Null position is correct for i in range(len(values)): if i not in validids: assert values[i] is cudf.NA - pd.options.display.max_rows = 10 - got = df.to_string() + with pd.option_context("display.max_rows", 10): + got = df.to_string() - expect = """ -a b c -0 1 11 0 -1 2 12 -2 3 13 2 -3 4 14 3 -4 5 15 -5 6 16 5 -""" - # values should match despite whitespace difference - assert got.split() == expect.split() + expect = textwrap.dedent( + """\ + a b c + 0 1 11 0 + 1 2 12 + 2 3 13 2 + 3 4 14 3 + 4 5 15 + 5 6 16 5""" + ) + assert got == expect def test_dataframe_to_string_wide(monkeypatch): diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py new file mode 100644 index 00000000000..05d6886c297 --- /dev/null +++ b/python/cudf/cudf/tests/test_doctests.py @@ -0,0 +1,102 @@ +import contextlib +import doctest +import inspect +import io +import os + +import numpy as np +import pytest + +import cudf + + +def _name_in_all(parent, name): + return name in getattr(parent, "__all__", []) + + +def _is_public_name(parent, name): + return not name.startswith("_") + + +def _find_doctests_in_obj(obj, finder=None, criteria=None): + """Find all doctests in an object. + + Parameters + ---------- + obj : module or class + The object to search for docstring examples. + finder : doctest.DocTestFinder, optional + The DocTestFinder object to use. If not provided, a DocTestFinder is + constructed. + criteria : callable, optional + Callable indicating whether to recurse over members of the provided + object. If not provided, names not defined in the object's ``__all__`` + property are ignored. + + Yields + ------ + doctest.DocTest + The next doctest found in the object. + """ + if finder is None: + finder = doctest.DocTestFinder() + if criteria is None: + criteria = _name_in_all + for docstring in finder.find(obj): + if docstring.examples: + yield docstring + for name, member in inspect.getmembers(obj): + # Only recurse over members matching the criteria + if not criteria(obj, name): + continue + # Recurse over the public API of modules (objects defined in the + # module's __all__) + if inspect.ismodule(member): + yield from _find_doctests_in_obj( + member, finder, criteria=_name_in_all + ) + # Recurse over the public API of classes (attributes not prefixed with + # an underscore) + if inspect.isclass(member): + yield from _find_doctests_in_obj( + member, finder, criteria=_is_public_name + ) + + +class TestDoctests: + @pytest.fixture(autouse=True) + def chdir_to_tmp_path(cls, tmp_path): + # Some doctests generate files, so this fixture runs the tests in a + # temporary directory. + original_directory = os.getcwd() + os.chdir(tmp_path) + yield + os.chdir(original_directory) + + @pytest.mark.parametrize( + "docstring", + _find_doctests_in_obj(cudf), + ids=lambda docstring: docstring.name, + ) + def test_docstring(self, docstring): + # We ignore differences in whitespace in the doctest output, and enable + # the use of an ellipsis "..." to match any string in the doctest + # output. An ellipsis is useful for, e.g., memory addresses or + # imprecise floating point values. + optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE + runner = doctest.DocTestRunner(optionflags=optionflags) + + # These global names are pre-defined and can be used in doctests + # without first importing them. + globals = dict(cudf=cudf, np=np,) + docstring.globs = globals + + # Capture stdout and include failing outputs in the traceback. + doctest_stdout = io.StringIO() + with contextlib.redirect_stdout(doctest_stdout): + runner.run(docstring) + results = runner.summarize() + assert not results.failed, ( + f"{results.failed} of {results.attempted} doctests failed for " + f"{docstring.name}:\n{doctest_stdout.getvalue()}" + ) diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index 7a4a2673f9b..2fcf996b641 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -225,13 +225,13 @@ def wrapper(func): 2 2010-01-01 dtype: datetime64[s] >>> s.describe() - count 3 - mean 2006-09-01 08:00:00.000000000 - min 2000-01-01 00:00:00.000000000 - 25% 2004-12-31 12:00:00.000000000 - 50% 2010-01-01 00:00:00.000000000 - 75% 2010-01-01 00:00:00.000000000 - max 2010-01-01 00:00:00.000000000 + count 3 + mean 2006-09-01 08:00:00 + min 2000-01-01 00:00:00 + 25% 2004-12-31 12:00:00 + 50% 2010-01-01 00:00:00 + 75% 2010-01-01 00:00:00 + max 2010-01-01 00:00:00 dtype: object Describing a ``DataFrame``. By default only numeric fields are diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index b881f9372bc..6f958860dad 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -972,9 +972,9 @@ >>> import cudf >>> filename = 'foo.csv' >>> df = cudf.DataFrame({'x': [0, 1, 2, 3], - 'y': [1.0, 3.3, 2.2, 4.4], - 'z': ['a', 'b', 'c', 'd']}) ->>> df = df.set_index([3, 2, 1, 0]) +... 'y': [1.0, 3.3, 2.2, 4.4], +... 'z': ['a', 'b', 'c', 'd']}) +>>> df = df.set_index(cudf.Series([3, 2, 1, 0])) >>> df.to_csv(filename) """ From 7ff5f128bec185c40017bab20c08f1342fa6b74e Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sat, 15 Jan 2022 14:12:12 -0700 Subject: [PATCH 22/23] Support structs for `cudf::contains` with column/scalar input (#9929) This PR adds support for `cudf::contains` so we can check whether a structs column contains a scalar struct element. Partially addresses #8965. This does not support checking if structs given in a structs column exist in another structs column. Such cases will be supported when the new data structure mentioned in https://github.com/rapidsai/cudf/issues/9413 is merged into cudf. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/9929 --- cpp/src/search/search.cu | 66 +++++-- cpp/tests/search/search_struct_test.cpp | 241 +++++++++++++++++++++++- 2 files changed, 288 insertions(+), 19 deletions(-) diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu index 241b3c595f1..81ed3cfbd51 100644 --- a/cpp/src/search/search.cu +++ b/cpp/src/search/search.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -173,11 +173,56 @@ bool contains_scalar_dispatch::operator()(column_view const&, } template <> -bool contains_scalar_dispatch::operator()(column_view const&, - scalar const&, - rmm::cuda_stream_view) +bool contains_scalar_dispatch::operator()(column_view const& col, + scalar const& value, + rmm::cuda_stream_view stream) { - CUDF_FAIL("struct_view type not supported yet"); + CUDF_EXPECTS(col.type() == value.type(), "scalar and column types must match"); + + auto const scalar_table = static_cast(&value)->view(); + CUDF_EXPECTS(col.num_children() == scalar_table.num_columns(), + "struct scalar and structs column must have the same number of children"); + for (size_type i = 0; i < col.num_children(); ++i) { + CUDF_EXPECTS(col.child(i).type() == scalar_table.column(i).type(), + "scalar and column children types must match"); + } + + // Prepare to flatten the structs column and scalar. + auto const has_null_elements = + has_nested_nulls(table_view{std::vector{col.child_begin(), col.child_end()}}) || + has_nested_nulls(scalar_table); + auto const flatten_nullability = has_null_elements + ? structs::detail::column_nullability::FORCE + : structs::detail::column_nullability::MATCH_INCOMING; + + // Flatten the input structs column, only materialize the bitmask if there is null in the input. + auto const col_flattened = + structs::detail::flatten_nested_columns(table_view{{col}}, {}, {}, flatten_nullability); + auto const val_flattened = + structs::detail::flatten_nested_columns(scalar_table, {}, {}, flatten_nullability); + + // The struct scalar only contains the struct member columns. + // Thus, if there is any null in the input, we must exclude the first column in the flattened + // table of the input column from searching because that column is the materialized bitmask of + // the input structs column. + auto const col_flattened_content = col_flattened.flattened_columns(); + auto const col_flattened_children = table_view{std::vector{ + col_flattened_content.begin() + static_cast(has_null_elements), + col_flattened_content.end()}}; + + auto const d_col_children_ptr = table_device_view::create(col_flattened_children, stream); + auto const d_val_ptr = table_device_view::create(val_flattened, stream); + + auto const start_iter = thrust::make_counting_iterator(0); + auto const end_iter = start_iter + col.size(); + auto const comp = row_equality_comparator( + nullate::DYNAMIC{has_null_elements}, *d_col_children_ptr, *d_val_ptr, null_equality::EQUAL); + auto const found_iter = thrust::find_if( + rmm::exec_policy(stream), start_iter, end_iter, [comp] __device__(auto const idx) { + return comp(idx, 0); // compare col[idx] == val[0]. + }); + + return found_iter != end_iter; } template <> @@ -203,7 +248,6 @@ namespace detail { bool contains(column_view const& col, scalar const& value, rmm::cuda_stream_view stream) { if (col.is_empty()) { return false; } - if (not value.is_valid(stream)) { return col.has_nulls(); } return cudf::type_dispatcher(col.type(), contains_scalar_dispatch{}, col, value, stream); @@ -264,20 +308,14 @@ struct multi_contains_dispatch { template <> std::unique_ptr multi_contains_dispatch::operator()( - column_view const& haystack, - column_view const& needles, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + column_view const&, column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) { CUDF_FAIL("list_view type not supported"); } template <> std::unique_ptr multi_contains_dispatch::operator()( - column_view const& haystack, - column_view const& needles, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + column_view const&, column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) { CUDF_FAIL("struct_view type not supported"); } diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp index db2ecb89d6a..a1f0b1d81cf 100644 --- a/cpp/tests/search/search_struct_test.cpp +++ b/cpp/tests/search/search_struct_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -35,15 +36,14 @@ constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_leve constexpr int32_t null{0}; // Mark for null child elements constexpr int32_t XXX{0}; // Mark for null struct elements -template -struct TypedStructSearchTest : public cudf::test::BaseFixture { -}; - using TestTypes = cudf::test::Concat; +template +struct TypedStructSearchTest : public cudf::test::BaseFixture { +}; TYPED_TEST_SUITE(TypedStructSearchTest, TestTypes); namespace { @@ -353,3 +353,234 @@ TYPED_TEST(TypedStructSearchTest, ComplexStructTest) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), verbosity); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), verbosity); } + +template +struct TypedScalarStructContainTest : public cudf::test::BaseFixture { +}; +TYPED_TEST_SUITE(TypedScalarStructContainTest, TestTypes); + +TYPED_TEST(TypedScalarStructContainTest, EmptyInputTest) +{ + using col_wrapper = cudf::test::fixed_width_column_wrapper; + + auto const col = [] { + auto child = col_wrapper{}; + return structs_col{{child}}; + }(); + + auto const val = [] { + auto child = col_wrapper{1}; + return cudf::struct_scalar(std::vector{child}); + }(); + + EXPECT_EQ(false, cudf::contains(col, val)); +} + +TYPED_TEST(TypedScalarStructContainTest, TrivialInputTests) +{ + using col_wrapper = cudf::test::fixed_width_column_wrapper; + + auto const col = [] { + auto child1 = col_wrapper{1, 2, 3}; + auto child2 = col_wrapper{4, 5, 6}; + auto child3 = strings_col{"x", "y", "z"}; + return structs_col{{child1, child2, child3}}; + }(); + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"x"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"a"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); +} + +TYPED_TEST(TypedScalarStructContainTest, SlicedColumnInputTests) +{ + using col_wrapper = cudf::test::fixed_width_column_wrapper; + + constexpr int32_t dont_care{0}; + + auto const col_original = [] { + auto child1 = col_wrapper{dont_care, dont_care, 1, 2, 3, dont_care}; + auto child2 = col_wrapper{dont_care, dont_care, 4, 5, 6, dont_care}; + auto child3 = strings_col{"dont_care", "dont_care", "x", "y", "z", "dont_care"}; + return structs_col{{child1, child2, child3}}; + }(); + auto const col = cudf::slice(col_original, {2, 5})[0]; + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"x"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{dont_care}; + auto child2 = col_wrapper{dont_care}; + auto child3 = strings_col{"dont_care"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); +} + +TYPED_TEST(TypedScalarStructContainTest, SimpleInputWithNullsTests) +{ + using col_wrapper = cudf::test::fixed_width_column_wrapper; + + constexpr int32_t null{0}; + + // Test with nulls at the top level. + { + auto const col = [] { + auto child1 = col_wrapper{1, null, 3}; + auto child2 = col_wrapper{4, null, 6}; + auto child3 = strings_col{"x", "" /*NULL*/, "z"}; + return structs_col{{child1, child2, child3}, null_at(1)}; + }(); + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"x"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"a"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); + } + + // Test with nulls at the children level. + { + auto const col = [] { + auto child1 = col_wrapper{{1, null, 3}, null_at(1)}; + auto child2 = col_wrapper{{4, null, 6}, null_at(1)}; + auto child3 = strings_col{{"" /*NULL*/, "y", "z"}, null_at(0)}; + return structs_col{{child1, child2, child3}}; + }(); + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{{"" /*NULL*/}, null_at(0)}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{""}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); + } + + // Test with nulls in the input scalar. + { + auto const col = [] { + auto child1 = col_wrapper{1, 2, 3}; + auto child2 = col_wrapper{4, 5, 6}; + auto child3 = strings_col{"x", "y", "z"}; + return structs_col{{child1, child2, child3}}; + }(); + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"x"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{{"" /*NULL*/}, null_at(0)}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); + } +} + +TYPED_TEST(TypedScalarStructContainTest, SlicedInputWithNullsTests) +{ + using col_wrapper = cudf::test::fixed_width_column_wrapper; + + constexpr int32_t dont_care{0}; + constexpr int32_t null{0}; + + // Test with nulls at the top level. + { + auto const col_original = [] { + auto child1 = col_wrapper{dont_care, dont_care, 1, null, 3, dont_care}; + auto child2 = col_wrapper{dont_care, dont_care, 4, null, 6, dont_care}; + auto child3 = strings_col{"dont_care", "dont_care", "x", "" /*NULL*/, "z", "dont_care"}; + return structs_col{{child1, child2, child3}, null_at(3)}; + }(); + auto const col = cudf::slice(col_original, {2, 5})[0]; + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"x"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"a"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); + } + + // Test with nulls at the children level. + { + auto const col_original = [] { + auto child1 = + col_wrapper{{dont_care, dont_care /*also NULL*/, 1, null, 3, dont_care}, null_at(3)}; + auto child2 = + col_wrapper{{dont_care, dont_care /*also NULL*/, 4, null, 6, dont_care}, null_at(3)}; + auto child3 = strings_col{ + {"dont_care", "dont_care" /*also NULL*/, "" /*NULL*/, "y", "z", "dont_care"}, null_at(2)}; + return structs_col{{child1, child2, child3}, null_at(1)}; + }(); + auto const col = cudf::slice(col_original, {2, 5})[0]; + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{{"x"}, null_at(0)}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{dont_care}; + auto child2 = col_wrapper{dont_care}; + auto child3 = strings_col{"dont_care"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); + } +} From e4a16ae2550f5a7481887b28c7a60fc14fea2f5c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 18 Jan 2022 06:31:14 -0800 Subject: [PATCH 23/23] Implement mixed equality/conditional joins (#9917) This PR implements mixed equality/inequality joins for inner, left, and full joins. This resolves #9696 and contributes to #5401. For the moment, all APIs are functional only, but an object-oriented API is planned to support caching of the hash table. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Yunsong Wang (https://github.com/PointKernel) - Jason Lowe (https://github.com/jlowe) - Jake Hemstad (https://github.com/jrhemstad) URL: https://github.com/rapidsai/cudf/pull/9917 --- cpp/CMakeLists.txt | 1 + .../cudf/ast/detail/expression_evaluator.cuh | 4 +- cpp/include/cudf/join.hpp | 267 +++++++- cpp/include/cudf/table/row_operators.cuh | 7 +- cpp/src/join/conditional_join.cu | 21 +- cpp/src/join/conditional_join_kernels.cuh | 6 +- cpp/src/join/hash_join.cu | 43 -- cpp/src/join/hash_join.cuh | 44 +- cpp/src/join/join_common_utils.cuh | 11 + cpp/src/join/join_common_utils.hpp | 9 + cpp/src/join/mixed_join.cu | 557 +++++++++++++++ cpp/src/join/mixed_join_kernels.cuh | 322 +++++++++ cpp/tests/CMakeLists.txt | 2 +- cpp/tests/join/conditional_join_tests.cu | 13 +- cpp/tests/join/mixed_join_tests.cu | 643 ++++++++++++++++++ 15 files changed, 1884 insertions(+), 66 deletions(-) create mode 100644 cpp/src/join/mixed_join.cu create mode 100644 cpp/src/join/mixed_join_kernels.cuh create mode 100644 cpp/tests/join/mixed_join_tests.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a8100fb3f92..2f51f582e12 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -322,6 +322,7 @@ add_library( src/jit/parser.cpp src/jit/type.cpp src/join/conditional_join.cu + src/join/mixed_join.cu src/join/cross_join.cu src/join/hash_join.cu src/join/join.cu diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh index 0b739482c4d..ecd46ec2c23 100644 --- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh +++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh @@ -429,7 +429,7 @@ struct expression_evaluator { __device__ __forceinline__ void evaluate( expression_result& output_object, cudf::size_type const row_index, - IntermediateDataType* thread_intermediate_storage) + IntermediateDataType* thread_intermediate_storage) const { evaluate(output_object, row_index, row_index, row_index, thread_intermediate_storage); } @@ -452,7 +452,7 @@ struct expression_evaluator { cudf::size_type const left_row_index, cudf::size_type const right_row_index, cudf::size_type const output_row_index, - IntermediateDataType* thread_intermediate_storage) + IntermediateDataType* thread_intermediate_storage) const { cudf::size_type operator_source_index{0}; for (cudf::size_type operator_index = 0; operator_index < plan.operators.size(); diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 8ea6bd1a6cc..30400074c50 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -701,7 +702,7 @@ conditional_inner_join( * The first returned vector contains all the row indices from the left * table (in unspecified order). The corresponding value in the * second returned vector is either (1) the row index of the matched row - * from the right table, if there is a match or (2) an unspecified + * from the right table, if there is a match or (2) an unspecified * out-of-bounds value. * * If the provided predicate returns NULL for a pair of rows @@ -858,6 +859,270 @@ std::unique_ptr> conditional_left_anti_join( std::optional output_size = {}, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a pair of row index vectors corresponding to all pairs of + * rows between the specified tables where the columns of the equality table + * are equal and the predicate evaluates to true on the conditional tables. + * + * The first returned vector contains the row indices from the left + * table that have a match in the right table (in unspecified order). + * The corresponding values in the second returned vector are + * the matched row indices from the right table. + * + * If the provided predicate returns NULL for a pair of rows + * (left, right), that pair is not included in the output. It is the user's + * responsiblity to choose a suitable compare_nulls value AND use appropriate + * null-safe operators in the expression. + * + * If the provided output size or per-row counts are incorrect, behavior is undefined. + * + * @code{.pseudo} + * left_equality: {{0, 1, 2}} + * right_equality: {{1, 2, 3}} + * left_conditional: {{4, 4, 4}} + * right_conditional: {{3, 4, 5}} + * Expression: Left.Column_0 > Right.Column_0 + * Result: {{1}, {0}} + * @endcode + * + * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not + * match. + * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not + * match. + * + * @param left_equality The left table used for the equality join. + * @param right_equality The right table used for the equality join. + * @param left_conditional The left table used for the conditional join. + * @param right_conditional The right table used for the conditional join. + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether or not null values join to each other or not. + * @param output_size_data An optional pair of values indicating the exact output size and the + * number of matches for each row in the larger of the two input tables, left or right (may be + * precomputed using the corresponding mixed_inner_join_size API). + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a mixed inner join between the four input tables. + */ +std::pair>, + std::unique_ptr>> +mixed_inner_join( + table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + std::optional>> output_size_data = {}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a pair of row index vectors corresponding to all pairs of + * rows between the specified tables where the columns of the equality table + * are equal and the predicate evaluates to true on the conditional tables, + * or null matches for rows in left that have no match in right. + * + * The first returned vector contains the row indices from the left + * tables that have a match in the right tables (in unspecified order). + * The corresponding value in the second returned vector is either (1) + * the row index of the matched row from the right tables, or (2) an + * unspecified out-of-bounds value. + * + * If the provided predicate returns NULL for a pair of rows + * (left, right), that pair is not included in the output. It is the user's + * responsiblity to choose a suitable compare_nulls value AND use appropriate + * null-safe operators in the expression. + * + * If the provided output size or per-row counts are incorrect, behavior is undefined. + * + * @code{.pseudo} + * left_equality: {{0, 1, 2}} + * right_equality: {{1, 2, 3}} + * left_conditional: {{4, 4, 4}} + * right_conditional: {{3, 4, 5}} + * Expression: Left.Column_0 > Right.Column_0 + * Result: {{0, 1, 2}, {None, 0, None}} + * @endcode + * + * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not + * match. + * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not + * match. + * + * @param left_equality The left table used for the equality join. + * @param right_equality The right table used for the equality join. + * @param left_conditional The left table used for the conditional join. + * @param right_conditional The right table used for the conditional join. + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether or not null values join to each other or not. + * @param output_size_data An optional pair of values indicating the exact output size and the + * number of matches for each row in the larger of the two input tables, left or right (may be + * precomputed using the corresponding mixed_left_join_size API). + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a mixed left join between the four input tables. + */ +std::pair>, + std::unique_ptr>> +mixed_left_join( + table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + std::optional>> output_size_data = {}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a pair of row index vectors corresponding to all pairs of + * rows between the specified tables where the columns of the equality table + * are equal and the predicate evaluates to true on the conditional tables, + * or null matches for rows in either pair of tables that have no matches in + * the other pair. + * + * Taken pairwise, the values from the returned vectors are one of: + * (1) row indices corresponding to matching rows from the left and + * right tables, (2) a row index and an unspecified out-of-bounds value, + * representing a row from one table without a match in the other. + * + * If the provided predicate returns NULL for a pair of rows + * (left, right), that pair is not included in the output. It is the user's + * responsiblity to choose a suitable compare_nulls value AND use appropriate + * null-safe operators in the expression. + * + * If the provided output size or per-row counts are incorrect, behavior is undefined. + * + * @code{.pseudo} + * left_equality: {{0, 1, 2}} + * right_equality: {{1, 2, 3}} + * left_conditional: {{4, 4, 4}} + * right_conditional: {{3, 4, 5}} + * Expression: Left.Column_0 > Right.Column_0 + * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}} + * @endcode + * + * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not + * match. + * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not + * match. + * + * @param left_equality The left table used for the equality join. + * @param right_equality The right table used for the equality join. + * @param left_conditional The left table used for the conditional join. + * @param right_conditional The right table used for the conditional join. + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether or not null values join to each other or not. + * @param output_size_data An optional pair of values indicating the exact output size and the + * number of matches for each row in the larger of the two input tables, left or right (may be + * precomputed using the corresponding mixed_full_join_size API). + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a mixed full join between the four input tables. + */ +std::pair>, + std::unique_ptr>> +mixed_full_join( + table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + std::optional>> output_size_data = {}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns the exact number of matches (rows) when performing a + * mixed inner join between the specified tables where the columns of the + * equality table are equal and the predicate evaluates to true on the + * conditional tables. + * + * If the provided predicate returns NULL for a pair of rows (left, right), + * that pair is not included in the output. It is the user's responsiblity to + * choose a suitable compare_nulls value AND use appropriate null-safe + * operators in the expression. + * + * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not + * match. + * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not + * match. + * + * @param left_equality The left table used for the equality join. + * @param right_equality The right table used for the equality join. + * @param left_conditional The left table used for the conditional join. + * @param right_conditional The right table used for the conditional join. + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether or not null values join to each other or not. + * @param output_size An optional pair of values indicating the exact output size and the number of + * matches for each row in the larger of the two input tables, left or right (may be precomputed + * using the corresponding mixed_inner_join_size API). + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair containing the size that would result from performing the + * requested join and the number of matches for each row in one of the two + * tables. Which of the two tables is an implementation detail and should not + * be relied upon, simply passed to the corresponding `mixed_inner_join` API as + * is. + */ +std::pair>> mixed_inner_join_size( + table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns the exact number of matches (rows) when performing a + * mixed left join between the specified tables where the columns of the + * equality table are equal and the predicate evaluates to true on the + * conditional tables. + * + * If the provided predicate returns NULL for a pair of rows (left, right), + * that pair is not included in the output. It is the user's responsiblity to + * choose a suitable compare_nulls value AND use appropriate null-safe + * operators in the expression. + * + * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not + * match. + * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not + * match. + * + * @param left_equality The left table used for the equality join. + * @param right_equality The right table used for the equality join. + * @param left_conditional The left table used for the conditional join. + * @param right_conditional The right table used for the conditional join. + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether or not null values join to each other or not. + * @param output_size An optional pair of values indicating the exact output size and the number of + * matches for each row in the larger of the two input tables, left or right (may be precomputed + * using the corresponding mixed_inner_join_size API). + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair containing the size that would result from performing the + * requested join and the number of matches for each row in one of the two + * tables. Which of the two tables is an implementation detail and should not + * be relied upon, simply passed to the corresponding `mixed_left_join` API as + * is. + */ +std::pair>> mixed_left_join_size( + table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns the exact number of matches (rows) when performing a * conditional inner join between the specified tables where the predicate diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh index 32ddd1ef49a..a3b08fda15d 100644 --- a/cpp/include/cudf/table/row_operators.cuh +++ b/cpp/include/cudf/table/row_operators.cuh @@ -492,8 +492,11 @@ template