From e37bddb84be7a3a6915511702739f5b7e87386d7 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 10 Mar 2023 10:24:04 -0500 Subject: [PATCH 1/4] Remove return type from @return doxygen tags (#12908) The doxygen `@return` does not require a type but only the description. However, the doxygen output looks cleaner without the type. There were only a few places in the public header files that needed to be corrected. This technically a documentation-only change. Found when working on #12904 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/12908 --- cpp/include/cudf/column/column.hpp | 14 +++++----- .../cudf/column/column_device_view.cuh | 18 ++++++------- cpp/include/cudf/column/column_view.hpp | 26 +++++++++---------- cpp/include/cudf/concatenate.hpp | 3 +-- cpp/include/cudf/copying.hpp | 16 ++++++------ cpp/include/cudf/io/data_sink.hpp | 6 ++--- cpp/include/cudf/io/datasource.hpp | 4 +-- cpp/include/cudf/lists/contains.hpp | 10 +++---- cpp/include/cudf/lists/lists_column_view.hpp | 6 ++--- cpp/include/cudf/null_mask.hpp | 14 +++++----- cpp/include/cudf/round.hpp | 4 +-- cpp/include/cudf/sorting.hpp | 9 +++---- .../cudf/table/experimental/row_operators.cuh | 10 +++---- cpp/include/cudf/transform.hpp | 4 +-- cpp/include/cudf/types.hpp | 4 +-- 15 files changed, 72 insertions(+), 76 deletions(-) diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp index c02991051d9..178fc92b399 100644 --- a/cpp/include/cudf/column/column.hpp +++ b/cpp/include/cudf/column/column.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -259,7 +259,7 @@ class column { * @brief Returns a reference to the specified child * * @param child_index Index of the desired child - * @return column& Reference to the desired child + * @return Reference to the desired child */ column& child(size_type child_index) noexcept { return *_children[child_index]; }; @@ -267,7 +267,7 @@ class column { * @brief Returns a const reference to the specified child * * @param child_index Index of the desired child - * @return column const& Const reference to the desired child + * @return Const reference to the desired child */ [[nodiscard]] column const& child(size_type child_index) const noexcept { @@ -306,7 +306,7 @@ class column { * @brief Creates an immutable, non-owning view of the column's data and * children. * - * @return column_view The immutable, non-owning view + * @return The immutable, non-owning view */ [[nodiscard]] column_view view() const; @@ -316,7 +316,7 @@ class column { * This allows passing a `column` object directly into a function that * requires a `column_view`. The conversion is automatic. * - * @return column_view Immutable, non-owning `column_view` + * @return Immutable, non-owning `column_view` */ operator column_view() const { return this->view(); }; @@ -330,7 +330,7 @@ class column { * if not, the null count will be recomputed on the next invocation of *`null_count()`. * - * @return mutable_column_view The mutable, non-owning view + * @return The mutable, non-owning view */ mutable_column_view mutable_view(); @@ -346,7 +346,7 @@ class column { * Otherwise, the null count will be recomputed on the next invocation of * `null_count()`. * - * @return mutable_column_view Mutable, non-owning `mutable_column_view` + * @return Mutable, non-owning `mutable_column_view` */ operator mutable_column_view() { return this->mutable_view(); }; diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 1361866d0aa..b3e6ad0b99f 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -129,7 +129,7 @@ class alignas(16) column_device_view_base { * or `std::is_same_v` are true. * * @tparam The type to cast to - * @return T const* Typed pointer to underlying data + * @return Typed pointer to underlying data */ template or is_rep_layout_compatible())> @@ -151,7 +151,7 @@ class alignas(16) column_device_view_base { * false. * * @tparam T The type to cast to - * @return T const* Typed pointer to underlying data, including the offset + * @return Typed pointer to underlying data, including the offset */ template ())> [[nodiscard]] CUDF_HOST_DEVICE T const* data() const noexcept @@ -990,7 +990,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * `data()`. * * @tparam The type to cast to - * @return T* Typed pointer to underlying data + * @return Typed pointer to underlying data */ template or is_rep_layout_compatible())> @@ -1009,7 +1009,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * @note If `offset() == 0`, then `head() == data()` * * @tparam T The type to cast to - * @return T* Typed pointer to underlying data, including the offset + * @return Typed pointer to underlying data, including the offset */ template ())> CUDF_HOST_DEVICE T* data() const noexcept @@ -1078,7 +1078,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * `mutable_column_device_view::has_element_accessor()` is false. * * @tparam T The desired type - * @return T* Pointer to the first element after casting + * @return Pointer to the first element after casting */ template ())> iterator begin() @@ -1094,7 +1094,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * `mutable_column_device_view::has_element_accessor()` is false. * * @tparam T The desired type - * @return T const* Pointer to one past the last element after casting + * @return Pointer to one past the last element after casting */ template ())> iterator end() @@ -1106,7 +1106,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * @brief Returns the specified child * * @param child_index The index of the desired child - * @return column_view The requested child `column_view` + * @return The requested child `column_view` */ [[nodiscard]] __device__ mutable_column_device_view child(size_type child_index) const noexcept { @@ -1173,7 +1173,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * device view of the specified column and it's children. * * @param source_view The `column_view` to use for this calculation. - * @return size_t The size in bytes of the amount of memory needed to hold a + * @return The size in bytes of the amount of memory needed to hold a * device view of the specified column and it's children */ static std::size_t extent(mutable_column_view source_view); diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 217f88e67f9..4889a62bbe4 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,7 +66,7 @@ class column_view_base { * or `std::is_same_v` are true. * * @tparam The type to cast to - * @return T const* Typed pointer to underlying data + * @return Typed pointer to underlying data */ template or is_rep_layout_compatible())> @@ -85,7 +85,7 @@ class column_view_base { * false. * * @tparam T The type to cast to - * @return T const* Typed pointer to underlying data, including the offset + * @return Typed pointer to underlying data, including the offset */ template ())> T const* data() const noexcept @@ -101,7 +101,7 @@ class column_view_base { * false. * * @tparam T The desired type - * @return T const* Pointer to the first element after casting + * @return Pointer to the first element after casting */ template ())> T const* begin() const noexcept @@ -117,7 +117,7 @@ class column_view_base { * false. * * @tparam T The desired type - * @return T const* Pointer to one past the last element after casting + * @return Pointer to one past the last element after casting */ template ())> T const* end() const noexcept @@ -389,7 +389,7 @@ class column_view : public detail::column_view_base { * @brief Returns the specified child * * @param child_index The index of the desired child - * @return column_view The requested child `column_view` + * @return The requested child `column_view` */ [[nodiscard]] column_view child(size_type child_index) const noexcept { @@ -553,7 +553,7 @@ class mutable_column_view : public detail::column_view_base { * column, and instead, accessing the elements should be done via `data()`. * * @tparam The type to cast to - * @return T* Typed pointer to underlying data + * @return Typed pointer to underlying data */ template or is_rep_layout_compatible())> @@ -572,7 +572,7 @@ class mutable_column_view : public detail::column_view_base { * @note If `offset() == 0`, then `head() == data()` * * @tparam T The type to cast to - * @return T* Typed pointer to underlying data, including the offset + * @return Typed pointer to underlying data, including the offset */ template ())> T* data() const noexcept @@ -588,7 +588,7 @@ class mutable_column_view : public detail::column_view_base { * false. * * @tparam T The desired type - * @return T* Pointer to the first element after casting + * @return Pointer to the first element after casting */ template ())> T* begin() const noexcept @@ -604,7 +604,7 @@ class mutable_column_view : public detail::column_view_base { * false. * * @tparam T The desired type - * @return T* Pointer to one past the last element after casting + * @return Pointer to one past the last element after casting */ template ())> T* end() const noexcept @@ -639,7 +639,7 @@ class mutable_column_view : public detail::column_view_base { * @brief Returns a reference to the specified child * * @param child_index The index of the desired child - * @return mutable_column_view The requested child `mutable_column_view` + * @return The requested child `mutable_column_view` */ [[nodiscard]] mutable_column_view child(size_type child_index) const noexcept { @@ -670,7 +670,7 @@ class mutable_column_view : public detail::column_view_base { /** * @brief Converts a mutable view into an immutable view * - * @return column_view An immutable view of the mutable view's elements + * @return An immutable view of the mutable view's elements */ operator column_view() const; @@ -684,7 +684,7 @@ class mutable_column_view : public detail::column_view_base { * @brief Counts the number of descendants of the specified parent. * * @param parent The parent whose descendants will be counted - * @return size_type The number of descendants of the parent + * @return The number of descendants of the parent */ size_type count_descendants(column_view parent); diff --git a/cpp/include/cudf/concatenate.hpp b/cpp/include/cudf/concatenate.hpp index b20c97b3c31..2b4eee607e2 100644 --- a/cpp/include/cudf/concatenate.hpp +++ b/cpp/include/cudf/concatenate.hpp @@ -40,8 +40,7 @@ namespace cudf { * * @param views host_span of column views whose bitmasks will be concatenated * @param mr Device memory resource used for allocating the new device_buffer - * @return rmm::device_buffer A `device_buffer` containing the bitmasks of all - * the column views in the views vector + * @return A `device_buffer` containing the bitmasks of all the column views in the views vector */ rmm::device_buffer concatenate_masks( host_span views, diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index 63c66335d2d..d5a3c930853 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,7 +78,7 @@ enum class out_of_bounds_policy : bool { * better performance. If `policy` is set to `DONT_CHECK` and there are out-of-bounds indices * in the gather map, the behavior is undefined. Defaults to `DONT_CHECK`. * @param[in] mr Device memory resource used to allocate the returned table's device memory - * @return std::unique_ptr Result of the gather + * @return Result of the gather */ std::unique_ptr
gather( table_view const& source_table, @@ -211,7 +211,7 @@ enum class mask_allocation_policy { * @brief Initializes and returns an empty column of the same type as the `input`. * * @param[in] input Immutable view of input column to emulate - * @return std::unique_ptr An empty column of same type as `input` + * @return An empty column of same type as `input` */ std::unique_ptr empty_like(column_view const& input); @@ -219,7 +219,7 @@ std::unique_ptr empty_like(column_view const& input); * @brief Initializes and returns an empty column of the same type as the `input`. * * @param[in] input Scalar to emulate - * @return std::unique_ptr An empty column of same type as `input` + * @return An empty column of same type as `input` */ std::unique_ptr empty_like(scalar const& input); @@ -264,7 +264,7 @@ std::unique_ptr allocate_like( * memory for the column's data or bitmask. * * @param[in] input_table Immutable view of input table to emulate - * @return std::unique_ptr
A table of empty columns with the same types as the columns in + * @return A table of empty columns with the same types as the columns in * `input_table` */ std::unique_ptr
empty_like(table_view const& input_table); @@ -333,7 +333,7 @@ void copy_range_in_place(column_view const& source, * (exclusive) * @param target_begin The starting index of the target range (inclusive) * @param mr Device memory resource used to allocate the returned column's device memory - * @return std::unique_ptr The result target column + * @return The result target column */ std::unique_ptr copy_range( column_view const& source, @@ -920,7 +920,7 @@ std::unique_ptr
boolean_mask_scatter( * @param input Column view to get the element from * @param index Index into `input` to get the element at * @param mr Device memory resource used to allocate the returned scalar's device memory - * @return std::unique_ptr Scalar containing the single value + * @return Scalar containing the single value */ std::unique_ptr get_element( column_view const& input, @@ -960,7 +960,7 @@ enum class sample_with_replacement : bool { * @param seed Seed value to initiate random number generator * @param mr Device memory resource used to allocate the returned table's device memory * - * @return std::unique_ptr
Table containing samples from `input` + * @return Table containing samples from `input` */ std::unique_ptr
sample( table_view const& input, diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp index cf3e94029be..0be2935b84c 100644 --- a/cpp/include/cudf/io/data_sink.hpp +++ b/cpp/include/cudf/io/data_sink.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -123,7 +123,7 @@ class data_sink { * instead of write() when possible. However, it is still possible to receive * write() calls as well. * - * @return bool If this writer supports device_write() calls + * @return If this writer supports device_write() calls */ [[nodiscard]] virtual bool supports_device_write() const { return false; } @@ -194,7 +194,7 @@ class data_sink { /** * @pure @brief Returns the total number of bytes written into this sink * - * @return size_t Total number of bytes written into this sink + * @return Total number of bytes written into this sink */ virtual size_t bytes_written() = 0; }; diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index a0ef2155f7d..12b8377bff2 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -296,14 +296,14 @@ class datasource { /** * @brief Returns the size of the data in the source. * - * @return size_t The size of the source data in bytes + * @return The size of the source data in bytes */ [[nodiscard]] virtual size_t size() const = 0; /** * @brief Returns whether the source contains any data. * - * @return bool True if there is data, False otherwise + * @return True if there is data, False otherwise */ [[nodiscard]] virtual bool is_empty() const { return size() == 0; } diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp index a9f06bf399c..fbe931f945d 100644 --- a/cpp/include/cudf/lists/contains.hpp +++ b/cpp/include/cudf/lists/contains.hpp @@ -43,7 +43,7 @@ namespace lists { * @param lists Lists column whose `n` rows are to be searched * @param search_key The scalar key to be looked up in each list row * @param mr Device memory resource used to allocate the returned column's device memory. - * @return std::unique_ptr BOOL8 column of `n` rows with the result of the lookup + * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains( cudf::lists_column_view const& lists, @@ -65,7 +65,7 @@ std::unique_ptr contains( * @param lists Lists column whose `n` rows are to be searched * @param search_keys Column of elements to be looked up in each list row * @param mr Device memory resource used to allocate the returned column's device memory. - * @return std::unique_ptr BOOL8 column of `n` rows with the result of the lookup + * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains( cudf::lists_column_view const& lists, @@ -86,7 +86,7 @@ std::unique_ptr contains( * * @param lists Lists column whose `n` rows are to be searched * @param mr Device memory resource used to allocate the returned column's device memory. - * @return std::unique_ptr BOOL8 column of `n` rows with the result of the lookup + * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains_nulls( cudf::lists_column_view const& lists, @@ -124,7 +124,7 @@ enum class duplicate_find_option : int32_t { * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or * last (`FIND_LAST`) * @param mr Device memory resource used to allocate the returned column's device memory. - * @return std::unique_ptr INT32 column of `n` rows with the location of the `search_key` + * @return INT32 column of `n` rows with the location of the `search_key` * * @throw cudf::data_type_error If `search_keys` type does not match the element type in `lists` */ @@ -158,7 +158,7 @@ std::unique_ptr index_of( * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or * last (`FIND_LAST`) * @param mr Device memory resource used to allocate the returned column's device memory. - * @return std::unique_ptr INT32 column of `n` rows with the location of the `search_key` + * @return INT32 column of `n` rows with the location of the `search_key` * * @throw cudf::logic_error If `search_keys` does not match `lists` in its number of rows * @throw cudf::data_type_error If `search_keys` type does not match the element type in `lists` diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp index 6b74a0e600a..336214e3934 100644 --- a/cpp/include/cudf/lists/lists_column_view.hpp +++ b/cpp/include/cudf/lists/lists_column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -115,7 +115,7 @@ class lists_column_view : private column_view { /** * @brief Return first offset (accounting for column offset) * - * @return int32_t const* Pointer to the first offset + * @return Pointer to the first offset */ [[nodiscard]] offset_iterator offsets_begin() const noexcept { @@ -130,7 +130,7 @@ class lists_column_view : private column_view { * be computed using the size of the offsets() child column, which is also the offsets of the * entire original (non-sliced) lists column. * - * @return int32_t const* Pointer to one past the last offset + * @return Pointer to one past the last offset */ [[nodiscard]] offset_iterator offsets_end() const noexcept { diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index bd4ce28a2ef..360006c1eea 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,7 +38,7 @@ namespace cudf { * * @param state The state of the null mask * @param size The number of elements represented by the mask - * @return size_type The count of null elements + * @return The count of null elements */ size_type state_null_count(mask_state state, size_type size); @@ -52,7 +52,7 @@ size_type state_null_count(mask_state state, size_type size); * @param number_of_bits The number of bits that need to be represented * @param padding_boundary The value returned will be rounded up to a multiple * of this value - * @return std::size_t The necessary number of bytes + * @return The necessary number of bytes */ std::size_t bitmask_allocation_size_bytes(size_type number_of_bits, std::size_t padding_boundary = 64); @@ -68,7 +68,7 @@ std::size_t bitmask_allocation_size_bytes(size_type number_of_bits, * in a bitmask and ignore the padding/slack bits. * * @param number_of_bits The number of bits that need to be represented - * @return size_type The necessary number of `bitmask_type` elements + * @return The necessary number of `bitmask_type` elements */ size_type num_bitmask_words(size_type number_of_bits); @@ -79,7 +79,7 @@ size_type num_bitmask_words(size_type number_of_bits); * @param size The number of elements to be represented by the mask * @param state The desired state of the mask * @param mr Device memory resource used to allocate the returned device_buffer - * @return rmm::device_buffer A `device_buffer` for use as a null bitmask + * @return A `device_buffer` for use as a null bitmask * satisfying the desired size and state */ rmm::device_buffer create_null_mask( @@ -114,7 +114,7 @@ void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit * @param begin_bit Index of the first bit to be copied (inclusive) * @param end_bit Index of the last bit to be copied (exclusive) * @param mr Device memory resource used to allocate the returned device_buffer - * @return rmm::device_buffer A `device_buffer` containing the bits + * @return A `device_buffer` containing the bits * `[begin_bit, end_bit)` from `mask`. */ rmm::device_buffer copy_bitmask( @@ -131,7 +131,7 @@ rmm::device_buffer copy_bitmask( * * @param view Column view whose bitmask needs to be copied * @param mr Device memory resource used to allocate the returned device_buffer - * @return rmm::device_buffer A `device_buffer` containing the bits + * @return A `device_buffer` containing the bits * `[view.offset(), view.offset() + view.size())` from `view`'s bitmask. */ rmm::device_buffer copy_bitmask( diff --git a/cpp/include/cudf/round.hpp b/cpp/include/cudf/round.hpp index 29e5c1ab808..030d3d42773 100644 --- a/cpp/include/cudf/round.hpp +++ b/cpp/include/cudf/round.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,7 +67,7 @@ enum class rounding_method : int32_t { HALF_UP, HALF_EVEN }; * @param method Rounding method * @param mr Device memory resource used to allocate the returned column's device memory * - * @return std::unique_ptr Column with each of the values rounded + * @return Column with each of the values rounded */ std::unique_ptr round( column_view const& input, diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp index f43089210fd..922bed3b1ea 100644 --- a/cpp/include/cudf/sorting.hpp +++ b/cpp/include/cudf/sorting.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -190,10 +190,9 @@ std::unique_ptr
stable_sort_by_key( * for column * @param percentage flag to convert ranks to percentage in range (0,1] * @param mr Device memory resource used to allocate the returned column's device memory - * @return std::unique_ptr A column of containing the rank of the each - * element of the column of `input`. The output column type will be `size_type` - * column by default or else `double` when `method=rank_method::AVERAGE` or - *`percentage=True` + * @return A column of containing the rank of the each element of the column of `input`. The output + * column type will be `size_type`column by default or else `double` when + * `method=rank_method::AVERAGE` or `percentage=True` */ std::unique_ptr rank( column_view const& input, diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 2a207d2a5c4..58f20adb923 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -756,9 +756,8 @@ struct preprocessed_table { /** * @brief Get a device array containing the desired order of each column in the preprocessed table * - * @return std::optional> Device array containing respective column - * orders. If no explicit column orders were specified during the creation of this object then - * this will be `nullopt`. + * @return Device array containing respective column orders. If no explicit column orders were + * specified during the creation of this object then this will be `nullopt`. */ [[nodiscard]] std::optional> column_order() const { @@ -770,9 +769,8 @@ struct preprocessed_table { * @brief Get a device array containing the desired null precedence of each column in the * preprocessed table * - * @return std::optional> Device array containing respective column - * null precedence. If no explicit column null precedences were specified during the creation of - * this object then this will be `nullopt`. + * @return Device array containing respective column null precedence. If no explicit column null + * precedences were specified during the creation of this object then this will be `nullopt`. */ [[nodiscard]] std::optional> null_precedence() const { diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp index 969bec84716..412fe17ef26 100644 --- a/cpp/include/cudf/transform.hpp +++ b/cpp/include/cudf/transform.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,7 +82,7 @@ std::pair, size_type> nans_to_nulls( * @param table The table used for expression evaluation * @param expr The root of the expression tree * @param mr Device memory resource - * @return std::unique_ptr Output column + * @return Output column */ std::unique_ptr compute_column( table_view const& table, diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 8a1e4c9aee7..3bc1f9d6da7 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -89,7 +89,7 @@ using thread_index_type = int64_t; ///< Thread index type in kernels * @tparam T Iterator type * @param f "first" iterator * @param l "last" iterator - * @return size_type The distance between first and last + * @return The distance between first and last */ template size_type distance(T f, T l) From 4da6b19f86b87cf8453fa4a2b54caea276d49706 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Fri, 10 Mar 2023 11:31:49 -0600 Subject: [PATCH 2/4] Fix null hive-partition behavior in dask-cudf parquet (#12866) This PR includes a few simple changes to fix the handling of null hive partitions in `dask_cudf`. ~Depends on https://github.com/dask/dask/pull/10007~ Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/12866 --- python/cudf/cudf/io/parquet.py | 52 ++++++++++++++++--- python/dask_cudf/dask_cudf/io/parquet.py | 17 +++--- .../dask_cudf/io/tests/test_parquet.py | 22 ++++++++ 3 files changed, 77 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 1329c06b18d..ca4fb103ee8 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -9,6 +9,7 @@ from typing import Dict, List, Optional, Tuple from uuid import uuid4 +import pandas as pd from pyarrow import dataset as ds, parquet as pq import cudf @@ -269,6 +270,7 @@ def _process_dataset( filters=None, row_groups=None, categorical_partitions=True, + dataset_kwargs=None, ): # Returns: # file_list - Expanded/filtered list of paths @@ -296,8 +298,13 @@ def _process_dataset( dataset = ds.dataset( source=paths[0] if len(paths) == 1 else paths, filesystem=fs, - format="parquet", - partitioning="hive", + **( + dataset_kwargs + or { + "format": "parquet", + "partitioning": "hive", + } + ), ) file_list = dataset.files @@ -423,6 +430,7 @@ def read_parquet( categorical_partitions=True, open_file_options=None, bytes_per_thread=None, + dataset_kwargs=None, *args, **kwargs, ): @@ -483,6 +491,7 @@ def read_parquet( filters=filters, row_groups=row_groups, categorical_partitions=categorical_partitions, + dataset_kwargs=dataset_kwargs, ) elif filters is not None: raise ValueError("cudf cannot apply filters to open file objects.") @@ -540,6 +549,7 @@ def read_parquet( use_pandas_metadata=use_pandas_metadata, partition_keys=partition_keys, partition_categories=partition_categories, + dataset_kwargs=dataset_kwargs, **kwargs, ) @@ -551,6 +561,7 @@ def _parquet_to_frame( row_groups=None, partition_keys=None, partition_categories=None, + dataset_kwargs=None, **kwargs, ): @@ -564,6 +575,13 @@ def _parquet_to_frame( **kwargs, ) + partition_meta = None + partitioning = (dataset_kwargs or {}).get("partitioning", None) + if hasattr(partitioning, "schema"): + partition_meta = cudf.DataFrame.from_arrow( + partitioning.schema.empty_table() + ) + # For partitioned data, we need a distinct read for each # unique set of partition keys. Therefore, we start by # aggregating all paths with matching keys using a dict @@ -607,7 +625,14 @@ def _parquet_to_frame( else: # Not building categorical columns, so # `value` is already what we want - dfs[-1][name] = as_column(value, length=len(dfs[-1])) + if partition_meta is not None: + dfs[-1][name] = as_column( + value, + length=len(dfs[-1]), + dtype=partition_meta[name].dtype, + ) + else: + dfs[-1][name] = as_column(value, length=len(dfs[-1])) # Concatenate dfs and return. # Assume we can ignore the index if it has no name. @@ -827,7 +852,10 @@ def _get_partitioned( metadata_file_paths = [] for keys in part_names.itertuples(index=False): subdir = fs.sep.join( - [f"{name}={val}" for name, val in zip(partition_cols, keys)] + [ + _hive_dirname(name, val) + for name, val in zip(partition_cols, keys) + ] ) prefix = fs.sep.join([root_path, subdir]) fs.mkdirs(prefix, exist_ok=True) @@ -848,16 +876,17 @@ def _get_groups_and_offsets( ): if not (set(df._data) - set(partition_cols)): - raise ValueError("No data left to save outside partition columns") + warnings.warn("No data left to save outside partition columns") - part_names, part_offsets, _, grouped_df = df.groupby( - partition_cols + _, part_offsets, part_keys, grouped_df = df.groupby( + partition_cols, + dropna=False, )._grouped() if not preserve_index: grouped_df.reset_index(drop=True, inplace=True) grouped_df.drop(columns=partition_cols, inplace=True) # Copy the entire keys df in one operation rather than using iloc - part_names = part_names.to_pandas().to_frame(index=False) + part_names = part_keys.to_pandas().unique().to_frame(index=False) return part_names, grouped_df, part_offsets @@ -1251,3 +1280,10 @@ def _default_open_file_options( ) open_file_options["precache_options"] = precache_options return open_file_options + + +def _hive_dirname(name, val): + # Simple utility to produce hive directory name + if pd.isna(val): + val = "__HIVE_DEFAULT_PARTITION__" + return f"{name}={val}" diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 962662061b5..452f2f8914a 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -71,6 +71,7 @@ def _read_paths( partitioning=None, partition_keys=None, open_file_options=None, + dataset_kwargs=None, **kwargs, ): @@ -78,6 +79,8 @@ def _read_paths( if row_groups == [None for path in paths]: row_groups = None + dataset_kwargs = dataset_kwargs or {} + dataset_kwargs["partitioning"] = partitioning or "hive" with ExitStack() as stack: # Non-local filesystem handling @@ -100,6 +103,8 @@ def _read_paths( columns=columns, row_groups=row_groups if row_groups else None, strings_to_categorical=strings_to_categorical, + dataset_kwargs=dataset_kwargs, + categorical_partitions=False, **kwargs, ) except RuntimeError as err: @@ -127,15 +132,10 @@ def _read_paths( if partitions and partition_keys is None: # Use `HivePartitioning` by default - partitioning = partitioning or {"obj": pa_ds.HivePartitioning} ds = pa_ds.dataset( paths, filesystem=fs, - format="parquet", - partitioning=partitioning["obj"].discover( - *partitioning.get("args", []), - **partitioning.get("kwargs", {}), - ), + **dataset_kwargs, ) frag = next(ds.get_fragments()) if frag: @@ -189,6 +189,9 @@ def read_partition( if isinstance(index, list): columns += index + dataset_kwargs = kwargs.get("dataset", {}) + partitioning = partitioning or dataset_kwargs.get("partitioning", None) + # Check if we are actually selecting any columns read_columns = columns if schema and columns: @@ -249,6 +252,7 @@ def read_partition( partitions=partitions, partitioning=partitioning, partition_keys=last_partition_keys, + dataset_kwargs=dataset_kwargs, **read_kwargs, ) ) @@ -274,6 +278,7 @@ def read_partition( partitions=partitions, partitioning=partitioning, partition_keys=last_partition_keys, + dataset_kwargs=dataset_kwargs, **read_kwargs, ) ) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 355a1a5d73a..8fb6e591660 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -504,6 +504,28 @@ def test_check_file_size(tmpdir): dask_cudf.read_parquet(fn, check_file_size=1).compute() +def test_null_partition(tmpdir): + import pyarrow as pa + from pyarrow.dataset import HivePartitioning + + df = pd.DataFrame({"id": [0, 1, None], "x": [1, 2, 3]}) + ddf = dd.from_pandas(df, npartitions=1).to_backend("cudf") + ddf.to_parquet(str(tmpdir), partition_on="id") + fns = glob.glob(os.path.join(tmpdir, "id" + "=*/*.parquet")) + assert len(fns) == 3 + + partitioning = HivePartitioning(pa.schema([("id", pa.float64())])) + ddf_read = dask_cudf.read_parquet( + str(tmpdir), + dataset={"partitioning": partitioning}, + ) + dd.assert_eq( + ddf[["x", "id"]], + ddf_read[["x", "id"]], + check_divisions=False, + ) + + def test_nullable_schema_mismatch(tmpdir): # See: https://github.com/rapidsai/cudf/issues/12702 path0 = str(tmpdir.join("test.0.parquet")) From e591f6826877114987e7992bd7e955a579649a59 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 10 Mar 2023 13:45:16 -0500 Subject: [PATCH 3/4] Fix libcudf gtests to pass null-count=0 for empty validity masks (#12923) Removing some unneeded usages of `cudf::UNKNOWN_NULL_COUNT` specifically in libcudf gtests source files. These were found when working on other PRs. Reducing the usage like this hopefully will move us closer to not using it at all. There are few places in the gtest source files where we still use it even though the number of nulls is technically known but these may be fixed by adding and using some new test utilities in a future PR. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/12923 --- cpp/benchmarks/common/generate_input.cu | 2 +- cpp/tests/copying/gather_struct_tests.cpp | 20 ++++------- cpp/tests/io/parquet_test.cpp | 35 ++++++------------- cpp/tests/lists/explode_tests.cpp | 10 +++--- cpp/tests/reductions/tdigest_tests.cu | 13 ++++--- cpp/tests/sort/sort_test.cpp | 12 +++---- .../apply_boolean_mask_tests.cpp | 6 ++-- cpp/tests/structs/structs_column_tests.cpp | 22 ++++-------- cpp/tests/transform/row_bit_count_test.cu | 8 ++--- .../lists_column_wrapper_tests.cpp | 25 +++++-------- 10 files changed, 56 insertions(+), 97 deletions(-) diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index 2829d14070c..edb19b7b0ca 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -716,7 +716,7 @@ std::unique_ptr create_random_column(data_profile num_rows, std::move(offsets_column), std::move(current_child_column), - profile.get_null_probability().has_value() ? null_count : 0, // cudf::UNKNOWN_NULL_COUNT, + profile.get_null_probability().has_value() ? null_count : 0, profile.get_null_probability().has_value() ? std::move(null_mask) : rmm::device_buffer{}); } return list_column; // return the top-level column diff --git a/cpp/tests/copying/gather_struct_tests.cpp b/cpp/tests/copying/gather_struct_tests.cpp index cc84d558d44..f6fc640c957 100644 --- a/cpp/tests/copying/gather_struct_tests.cpp +++ b/cpp/tests/copying/gather_struct_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -388,13 +388,9 @@ TYPED_TEST(TypedStructGatherTest, TestGatherStructOfListOfStructs) auto const struct_of_list_of_structs = [&] { auto numeric_column = numerics{{5, 10, 15, 20, 25, 30, 35, 45, 50, 55, 60, 65, 70, 75}}; - auto structs_column = structs{{numeric_column}}.release(); - auto list_of_structs_column = - cudf::make_lists_column(7, - offsets{0, 2, 4, 6, 8, 10, 12, 14}.release(), - std::move(structs_column), - cudf::UNKNOWN_NULL_COUNT, - {}); + auto structs_column = structs{{numeric_column}}.release(); + auto list_of_structs_column = cudf::make_lists_column( + 7, offsets{0, 2, 4, 6, 8, 10, 12, 14}.release(), std::move(structs_column), 0, {}); std::vector> vector_of_columns; vector_of_columns.push_back(std::move(list_of_structs_column)); @@ -410,12 +406,8 @@ TYPED_TEST(TypedStructGatherTest, TestGatherStructOfListOfStructs) auto expected_gather_result = [&] { auto expected_numeric_col = numerics{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}}; auto expected_struct_col = structs{{expected_numeric_col}}.release(); - auto expected_list_of_structs_column = - cudf::make_lists_column(5, - offsets{0, 2, 4, 6, 8, 10}.release(), - std::move(expected_struct_col), - cudf::UNKNOWN_NULL_COUNT, - {}); + auto expected_list_of_structs_column = cudf::make_lists_column( + 5, offsets{0, 2, 4, 6, 8, 10}.release(), std::move(expected_struct_col), 0, {}); std::vector> expected_vector_of_columns; expected_vector_of_columns.push_back(std::move(expected_list_of_structs_column)); return structs{std::move(expected_vector_of_columns), {0, 1, 1, 1, 1}}; diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 141c06733a6..b682ecbbae9 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -1313,11 +1313,8 @@ TEST_F(ParquetWriterTest, ListOfStruct) cudf::test::fixed_width_column_wrapper{0, 2, 5, 5, 6}.release(); auto num_list_rows = list_offsets_column->size() - 1; - auto list_col = cudf::make_lists_column(num_list_rows, - std::move(list_offsets_column), - std::move(struct_2), - cudf::UNKNOWN_NULL_COUNT, - {}); + auto list_col = cudf::make_lists_column( + num_list_rows, std::move(list_offsets_column), std::move(struct_2), 0, {}); auto expected = table_view({*list_col}); @@ -1779,11 +1776,8 @@ TEST_F(ParquetChunkedWriterTest, ListOfStruct) cudf::test::fixed_width_column_wrapper{0, 2, 3, 3}.release(); auto num_list_rows_1 = list_offsets_column_1->size() - 1; - auto list_col_1 = cudf::make_lists_column(num_list_rows_1, - std::move(list_offsets_column_1), - struct_2_1.release(), - cudf::UNKNOWN_NULL_COUNT, - {}); + auto list_col_1 = cudf::make_lists_column( + num_list_rows_1, std::move(list_offsets_column_1), struct_2_1.release(), 0, {}); auto table_1 = table_view({*list_col_1}); @@ -1798,11 +1792,8 @@ TEST_F(ParquetChunkedWriterTest, ListOfStruct) cudf::test::fixed_width_column_wrapper{0, 1, 2, 3}.release(); auto num_list_rows_2 = list_offsets_column_2->size() - 1; - auto list_col_2 = cudf::make_lists_column(num_list_rows_2, - std::move(list_offsets_column_2), - struct_2_2.release(), - cudf::UNKNOWN_NULL_COUNT, - {}); + auto list_col_2 = cudf::make_lists_column( + num_list_rows_2, std::move(list_offsets_column_2), struct_2_2.release(), 0, {}); auto table_2 = table_view({*list_col_2}); @@ -1861,11 +1852,8 @@ TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList) cudf::test::fixed_width_column_wrapper{0, 2, 3, 4}.release(); auto num_list_rows_1 = list_offsets_column_1->size() - 1; - auto list_col_1 = cudf::make_lists_column(num_list_rows_1, - std::move(list_offsets_column_1), - struct_2_1.release(), - cudf::UNKNOWN_NULL_COUNT, - {}); + auto list_col_1 = cudf::make_lists_column( + num_list_rows_1, std::move(list_offsets_column_1), struct_2_1.release(), 0, {}); auto table_1 = table_view({*list_col_1}); @@ -1889,11 +1877,8 @@ TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList) cudf::test::fixed_width_column_wrapper{0, 1, 2}.release(); auto num_list_rows_2 = list_offsets_column_2->size() - 1; - auto list_col_2 = cudf::make_lists_column(num_list_rows_2, - std::move(list_offsets_column_2), - struct_2_2.release(), - cudf::UNKNOWN_NULL_COUNT, - {}); + auto list_col_2 = cudf::make_lists_column( + num_list_rows_2, std::move(list_offsets_column_2), struct_2_2.release(), 0, {}); auto table_2 = table_view({*list_col_2}); diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp index 12b981e3611..7b2719196f9 100644 --- a/cpp/tests/lists/explode_tests.cpp +++ b/cpp/tests/lists/explode_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -469,8 +469,8 @@ TYPED_TEST(ExplodeTypedTest, ListOfStructs) cudf::test::strings_column_wrapper string_col{ "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"}; auto struct_col = cudf::test::structs_column_wrapper{{numeric_col, string_col}}.release(); - auto a = cudf::make_lists_column( - 5, FCW{0, 2, 4, 6, 8, 10}.release(), std::move(struct_col), cudf::UNKNOWN_NULL_COUNT, {}); + auto a = + cudf::make_lists_column(5, FCW{0, 2, 4, 6, 8, 10}.release(), std::move(struct_col), 0, {}); FCW b{100, 200, 300, 400, 500}; @@ -1118,8 +1118,8 @@ TYPED_TEST(ExplodeOuterTypedTest, ListOfStructs) cudf::test::strings_column_wrapper string_col{ "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"}; auto struct_col = cudf::test::structs_column_wrapper{{numeric_col, string_col}}.release(); - auto a = cudf::make_lists_column( - 5, FCW{0, 2, 4, 6, 8, 10}.release(), std::move(struct_col), cudf::UNKNOWN_NULL_COUNT, {}); + auto a = + cudf::make_lists_column(5, FCW{0, 2, 4, 6, 8, 10}.release(), std::move(struct_col), 0, {}); FCW b{100, 200, 300, 400, 500}; diff --git a/cpp/tests/reductions/tdigest_tests.cu b/cpp/tests/reductions/tdigest_tests.cu index ca25c074ada..19112f2b713 100644 --- a/cpp/tests/reductions/tdigest_tests.cu +++ b/cpp/tests/reductions/tdigest_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -97,8 +97,8 @@ TEST_F(ReductionTDigestMerge, FewHeavyCentroids) cudf::test::fixed_width_column_wrapper c0w{100.0, 50.0}; cudf::test::structs_column_wrapper c0s({c0c, c0w}); cudf::test::fixed_width_column_wrapper c0_offsets{0, 2}; - auto c0l = cudf::make_lists_column( - 1, c0_offsets.release(), c0s.release(), cudf::UNKNOWN_NULL_COUNT, rmm::device_buffer{}); + auto c0l = + cudf::make_lists_column(1, c0_offsets.release(), c0s.release(), 0, rmm::device_buffer{}); cudf::test::fixed_width_column_wrapper c0min{1.0}; cudf::test::fixed_width_column_wrapper c0max{2.0}; std::vector> c0_children; @@ -114,8 +114,8 @@ TEST_F(ReductionTDigestMerge, FewHeavyCentroids) cudf::test::fixed_width_column_wrapper c1w{200.0, 50.0}; cudf::test::structs_column_wrapper c1s({c1c, c1w}); cudf::test::fixed_width_column_wrapper c1_offsets{0, 2}; - auto c1l = cudf::make_lists_column( - 1, c1_offsets.release(), c1s.release(), cudf::UNKNOWN_NULL_COUNT, rmm::device_buffer{}); + auto c1l = + cudf::make_lists_column(1, c1_offsets.release(), c1s.release(), 0, rmm::device_buffer{}); cudf::test::fixed_width_column_wrapper c1min{3.0}; cudf::test::fixed_width_column_wrapper c1max{4.0}; std::vector> c1_children; @@ -150,8 +150,7 @@ TEST_F(ReductionTDigestMerge, FewHeavyCentroids) cudf::test::fixed_width_column_wrapper ew{100.0, 50.0, 200.0, 50.0}; cudf::test::structs_column_wrapper es({ec, ew}); cudf::test::fixed_width_column_wrapper e_offsets{0, 4}; - auto el = cudf::make_lists_column( - 1, e_offsets.release(), es.release(), cudf::UNKNOWN_NULL_COUNT, rmm::device_buffer{}); + auto el = cudf::make_lists_column(1, e_offsets.release(), es.release(), 0, rmm::device_buffer{}); cudf::test::fixed_width_column_wrapper emin{1.0}; cudf::test::fixed_width_column_wrapper emax{4.0}; std::vector> e_children; diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp index 68099ad41d8..7c37523fae2 100644 --- a/cpp/tests/sort/sort_test.cpp +++ b/cpp/tests/sort/sort_test.cpp @@ -1053,9 +1053,9 @@ TEST_F(SortCornerTest, WithEmptyStructColumn) // struct{}, int, int int_col col_for_mask{{0, 0, 0, 0, 0, 0}, {1, 0, 1, 1, 1, 1}}; - auto null_mask = cudf::copy_bitmask(col_for_mask.release()->view()); - auto struct_col = - cudf::make_structs_column(6, {}, cudf::UNKNOWN_NULL_COUNT, std::move(null_mask)); + auto null_mask = cudf::copy_bitmask(col_for_mask); + auto struct_col = cudf::make_structs_column( + 6, {}, cudf::column_view(col_for_mask).null_count(), std::move(null_mask)); int_col col1{{1, 2, 3, 1, 2, 3}}; int_col col2{{1, 1, 1, 2, 2, 2}}; @@ -1082,10 +1082,10 @@ TEST_F(SortCornerTest, WithEmptyStructColumn) // struct{struct{}, struct{int}} int_col col_for_mask2{{0, 0, 0, 0, 0, 0}, {1, 0, 1, 1, 0, 1}}; - auto null_mask2 = cudf::copy_bitmask(col_for_mask2.release()->view()); + auto null_mask2 = cudf::copy_bitmask(col_for_mask2); std::vector> child_columns2; - auto child_col_1 = - cudf::make_structs_column(6, {}, cudf::UNKNOWN_NULL_COUNT, std::move(null_mask2)); + auto child_col_1 = cudf::make_structs_column( + 6, {}, cudf::column_view(col_for_mask2).null_count(), std::move(null_mask2)); child_columns2.push_back(std::move(child_col_1)); int_col col4{{5, 4, 3, 2, 1, 0}}; std::vector> grand_child; diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp index 2f8bfa847fa..78e9f8d01b3 100644 --- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp +++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -318,7 +318,7 @@ TEST_F(ApplyBooleanMask, ListOfStructsFiltering) cudf::make_lists_column(5, fixed_width_column_wrapper{0, 2, 4, 6, 8, 10}.release(), struct_column.release(), - cudf::UNKNOWN_NULL_COUNT, + 0, {}); auto filter_mask = fixed_width_column_wrapper{{1, 0, 1, 0, 1}}; @@ -340,7 +340,7 @@ TEST_F(ApplyBooleanMask, ListOfStructsFiltering) cudf::make_lists_column(3, fixed_width_column_wrapper{0, 2, 4, 6}.release(), expected_struct_column.release(), - cudf::UNKNOWN_NULL_COUNT, + 0, {}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(filtered_list_column, diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp index 0daec0d8290..54158e486f3 100644 --- a/cpp/tests/structs/structs_column_tests.cpp +++ b/cpp/tests/structs/structs_column_tests.cpp @@ -422,11 +422,8 @@ TYPED_TEST(TypedStructColumnWrapperTest, TestListsOfStructs) cudf::test::fixed_width_column_wrapper{0, 2, 3, 5, 6}.release(); auto num_list_rows = list_offsets_column->size() - 1; - auto list_col = cudf::make_lists_column(num_list_rows, - std::move(list_offsets_column), - std::move(struct_col), - cudf::UNKNOWN_NULL_COUNT, - {}); + auto list_col = cudf::make_lists_column( + num_list_rows, std::move(list_offsets_column), std::move(struct_col), 0, {}); // List of structs was constructed successfully. No exceptions. // Verify that child columns is as it was set. @@ -552,12 +549,8 @@ TYPED_TEST(TypedStructColumnWrapperTest, EmptyColumnsOfStructs) EXPECT_TRUE(struct_column->size() == 0); EXPECT_TRUE(struct_column->null_count() == 0); - auto empty_list_of_structs = - cudf::make_lists_column(0, - fixed_width_column_wrapper{0}.release(), - std::move(struct_column), - cudf::UNKNOWN_NULL_COUNT, - {}); + auto empty_list_of_structs = cudf::make_lists_column( + 0, fixed_width_column_wrapper{0}.release(), std::move(struct_column), 0, {}); EXPECT_TRUE(empty_list_of_structs->size() == 0); EXPECT_TRUE(empty_list_of_structs->null_count() == 0); @@ -613,11 +606,8 @@ TYPED_TEST(TypedStructColumnWrapperTest, CopyColumnFromView) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(clone_structs_column, structs_column); auto list_of_structs_column = - cudf::make_lists_column(3, - fixed_width_column_wrapper{0, 2, 4, 6}.release(), - structs_column.release(), - cudf::UNKNOWN_NULL_COUNT, - {}) + cudf::make_lists_column( + 3, fixed_width_column_wrapper{0, 2, 4, 6}.release(), structs_column.release(), 0, {}) .release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_of_structs_column->view(), diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu index 2174eabb376..51a5363c6dd 100644 --- a/cpp/tests/transform/row_bit_count_test.cu +++ b/cpp/tests/transform/row_bit_count_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -397,7 +397,7 @@ std::unique_ptr build_nested_column1(std::vector const& stru return cudf::make_lists_column(static_cast(size), outer_offsets_col.release(), struct_column.release(), - cudf::UNKNOWN_NULL_COUNT, + 0, rmm::device_buffer{}); } @@ -429,7 +429,7 @@ std::unique_ptr build_nested_column2(std::vector const& stru return make_lists_column(static_cast(size), outer_offsets_col.release(), outer_struct.release(), - cudf::UNKNOWN_NULL_COUNT, + 0, rmm::device_buffer{}); } @@ -514,7 +514,7 @@ TEST_F(RowBitCount, NestedTypes) auto l4 = cudf::make_lists_column(static_cast(l4_size), l4_offsets_col.release(), innermost_struct.release(), - cudf::UNKNOWN_NULL_COUNT, + 0, rmm::device_buffer{}); // inner struct diff --git a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp index a73e2374e02..55861f8f8db 100644 --- a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp +++ b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp @@ -1380,8 +1380,8 @@ TYPED_TEST(ListColumnWrapperTestTyped, ListsOfStructs) auto lists_column_offsets = test::fixed_width_column_wrapper{0, 2, 4, 8}.release(); auto num_lists = lists_column_offsets->size() - 1; - auto lists_column = make_lists_column( - num_lists, std::move(lists_column_offsets), std::move(struct_column), UNKNOWN_NULL_COUNT, {}); + auto lists_column = + make_lists_column(num_lists, std::move(lists_column_offsets), std::move(struct_column), 0, {}); // Check if child column is unchanged. @@ -1444,18 +1444,14 @@ TYPED_TEST(ListColumnWrapperTestTyped, ListsOfListsOfStructs) auto lists_column_offsets = test::fixed_width_column_wrapper{0, 2, 4, 8}.release(); auto num_lists = lists_column_offsets->size() - 1; - auto lists_column = make_lists_column( - num_lists, std::move(lists_column_offsets), std::move(struct_column), UNKNOWN_NULL_COUNT, {}); + auto lists_column = + make_lists_column(num_lists, std::move(lists_column_offsets), std::move(struct_column), 0, {}); auto lists_of_lists_column_offsets = test::fixed_width_column_wrapper{0, 2, 3}.release(); - auto num_lists_of_lists = lists_of_lists_column_offsets->size() - 1; - auto lists_of_lists_of_structs_column = - make_lists_column(num_lists_of_lists, - std::move(lists_of_lists_column_offsets), - std::move(lists_column), - UNKNOWN_NULL_COUNT, - {}); + auto num_lists_of_lists = lists_of_lists_column_offsets->size() - 1; + auto lists_of_lists_of_structs_column = make_lists_column( + num_lists_of_lists, std::move(lists_of_lists_column_offsets), std::move(lists_column), 0, {}); // Check if child column is unchanged. @@ -1555,11 +1551,8 @@ TYPED_TEST(ListColumnWrapperTestTyped, LargeListsOfStructsWithValidity) auto list_offset_column = test::fixed_width_column_wrapper( list_offset_iterator, list_offset_iterator + num_list_rows + 1) .release(); - auto lists_column = make_lists_column(num_list_rows, - std::move(list_offset_column), - std::move(struct_column), - cudf::UNKNOWN_NULL_COUNT, - {}); + auto lists_column = make_lists_column( + num_list_rows, std::move(list_offset_column), std::move(struct_column), 0, {}); // List construction succeeded. // Verify that the child is unchanged. From e4557cbcf803865ff9333b0c6fa45c966b530518 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Fri, 10 Mar 2023 13:30:36 -0600 Subject: [PATCH 4/4] Update minimum `pandas` and `numpy` pinnings (#12887) This PR: - [x] Increments the minimum pinning for `pandas` version from `1.0` to `1.3`. - [x] Sets a minimum pinning for `numpy` as `>=1.21` - [x] Fixes arm conda environment creation by removing `pandoc` version constraint. Resolves #12785. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Ray Douglass (https://github.com/raydouglass) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/12887 --- .../all_cuda-118_arch-x86_64.yaml | 6 +- conda/recipes/cudf/meta.yaml | 4 +- dependencies.yaml | 6 +- python/cudf/cudf/core/_compat.py | 4 - python/cudf/cudf/core/column/datetime.py | 6 +- python/cudf/cudf/core/dtypes.py | 12 +- python/cudf/cudf/core/multiindex.py | 4 +- python/cudf/cudf/testing/testing.py | 36 ++--- python/cudf/cudf/tests/test_categorical.py | 3 +- python/cudf/cudf/tests/test_dataframe.py | 25 +-- python/cudf/cudf/tests/test_dtypes.py | 8 +- python/cudf/cudf/tests/test_groupby.py | 13 +- python/cudf/cudf/tests/test_index.py | 33 +--- python/cudf/cudf/tests/test_indexing.py | 23 +-- python/cudf/cudf/tests/test_joining.py | 8 +- python/cudf/cudf/tests/test_json.py | 14 +- python/cudf/cudf/tests/test_multiindex.py | 3 +- python/cudf/cudf/tests/test_repr.py | 7 - python/cudf/cudf/tests/test_reshape.py | 15 +- python/cudf/cudf/tests/test_rolling.py | 38 ++--- python/cudf/cudf/tests/test_series.py | 11 +- python/cudf/cudf/tests/test_setitem.py | 6 +- python/cudf/cudf/tests/test_string.py | 150 ++++++------------ python/cudf/cudf/tests/test_timedelta.py | 49 +----- python/cudf/cudf/utils/dtypes.py | 17 +- python/cudf/pyproject.toml | 4 +- .../dask_cudf/dask_cudf/tests/test_groupby.py | 17 +- python/dask_cudf/pyproject.toml | 8 +- 28 files changed, 131 insertions(+), 399 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 67e2dc4720e..6f9734eb314 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -41,13 +41,13 @@ dependencies: - ninja - notebook - numba>=0.56.2 -- numpy +- numpy>=1.21 - numpydoc - nvcc_linux-64=11.8 - nvtx>=0.2.1 - packaging -- pandas>=1.0,<1.6.0dev0 -- pandoc<=2.0.0 +- pandas>=1.3,<1.6.0dev0 +- pandoc - pip - pre-commit - protobuf>=4.21.6,<4.22 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index e0f33ad40c7..6b23c8953d3 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -62,10 +62,10 @@ requirements: - protobuf >=4.21.6,<4.22 - python - typing_extensions - - pandas >=1.0,<1.6.0dev0 + - pandas >=1.3,<1.6.0dev0 - cupy >=9.5.0,<12.0.0a0 - numba >=0.56.2 - - numpy + - numpy >=1.21 - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} - libcudf {{ version }} - fastavro >=0.22.0 diff --git a/dependencies.yaml b/dependencies.yaml index 4bac8148b10..48b5bfe53d4 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -152,7 +152,7 @@ dependencies: - myst-nb - nbsphinx - numpydoc - - pandoc<=2.0.0 # We should check and fix all "<=" pinnings + - pandoc - pydata-sphinx-theme - sphinx - sphinx-autobuild @@ -254,10 +254,10 @@ dependencies: - distributed>=2023.1.1 - fsspec>=0.6.0 - numba>=0.56.2 - - numpy + - numpy>=1.21 - nvtx>=0.2.1 - packaging - - pandas>=1.0,<1.6.0dev0 + - pandas>=1.3,<1.6.0dev0 - python-confluent-kafka=1.7.0 - streamz - typing_extensions diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index 722f9677db0..6ecbe414ebb 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -4,10 +4,6 @@ from packaging import version PANDAS_VERSION = version.parse(pd.__version__) -PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1") -PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2") -PANDAS_LE_122 = PANDAS_VERSION <= version.parse("1.2.2") -PANDAS_GE_130 = PANDAS_VERSION >= version.parse("1.3.0") PANDAS_GE_133 = PANDAS_VERSION >= version.parse("1.3.3") PANDAS_GE_134 = PANDAS_VERSION >= version.parse("1.3.4") PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0") diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 0c546168fe3..14aa7bdd84b 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -21,16 +21,12 @@ ScalarLike, ) from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype -from cudf.core._compat import PANDAS_GE_120 from cudf.core.buffer import Buffer, cuda_array_interface_wrapper from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils.utils import _fillna_natwise -if PANDAS_GE_120: - _guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format -else: - _guess_datetime_format = pd.core.tools.datetimes._guess_datetime_format +_guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format # nanoseconds per time_unit _dtype_to_format_conversion = { diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 963f13acf10..d6edd6af093 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -19,7 +19,7 @@ import cudf from cudf._typing import Dtype -from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150 +from cudf.core._compat import PANDAS_GE_150 from cudf.core.abc import Serializable from cudf.core.buffer import Buffer from cudf.utils.docutils import doc_apply @@ -875,16 +875,10 @@ def to_arrow(self): @classmethod def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype": - if PANDAS_GE_130: - return cls(subtype=pd_dtype.subtype, closed=pd_dtype.closed) - else: - return cls(subtype=pd_dtype.subtype) + return cls(subtype=pd_dtype.subtype, closed=pd_dtype.closed) def to_pandas(self) -> pd.IntervalDtype: - if PANDAS_GE_130: - return pd.IntervalDtype(subtype=self.subtype, closed=self.closed) - else: - return pd.IntervalDtype(subtype=self.subtype) + return pd.IntervalDtype(subtype=self.subtype, closed=self.closed) def __eq__(self, other): if isinstance(other, str): diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 1ce4cc218f8..4a9bc89fa34 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -21,7 +21,7 @@ from cudf._typing import DataFrameOrSeries from cudf.api.types import is_integer, is_list_like, is_object_dtype from cudf.core import column -from cudf.core._compat import PANDAS_GE_120, PANDAS_GE_150 +from cudf.core._compat import PANDAS_GE_150 from cudf.core.frame import Frame from cudf.core.index import ( BaseIndex, @@ -495,7 +495,7 @@ def __repr__(self): ) ) - if PANDAS_GE_120 and not PANDAS_GE_150: + if not PANDAS_GE_150: # Need this whole `if` block, # this is a workaround for the following issue: # https://github.com/pandas-dev/pandas/issues/39984 diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index a8428c2647b..484c013f774 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from __future__ import annotations @@ -19,7 +19,6 @@ is_string_dtype, is_struct_dtype, ) -from cudf.core._compat import PANDAS_GE_110 from cudf.core.missing import NA @@ -699,28 +698,17 @@ def assert_frame_equal( obj=f"{obj}.index", ) - if PANDAS_GE_110: - pd.testing.assert_index_equal( - left._data.to_pandas_index(), - right._data.to_pandas_index(), - exact=check_column_type, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=f"{obj}.columns", - ) - else: - pd.testing.assert_index_equal( - left._data.to_pandas_index(), - right._data.to_pandas_index(), - exact=check_column_type, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - obj=f"{obj}.columns", - ) + pd.testing.assert_index_equal( + left._data.to_pandas_index(), + right._data.to_pandas_index(), + exact=check_column_type, + check_names=check_names, + check_exact=check_exact, + check_categorical=check_categorical, + rtol=rtol, + atol=atol, + obj=f"{obj}.columns", + ) for col in left._column_names: assert_column_equal( diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index 496039ca2f8..2c8226e4fe5 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -11,7 +11,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_134 +from cudf.core._compat import PANDAS_GE_134 from cudf.testing._utils import ( NUMERIC_TYPES, assert_eq, @@ -81,7 +81,6 @@ def test_categorical_basic(): assert_eq(cat.codes, cudf_cat.codes.to_numpy()) -@pytest.mark.skipif(not PANDAS_GE_110, reason="requires pandas>=1.1.0") def test_categorical_integer(): cat = pd.Categorical(["a", "_", "_", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 7ddfa3a7f48..6a79555d43e 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -22,13 +22,7 @@ from packaging import version import cudf -from cudf.core._compat import ( - PANDAS_GE_110, - PANDAS_GE_120, - PANDAS_GE_134, - PANDAS_GE_150, - PANDAS_LT_140, -) +from cudf.core._compat import PANDAS_GE_134, PANDAS_GE_150, PANDAS_LT_140 from cudf.core.buffer.spill_manager import get_global_manager from cudf.core.column import column from cudf.testing import _utils as utils @@ -3227,10 +3221,6 @@ def test_dataframe_reindex_fill_value( @pytest.mark.parametrize("copy", [True, False]) def test_dataframe_reindex_change_dtype(copy): - if PANDAS_GE_110: - kwargs = {"check_freq": False} - else: - kwargs = {} index = pd.date_range("12/29/2009", periods=10, freq="D") columns = ["a", "b", "c", "d", "e"] gdf = cudf.datasets.randomdata( @@ -3242,7 +3232,7 @@ def test_dataframe_reindex_change_dtype(copy): assert_eq( pdf.reindex(index=index, columns=columns, copy=True), gdf.reindex(index=index, columns=columns, copy=copy), - **kwargs, + check_freq=False, ) @@ -4632,10 +4622,6 @@ def test_isin_dataframe(data, values): else: try: expected = pdf.isin(values) - except ValueError as e: - if str(e) == "Lengths must match." and not PANDAS_GE_110: - # https://github.com/pandas-dev/pandas/issues/34256 - return except TypeError as e: # Can't do isin with different categories if str(e) == ( @@ -5302,12 +5288,7 @@ def test_rowwise_ops_datetime_dtypes_pdbug(data): expected = pdf.max(axis=1, skipna=False) got = gdf.max(axis=1, skipna=False) - if PANDAS_GE_120: - assert_eq(got, expected) - else: - # PANDAS BUG: https://github.com/pandas-dev/pandas/issues/36907 - with pytest.raises(AssertionError, match="numpy array are different"): - assert_eq(got, expected) + assert_eq(got, expected) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index 2f8e1ac5c2f..6e24099f1a8 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -6,7 +6,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150 +from cudf.core._compat import PANDAS_GE_150 from cudf.core.column import ColumnBase from cudf.core.dtypes import ( CategoricalDtype, @@ -187,10 +187,6 @@ def test_interval_dtype_pyarrow_round_trip(subtype, closed): assert expect.equals(got) -@pytest.mark.skipif( - not PANDAS_GE_130, - reason="pandas<1.3.0 doesn't have a closed argument for IntervalDtype", -) def test_interval_dtype_from_pandas(subtype, closed): expect = cudf.IntervalDtype(subtype, closed=closed) pd_type = pd.IntervalDtype(subtype, closed=closed) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 97700779a89..0751ef7ca67 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -15,12 +15,7 @@ import cudf from cudf import DataFrame, Series -from cudf.core._compat import ( - PANDAS_GE_110, - PANDAS_GE_130, - PANDAS_GE_150, - PANDAS_LT_140, -) +from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES from cudf.testing._utils import ( DATETIME_TYPES, @@ -573,7 +568,7 @@ def test_groupby_2keys_agg(nelem, func): # "func", ["min", "max", "idxmin", "idxmax", "count", "sum"], ) @pytest.mark.xfail( - condition=PANDAS_GE_130 and PANDAS_LT_140, + condition=PANDAS_LT_140, reason="https://github.com/pandas-dev/pandas/issues/43209", ) def test_groupby_agg_decimal(num_groups, nelem_per_group, func): @@ -1507,9 +1502,6 @@ def test_groupby_median(agg, by): @pytest.mark.parametrize("agg", [lambda x: x.nunique(), "nunique"]) @pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) -@pytest.mark.xfail( - condition=not PANDAS_GE_110, reason="pandas >= 1.1 required" -) def test_groupby_nunique(agg, by): pdf = pd.DataFrame( {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]} @@ -1545,7 +1537,6 @@ def test_groupby_nth(n, by): @pytest.mark.xfail( - condition=PANDAS_GE_130, reason="https://github.com/pandas-dev/pandas/issues/43209", ) def test_raise_data_error(): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 29601cbd203..d043b917251 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -11,7 +11,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_133, PANDAS_GE_200 +from cudf.core._compat import PANDAS_GE_133, PANDAS_GE_200 from cudf.core.index import ( CategoricalIndex, DatetimeIndex, @@ -811,17 +811,6 @@ def test_index_difference(data, other, sort): gd_data = cudf.core.index.as_index(data) gd_other = cudf.core.index.as_index(other) - if ( - gd_data.dtype.kind == "f" - and gd_other.dtype.kind != "f" - or (gd_data.dtype.kind != "f" and gd_other.dtype.kind == "f") - ): - pytest.mark.xfail( - condition=not PANDAS_GE_110, - reason="Bug in Pandas: " - "https://github.com/pandas-dev/pandas/issues/35217", - ) - expected = pd_data.difference(pd_other, sort=sort) actual = gd_data.difference(gd_other, sort=sort) assert_eq(expected, actual) @@ -880,15 +869,6 @@ def test_index_equals(data, other): gd_data = cudf.core.index.as_index(data) gd_other = cudf.core.index.as_index(other) - if ( - gd_data.dtype.kind == "f" or gd_other.dtype.kind == "f" - ) and cudf.utils.dtypes.is_mixed_with_object_dtype(gd_data, gd_other): - pytest.mark.xfail( - condition=not PANDAS_GE_110, - reason="Bug in Pandas: " - "https://github.com/pandas-dev/pandas/issues/35217", - ) - expected = pd_data.equals(pd_other) actual = gd_data.equals(gd_other) assert_eq(expected, actual) @@ -935,17 +915,6 @@ def test_index_categories_equal(data, other): gd_data = cudf.core.index.as_index(data).astype("category") gd_other = cudf.core.index.as_index(other) - if ( - gd_data.dtype.kind == "f" - and gd_other.dtype.kind != "f" - or (gd_data.dtype.kind != "f" and gd_other.dtype.kind == "f") - ): - pytest.mark.xfail( - condition=not PANDAS_GE_110, - reason="Bug in Pandas: " - "https://github.com/pandas-dev/pandas/issues/35217", - ) - expected = pd_data.equals(pd_other) actual = gd_data.equals(gd_other) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 634466e92a3..5012ae0979f 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -8,7 +8,6 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_120 from cudf.testing import _utils as utils from cudf.testing._utils import ( INTEGER_TYPES, @@ -451,10 +450,6 @@ def test_series_loc_string(): def test_series_loc_datetime(): - if PANDAS_GE_110: - kwargs = {"check_freq": False} - else: - kwargs = {} ps = pd.Series( [1, 2, 3, 4, 5], index=pd.date_range("20010101", "20010105") ) @@ -475,11 +470,11 @@ def test_series_loc_datetime(): assert_eq( ps.loc["2001-01-02":"2001-01-05"], gs.loc["2001-01-02":"2001-01-05"], - **kwargs, + check_freq=False, ) - assert_eq(ps.loc["2001-01-02":], gs.loc["2001-01-02":], **kwargs) - assert_eq(ps.loc[:"2001-01-04"], gs.loc[:"2001-01-04"], **kwargs) - assert_eq(ps.loc[::2], gs.loc[::2], **kwargs) + assert_eq(ps.loc["2001-01-02":], gs.loc["2001-01-02":], check_freq=False) + assert_eq(ps.loc[:"2001-01-04"], gs.loc[:"2001-01-04"], check_freq=False) + assert_eq(ps.loc[::2], gs.loc[::2], check_freq=False) assert_eq( ps.loc[["2001-01-01", "2001-01-04", "2001-01-05"]], @@ -505,13 +500,15 @@ def test_series_loc_datetime(): assert_eq( ps.loc[[True, False, True, False, True]], gs.loc[[True, False, True, False, True]], - **kwargs, + check_freq=False, ) just_less_than_max = ps.index.max() - pd.Timedelta("5m") assert_eq( - ps.loc[:just_less_than_max], gs.loc[:just_less_than_max], **kwargs + ps.loc[:just_less_than_max], + gs.loc[:just_less_than_max], + check_freq=False, ) @@ -1012,10 +1009,6 @@ def test_series_setitem_datetime(): assert_eq(psr, gsr) -@pytest.mark.xfail( - condition=not PANDAS_GE_120, - reason="Pandas will coerce to object datatype here", -) def test_series_setitem_datetime_coerced(): psr = pd.Series(["2001", "2002", "2003"], dtype="datetime64[ns]") gsr = cudf.from_pandas(psr) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 94da7a50c2e..b197e91882a 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from itertools import combinations, product, repeat @@ -7,7 +7,6 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_120 from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import ( INTEGER_TYPES, @@ -545,11 +544,6 @@ def test_empty_joins(how, left_empty, right_empty): assert len(expected) == len(result) -@pytest.mark.xfail( - condition=not PANDAS_GE_120, - reason="left_on/right_on produces undefined results with 0" - "index and is disabled", -) def test_merge_left_index_zero(): left = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5]) right = pd.DataFrame( diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index b778db4465f..8dcab37d20a 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -13,7 +13,6 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_110 from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, @@ -165,18 +164,7 @@ def test_json_writer(tmpdir, pdf, gdf): assert os.path.exists(pdf_series_fname) assert os.path.exists(gdf_series_fname) - try: - # xref 'https://github.com/pandas-dev/pandas/pull/33373' - expect_series = pd.read_json(pdf_series_fname, typ="series") - except TypeError as e: - if ( - not PANDAS_GE_110 - and str(e) == " is not convertible to datetime" - ): - continue - else: - raise e - + expect_series = pd.read_json(pdf_series_fname, typ="series") got_series = pd.read_json(gdf_series_fname, typ="series") assert_eq(expect_series, got_series) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index bd9f36a595d..0f04e8c0f2d 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -16,7 +16,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_200 +from cudf.core._compat import PANDAS_GE_200 from cudf.core.column import as_column from cudf.core.index import as_index from cudf.testing._utils import ( @@ -1102,7 +1102,6 @@ def test_multicolumn_loc(pdf, pdfIndex): @pytest.mark.xfail( - condition=PANDAS_GE_130, reason="https://github.com/pandas-dev/pandas/issues/43351", ) def test_multicolumn_set_item(pdf, pdfIndex): diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index bae0fde6463..e7fa401f1ec 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -9,7 +9,6 @@ from hypothesis import given, settings, strategies as st import cudf -from cudf.core._compat import PANDAS_GE_110 from cudf.testing import _utils as utils from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes @@ -600,9 +599,6 @@ def test_series_null_index_repr(sr, pandas_special_case): ], ) @pytest.mark.parametrize("dtype", ["timedelta64[s]", "timedelta64[us]"]) -@pytest.mark.xfail( - condition=not PANDAS_GE_110, reason="pandas >= 1.1 required" -) def test_timedelta_series_s_us_repr(data, dtype): sr = cudf.Series(data, dtype=dtype) psr = sr.to_pandas() @@ -1103,9 +1099,6 @@ def test_timedelta_dataframe_repr(df, expected_repr): ), ], ) -@pytest.mark.xfail( - condition=not PANDAS_GE_110, reason="pandas >= 1.1 required" -) def test_timedelta_index_repr(index, expected_repr): actual_repr = repr(index) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 37ffbab1676..78e95fdbd81 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -1,6 +1,7 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import re +from itertools import chain import numpy as np import pandas as pd @@ -8,7 +9,6 @@ import cudf from cudf import melt as cudf_melt -from cudf.core._compat import PANDAS_GE_120 from cudf.core.buffer.spill_manager import get_global_manager from cudf.testing._utils import ( ALL_TYPES, @@ -86,16 +86,7 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 1000]) @pytest.mark.parametrize( - "dtype", - list(NUMERIC_TYPES + DATETIME_TYPES) - + [ - pytest.param( - "str", - marks=pytest_xfail( - condition=not PANDAS_GE_120, reason="pandas bug" - ), - ) - ], + "dtype", list(chain(NUMERIC_TYPES, DATETIME_TYPES, ["str"])) ) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_df_stack(nulls, num_cols, num_rows, dtype): diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py index 08188c25ffa..62120619d94 100644 --- a/python/cudf/cudf/tests/test_rolling.py +++ b/python/cudf/cudf/tests/test_rolling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import math from contextlib import contextmanager @@ -8,12 +8,7 @@ import pytest import cudf -from cudf.core._compat import ( - PANDAS_GE_110, - PANDAS_GE_130, - PANDAS_GE_150, - PANDAS_LT_140, -) +from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 from cudf.testing._utils import _create_pandas_series, assert_eq from cudf.testing.dataset_generator import rand_dataframe @@ -48,10 +43,7 @@ def _hide_pandas_rolling_min_periods_warning(agg): @pytest.mark.parametrize("center", [True, False]) def test_rolling_series_basic(data, index, agg, nulls, center): rng = np.random.default_rng(1) - if PANDAS_GE_110: - kwargs = {"check_freq": False} - else: - kwargs = {} + if len(data) > 0: if nulls == "one": p = rng.integers(0, len(data)) @@ -73,7 +65,7 @@ def test_rolling_series_basic(data, index, agg, nulls, center): got = getattr( gsr.rolling(window_size, min_periods, center), agg )().fillna(-1) - assert_eq(expect, got, check_dtype=False, **kwargs) + assert_eq(expect, got, check_dtype=False, check_freq=False) @pytest.mark.parametrize( @@ -159,10 +151,6 @@ def test_rolling_with_offset(agg): @pytest.mark.parametrize("seed", [100, 2000]) @pytest.mark.parametrize("window_size", [2, 10, 100]) def test_rolling_var_std_large(agg, ddof, center, seed, window_size): - if PANDAS_GE_110: - kwargs = {"check_freq": False} - else: - kwargs = {} iupper_bound = math.sqrt(np.iinfo(np.int64).max / window_size) ilower_bound = -math.sqrt(abs(np.iinfo(np.int64).min) / window_size) @@ -214,15 +202,11 @@ def test_rolling_var_std_large(agg, ddof, center, seed, window_size): mask = (got[col].fillna(-1) != 0).to_pandas() expect[col] = expect[col][mask] got[col] = got[col][mask] - assert_eq(expect[col], got[col], **kwargs) + assert_eq(expect[col], got[col], check_freq=False) else: - assert_eq(expect, got, **kwargs) + assert_eq(expect, got, check_freq=False) -@pytest.mark.xfail( - condition=not PANDAS_GE_130, - reason="https://github.com/pandas-dev/pandas/issues/37051", -) def test_rolling_var_uniform_window(): """ Pandas adopts an online variance calculation algorithm. This gives a @@ -310,17 +294,17 @@ def test_rolling_getitem(): def test_rolling_getitem_window(): - if PANDAS_GE_110: - kwargs = {"check_freq": False} - else: - kwargs = {} index = pd.DatetimeIndex( pd.date_range("2000-01-01", "2000-01-02", freq="1h") ) pdf = pd.DataFrame({"x": np.arange(len(index))}, index=index) gdf = cudf.from_pandas(pdf) - assert_eq(pdf.rolling("2h").x.mean(), gdf.rolling("2h").x.mean(), **kwargs) + assert_eq( + pdf.rolling("2h").x.mean(), + gdf.rolling("2h").x.mean(), + check_freq=False, + ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index ce519a445ba..7123069d5b8 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -13,7 +13,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_120, PANDAS_LT_140 +from cudf.core._compat import PANDAS_LT_140 from cudf.testing._utils import ( NUMERIC_TYPES, TIMEDELTA_TYPES, @@ -1842,14 +1842,7 @@ def test_isin_datetime(data, values): ["this", "is"], [None, None, None], ["12", "14", "19"], - pytest.param( - [12, 14, 19], - marks=pytest.mark.xfail( - not PANDAS_GE_120, - reason="pandas's failure here seems like a bug(in < 1.2) " - "given the reverse succeeds", - ), - ), + [12, 14, 19], ["is", "this", "is", "this", "is"], ], ) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index d59226ee17a..4d9ffc7cd81 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -5,7 +5,7 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_120, PANDAS_GE_150, PANDAS_LE_122 +from cudf.core._compat import PANDAS_GE_150 from cudf.testing._utils import assert_eq, assert_exceptions_equal @@ -20,10 +20,6 @@ def test_dataframe_setitem_bool_mask_scaler(df, arg, value): assert_eq(df, gdf) -@pytest.mark.xfail( - condition=PANDAS_GE_120 and PANDAS_LE_122, - reason="https://github.com/pandas-dev/pandas/issues/40204", -) def test_dataframe_setitem_scaler_bool(): df = pd.DataFrame({"a": [1, 2, 3]}) df[[True, False, True]] = pd.DataFrame({"a": [-1, -2]}) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 0e6ed444c32..10208611f13 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -15,7 +15,7 @@ import cudf from cudf import concat -from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_150 +from cudf.core._compat import PANDAS_GE_150 from cudf.core.column.string import StringColumn from cudf.core.index import StringIndex, as_index from cudf.testing._utils import ( @@ -415,20 +415,8 @@ def _cat_convert_seq_to_cudf(others): ("f", "g", "h", "i", "j"), pd.Series(["f", "g", "h", "i", "j"]), pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pytest.param( - pd.Index(["f", "g", "h", "i", "j"]), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_110, - reason="https://github.com/pandas-dev/pandas/issues/33436", - ), - ), - pytest.param( - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_110, - reason="https://github.com/pandas-dev/pandas/issues/33436", - ), - ), + pd.Index(["f", "g", "h", "i", "j"]), + pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), ( np.array(["f", "g", "h", "i", "j"]), np.array(["f", "g", "h", "i", "j"]), @@ -453,38 +441,26 @@ def _cat_convert_seq_to_cudf(others): pd.Series(["f", "g", "h", "i", "j"]), np.array(["f", "g", "h", "i", "j"]), ), - pytest.param( - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["1", "2", "3", "4", "5"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_110, - reason="https://github.com/pandas-dev/pandas/issues/33436", - ), - ), - pytest.param( - [ - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ], - marks=pytest.mark.xfail( - condition=not PANDAS_GE_110, - reason="https://github.com/pandas-dev/pandas/issues/33436", - ), + ( + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["1", "2", "3", "4", "5"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), ), + [ + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + ], [ pd.Series(["hello", "world", "abc", "xyz", "pqr"]), pd.Series(["abc", "xyz", "hello", "pqr", "world"]), @@ -582,20 +558,8 @@ def test_string_cat(ps_gs, others, sep, na_rep, index): ("f", "g", "h", "i", "j"), pd.Series(["f", "g", "h", "i", "j"]), pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pytest.param( - pd.Index(["f", "g", "h", "i", "j"]), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_110, - reason="https://github.com/pandas-dev/pandas/issues/33436", - ), - ), - pytest.param( - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_110, - reason="https://github.com/pandas-dev/pandas/issues/33436", - ), - ), + pd.Index(["f", "g", "h", "i", "j"]), + pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), ( np.array(["f", "g", "h", "i", "j"]), np.array(["f", "g", "h", "i", "j"]), @@ -608,38 +572,26 @@ def test_string_cat(ps_gs, others, sep, na_rep, index): pd.Series(["f", "g", "h", "i", "j"]), pd.Series(["f", "g", "h", "i", "j"]), ], - pytest.param( - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["1", "2", "3", "4", "5"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_110, - reason="https://github.com/pandas-dev/pandas/issues/33436", - ), - ), - pytest.param( - [ - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ], - marks=pytest.mark.xfail( - condition=not PANDAS_GE_110, - reason="https://github.com/pandas-dev/pandas/issues/33436", - ), + ( + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["1", "2", "3", "4", "5"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), ), + [ + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Series(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + np.array(["f", "a", "b", "f", "a"]), + pd.Index(["f", "g", "h", "i", "j"]), + ], [ pd.Series( ["hello", "world", "abc", "xyz", "pqr"], @@ -701,20 +653,8 @@ def test_string_index_str_cat(data, others, sep, na_rep, name): None, ["f", "g", "h", "i", "j"], pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pytest.param( - pd.Index(["f", "g", "h", "i", "j"]), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_110, - reason="https://github.com/pandas-dev/pandas/issues/33436", - ), - ), - pytest.param( - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_110, - reason="https://github.com/pandas-dev/pandas/issues/33436", - ), - ), + pd.Index(["f", "g", "h", "i", "j"]), + pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), [ np.array(["f", "g", "h", "i", "j"]), np.array(["f", "g", "h", "i", "j"]), diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index 468773387c1..4b1e8cf1027 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -9,7 +9,6 @@ import pytest import cudf -from cudf.core._compat import PANDAS_GE_120 from cudf.testing import _utils as utils from cudf.testing._utils import assert_eq, assert_exceptions_equal @@ -414,13 +413,7 @@ def test_timedelta_dataframe_ops(df, op): np.timedelta64(4, "s"), np.timedelta64(456, "D"), np.timedelta64(46, "h"), - pytest.param( - np.timedelta64("nat"), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_120, - reason="https://github.com/pandas-dev/pandas/issues/35529", - ), - ), + np.timedelta64("nat"), np.timedelta64(1, "s"), np.timedelta64(1, "ms"), np.timedelta64(1, "us"), @@ -435,13 +428,7 @@ def test_timedelta_dataframe_ops(df, op): "sub", "truediv", "mod", - pytest.param( - "floordiv", - marks=pytest.mark.xfail( - condition=not PANDAS_GE_120, - reason="https://github.com/pandas-dev/pandas/issues/35529", - ), - ), + "floordiv", ], ) def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op): @@ -541,13 +528,7 @@ def test_timedelta_series_mod_with_scalar_zero(reverse): datetime.timedelta(seconds=768), datetime.timedelta(microseconds=7), np.timedelta64(4, "s"), - pytest.param( - np.timedelta64("nat", "s"), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_120, - reason="https://github.com/pandas-dev/pandas/issues/35529", - ), - ), + np.timedelta64("nat", "s"), np.timedelta64(1, "s"), np.timedelta64(1, "ms"), np.timedelta64(1, "us"), @@ -563,13 +544,7 @@ def test_timedelta_series_mod_with_scalar_zero(reverse): "sub", "truediv", "mod", - pytest.param( - "floordiv", - marks=pytest.mark.xfail( - condition=not PANDAS_GE_120, - reason="https://github.com/pandas-dev/pandas/issues/35529", - ), - ), + "floordiv", ], ) def test_timedelta_series_ops_with_cudf_scalars(data, cpu_scalar, dtype, op): @@ -858,13 +833,7 @@ def test_timedelta_datetime_index_ops_misc( "add", "sub", "truediv", - pytest.param( - "floordiv", - marks=pytest.mark.xfail( - condition=not PANDAS_GE_120, - reason="https://github.com/pandas-dev/pandas/issues/35529", - ), - ), + "floordiv", ], ) @pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning:pandas") @@ -938,13 +907,7 @@ def test_timedelta_index_ops_with_scalars( "add", "sub", "truediv", - pytest.param( - "floordiv", - marks=pytest.mark.xfail( - condition=not PANDAS_GE_120, - reason="https://github.com/pandas-dev/pandas/issues/35529", - ), - ), + "floordiv", ], ) def test_timedelta_index_ops_with_cudf_scalars( diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 92c23d8b97b..acf00b3a3d5 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import datetime from collections import namedtuple @@ -12,7 +12,6 @@ import cudf from cudf.api.types import is_bool, is_float, is_integer -from cudf.core._compat import PANDAS_GE_120 from cudf.core.missing import NA _NA_REP = "" @@ -90,13 +89,13 @@ "boolean": "bool", } -if PANDAS_GE_120: - np_dtypes_to_pandas_dtypes[np.dtype("float32")] = pd.Float32Dtype() - np_dtypes_to_pandas_dtypes[np.dtype("float64")] = pd.Float64Dtype() - pandas_dtypes_to_np_dtypes[pd.Float32Dtype()] = np.dtype("float32") - pandas_dtypes_to_np_dtypes[pd.Float64Dtype()] = np.dtype("float64") - pandas_dtypes_alias_to_cudf_alias["Float32"] = "float32" - pandas_dtypes_alias_to_cudf_alias["Float64"] = "float64" + +np_dtypes_to_pandas_dtypes[np.dtype("float32")] = pd.Float32Dtype() +np_dtypes_to_pandas_dtypes[np.dtype("float64")] = pd.Float64Dtype() +pandas_dtypes_to_np_dtypes[pd.Float32Dtype()] = np.dtype("float32") +pandas_dtypes_to_np_dtypes[pd.Float64Dtype()] = np.dtype("float64") +pandas_dtypes_alias_to_cudf_alias["Float32"] = "float32" +pandas_dtypes_alias_to_cudf_alias["Float64"] = "float64" SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"} UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"} diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index ca14ccfc63e..5b259b1dc66 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -31,10 +31,10 @@ dependencies = [ "cuda-python>=11.7.1,<12.0", "fsspec>=0.6.0", "numba>=0.56.2", - "numpy", + "numpy>=1.21", "nvtx>=0.2.1", "packaging", - "pandas>=1.0,<1.6.0dev0", + "pandas>=1.3,<1.6.0dev0", "protobuf>=4.21.6,<4.22", "typing_extensions", # Allow floating minor versions for Arrow. diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 741dbc28e6c..cfb951901d3 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. import contextlib @@ -11,7 +11,6 @@ from dask.utils_test import hlg_layer import cudf -from cudf.core._compat import PANDAS_GE_120 import dask_cudf from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized @@ -160,18 +159,8 @@ def test_groupby_agg_empty_partition(tmpdir, split_out): @pytest.mark.parametrize( "func", [ - pytest.param( - lambda df: df.groupby(["a", "b"]).x.sum(), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_120, reason="pandas bug" - ), - ), - pytest.param( - lambda df: df.groupby(["a", "b"]).sum(), - marks=pytest.mark.xfail( - condition=not PANDAS_GE_120, reason="pandas bug" - ), - ), + lambda df: df.groupby(["a", "b"]).x.sum(), + lambda df: df.groupby(["a", "b"]).sum(), pytest.param( lambda df: df.groupby(["a", "b"]).agg({"x", "sum"}), marks=pytest.mark.xfail, diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 07b0edb6008..79a9aca9e96 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -21,8 +21,8 @@ dependencies = [ "dask>=2023.1.1", "distributed>=2023.1.1", "fsspec>=0.6.0", - "numpy", - "pandas>=1.0,<1.6.0dev0", + "numpy>=1.21", + "pandas>=1.3,<1.6.0dev0", "cudf==23.4.*", "cupy-cuda11x", ] @@ -40,8 +40,8 @@ dynamic = ["entry-points"] [project.optional-dependencies] test = [ - "numpy", - "pandas>=1.0,<1.6.0dev0", + "numpy>=1.21", + "pandas>=1.3,<1.6.0dev0", "pytest", "pytest-xdist", "numba>=0.56.2",