From 9def28c842334d0da8502cd94841ad91a273389e Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <wence@gmx.li>
Date: Tue, 10 May 2022 07:51:28 +0100
Subject: [PATCH 01/12] Generic serialization of all column types (#10784)

Prior to this change, not all column types were serializable, with
serialization implemented in an ad-hoc way for each type in turn. The
inheritance is such that we can just implement generic serialization on the
base column class (no subclass has a specialized constructor), so do
that (closes #10766).

To support this implement serialization for all dtypes. These must
individually implement the `Serializable` interface since in contrast to
columns every dtype has its own constructor (closes #10785).

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10784
---
 python/cudf/cudf/core/buffer.py               |   8 +-
 python/cudf/cudf/core/column/categorical.py   |  59 +-----------
 python/cudf/cudf/core/column/column.py        |  67 +++++++++++--
 python/cudf/cudf/core/column/decimal.py       |  14 +--
 python/cudf/cudf/core/column/lists.py         |  60 ------------
 python/cudf/cudf/core/column/string.py        |  52 ----------
 python/cudf/cudf/core/dtypes.py               |  89 +++++++++++++++---
 .../stringColumnWithRangeIndex_cudf_0.16.pkl  | Bin 1482 -> 1709 bytes
 python/cudf/cudf/tests/test_serialize.py      |  75 ++++++++++++++-
 9 files changed, 216 insertions(+), 208 deletions(-)

diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py
index 0658927975f..63e99f34803 100644
--- a/python/cudf/cudf/core/buffer.py
+++ b/python/cudf/cudf/core/buffer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import functools
@@ -123,16 +123,20 @@ def serialize(self) -> Tuple[dict, list]:
         header["constructor-kwargs"] = {}
         header["desc"] = self.__cuda_array_interface__.copy()
         header["desc"]["strides"] = (1,)
+        header["frame_count"] = 1
         frames = [self]
         return header, frames
 
     @classmethod
     def deserialize(cls, header: dict, frames: list) -> Buffer:
+        assert (
+            header["frame_count"] == 1
+        ), "Only expecting to deserialize Buffer with a single frame."
         buf = cls(frames[0], **header["constructor-kwargs"])
 
         if header["desc"]["shape"] != buf.__cuda_array_interface__["shape"]:
             raise ValueError(
-                f"Recieved a `Buffer` with the wrong size."
+                f"Received a `Buffer` with the wrong size."
                 f" Expected {header['desc']['shape']}, "
                 f"but got {buf.__cuda_array_interface__['shape']}"
             )
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 777e8ac7463..e28b7d059b7 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -2,19 +2,9 @@
 
 from __future__ import annotations
 
-import pickle
 from collections import abc
 from functools import cached_property
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Dict,
-    Mapping,
-    Optional,
-    Sequence,
-    Tuple,
-    cast,
-)
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast
 
 import numpy as np
 import pandas as pd
@@ -685,53 +675,6 @@ def __contains__(self, item: ScalarLike) -> bool:
             return False
         return self._encode(item) in self.as_numerical
 
-    def serialize(self) -> Tuple[dict, list]:
-        header: Dict[Any, Any] = {}
-        frames = []
-        header["type-serialized"] = pickle.dumps(type(self))
-        header["dtype"], dtype_frames = self.dtype.serialize()
-        header["dtype_frames_count"] = len(dtype_frames)
-        frames.extend(dtype_frames)
-        header["data"], data_frames = self.codes.serialize()
-        header["data_frames_count"] = len(data_frames)
-        frames.extend(data_frames)
-        if self.mask is not None:
-            mask_header, mask_frames = self.mask.serialize()
-            header["mask"] = mask_header
-            frames.extend(mask_frames)
-        header["frame_count"] = len(frames)
-        return header, frames
-
-    @classmethod
-    def deserialize(cls, header: dict, frames: list) -> CategoricalColumn:
-        n_dtype_frames = header["dtype_frames_count"]
-        dtype = CategoricalDtype.deserialize(
-            header["dtype"], frames[:n_dtype_frames]
-        )
-        n_data_frames = header["data_frames_count"]
-
-        column_type = pickle.loads(header["data"]["type-serialized"])
-        data = column_type.deserialize(
-            header["data"],
-            frames[n_dtype_frames : n_dtype_frames + n_data_frames],
-        )
-        mask = None
-        if "mask" in header:
-            mask = Buffer.deserialize(
-                header["mask"], [frames[n_dtype_frames + n_data_frames]]
-            )
-        return cast(
-            CategoricalColumn,
-            column.build_column(
-                data=None,
-                dtype=dtype,
-                mask=mask,
-                children=(
-                    column.build_column(data.base_data, dtype=data.dtype),
-                ),
-            ),
-        )
-
     def set_base_data(self, value):
         if value is not None:
             raise RuntimeError(
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 3fb71173178..e1d91e6d0c0 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -5,6 +5,7 @@
 import pickle
 import warnings
 from functools import cached_property
+from itertools import chain
 from types import SimpleNamespace
 from typing import (
     Any,
@@ -1037,10 +1038,29 @@ def unique(self) -> ColumnBase:
         return drop_duplicates([self], keep="first")[0]
 
     def serialize(self) -> Tuple[dict, list]:
+        # data model:
+
+        # Serialization produces a nested metadata "header" and a flattened
+        # list of memoryviews/buffers that reference data (frames).  Each
+        # header advertises a frame_count slot which indicates how many
+        # frames deserialization will consume. The class used to construct
+        # an object is named under the key "type-serialized" to match with
+        # Dask's serialization protocol (see
+        # distributed.protocol.serialize). Since column dtypes may either be
+        # cudf native or foreign some special-casing is required here for
+        # serialization.
+
         header: Dict[Any, Any] = {}
         frames = []
         header["type-serialized"] = pickle.dumps(type(self))
-        header["dtype"] = self.dtype.str
+        try:
+            dtype, dtype_frames = self.dtype.serialize()
+            header["dtype"] = dtype
+            frames.extend(dtype_frames)
+            header["dtype-is-cudf-serialized"] = True
+        except AttributeError:
+            header["dtype"] = pickle.dumps(self.dtype)
+            header["dtype-is-cudf-serialized"] = False
 
         if self.data is not None:
             data_header, data_frames = self.data.serialize()
@@ -1051,19 +1071,52 @@ def serialize(self) -> Tuple[dict, list]:
             mask_header, mask_frames = self.mask.serialize()
             header["mask"] = mask_header
             frames.extend(mask_frames)
-
+        if self.children:
+            child_headers, child_frames = zip(
+                *(c.serialize() for c in self.children)
+            )
+            header["subheaders"] = list(child_headers)
+            frames.extend(chain(*child_frames))
+        header["size"] = self.size
         header["frame_count"] = len(frames)
         return header, frames
 
     @classmethod
     def deserialize(cls, header: dict, frames: list) -> ColumnBase:
-        dtype = header["dtype"]
-        data = Buffer.deserialize(header["data"], [frames[0]])
-        mask = None
+        def unpack(header, frames) -> Tuple[Any, list]:
+            count = header["frame_count"]
+            klass = pickle.loads(header["type-serialized"])
+            obj = klass.deserialize(header, frames[:count])
+            return obj, frames[count:]
+
+        assert header["frame_count"] == len(frames), (
+            f"Deserialization expected {header['frame_count']} frames, "
+            f"but received {len(frames)}"
+        )
+        if header["dtype-is-cudf-serialized"]:
+            dtype, frames = unpack(header["dtype"], frames)
+        else:
+            dtype = pickle.loads(header["dtype"])
+        if "data" in header:
+            data, frames = unpack(header["data"], frames)
+        else:
+            data = None
         if "mask" in header:
-            mask = Buffer.deserialize(header["mask"], [frames[1]])
+            mask, frames = unpack(header["mask"], frames)
+        else:
+            mask = None
+        children = []
+        if "subheaders" in header:
+            for h in header["subheaders"]:
+                child, frames = unpack(h, frames)
+                children.append(child)
+        assert len(frames) == 0, "Deserialization did not consume all frames"
         return build_column(
-            data=data, dtype=dtype, mask=mask, size=header.get("size", None)
+            data=data,
+            dtype=dtype,
+            mask=mask,
+            size=header.get("size", None),
+            children=tuple(children),
         )
 
     def unary_operator(self, unaryop: str):
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index d8ddb3d8d1a..69009106d15 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -2,7 +2,7 @@
 
 import warnings
 from decimal import Decimal
-from typing import Any, Sequence, Tuple, Union, cast
+from typing import Any, Sequence, Union, cast
 
 import cupy as cp
 import numpy as np
@@ -321,18 +321,6 @@ def to_arrow(self):
             buffers=[mask_buf, data_buf],
         )
 
-    def serialize(self) -> Tuple[dict, list]:
-        header, frames = super().serialize()
-        header["dtype"] = self.dtype.serialize()
-        header["size"] = self.size
-        return header, frames
-
-    @classmethod
-    def deserialize(cls, header: dict, frames: list) -> ColumnBase:
-        dtype = cudf.Decimal64Dtype.deserialize(*header["dtype"])
-        header["dtype"] = dtype
-        return super().deserialize(header, frames)
-
     @property
     def __cuda_array_interface__(self):
         raise NotImplementedError(
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 2964378d114..30e418f0825 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import pickle
 from functools import cached_property
 from typing import List, Optional, Sequence, Union
 
@@ -28,7 +27,6 @@
     is_list_dtype,
     is_scalar,
 )
-from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethods, ParentType
 from cudf.core.dtypes import ListDtype
@@ -166,64 +164,6 @@ def set_base_data(self, value):
         else:
             super().set_base_data(value)
 
-    def serialize(self):
-        header = {}
-        frames = []
-        header["type-serialized"] = pickle.dumps(type(self))
-        header["null_count"] = self.null_count
-        header["size"] = self.size
-        header["dtype"], dtype_frames = self.dtype.serialize()
-        header["dtype_frames_count"] = len(dtype_frames)
-        frames.extend(dtype_frames)
-
-        sub_headers = []
-
-        for item in self.children:
-            sheader, sframes = item.serialize()
-            sub_headers.append(sheader)
-            frames.extend(sframes)
-
-        if self.null_count > 0:
-            frames.append(self.mask)
-
-        header["subheaders"] = sub_headers
-        header["frame_count"] = len(frames)
-
-        return header, frames
-
-    @classmethod
-    def deserialize(cls, header, frames):
-
-        # Get null mask
-        if header["null_count"] > 0:
-            mask = Buffer(frames[-1])
-        else:
-            mask = None
-
-        # Deserialize dtype
-        dtype = pickle.loads(header["dtype"]["type-serialized"]).deserialize(
-            header["dtype"], frames[: header["dtype_frames_count"]]
-        )
-
-        # Deserialize child columns
-        children = []
-        f = header["dtype_frames_count"]
-        for h in header["subheaders"]:
-            fcount = h["frame_count"]
-            child_frames = frames[f : f + fcount]
-            column_type = pickle.loads(h["type-serialized"])
-            children.append(column_type.deserialize(h, child_frames))
-            f += fcount
-
-        # Materialize list column
-        return column.build_column(
-            data=None,
-            dtype=dtype,
-            mask=mask,
-            children=tuple(children),
-            size=header["size"],
-        )
-
     @property
     def __cuda_array_interface__(self):
         raise NotImplementedError(
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 0db7e7d9a27..70097f15372 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -2,14 +2,12 @@
 
 from __future__ import annotations
 
-import pickle
 import re
 import warnings
 from functools import cached_property
 from typing import (
     TYPE_CHECKING,
     Any,
-    Dict,
     Optional,
     Sequence,
     Tuple,
@@ -5336,56 +5334,6 @@ def to_pandas(
             pd_series.index = index
         return pd_series
 
-    def serialize(self) -> Tuple[dict, list]:
-        header: Dict[Any, Any] = {"null_count": self.null_count}
-        header["type-serialized"] = pickle.dumps(type(self))
-        header["size"] = self.size
-
-        frames = []
-        sub_headers = []
-
-        for item in self.children:
-            sheader, sframes = item.serialize()
-            sub_headers.append(sheader)
-            frames.extend(sframes)
-
-        if self.null_count > 0:
-            frames.append(self.mask)
-
-        header["subheaders"] = sub_headers
-        header["frame_count"] = len(frames)
-        return header, frames
-
-    @classmethod
-    def deserialize(cls, header: dict, frames: list) -> StringColumn:
-        size = header["size"]
-        if not isinstance(size, int):
-            size = pickle.loads(size)
-
-        # Deserialize the mask, value, and offset frames
-        buffers = [Buffer(each_frame) for each_frame in frames]
-
-        nbuf = None
-        if header["null_count"] > 0:
-            nbuf = buffers[2]
-
-        children = []
-        for h, b in zip(header["subheaders"], buffers[:2]):
-            column_type = pickle.loads(h["type-serialized"])
-            children.append(column_type.deserialize(h, [b]))
-
-        col = cast(
-            StringColumn,
-            column.build_column(
-                data=None,
-                dtype="str",
-                mask=nbuf,
-                children=tuple(children),
-                size=size,
-            ),
-        )
-        return col
-
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
         to_dtype = cudf.dtype(to_dtype)
 
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 585e8b94e80..9991bad5a9e 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import decimal
+import operator
 import pickle
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 
 import numpy as np
 import pandas as pd
@@ -68,6 +69,50 @@ def dtype(arbitrary):
             )
 
 
+def _decode_type(
+    cls: Type,
+    header: dict,
+    frames: list,
+    is_valid_class: Callable[[Type, Type], bool] = operator.is_,
+) -> Tuple[dict, list, Type]:
+    """Decode metadata-encoded type and check validity
+
+    Parameters
+    ----------
+    cls : type
+        class performing deserialization
+    header : dict
+        metadata for deserialization
+    frames : list
+        buffers containing data for deserialization
+    is_valid_class : Callable
+        function to call to check if the encoded class type is valid for
+        serialization by `cls` (default is to check type equality), called
+        as `is_valid_class(decoded_class, cls)`.
+
+    Returns
+    -------
+    tuple
+        Tuple of validated headers, frames, and the decoded class
+        constructor.
+
+    Raises
+    ------
+    AssertionError
+        if the number of frames doesn't match the count encoded in the
+        headers, or `is_valid_class` is not true.
+    """
+    assert header["frame_count"] == len(frames), (
+        f"Deserialization expected {header['frame_count']} frames, "
+        f"but received {len(frames)}."
+    )
+    klass = pickle.loads(header["type-serialized"])
+    assert is_valid_class(
+        klass, cls
+    ), f"Header-encoded {klass=} does not match decoding {cls=}."
+    return header, frames, klass
+
+
 class _BaseDtype(ExtensionDtype, Serializable):
     # Base type for all cudf-specific dtypes
     pass
@@ -169,11 +214,12 @@ def serialize(self):
             categories_header, categories_frames = self.categories.serialize()
         header["categories"] = categories_header
         frames.extend(categories_frames)
-
+        header["frame_count"] = len(frames)
         return header, frames
 
     @classmethod
     def deserialize(cls, header, frames):
+        header, frames, klass = _decode_type(cls, header, frames)
         ordered = header["ordered"]
         categories_header = header["categories"]
         categories_frames = frames
@@ -181,7 +227,7 @@ def deserialize(cls, header, frames):
         categories = categories_type.deserialize(
             categories_header, categories_frames
         )
-        return cls(categories=categories, ordered=ordered)
+        return klass(categories=categories, ordered=ordered)
 
 
 class ListDtype(_BaseDtype):
@@ -254,19 +300,19 @@ def serialize(self) -> Tuple[dict, list]:
             header["element-type"], frames = self.element_type.serialize()
         else:
             header["element-type"] = self.element_type
-
+        header["frame_count"] = len(frames)
         return header, frames
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
+        header, frames, klass = _decode_type(cls, header, frames)
         if isinstance(header["element-type"], dict):
             element_type = pickle.loads(
                 header["element-type"]["type-serialized"]
             ).deserialize(header["element-type"], frames)
         else:
             element_type = header["element-type"]
-
-        return cls(element_type=element_type)
+        return klass(element_type=element_type)
 
 
 class StructDtype(_BaseDtype):
@@ -325,7 +371,7 @@ def serialize(self) -> Tuple[dict, list]:
 
         frames: List[Buffer] = []
 
-        fields = {}
+        fields: Dict[str, Union[bytes, Tuple[Any, Tuple[int, int]]]] = {}
 
         for k, dtype in self.fields.items():
             if isinstance(dtype, _BaseDtype):
@@ -336,13 +382,14 @@ def serialize(self) -> Tuple[dict, list]:
                 )
                 frames.extend(dtype_frames)
             else:
-                fields[k] = dtype
+                fields[k] = pickle.dumps(dtype)
         header["fields"] = fields
-
+        header["frame_count"] = len(frames)
         return header, frames
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
+        header, frames, klass = _decode_type(cls, header, frames)
         fields = {}
         for k, dtype in header["fields"].items():
             if isinstance(dtype, tuple):
@@ -354,7 +401,7 @@ def deserialize(cls, header: dict, frames: list):
                     frames[start:stop],
                 )
             else:
-                fields[k] = dtype
+                fields[k] = pickle.loads(dtype)
         return cls(fields)
 
 
@@ -452,13 +499,18 @@ def serialize(self) -> Tuple[dict, list]:
                 "type-serialized": pickle.dumps(type(self)),
                 "precision": self.precision,
                 "scale": self.scale,
+                "frame_count": 0,
             },
             [],
         )
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
-        return cls(header["precision"], header["scale"])
+        header, frames, klass = _decode_type(
+            cls, header, frames, is_valid_class=issubclass
+        )
+        klass = pickle.loads(header["type-serialized"])
+        return klass(header["precision"], header["scale"])
 
     def __eq__(self, other: Dtype) -> bool:
         if other is self:
@@ -531,6 +583,21 @@ def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype":
             subtype=pd_dtype.subtype
         )  # TODO: needs `closed` when we upgrade Pandas
 
+    def serialize(self) -> Tuple[dict, list]:
+        header = {
+            "type-serialized": pickle.dumps(type(self)),
+            "fields": pickle.dumps((self.subtype, self.closed)),
+            "frame_count": 0,
+        }
+        return header, []
+
+    @classmethod
+    def deserialize(cls, header: dict, frames: list):
+        header, frames, klass = _decode_type(cls, header, frames)
+        klass = pickle.loads(header["type-serialized"])
+        subtype, closed = pickle.loads(header["fields"])
+        return klass(subtype, closed=closed)
+
 
 def is_categorical_dtype(obj):
     """Check whether an array-like or dtype is of the Categorical dtype.
diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_0.16.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_0.16.pkl
index 30e31487a825d179c8f2ec617b280b33c817d5c5..97c745c1dd0c214b12defd0baf07b763d7f5296c 100644
GIT binary patch
literal 1709
zcmcgt-%k@k5Wd2d3SI)%8vGGLL@|xAjSv%j&`3CAqL<VJjPb?P?cKI};d+OAdm;uR
zFNN0Z8^b@~KdQ5PJz7XnNQ{?UJG-5oneW?gru*jc&j~|6@=7D|1L21;$U=(>A_Rvc
zkk{SBYgXNe)2c@VX>!u0ASYj<{Yu;DGsB`-)IAzg?vud(M&SU~VGqi3TI_7m%E`V5
z_Tzo?cri_BTqJlEq~%6I(hln(rjEfbVuV5#AxEGC=u=32;&ZaD;F(9+nb`rP^i(w#
z+z(rrX+s;f&WGqRaNbi8z+S@$D_Mkmn6$TcPJToR#@;@KUiT1Tmz{B$+l5-K_U0Ep
z`ono4pn<_|>R=T=uBZ}*dFe^BP2$aS%72K-=d<K9gYp|mvq?F~iM7<w6&KwoL_mov
zB3{|tCcLFsktW`ctK*yMBH_n`VSf7D_})7}95EvZ(pl{wlZp)V1t+Ig3O(+C9N$fr
zaQ%hy*pEVRj6`15YN)$W5(;!r<W!tA7$qM1z(**f0@#fG9E)GhioZQ5zRx{UiL!A?
z0CKvP*+VzLiht$ae=^MdvoqWqntN92u{l-HtbG|a*{wZbchtE#B+Td&#^#+;2iRS<
z!0xg8Y!S&|4-^4rtkm!1_3FRDUmAvgR(k&s{@4J%H3~jkb}WsYqGh5ro}h&Ce(2G!
zT6vZ>Q822X^-ew(grj&2q)tRz;F!vwXec&S$AfL|o3$!jL<gsOaI_9ajHC(*bRiSy
z^gy6#-EwN3gf047gHR2IxKR7oQYfc7z>tNswTV)Hx7($%5YVtCm~IDW)hRg^m6O}t
z7nC#ts+4*N2|CLgdZeU-ww>T0CY%!-F;<<^lZI9UIh#F&&`Vv8ki8Drxdx=;C#Tw&
zy<<f+mau34FqV`ydiH8IP5T^N`U@}&+hFfi9qdC^h3b%c&n~HOWB95pR{tG+LWv4b
USj=ZM2%_zX2cGhz>hGlb8@cygRsaA1

literal 1482
zcmbtUdvnxO6i=EhkFY|)^(_eUiee)RsHjy7!3snd3_jvxUGuozn=wuLOKz-nnqmLh
zKKv?t3BDPgd$X-{+!<yxGrPGt_awi^Ip>ayD6EoHBw1+=nFxd^OlVP5#mjb-7tD?W
z5iA$M3NvrfYDul2`*WJ6@w^m+n3b$Zf;9P)VP+GChgOYQ%Rfd5Vttj(?`_98;G7#7
z&-%7z<DfrFZzoyI{=lIchb_Oo`#gy9v|7nv);MCRv{D3xK-<;)7A2J;U&V87t5Q!H
z@y1c7H8OfOj;ZumlER#GgP3I1%IdnQA|%?Mks#Nh?`&^j#)DzGhObj>0j*xIclQZh
z>d(~#W<wjtr;0q`oY^N*$uPYj8?rl65h2-l-akSgR=$#=Dii3qTIC;?3}n*a>8%$?
zn>3)2K9HhHyyzSqHS_{#W9T+s@(=DMm3X<vE0%9kB&*VN0`jV>Pv!iYb7>#v2FWf(
zkz_;jfx0=$c*Xwp#%fvelg{kOa9cJ`DXg*zISXP|lz5|$(^ic$(_7(9r#lUiDni{9
z1*zHFT|I?+me2VHDOyOb;hf?iyA$VYT$qk>(K)pbC9Ia0Sz+F6Zn6-Te?pm-k=7~_
zc`^UnT2Ks2yjA1v27;9lF4cI);^+T5pqY;!Ej`@X#C+3yJfLo}-Od>AD*L5nG!nS%
z>SJ_ONH!dWaYb?8sJghS;=hwiNuJ>vRledKSFdWkXYv2!%=;5(Hj*wr;AF^ki$7cZ
zhn!S?#7W%8jkxwi+yb}V_86b=%e>34@T>eEj*OTZ)fk^D4M{mqM@4|o`dGBKx3`&!
zrYsu@UgAa{H!Zj0w%tYMA10sIibSv=Oc@-Ss4d*8`uN;x=CC<#Rw_D+f}#l4%|XGU
zd{P$^OOt8M;$|0LxYYKy9X-UIE!_3sTCOhZ@a6uo{z{ECnckCQeb2-F-Lc-l*Z&%8
zFWhlDL{yV!fgC3}ck+h^0^fvui7VJ|IbnY1;d_ZmM)~~yj??A|XK9+R<wY9f2Z0~=
i@_&-~KYQrQ{KYYTk@<`A`~T;{K;YMqZ6fdx+J6IG4!*Pi

diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index 440dcf527ca..b7d679e95d5 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import pickle
 
@@ -23,6 +23,23 @@
         lambda: cudf.Series([1, 2, 3])[:2]._column,
         lambda: cudf.Series(["a", "bb", "ccc"]),
         lambda: cudf.Series(["a", None, "ccc"]),
+        lambda: cudf.Series(
+            [
+                {"a": ({"b": [1, 2, 3], "c": [4, 5, 6]}, {"d": [2, 4, 6]})},
+                {"e": ({"b": [0, 2, 4], "c": [-1, -2, -3]}, {"d": [1, 1, 1]})},
+            ]
+        ),
+        lambda: cudf.Series(
+            [
+                14.12302,
+                97938.2,
+                np.nan,
+                0.0,
+                -8.302014,
+                np.nan,
+                -112.2314,
+            ]
+        ).astype(cudf.Decimal64Dtype(7, 2)),
         lambda: cudf.DataFrame({"x": [1, 2, 3]}),
         lambda: cudf.DataFrame({"x": [1, 2, 3], "y": [1.0, None, 3.0]}),
         lambda: cudf.DataFrame(
@@ -35,11 +52,47 @@
             {"x": ["a", "bb", "ccc"], "y": [1.0, None, 3.0]},
             index=[1, None, 3],
         ),
-        pd._testing.makeTimeDataFrame,
+        pd._testing.makeBoolIndex,
+        pd._testing.makeCategoricalIndex,
+        lambda: pd._testing.makeCustomDataframe(3, 4),
+        lambda: pd._testing.makeCustomIndex(2, 5),
+        pd._testing.makeDataFrame,
+        pd._testing.makeDateIndex,
+        pd._testing.makeFloatIndex,
+        pd._testing.makeFloatSeries,
+        pd._testing.makeIntIndex,
+        pd._testing.makeIntervalIndex,
+        pd._testing.makeMissingDataframe,
         pd._testing.makeMixedDataFrame,
+        pd._testing.makeMultiIndex,
+        lambda: pd._testing.makeNumericIndex(dtype=np.float64),
+        pd._testing.makeObjectSeries,
+        pytest.param(
+            pd._testing.makePeriodFrame,
+            marks=pytest.mark.xfail(
+                reason="Periods not supported in cudf", raises=RuntimeError
+            ),
+        ),
+        pytest.param(
+            pd._testing.makePeriodIndex,
+            marks=pytest.mark.xfail(
+                reason="Periods not supported in cudf", raises=RuntimeError
+            ),
+        ),
+        pytest.param(
+            pd._testing.makePeriodSeries,
+            marks=pytest.mark.xfail(
+                reason="Periods not supported in cudf", raises=RuntimeError
+            ),
+        ),
+        pd._testing.makeRangeIndex,
+        pd._testing.makeStringIndex,
+        pd._testing.makeStringSeries,
         pd._testing.makeTimeDataFrame,
-        # pd._testing.makeMissingDataframe, # Problem in distributed
-        # pd._testing.makeMultiIndex, # Indices not serialized on device
+        pd._testing.makeTimeSeries,
+        pd._testing.makeTimedeltaIndex,
+        pd._testing.makeUIntIndex,
+        pd._testing.makeUnicodeIndex,
     ],
 )
 @pytest.mark.parametrize("to_host", [True, False])
@@ -64,13 +117,25 @@ def test_serialize(df, to_host):
     elif hasattr(df, "_cols"):
         assert ndevice >= len(df._data)
     else:
-        assert ndevice > 0
+        # If there are frames, something should be on the device
+        assert ndevice > 0 or not frames
 
     typ = type(a)
     b = typ.deserialize(header, frames)
     assert_eq(a, b)
 
 
+def test_serialize_dtype_error_checking():
+    dtype = cudf.IntervalDtype("float", "right")
+    header, frames = dtype.serialize()
+    with pytest.raises(AssertionError):
+        # Invalid number of frames
+        type(dtype).deserialize(header, [None] * (header["frame_count"] + 1))
+    with pytest.raises(AssertionError):
+        # mismatching class
+        cudf.StructDtype.deserialize(header, frames)
+
+
 def test_serialize_dataframe():
     df = cudf.DataFrame()
     df["a"] = np.arange(100)

From 1519108adcdb520ff4b41849d7dc6cf61330c2fc Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 10 May 2022 07:45:56 -0400
Subject: [PATCH 02/12] Mention 2 cpp-reviewer requirement in pull request
 template (#10768)

Although 2 cpp reviewers are automatically assigned to a PR, there is no documentation that indicates 2 are required. This PR adds to the list of guidelines in the [Pull Request Template](https://github.com/rapidsai/cudf/blob/branch-22.06/.github/PULL_REQUEST_TEMPLATE.md) that appears when a PR is created through the github web page.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/10768
---
 .github/PULL_REQUEST_TEMPLATE.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 2c5ecf68690..8dac8083f31 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -47,6 +47,10 @@ Here are some guidelines to help the review process go smoothly.
    If conflicts occur against the target branch they should be resolved by
    merging the target branch into the branch used for making the pull request.
 
+8. Pull requests that modify cpp source that are marked ready for review 
+   will automatically be assigned two cudf-cpp-codeowners reviewers.
+   Ensure at least two approvals from cudf-cpp-codeowners before merging.
+
 Many thanks in advance for your cooperation!
 
 -->

From 19c5bad020559bd34427d1b30cbf0c46c8c87d82 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 10 May 2022 09:39:33 -0500
Subject: [PATCH 03/12] Rework `Scalar` imports (#10791)

This PR changes the way things are imported in our scalar code such that it's less prone to circular import issues. For instance before this we can't make `NA` the default value of a kwarg for, say, anything in `column.py`.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10791
---
 python/cudf/cudf/__init__.py                  |  7 ++--
 python/cudf/cudf/core/missing.py              |  9 +++++
 python/cudf/cudf/core/scalar.py               | 40 +++++++------------
 python/cudf/cudf/core/udf/typing.py           |  4 +-
 python/cudf/cudf/tests/test_udf_masked_ops.py |  2 +-
 5 files changed, 29 insertions(+), 33 deletions(-)
 create mode 100644 python/cudf/cudf/core/missing.py

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 273ab147241..27aaa07f940 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -17,10 +17,8 @@
     register_index_accessor,
     register_series_accessor,
 )
-from cudf.core.scalar import (
-    NA,
-    Scalar,
-)
+from cudf.core.scalar import Scalar
+
 from cudf.core.index import (
     BaseIndex,
     CategoricalIndex,
@@ -45,6 +43,7 @@
 )
 from cudf.core.dataframe import DataFrame, from_pandas, merge, from_dataframe
 from cudf.core.series import Series
+from cudf.core.missing import NA
 from cudf.core.multiindex import MultiIndex
 from cudf.core.cut import cut
 from cudf.core.algorithms import factorize
diff --git a/python/cudf/cudf/core/missing.py b/python/cudf/cudf/core/missing.py
new file mode 100644
index 00000000000..02bcb7636f4
--- /dev/null
+++ b/python/cudf/cudf/core/missing.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+
+
+# Pandas NAType enforces a single instance exists at a time
+# instantiating this class will yield the existing instance
+# of pandas._libs.missing.NAType, id(cudf.NA) == id(pd.NA).
+from pandas import NA
+
+__all__ = ["NA"]
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 1c81803ed98..189418f651f 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -4,14 +4,12 @@
 
 import numpy as np
 import pyarrow as pa
-from pandas._libs.missing import NAType as pd_NAType
 
 import cudf
-from cudf.core.column.column import ColumnBase
+from cudf.api.types import is_scalar
 from cudf.core.dtypes import ListDtype, StructDtype
-from cudf.core.index import BaseIndex
+from cudf.core.missing import NA
 from cudf.core.mixins import BinaryOperand
-from cudf.core.series import Series
 from cudf.utils.dtypes import (
     get_allowed_combinations_for_operator,
     to_cudf_compatible_scalar,
@@ -273,19 +271,19 @@ def _binop_result_dtype_or_error(self, other, op):
         return cudf.dtype(out_dtype)
 
     def _binaryop(self, other, op: str):
-        if isinstance(other, (ColumnBase, Series, BaseIndex, np.ndarray)):
-            # dispatch to column implementation
-            return NotImplemented
-        other = to_cudf_compatible_scalar(other)
-        out_dtype = self._binop_result_dtype_or_error(other, op)
-        valid = self.is_valid and (
-            isinstance(other, np.generic) or other.is_valid
-        )
-        if not valid:
-            return Scalar(None, dtype=out_dtype)
+        if is_scalar(other):
+            other = to_cudf_compatible_scalar(other)
+            out_dtype = self._binop_result_dtype_or_error(other, op)
+            valid = self.is_valid and (
+                isinstance(other, np.generic) or other.is_valid
+            )
+            if not valid:
+                return Scalar(None, dtype=out_dtype)
+            else:
+                result = self._dispatch_scalar_binop(other, op)
+                return Scalar(result, dtype=out_dtype)
         else:
-            result = self._dispatch_scalar_binop(other, op)
-            return Scalar(result, dtype=out_dtype)
+            return NotImplemented
 
     def _dispatch_scalar_binop(self, other, op):
         if isinstance(other, Scalar):
@@ -323,13 +321,3 @@ def _dispatch_scalar_unaop(self, op):
 
     def astype(self, dtype):
         return Scalar(self.value, dtype)
-
-
-class _NAType(pd_NAType):
-    # Pandas NAType enforces a single instance exists at a time
-    # instantiating this class will yield the existing instance
-    # of pandas._libs.missing.NAType, id(cudf.NA) == id(pd.NA).
-    pass
-
-
-NA = _NAType()
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index ed5fc1d6d23..37e2712f71c 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -17,8 +17,8 @@
 )
 from numba.core.typing.typeof import typeof
 from numba.cuda.cudadecl import registry as cuda_decl_registry
-from pandas._libs.missing import NAType as _NAType
 
+from cudf.core.missing import NA
 from cudf.core.udf import api
 from cudf.core.udf._ops import (
     arith_ops,
@@ -214,7 +214,7 @@ def unify(self, context, other):
 na_type = NAType()
 
 
-@typeof_impl.register(_NAType)
+@typeof_impl.register(type(NA))
 def typeof_na(val, c):
     """
     Tie instances of _NAType (cudf.NA) to our NAType.
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 438f46d4266..73bf5160347 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -7,7 +7,7 @@
 from numba import cuda
 
 import cudf
-from cudf.core.scalar import NA
+from cudf.core.missing import NA
 from cudf.core.udf._ops import (
     arith_ops,
     bitwise_ops,

From ee8cd59e297ff3d6e462247b35c46e39f761db68 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Tue, 10 May 2022 13:13:46 -0400
Subject: [PATCH 04/12] Improve coverage of dask-cudf's groupby aggregation,
 add tests for `dropna` support (#10449)

This PR does the following:

- Make sure that all of dask-cudf's `SUPPORTED_AGGS` have an overriding method for upstream Dask's series / dataframe groupby methods
- Add tests comparing dask-cudf's `dropna` support to upstream Dask's, as at the moment we are only comparing against cuDF
- Fix the resulting failures of these changes (by properly parsing `self.dropna` in dask-cudf's groupby code)

As a side note, I think that a larger rethinking of dask-cudf's groupby would pay off well, as currently it seems like we have some "duplicate" tests and aren't really able to discern if `groupby_agg` was called for a supported aggregation

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/10449
---
 python/dask_cudf/dask_cudf/groupby.py         | 281 ++++++++++++++++--
 .../dask_cudf/dask_cudf/tests/test_groupby.py | 101 ++++++-
 2 files changed, 352 insertions(+), 30 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 684b1f71099..d137fac5fe3 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -35,6 +35,25 @@
 )
 
 
+def _check_groupby_supported(func):
+    """
+    Decorator for dask-cudf's groupby methods that returns the dask-cudf
+    method if the groupby object is supported, otherwise reverting to the
+    upstream Dask method
+    """
+
+    def wrapper(*args, **kwargs):
+        gb = args[0]
+        if _groupby_supported(gb):
+            return func(*args, **kwargs)
+        # note that we use upstream Dask's default kwargs for this call if
+        # none are specified; this shouldn't be an issue as those defaults are
+        # consistent with dask-cudf
+        return getattr(super(type(gb), gb), func.__name__)(*args[1:], **kwargs)
+
+    return wrapper
+
+
 class CudfDataFrameGroupBy(DataFrameGroupBy):
     @_dask_cudf_nvtx_annotate
     def __init__(self, *args, **kwargs):
@@ -65,6 +84,22 @@ def __getitem__(self, key):
         return g
 
     @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def count(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {c: "count" for c in self.obj.columns if c not in self.by},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
+        )
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -72,13 +107,89 @@ def mean(self, split_every=None, split_out=1):
             {c: "mean" for c in self.obj.columns if c not in self.by},
             split_every=split_every,
             split_out=split_out,
-            dropna=self.dropna,
             sep=self.sep,
             sort=self.sort,
             as_index=self.as_index,
+            **self.dropna,
+        )
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def std(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {c: "std" for c in self.obj.columns if c not in self.by},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
+        )
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def var(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {c: "var" for c in self.obj.columns if c not in self.by},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
+        )
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def sum(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {c: "sum" for c in self.obj.columns if c not in self.by},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
+        )
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def min(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {c: "min" for c in self.obj.columns if c not in self.by},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
         )
 
     @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def max(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {c: "max" for c in self.obj.columns if c not in self.by},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
+        )
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -86,10 +197,40 @@ def collect(self, split_every=None, split_out=1):
             {c: "collect" for c in self.obj.columns if c not in self.by},
             split_every=split_every,
             split_out=split_out,
-            dropna=self.dropna,
             sep=self.sep,
             sort=self.sort,
             as_index=self.as_index,
+            **self.dropna,
+        )
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def first(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {c: "first" for c in self.obj.columns if c not in self.by},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
+        )
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def last(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {c: "last" for c in self.obj.columns if c not in self.by},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
         )
 
     @_dask_cudf_nvtx_annotate
@@ -98,17 +239,7 @@ def aggregate(self, arg, split_every=None, split_out=1):
             return self.size()
         arg = _redirect_aggs(arg)
 
-        if (
-            isinstance(self.obj, DaskDataFrame)
-            and (
-                isinstance(self.by, str)
-                or (
-                    isinstance(self.by, list)
-                    and all(isinstance(x, str) for x in self.by)
-                )
-            )
-            and _is_supported(arg, SUPPORTED_AGGS)
-        ):
+        if _groupby_supported(self) and _aggs_supported(arg, SUPPORTED_AGGS):
             if isinstance(self._meta.grouping.keys, cudf.MultiIndex):
                 keys = self._meta.grouping.keys.names
             else:
@@ -120,10 +251,10 @@ def aggregate(self, arg, split_every=None, split_out=1):
                 arg,
                 split_every=split_every,
                 split_out=split_out,
-                dropna=self.dropna,
                 sep=self.sep,
                 sort=self.sort,
                 as_index=self.as_index,
+                **self.dropna,
             )
 
         return super().aggregate(
@@ -139,6 +270,22 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
     @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def count(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {self._slice: "count"},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
+        )[self._slice]
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -146,13 +293,14 @@ def mean(self, split_every=None, split_out=1):
             {self._slice: "mean"},
             split_every=split_every,
             split_out=split_out,
-            dropna=self.dropna,
             sep=self.sep,
             sort=self.sort,
             as_index=self.as_index,
+            **self.dropna,
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
     def std(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -160,13 +308,14 @@ def std(self, split_every=None, split_out=1):
             {self._slice: "std"},
             split_every=split_every,
             split_out=split_out,
-            dropna=self.dropna,
             sep=self.sep,
             sort=self.sort,
             as_index=self.as_index,
+            **self.dropna,
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
     def var(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -174,13 +323,59 @@ def var(self, split_every=None, split_out=1):
             {self._slice: "var"},
             split_every=split_every,
             split_out=split_out,
-            dropna=self.dropna,
             sep=self.sep,
             sort=self.sort,
             as_index=self.as_index,
+            **self.dropna,
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def sum(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {self._slice: "sum"},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
+        )[self._slice]
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def min(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {self._slice: "min"},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
+        )[self._slice]
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def max(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {self._slice: "max"},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
+        )[self._slice]
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -188,10 +383,40 @@ def collect(self, split_every=None, split_out=1):
             {self._slice: "collect"},
             split_every=split_every,
             split_out=split_out,
-            dropna=self.dropna,
             sep=self.sep,
             sort=self.sort,
             as_index=self.as_index,
+            **self.dropna,
+        )[self._slice]
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def first(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {self._slice: "first"},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
+        )[self._slice]
+
+    @_dask_cudf_nvtx_annotate
+    @_check_groupby_supported
+    def last(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.by,
+            {self._slice: "last"},
+            split_every=split_every,
+            split_out=split_out,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+            **self.dropna,
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
@@ -203,21 +428,17 @@ def aggregate(self, arg, split_every=None, split_out=1):
         if not isinstance(arg, dict):
             arg = {self._slice: arg}
 
-        if (
-            isinstance(self.obj, DaskDataFrame)
-            and isinstance(self.by, (str, list))
-            and _is_supported(arg, SUPPORTED_AGGS)
-        ):
+        if _groupby_supported(self) and _aggs_supported(arg, SUPPORTED_AGGS):
             return groupby_agg(
                 self.obj,
                 self.by,
                 arg,
                 split_every=split_every,
                 split_out=split_out,
-                dropna=self.dropna,
                 sep=self.sep,
                 sort=self.sort,
                 as_index=self.as_index,
+                **self.dropna,
             )[self._slice]
 
         return super().aggregate(
@@ -258,7 +479,7 @@ def groupby_agg(
     """
     # Assert that aggregations are supported
     aggs = _redirect_aggs(aggs_in)
-    if not _is_supported(aggs, SUPPORTED_AGGS):
+    if not _aggs_supported(aggs, SUPPORTED_AGGS):
         raise ValueError(
             f"Supported aggs include {SUPPORTED_AGGS} for groupby_agg API. "
             f"Aggregations must be specified with dict or list syntax."
@@ -420,7 +641,7 @@ def _redirect_aggs(arg):
 
 
 @_dask_cudf_nvtx_annotate
-def _is_supported(arg, supported: set):
+def _aggs_supported(arg, supported: set):
     """Check that aggregations in `arg` are a subset of `supported`"""
     if isinstance(arg, (list, dict)):
         if isinstance(arg, dict):
@@ -439,6 +660,14 @@ def _is_supported(arg, supported: set):
     return False
 
 
+def _groupby_supported(gb):
+    """Check that groupby input is supported by dask-cudf"""
+    return isinstance(gb.obj, DaskDataFrame) and (
+        isinstance(gb.by, str)
+        or (isinstance(gb.by, list) and all(isinstance(x, str) for x in gb.by))
+    )
+
+
 def _make_name(*args, sep="_"):
     """Combine elements of `args` into a new string"""
     _args = (arg for arg in args if arg != "")
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index e3545149c24..d2c9ecd0293 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -11,7 +11,7 @@
 from cudf.core._compat import PANDAS_GE_120
 
 import dask_cudf
-from dask_cudf.groupby import SUPPORTED_AGGS, _is_supported
+from dask_cudf.groupby import SUPPORTED_AGGS, _aggs_supported
 
 
 @pytest.mark.parametrize("aggregation", SUPPORTED_AGGS)
@@ -235,8 +235,7 @@ def test_groupby_split_out(split_out, column):
 @pytest.mark.parametrize(
     "by", ["a", "b", "c", "d", ["a", "b"], ["a", "c"], ["a", "d"]]
 )
-def test_groupby_dropna(dropna, by):
-
+def test_groupby_dropna_cudf(dropna, by):
     # NOTE: This test is borrowed from upstream dask
     #       (dask/dask/dataframe/tests/test_groupby.py)
     df = cudf.DataFrame(
@@ -265,6 +264,100 @@ def test_groupby_dropna(dropna, by):
     dd.assert_eq(dask_result, cudf_result)
 
 
+@pytest.mark.parametrize(
+    "dropna,by",
+    [
+        (False, "a"),
+        (False, "b"),
+        (False, "c"),
+        pytest.param(
+            False,
+            "d",
+            marks=pytest.mark.xfail(
+                reason="dropna=False is broken in Dask CPU for groupbys on "
+                "categorical columns"
+            ),
+        ),
+        pytest.param(
+            False,
+            ["a", "b"],
+            marks=pytest.mark.xfail(
+                reason="https://github.com/dask/dask/issues/8817"
+            ),
+        ),
+        pytest.param(
+            False,
+            ["a", "c"],
+            marks=pytest.mark.xfail(
+                reason="https://github.com/dask/dask/issues/8817"
+            ),
+        ),
+        pytest.param(
+            False,
+            ["a", "d"],
+            marks=pytest.mark.xfail(
+                reason="multi-col groupbys on categorical columns are broken "
+                "in Dask CPU"
+            ),
+        ),
+        (True, "a"),
+        (True, "b"),
+        (True, "c"),
+        (True, "d"),
+        (True, ["a", "b"]),
+        (True, ["a", "c"]),
+        pytest.param(
+            True,
+            ["a", "d"],
+            marks=pytest.mark.xfail(
+                reason="multi-col groupbys on categorical columns are broken "
+                "in Dask CPU"
+            ),
+        ),
+        (None, "a"),
+        (None, "b"),
+        (None, "c"),
+        (None, "d"),
+        (None, ["a", "b"]),
+        (None, ["a", "c"]),
+        pytest.param(
+            None,
+            ["a", "d"],
+            marks=pytest.mark.xfail(
+                reason="multi-col groupbys on categorical columns are broken "
+                "in Dask CPU"
+            ),
+        ),
+    ],
+)
+def test_groupby_dropna_dask(dropna, by):
+    # NOTE: This test is borrowed from upstream dask
+    #       (dask/dask/dataframe/tests/test_groupby.py)
+    df = pd.DataFrame(
+        {
+            "a": [1, 2, 3, 4, None, None, 7, 8],
+            "b": [1, None, 1, 3, None, 3, 1, 3],
+            "c": ["a", "b", None, None, "e", "f", "g", "h"],
+            "e": [4, 5, 6, 3, 2, 1, 0, 0],
+        }
+    )
+    df["b"] = df["b"].astype("datetime64[ns]")
+    df["d"] = df["c"].astype("category")
+
+    gdf = cudf.from_pandas(df)
+    ddf = dd.from_pandas(df, npartitions=3)
+    gddf = dask_cudf.from_cudf(gdf, npartitions=3)
+
+    if dropna is None:
+        dask_cudf_result = gddf.groupby(by).e.sum()
+        dask_result = ddf.groupby(by).e.sum()
+    else:
+        dask_cudf_result = gddf.groupby(by, dropna=dropna).e.sum()
+        dask_result = ddf.groupby(by, dropna=dropna).e.sum()
+
+    dd.assert_eq(dask_cudf_result, dask_result)
+
+
 @pytest.mark.parametrize("myindex", [[1, 2] * 4, ["s1", "s2"] * 4])
 def test_groupby_string_index_name(myindex):
     # GH-Issue #3420
@@ -570,7 +663,7 @@ def test_groupby_agg_redirect(aggregations):
     [["not_supported"], {"a": "not_supported"}, {"a": ["not_supported"]}],
 )
 def test_is_supported(arg):
-    assert _is_supported(arg, {"supported"}) is False
+    assert _aggs_supported(arg, {"supported"}) is False
 
 
 def test_groupby_unique_lists():

From 0fcd364e00c2ec84a0128bf9b2688985e99e8e1f Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 10 May 2022 13:29:37 -0500
Subject: [PATCH 05/12] Enable writing to `s3` storage in chunked parquet
 writer (#10769)

Resolves: #10522
This PR:

- [x] Enables `s3` writing support in `ParquetDatasetWriter`
- [x] Add's a work-around to reading an `s3` directory in `cudf.read_parquet`. Issue here: https://issues.apache.org/jira/browse/ARROW-16438
- [x] Introduces all the required `s3` python library combinations that will work together with such that `test_s3.py` can be run locally on dev environments.
- [x] Improved the default `s3fs` error logs by changing the log level to `DEBUG` in pytests.(`S3FS_LOGGING_LEVEL`)

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Ayush Dattagupta (https://github.com/ayushdg)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10769
---
 conda/environments/cudf_dev_cuda11.5.yml      |   5 +
 docs/cudf/source/api_docs/io.rst              |   4 +
 python/cudf/cudf/io/__init__.py               |   3 +-
 python/cudf/cudf/io/parquet.py                | 168 +++++++++-------
 python/cudf/cudf/tests/test_s3.py             | 185 ++++++++++++------
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |  29 +--
 6 files changed, 244 insertions(+), 150 deletions(-)

diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index 1b79bdb763f..d6d05926099 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -67,6 +67,11 @@ dependencies:
   - pydata-sphinx-theme
   - librdkafka=1.7.0
   - python-confluent-kafka=1.7.0
+  - moto>=3.1.6
+  - boto3>=1.21.21
+  - botocore>=1.24.21
+  - aiobotocore>=2.2.0
+  - s3fs>=2022.3.0
   - pip:
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
diff --git a/docs/cudf/source/api_docs/io.rst b/docs/cudf/source/api_docs/io.rst
index 7e4d1b48c93..a52667cd3e4 100644
--- a/docs/cudf/source/api_docs/io.rst
+++ b/docs/cudf/source/api_docs/io.rst
@@ -36,6 +36,10 @@ Parquet
    read_parquet
    DataFrame.to_parquet
    cudf.io.parquet.read_parquet_metadata
+   :template: autosummary/class_with_autosummary.rst
+
+   cudf.io.parquet.ParquetDatasetWriter
+
 
 ORC
 ~~~
diff --git a/python/cudf/cudf/io/__init__.py b/python/cudf/cudf/io/__init__.py
index 15404b26042..4ec84ecbc74 100644
--- a/python/cudf/cudf/io/__init__.py
+++ b/python/cudf/cudf/io/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 from cudf.io.avro import read_avro
 from cudf.io.csv import read_csv, to_csv
 from cudf.io.dlpack import from_dlpack
@@ -9,6 +9,7 @@
 from cudf.io.parquet import (
     merge_parquet_filemetadata,
     read_parquet,
+    ParquetDatasetWriter,
     read_parquet_metadata,
     write_to_dataset,
 )
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index d7e85d72ba0..a9398a3139f 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 import math
+import shutil
+import tempfile
 import warnings
 from collections import defaultdict
 from contextlib import ExitStack
@@ -232,12 +234,15 @@ def _process_dataset(
         filters = pq._filters_to_expression(filters)
 
     # Initialize ds.FilesystemDataset
+    # TODO: Remove the if len(paths) workaround after following bug is fixed:
+    # https://issues.apache.org/jira/browse/ARROW-16438
     dataset = ds.dataset(
-        paths,
+        source=paths[0] if len(paths) == 1 else paths,
         filesystem=fs,
         format="parquet",
         partitioning="hive",
     )
+
     file_list = dataset.files
     if len(file_list) == 0:
         raise FileNotFoundError(f"{paths} could not be resolved to any files")
@@ -837,6 +842,67 @@ def _parse_bytes(s):
 
 
 class ParquetDatasetWriter:
+    """
+    Write a parquet file or dataset incrementally
+
+    Parameters
+    ----------
+    path : str
+        A local directory path or S3 URL. Will be used as root directory
+        path while writing a partitioned dataset.
+    partition_cols : list
+        Column names by which to partition the dataset
+        Columns are partitioned in the order they are given
+    index : bool, default None
+        If ``True``, include the dataframe's index(es) in the file output.
+        If ``False``, they will not be written to the file. If ``None``,
+        index(es) other than RangeIndex will be saved as columns.
+    compression : {'snappy', None}, default 'snappy'
+        Name of the compression to use. Use ``None`` for no compression.
+    statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP'
+        Level at which column statistics should be included in file.
+    max_file_size : int or str, default None
+        A file size that cannot be exceeded by the writer.
+        It is in bytes, if the input is int.
+        Size can also be a str in form or "10 MB", "1 GB", etc.
+        If this parameter is used, it is mandatory to pass
+        `file_name_prefix`.
+    file_name_prefix : str
+        This is a prefix to file names generated only when
+        `max_file_size` is specified.
+
+
+    Examples
+    --------
+    Using a context
+
+    >>> df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]})
+    >>> df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]})
+    >>> with ParquetDatasetWriter("./dataset", partition_cols=["a"]) as cw:
+    ...     cw.write_table(df1)
+    ...     cw.write_table(df2)
+
+    By manually calling ``close()``
+
+    >>> cw = ParquetDatasetWriter("./dataset", partition_cols=["a"])
+    >>> cw.write_table(df1)
+    >>> cw.write_table(df2)
+    >>> cw.close()
+
+    Both the methods will generate the same directory structure
+
+    .. code-block:: none
+
+        dataset/
+            a=1
+                <filename>.parquet
+            a=2
+                <filename>.parquet
+            a=3
+                <filename>.parquet
+
+    """
+
     @_cudf_nvtx_annotate
     def __init__(
         self,
@@ -847,68 +913,15 @@ def __init__(
         statistics="ROWGROUP",
         max_file_size=None,
         file_name_prefix=None,
+        **kwargs,
     ) -> None:
-        """
-        Write a parquet file or dataset incrementally
-
-        Parameters
-        ----------
-        path : str
-            File path or Root Directory path. Will be used as Root Directory
-            path while writing a partitioned dataset.
-        partition_cols : list
-            Column names by which to partition the dataset
-            Columns are partitioned in the order they are given
-        index : bool, default None
-            If ``True``, include the dataframe’s index(es) in the file output.
-            If ``False``, they will not be written to the file. If ``None``,
-            index(es) other than RangeIndex will be saved as columns.
-        compression : {'snappy', None}, default 'snappy'
-            Name of the compression to use. Use ``None`` for no compression.
-        statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP'
-            Level at which column statistics should be included in file.
-        max_file_size : int or str, default None
-            A file size that cannot be exceeded by the writer.
-            It is in bytes, if the input is int.
-            Size can also be a str in form or "10 MB", "1 GB", etc.
-            If this parameter is used, it is mandatory to pass
-            `file_name_prefix`.
-        file_name_prefix : str
-            This is a prefix to file names generated only when
-            `max_file_size` is specified.
-
-
-        Examples
-        ________
-        Using a context
-
-        >>> df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]})
-        >>> df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]})
-        >>> with ParquetDatasetWriter("./dataset", partition_cols=["a"]) as cw:
-        ...     cw.write_table(df1)
-        ...     cw.write_table(df2)
-
-        By manually calling ``close()``
-
-        >>> cw = ParquetDatasetWriter("./dataset", partition_cols=["a"])
-        >>> cw.write_table(df1)
-        >>> cw.write_table(df2)
-        >>> cw.close()
-
-        Both the methods will generate the same directory structure
-
-        .. code-block:: bash
-
-            dataset/
-                a=1
-                    <filename>.parquet
-                a=2
-                    <filename>.parquet
-                a=3
-                    <filename>.parquet
+        if isinstance(path, str) and path.startswith("s3://"):
+            self.fs_meta = {"is_s3": True, "actual_path": path}
+            self.path = tempfile.TemporaryDirectory().name
+        else:
+            self.fs_meta = {}
+            self.path = path
 
-        """
-        self.path = path
         self.common_args = {
             "index": index,
             "compression": compression,
@@ -923,6 +936,7 @@ def __init__(
         # Map of partition_col values to their ParquetWriter's index
         # in self._chunked_writers for reverse lookup
         self.path_cw_map: Dict[str, int] = {}
+        self.kwargs = kwargs
         self.filename = file_name_prefix
         self.max_file_size = max_file_size
         if max_file_size is not None:
@@ -1051,18 +1065,19 @@ def write_table(self, df):
             ]
             cw.write_table(grouped_df, this_cw_part_info)
 
-        # Create new cw for unhandled paths encountered in this write_table
-        new_paths, part_info, meta_paths = zip(*new_cw_paths)
-        self._chunked_writers.append(
-            (
-                ParquetWriter(new_paths, **self.common_args),
-                new_paths,
-                meta_paths,
+        if new_cw_paths:
+            # Create new cw for unhandled paths encountered in this write_table
+            new_paths, part_info, meta_paths = zip(*new_cw_paths)
+            self._chunked_writers.append(
+                (
+                    ParquetWriter(new_paths, **self.common_args),
+                    new_paths,
+                    meta_paths,
+                )
             )
-        )
-        new_cw_idx = len(self._chunked_writers) - 1
-        self.path_cw_map.update({k: new_cw_idx for k in new_paths})
-        self._chunked_writers[-1][0].write_table(grouped_df, part_info)
+            new_cw_idx = len(self._chunked_writers) - 1
+            self.path_cw_map.update({k: new_cw_idx for k in new_paths})
+            self._chunked_writers[-1][0].write_table(grouped_df, part_info)
 
     @_cudf_nvtx_annotate
     def close(self, return_metadata=False):
@@ -1076,6 +1091,15 @@ def close(self, return_metadata=False):
             for cw, _, meta_path in self._chunked_writers
         ]
 
+        if self.fs_meta.get("is_s3", False):
+            local_path = self.path
+            s3_path = self.fs_meta["actual_path"]
+            s3_file, _ = ioutils._get_filesystem_and_paths(
+                s3_path, **self.kwargs
+            )
+            s3_file.put(local_path, s3_path, recursive=True)
+            shutil.rmtree(self.path)
+
         if return_metadata:
             return (
                 merge_parquet_filemetadata(metadata)
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index e8d93caaf55..0966bee93fd 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -2,6 +2,7 @@
 
 import os
 import shlex
+import socket
 import subprocess
 import time
 from contextlib import contextmanager
@@ -18,12 +19,22 @@
 import cudf
 from cudf.testing._utils import assert_eq
 
-moto = pytest.importorskip("moto", minversion="1.3.14")
+moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
 requests = pytest.importorskip("requests")
 s3fs = pytest.importorskip("s3fs")
 
 
+@pytest.fixture(scope="session")
+def endpoint_port():
+    # Return a free port per worker session.
+    sock = socket.socket()
+    sock.bind(("127.0.0.1", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    return port
+
+
 @contextmanager
 def ensure_safe_environment_variables():
     """
@@ -40,7 +51,7 @@ def ensure_safe_environment_variables():
 
 
 @pytest.fixture(scope="session")
-def s3_base(worker_id):
+def s3_base(endpoint_port):
     """
     Fixture to set up moto server in separate process
     """
@@ -49,15 +60,11 @@ def s3_base(worker_id):
         # system aws credentials, https://github.com/spulec/moto/issues/1793
         os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key")
         os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret")
+        os.environ.setdefault("S3FS_LOGGING_LEVEL", "DEBUG")
 
         # Launching moto in server mode, i.e., as a separate process
         # with an S3 endpoint on localhost
 
-        endpoint_port = (
-            5000
-            if worker_id == "master"
-            else 5550 + int(worker_id.lstrip("gw"))
-        )
         endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
 
         proc = subprocess.Popen(
@@ -82,13 +89,10 @@ def s3_base(worker_id):
 
 
 @pytest.fixture()
-def s3so(worker_id):
+def s3so(endpoint_port):
     """
     Returns s3 storage options to pass to fsspec
     """
-    endpoint_port = (
-        5000 if worker_id == "master" else 5550 + int(worker_id.lstrip("gw"))
-    )
     endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
 
     return {"client_kwargs": {"endpoint_url": endpoint_uri}}
@@ -141,13 +145,13 @@ def pdf_ext(scope="module"):
 def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
     # Write to buffer
     fname = "test_csv_reader.csv"
-    bname = "csv"
+    bucket = "csv"
     buffer = pdf.to_csv(index=False)
 
     # Use fsspec file object
-    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         got = cudf.read_csv(
-            f"s3://{bname}/{fname}",
+            f"s3://{bucket}/{fname}",
             storage_options=s3so,
             bytes_per_thread=bytes_per_thread,
             use_python_file_object=False,
@@ -155,9 +159,9 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
     assert_eq(pdf, got)
 
     # Use Arrow PythonFile object
-    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         got = cudf.read_csv(
-            f"s3://{bname}/{fname}",
+            f"s3://{bucket}/{fname}",
             storage_options=s3so,
             bytes_per_thread=bytes_per_thread,
             use_python_file_object=True,
@@ -168,13 +172,13 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
 def test_read_csv_arrow_nativefile(s3_base, s3so, pdf):
     # Write to buffer
     fname = "test_csv_reader_arrow_nativefile.csv"
-    bname = "csv"
+    bucket = "csv"
     buffer = pdf.to_csv(index=False)
-    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bname}/{fname}") as fil:
+        with fs.open_input_file(f"{bucket}/{fname}") as fil:
             got = cudf.read_csv(fil)
 
     assert_eq(pdf, got)
@@ -187,13 +191,13 @@ def test_read_csv_byte_range(
 ):
     # Write to buffer
     fname = "test_csv_reader_byte_range.csv"
-    bname = "csv"
+    bucket = "csv"
     buffer = pdf.to_csv(index=False)
 
     # Use fsspec file object
-    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         got = cudf.read_csv(
-            f"s3://{bname}/{fname}",
+            f"s3://{bucket}/{fname}",
             storage_options=s3so,
             byte_range=(74, 73),
             bytes_per_thread=bytes_per_thread,
@@ -209,19 +213,19 @@ def test_read_csv_byte_range(
 def test_write_csv(s3_base, s3so, pdf, chunksize):
     # Write to buffer
     fname = "test_csv_writer.csv"
-    bname = "csv"
+    bucket = "csv"
     gdf = cudf.from_pandas(pdf)
-    with s3_context(s3_base=s3_base, bucket=bname) as s3fs:
+    with s3_context(s3_base=s3_base, bucket=bucket) as s3fs:
         gdf.to_csv(
-            f"s3://{bname}/{fname}",
+            f"s3://{bucket}/{fname}",
             index=False,
             chunksize=chunksize,
             storage_options=s3so,
         )
-        assert s3fs.exists(f"s3://{bname}/{fname}")
+        assert s3fs.exists(f"s3://{bucket}/{fname}")
 
         # TODO: Update to use `storage_options` from pandas v1.2.0
-        got = pd.read_csv(s3fs.open(f"s3://{bname}/{fname}"))
+        got = pd.read_csv(s3fs.open(f"s3://{bucket}/{fname}"))
 
     assert_eq(pdf, got)
 
@@ -240,15 +244,15 @@ def test_read_parquet(
     use_python_file_object,
 ):
     fname = "test_parquet_reader.parquet"
-    bname = "parquet"
+    bucket = "parquet"
     buffer = BytesIO()
     pdf.to_parquet(path=buffer)
 
     # Check direct path handling
     buffer.seek(0)
-    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         got1 = cudf.read_parquet(
-            f"s3://{bname}/{fname}",
+            f"s3://{bucket}/{fname}",
             open_file_options=(
                 {"precache_options": {"method": precache}}
                 if use_python_file_object
@@ -264,11 +268,11 @@ def test_read_parquet(
 
     # Check fsspec file-object handling
     buffer.seek(0)
-    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
-        fs = get_fs_token_paths(f"s3://{bname}/{fname}", storage_options=s3so)[
-            0
-        ]
-        with fs.open(f"s3://{bname}/{fname}", mode="rb") as f:
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+        fs = get_fs_token_paths(
+            f"s3://{bucket}/{fname}", storage_options=s3so
+        )[0]
+        with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f:
             got2 = cudf.read_parquet(
                 f,
                 bytes_per_thread=bytes_per_thread,
@@ -290,7 +294,7 @@ def test_read_parquet_ext(
     index,
 ):
     fname = "test_parquet_reader_ext.parquet"
-    bname = "parquet"
+    bucket = "parquet"
     buffer = BytesIO()
 
     if index:
@@ -300,9 +304,9 @@ def test_read_parquet_ext(
 
     # Check direct path handling
     buffer.seek(0)
-    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         got1 = cudf.read_parquet(
-            f"s3://{bname}/{fname}",
+            f"s3://{bucket}/{fname}",
             storage_options=s3so,
             bytes_per_thread=bytes_per_thread,
             footer_sample_size=3200,
@@ -323,15 +327,15 @@ def test_read_parquet_ext(
 def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
     # Write to buffer
     fname = "test_parquet_reader_arrow_nativefile.parquet"
-    bname = "parquet"
+    bucket = "parquet"
     buffer = BytesIO()
     pdf.to_parquet(path=buffer)
     buffer.seek(0)
-    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bname}/{fname}") as fil:
+        with fs.open_input_file(f"{bucket}/{fname}") as fil:
             got = cudf.read_parquet(fil, columns=columns)
 
     expect = pdf[columns] if columns else pdf
@@ -341,14 +345,14 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
 @pytest.mark.parametrize("precache", [None, "parquet"])
 def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
     fname = "test_parquet_reader_filters.parquet"
-    bname = "parquet"
+    bucket = "parquet"
     buffer = BytesIO()
     pdf_ext.to_parquet(path=buffer)
     buffer.seek(0)
     filters = [("String", "==", "Omega")]
-    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         got = cudf.read_parquet(
-            f"s3://{bname}/{fname}",
+            f"s3://{bucket}/{fname}",
             storage_options=s3so,
             filters=filters,
             open_file_options={"precache_options": {"method": precache}},
@@ -360,25 +364,38 @@ def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
 
 @pytest.mark.parametrize("partition_cols", [None, ["String"]])
 def test_write_parquet(s3_base, s3so, pdf, partition_cols):
-    fname = "test_parquet_writer.parquet"
-    bname = "parquet"
+    fname_cudf = "test_parquet_writer_cudf"
+    fname_pandas = "test_parquet_writer_pandas"
+    bucket = "parquet"
     gdf = cudf.from_pandas(pdf)
-    with s3_context(s3_base=s3_base, bucket=bname) as s3fs:
+
+    with s3_context(s3_base=s3_base, bucket=bucket) as s3fs:
         gdf.to_parquet(
-            f"s3://{bname}/{fname}",
+            f"s3://{bucket}/{fname_cudf}",
+            partition_cols=partition_cols,
+            storage_options=s3so,
+        )
+        assert s3fs.exists(f"s3://{bucket}/{fname_cudf}")
+        pdf.to_parquet(
+            f"s3://{bucket}/{fname_pandas}",
             partition_cols=partition_cols,
             storage_options=s3so,
         )
-        assert s3fs.exists(f"s3://{bname}/{fname}")
+        assert s3fs.exists(f"s3://{bucket}/{fname_pandas}")
 
-        got = pd.read_parquet(s3fs.open(f"s3://{bname}/{fname}"))
+        got = pd.read_parquet(
+            f"s3://{bucket}/{fname_pandas}", storage_options=s3so
+        )
+        expect = cudf.read_parquet(
+            f"s3://{bucket}/{fname_cudf}", storage_options=s3so
+        )
 
-    assert_eq(pdf, got)
+    assert_eq(expect, got)
 
 
 def test_read_json(s3_base, s3so):
     fname = "test_json_reader.json"
-    bname = "json"
+    bucket = "json"
     # TODO: After following bug is fixed switch
     # back to using bytes:
     # https://github.com/pandas-dev/pandas/issues/46935
@@ -396,9 +413,9 @@ def test_read_json(s3_base, s3so):
         '{"amount": 400, "name": "Dennis"}\n'
     )
 
-    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         got = cudf.read_json(
-            f"s3://{bname}/{fname}",
+            f"s3://{bucket}/{fname}",
             engine="cudf",
             orient="records",
             lines=True,
@@ -414,15 +431,15 @@ def test_read_json(s3_base, s3so):
 def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
     source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
     fname = "test_orc_reader.orc"
-    bname = "orc"
+    bucket = "orc"
     expect = pa.orc.ORCFile(source_file).read().to_pandas()
 
     with open(source_file, "rb") as f:
         buffer = f.read()
 
-    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         got = cudf.read_orc(
-            f"s3://{bname}/{fname}",
+            f"s3://{bucket}/{fname}",
             columns=columns,
             storage_options=s3so,
             use_python_file_object=use_python_file_object,
@@ -437,17 +454,17 @@ def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
 def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
     source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
     fname = "test_orc_reader.orc"
-    bname = "orc"
+    bucket = "orc"
     expect = pa.orc.ORCFile(source_file).read().to_pandas()
 
     with open(source_file, "rb") as f:
         buffer = f.read()
 
-    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bname}/{fname}") as fil:
+        with fs.open_input_file(f"{bucket}/{fname}") as fil:
             got = cudf.read_orc(fil, columns=columns)
 
     if columns:
@@ -457,13 +474,51 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
 
 def test_write_orc(s3_base, s3so, pdf):
     fname = "test_orc_writer.orc"
-    bname = "orc"
+    bucket = "orc"
     gdf = cudf.from_pandas(pdf)
-    with s3_context(s3_base=s3_base, bucket=bname) as s3fs:
-        gdf.to_orc(f"s3://{bname}/{fname}", storage_options=s3so)
-        assert s3fs.exists(f"s3://{bname}/{fname}")
+    with s3_context(s3_base=s3_base, bucket=bucket) as s3fs:
+        gdf.to_orc(f"s3://{bucket}/{fname}", storage_options=s3so)
+        assert s3fs.exists(f"s3://{bucket}/{fname}")
 
-        with s3fs.open(f"s3://{bname}/{fname}") as f:
+        with s3fs.open(f"s3://{bucket}/{fname}") as f:
             got = pa.orc.ORCFile(f).read().to_pandas()
 
     assert_eq(pdf, got)
+
+
+def test_write_chunked_parquet(s3_base, s3so):
+    df1 = cudf.DataFrame({"b": [10, 11, 12], "a": [1, 2, 3]})
+    df2 = cudf.DataFrame({"b": [20, 30, 50], "a": [3, 2, 1]})
+    dirname = "chunked_writer_directory"
+    bucket = "parquet"
+    from cudf.io.parquet import ParquetDatasetWriter
+
+    with s3_context(
+        s3_base=s3_base, bucket=bucket, files={dirname: BytesIO()}
+    ) as s3fs:
+        with ParquetDatasetWriter(
+            f"s3://{bucket}/{dirname}",
+            partition_cols=["a"],
+            storage_options=s3so,
+        ) as cw:
+            cw.write_table(df1)
+            cw.write_table(df2)
+
+        # TODO: Replace following workaround with:
+        # expect = cudf.read_parquet(f"s3://{bucket}/{dirname}/",
+        # storage_options=s3so)
+        # after the following bug is fixed:
+        # https://issues.apache.org/jira/browse/ARROW-16438
+
+        dfs = []
+        for folder in {"a=1", "a=2", "a=3"}:
+            assert s3fs.exists(f"s3://{bucket}/{dirname}/{folder}")
+            for file in s3fs.ls(f"s3://{bucket}/{dirname}/{folder}"):
+                df = cudf.read_parquet("s3://" + file, storage_options=s3so)
+                dfs.append(df)
+
+        actual = cudf.concat(dfs).astype("int64")
+        assert_eq(
+            actual.sort_values(["b"]).reset_index(drop=True),
+            cudf.concat([df1, df2]).sort_values(["b"]).reset_index(drop=True),
+        )
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index 83ff1273b36..9283380296c 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -1,5 +1,8 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 import os
 import shlex
+import socket
 import subprocess
 import time
 from contextlib import contextmanager
@@ -11,12 +14,22 @@
 
 import dask_cudf
 
-moto = pytest.importorskip("moto", minversion="1.3.14")
+moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
 requests = pytest.importorskip("requests")
 s3fs = pytest.importorskip("s3fs")
 
 
+@pytest.fixture(scope="session")
+def endpoint_port():
+    # Return a free port per worker session.
+    sock = socket.socket()
+    sock.bind(("127.0.0.1", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    return port
+
+
 @contextmanager
 def ensure_safe_environment_variables():
     """
@@ -33,7 +46,7 @@ def ensure_safe_environment_variables():
 
 
 @pytest.fixture(scope="session")
-def s3_base(worker_id):
+def s3_base(endpoint_port):
     """
     Fixture to set up moto server in separate process
     """
@@ -42,15 +55,10 @@ def s3_base(worker_id):
         # system aws credentials, https://github.com/spulec/moto/issues/1793
         os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key")
         os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret")
+        os.environ.setdefault("S3FS_LOGGING_LEVEL", "DEBUG")
 
         # Launching moto in server mode, i.e., as a separate process
         # with an S3 endpoint on localhost
-
-        endpoint_port = (
-            5000
-            if worker_id == "master"
-            else 5550 + int(worker_id.lstrip("gw"))
-        )
         endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
 
         proc = subprocess.Popen(
@@ -75,13 +83,10 @@ def s3_base(worker_id):
 
 
 @pytest.fixture()
-def s3so(worker_id):
+def s3so(endpoint_port):
     """
     Returns s3 storage options to pass to fsspec
     """
-    endpoint_port = (
-        5000 if worker_id == "master" else 5550 + int(worker_id.lstrip("gw"))
-    )
     endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
 
     return {"client_kwargs": {"endpoint_url": endpoint_uri}}

From 4539e5e60d2bc2c81338a5d646f5a9c3ac5ef7cf Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Tue, 10 May 2022 11:42:34 -0700
Subject: [PATCH 06/12] Refactor `cudf::contains`, renaming and switching
 parameters role (#10802)

This PR does the following:
 * Renaming parameters of `cudf::contains`, changing from `t`/`values`, `col`/`value`, etc... into `haystack`/`needle` in a consistent way.
 * Switching the role of `haystack` and `needles` parameters of the overload `cudf::contains(column_view, column_view)`, which incorrectly searches for `haystack` inside `needles`.
 * Update the internal usage of that overloads in cudf.
 * Update unit tests.
 * Rewriting all `cudf::contains` doxygen.
 * And some minor code cleanup/refactor.

Since the role of parameters is switched, this causes breaking changes.

Closes https://github.com/rapidsai/cudf/issues/10725. In addition, this is also a foundation for more changes in `search.cu` to support nested types in https://github.com/rapidsai/cudf/pull/10656.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Jason Lowe (https://github.com/jlowe)
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10802
---
 cpp/include/cudf/detail/search.hpp            |  26 +-
 cpp/include/cudf/search.hpp                   | 141 +++++------
 cpp/src/dictionary/remove_keys.cu             |   4 +-
 cpp/src/dictionary/set_keys.cu                |   2 +-
 cpp/src/search/search.cu                      | 228 ++++++++----------
 cpp/tests/search/search_dictionary_test.cpp   |   6 +-
 cpp/tests/search/search_test.cpp              |  24 +-
 .../main/java/ai/rapids/cudf/ColumnView.java  |  19 +-
 java/src/main/native/src/ColumnViewJni.cpp    |  17 +-
 python/cudf/cudf/_lib/cpp/search.pxd          |  10 +-
 python/cudf/cudf/core/column/string.py        |   2 +-
 11 files changed, 227 insertions(+), 252 deletions(-)

diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index c986418c790..44067ff87c0 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,11 +33,11 @@ namespace detail {
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> lower_bound(
-  table_view const& t,
-  table_view const& values,
+  table_view const& haystack,
+  table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -46,33 +46,29 @@ std::unique_ptr<column> lower_bound(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> upper_bound(
-  table_view const& t,
-  table_view const& values,
+  table_view const& haystack,
+  table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc cudf::contains(column_view const&, scalar const&,
- *                                       rmm::mr::device_memory_resource*)
+ * @copydoc cudf::contains(column_view const&, scalar const&, rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-bool contains(column_view const& col,
-              scalar const& value,
-              rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::contains(column_view const&, column_view const&,
- *                                       rmm::mr::device_memory_resource*)
+ * @copydoc cudf::contains(column_view const&, column_view const&, rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> contains(
   column_view const& haystack,
   column_view const& needles,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp
index 56a31891e27..3b68923ee93 100644
--- a/cpp/include/cudf/search.hpp
+++ b/cpp/include/cudf/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,134 +32,123 @@ namespace cudf {
  */
 
 /**
- * @brief Find smallest indices in a sorted table where values should be
- *  inserted to maintain order
+ * @brief Find smallest indices in a sorted table where values should be inserted to maintain order.
  *
- * For each row v in @p values, find the first index in @p t where
- *  inserting the row will maintain the sort order of @p t
+ * For each row in `needles`, find the first index in `haystack` where inserting the row still
+ * maintains its sort order.
  *
  * @code{.pseudo}
  * Example:
  *
  *  Single column:
- *      idx      0   1   2   3   4
- *   column = { 10, 20, 20, 30, 50 }
- *   values = { 20 }
- *   result = {  1 }
+ *      idx        0   1   2   3   4
+ *   haystack = { 10, 20, 20, 30, 50 }
+ *   needles  = { 20 }
+ *   result   = {  1 }
  *
  *  Multi Column:
- *      idx        0    1    2    3    4
- *   t      = {{  10,  20,  20,  20,  20 },
- *             { 5.0,  .5,  .5,  .7,  .7 },
- *             {  90,  77,  78,  61,  61 }}
- *   values = {{ 20 },
- *             { .7 },
- *             { 61 }}
- *   result =  {  3 }
+ *      idx          0    1    2    3    4
+ *   haystack = {{  10,  20,  20,  20,  20 },
+ *               { 5.0,  .5,  .5,  .7,  .7 },
+ *               {  90,  77,  78,  61,  61 }}
+ *   needles  = {{ 20 },
+ *               { .7 },
+ *               { 61 }}
+ *   result   = {   3 }
  * @endcode
  *
- * @param t               Table to search
- * @param values          Find insert locations for these values
- * @param column_order    Vector of column sort order
- * @param null_precedence Vector of null_precedence enums values
- * @param mr              Device memory resource used to allocate the returned column's device
- * memory
+ * @param haystack The table containing search space.
+ * @param needles Values for which to find the insert locations in the search space.
+ * @param column_order Vector of column sort order.
+ * @param null_precedence Vector of null_precedence enums needles.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A non-nullable column of cudf::size_type elements containing the insertion points.
  */
 std::unique_ptr<column> lower_bound(
-  table_view const& t,
-  table_view const& values,
+  table_view const& haystack,
+  table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Find largest indices in a sorted table where values should be
- *  inserted to maintain order
+ * @brief Find largest indices in a sorted table where values should be inserted to maintain order.
  *
- * For each row v in @p values, find the last index in @p t where
- *  inserting the row will maintain the sort order of @p t
+ * For each row in `needles`, find the last index in `haystack` where inserting the row still
+ * maintains its sort order.
  *
  * @code{.pseudo}
  * Example:
  *
  *  Single Column:
- *      idx      0   1   2   3   4
- *   column = { 10, 20, 20, 30, 50 }
- *   values = { 20 }
- *   result = {  3 }
+ *      idx        0   1   2   3   4
+ *   haystack = { 10, 20, 20, 30, 50 }
+ *   needles  = { 20 }
+ *   result   = {  3 }
  *
  *  Multi Column:
- *      idx        0    1    2    3    4
- *   t      = {{  10,  20,  20,  20,  20 },
- *             { 5.0,  .5,  .5,  .7,  .7 },
- *             {  90,  77,  78,  61,  61 }}
- *   values = {{ 20 },
- *             { .7 },
- *             { 61 }}
- *   result =  {  5 }
+ *      idx          0    1    2    3    4
+ *   haystack = {{  10,  20,  20,  20,  20 },
+ *               { 5.0,  .5,  .5,  .7,  .7 },
+ *               {  90,  77,  78,  61,  61 }}
+ *   needles  = {{ 20 },
+ *               { .7 },
+ *               { 61 }}
+ *   result =     { 5 }
  * @endcode
  *
- * @param search_table    Table to search
- * @param values          Find insert locations for these values
- * @param column_order    Vector of column sort order
- * @param null_precedence Vector of null_precedence enums values
- * @param mr              Device memory resource used to allocate the returned column's device
- * memory
+ * @param haystack The table containing search space.
+ * @param needles Values for which to find the insert locations in the search space.
+ * @param column_order Vector of column sort order.
+ * @param null_precedence Vector of null_precedence enums needles.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A non-nullable column of cudf::size_type elements containing the insertion points.
  */
 std::unique_ptr<column> upper_bound(
-  table_view const& search_table,
-  table_view const& values,
+  table_view const& haystack,
+  table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Find if the `value` is present in the `col`
+ * @brief Check if the given `needle` value exists in the `haystack` column.
  *
- * @throws cudf::logic_error
- * If `col.type() != values.type()`
+ * @throws cudf::logic_error If `haystack.type() != needle.type()`.
  *
  * @code{.pseudo}
  *  Single Column:
- *      idx      0   1   2   3   4
- *      col = { 10, 20, 20, 30, 50 }
- *  Scalar:
- *   value = { 20 }
- *   result = true
+ *   idx           0   1   2   3   4
+ *   haystack = { 10, 20, 20, 30, 50 }
+ *   needle   = { 20 }
+ *   result   = true
  * @endcode
  *
- * @param col      A column object
- * @param value    A scalar value to search for in `col`
- *
- * @return bool    If `value` is found in `column` true, else false.
+ * @param haystack The column containing search space.
+ * @param needle A scalar value to check for existence in the search space.
+ * @return true if the given `needle` value exists in the `haystack` column.
  */
-bool contains(column_view const& col, scalar const& value);
+bool contains(column_view const& haystack, scalar const& needle);
 
 /**
- * @brief  Returns a new column of type bool identifying for each element of @p haystack column,
- *         if that element is contained in @p needles column.
+ * @brief Check if the given `needles` values exists in the `haystack` column.
  *
- * The new column will have the same dimension and null status as the @p haystack column.  That is,
- * any element that is invalid in the @p haystack column will be invalid in the returned column.
+ * The new column will have type BOOL and have the same size and null mask as the input `needles`
+ * column. That is, any null row in the `needles` column will result in a nul row in the output
+ * column.
  *
- * @throws cudf::logic_error
- * If `haystack.type() != needles.type()`
+ * @throws cudf::logic_error If `haystack.type() != needles.type()`
  *
  * @code{.pseudo}
  *   haystack = { 10, 20, 30, 40, 50 }
  *   needles  = { 20, 40, 60, 80 }
- *
- *   result = { false, true, false, true, false }
+ *   result   = { true, true, false, false }
  * @endcode
  *
- * @param haystack  A column object
- * @param needles   A column of values to search for in `col`
- * @param mr        Device memory resource used to allocate the returned column's device memory
- *
- * @return A column of bool elements containing true if the corresponding entry in haystack
- * appears in needles and false if it does not.
+ * @param haystack The column containing search space.
+ * @param needles A column of values to check for existence in the search space.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return A BOOL column indicating if each element in `needles` exists in the search space.
  */
 std::unique_ptr<column> contains(
   column_view const& haystack,
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index c4b3bbc00e4..a98e69149af 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -158,7 +158,7 @@ std::unique_ptr<column> remove_keys(
   CUDF_EXPECTS(keys_view.type() == keys_to_remove.type(), "keys types must match");
 
   // locate keys to remove by searching the keys column
-  auto const matches = cudf::detail::contains(keys_view, keys_to_remove, stream, mr);
+  auto const matches = cudf::detail::contains(keys_to_remove, keys_view, stream, mr);
   auto d_matches     = matches->view().data<bool>();
   // call common utility method to keep the keys not matched to keys_to_remove
   auto key_matcher = [d_matches] __device__(size_type idx) { return !d_matches[idx]; };
@@ -181,7 +181,7 @@ std::unique_ptr<column> remove_unused_keys(
     thrust::sequence(rmm::exec_policy(stream), keys_positions.begin(), keys_positions.end());
     // wrap the indices for comparison in contains()
     column_view keys_positions_view(data_type{type_id::UINT32}, keys_size, keys_positions.data());
-    return cudf::detail::contains(keys_positions_view, indices_view, stream, mr);
+    return cudf::detail::contains(indices_view, keys_positions_view, stream, mr);
   }();
   auto d_matches = matches->view().data<bool>();
 
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index dfc6cbb78cc..25c46837e9f 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -138,7 +138,7 @@ std::unique_ptr<column> set_keys(
   std::unique_ptr<column> keys_column(std::move(sorted_keys.front()));
 
   // compute the new nulls
-  auto matches   = cudf::detail::contains(keys, keys_column->view(), stream, mr);
+  auto matches   = cudf::detail::contains(keys_column->view(), keys, stream, mr);
   auto d_matches = matches->view().data<bool>();
   auto indices_itr =
     cudf::detail::indexalator_factory::make_input_iterator(dictionary_column.indices());
diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index 29eddf703df..491ad49e020 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -43,40 +43,9 @@
 
 namespace cudf {
 namespace {
-template <typename DataIterator,
-          typename ValuesIterator,
-          typename OutputIterator,
-          typename Comparator>
-void launch_search(DataIterator it_data,
-                   ValuesIterator it_vals,
-                   size_type data_size,
-                   size_type values_size,
-                   OutputIterator it_output,
-                   Comparator comp,
-                   bool find_first,
-                   rmm::cuda_stream_view stream)
-{
-  if (find_first) {
-    thrust::lower_bound(rmm::exec_policy(stream),
-                        it_data,
-                        it_data + data_size,
-                        it_vals,
-                        it_vals + values_size,
-                        it_output,
-                        comp);
-  } else {
-    thrust::upper_bound(rmm::exec_policy(stream),
-                        it_data,
-                        it_data + data_size,
-                        it_vals,
-                        it_vals + values_size,
-                        it_output,
-                        comp);
-  }
-}
 
-std::unique_ptr<column> search_ordered(table_view const& t,
-                                       table_view const& values,
+std::unique_ptr<column> search_ordered(table_view const& haystack,
+                                       table_view const& needles,
                                        bool find_first,
                                        std::vector<order> const& column_order,
                                        std::vector<null_order> const& null_precedence,
@@ -84,30 +53,30 @@ std::unique_ptr<column> search_ordered(table_view const& t,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(
-    column_order.empty() or static_cast<std::size_t>(t.num_columns()) == column_order.size(),
+    column_order.empty() or static_cast<std::size_t>(haystack.num_columns()) == column_order.size(),
     "Mismatch between number of columns and column order.");
-  CUDF_EXPECTS(
-    null_precedence.empty() or static_cast<std::size_t>(t.num_columns()) == null_precedence.size(),
-    "Mismatch between number of columns and null precedence.");
+  CUDF_EXPECTS(null_precedence.empty() or
+                 static_cast<std::size_t>(haystack.num_columns()) == null_precedence.size(),
+               "Mismatch between number of columns and null precedence.");
 
   // Allocate result column
   auto result = make_numeric_column(
-    data_type{type_to_id<size_type>()}, values.num_rows(), mask_state::UNALLOCATED, stream, mr);
-  auto const result_out = result->mutable_view().data<size_type>();
+    data_type{type_to_id<size_type>()}, needles.num_rows(), mask_state::UNALLOCATED, stream, mr);
+  auto const out_it = result->mutable_view().data<size_type>();
 
   // Handle empty inputs
-  if (t.num_rows() == 0) {
+  if (haystack.num_rows() == 0) {
     CUDF_CUDA_TRY(
-      cudaMemsetAsync(result_out, 0, values.num_rows() * sizeof(size_type), stream.value()));
+      cudaMemsetAsync(out_it, 0, needles.num_rows() * sizeof(size_type), stream.value()));
     return result;
   }
 
   // This utility will ensure all corresponding dictionary columns have matching keys.
   // It will return any new dictionary columns created as well as updated table_views.
-  auto const matched = dictionary::detail::match_dictionaries({t, values}, stream);
+  auto const matched = dictionary::detail::match_dictionaries({haystack, needles}, stream);
 
   // Prepare to flatten the structs column
-  auto const has_null_elements   = has_nested_nulls(t) or has_nested_nulls(values);
+  auto const has_null_elements   = has_nested_nulls(haystack) or has_nested_nulls(needles);
   auto const flatten_nullability = has_null_elements
                                      ? structs::detail::column_nullability::FORCE
                                      : structs::detail::column_nullability::MATCH_INCOMING;
@@ -135,37 +104,50 @@ std::unique_ptr<column> search_ordered(table_view const& t,
                                                  rhs,
                                                  column_order_dv.data(),
                                                  null_precedence_dv.data());
-  launch_search(
-    count_it, count_it, t.num_rows(), values.num_rows(), result_out, comp, find_first, stream);
+
+  auto const do_search = [find_first](auto&&... args) {
+    if (find_first) {
+      thrust::lower_bound(std::forward<decltype(args)>(args)...);
+    } else {
+      thrust::upper_bound(std::forward<decltype(args)>(args)...);
+    }
+  };
+  do_search(rmm::exec_policy(stream),
+            count_it,
+            count_it + haystack.num_rows(),
+            count_it,
+            count_it + needles.num_rows(),
+            out_it,
+            comp);
 
   return result;
 }
 
 struct contains_scalar_dispatch {
   template <typename Element>
-  bool operator()(column_view const& col, scalar const& value, rmm::cuda_stream_view stream)
+  bool operator()(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream)
   {
-    CUDF_EXPECTS(col.type() == value.type(), "scalar and column types must match");
+    CUDF_EXPECTS(haystack.type() == needle.type(), "scalar and column types must match");
 
     using Type       = device_storage_type_t<Element>;
     using ScalarType = cudf::scalar_type_t<Element>;
-    auto d_col       = column_device_view::create(col, stream);
-    auto s           = static_cast<const ScalarType*>(&value);
+    auto d_haystack  = column_device_view::create(haystack, stream);
+    auto s           = static_cast<const ScalarType*>(&needle);
 
-    if (col.has_nulls()) {
+    if (haystack.has_nulls()) {
       auto found_iter = thrust::find(rmm::exec_policy(stream),
-                                     d_col->pair_begin<Type, true>(),
-                                     d_col->pair_end<Type, true>(),
+                                     d_haystack->pair_begin<Type, true>(),
+                                     d_haystack->pair_end<Type, true>(),
                                      thrust::make_pair(s->value(stream), true));
 
-      return found_iter != d_col->pair_end<Type, true>();
+      return found_iter != d_haystack->pair_end<Type, true>();
     } else {
       auto found_iter = thrust::find(rmm::exec_policy(stream),  //
-                                     d_col->begin<Type>(),
-                                     d_col->end<Type>(),
+                                     d_haystack->begin<Type>(),
+                                     d_haystack->end<Type>(),
                                      s->value(stream));
 
-      return found_iter != d_col->end<Type>();
+      return found_iter != d_haystack->end<Type>();
     }
   }
 };
@@ -179,66 +161,69 @@ bool contains_scalar_dispatch::operator()<cudf::list_view>(column_view const&,
 }
 
 template <>
-bool contains_scalar_dispatch::operator()<cudf::struct_view>(column_view const& col,
-                                                             scalar const& value,
+bool contains_scalar_dispatch::operator()<cudf::struct_view>(column_view const& haystack,
+                                                             scalar const& needle,
                                                              rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(col.type() == value.type(), "scalar and column types must match");
+  CUDF_EXPECTS(haystack.type() == needle.type(), "scalar and column types must match");
 
-  auto const scalar_table = static_cast<struct_scalar const*>(&value)->view();
-  CUDF_EXPECTS(col.num_children() == scalar_table.num_columns(),
+  auto const scalar_table = static_cast<struct_scalar const*>(&needle)->view();
+  CUDF_EXPECTS(haystack.num_children() == scalar_table.num_columns(),
                "struct scalar and structs column must have the same number of children");
-  for (size_type i = 0; i < col.num_children(); ++i) {
-    CUDF_EXPECTS(col.child(i).type() == scalar_table.column(i).type(),
+  for (size_type i = 0; i < haystack.num_children(); ++i) {
+    CUDF_EXPECTS(haystack.child(i).type() == scalar_table.column(i).type(),
                  "scalar and column children types must match");
   }
 
   // Prepare to flatten the structs column and scalar.
-  auto const has_null_elements =
-    has_nested_nulls(table_view{std::vector<column_view>{col.child_begin(), col.child_end()}}) ||
-    has_nested_nulls(scalar_table);
+  auto const has_null_elements = has_nested_nulls(table_view{std::vector<column_view>{
+                                   haystack.child_begin(), haystack.child_end()}}) ||
+                                 has_nested_nulls(scalar_table);
   auto const flatten_nullability = has_null_elements
                                      ? structs::detail::column_nullability::FORCE
                                      : structs::detail::column_nullability::MATCH_INCOMING;
 
   // Flatten the input structs column, only materialize the bitmask if there is null in the input.
-  auto const col_flattened =
-    structs::detail::flatten_nested_columns(table_view{{col}}, {}, {}, flatten_nullability);
-  auto const val_flattened =
+  auto const haystack_flattened =
+    structs::detail::flatten_nested_columns(table_view{{haystack}}, {}, {}, flatten_nullability);
+  auto const needle_flattened =
     structs::detail::flatten_nested_columns(scalar_table, {}, {}, flatten_nullability);
 
   // The struct scalar only contains the struct member columns.
   // Thus, if there is any null in the input, we must exclude the first column in the flattened
   // table of the input column from searching because that column is the materialized bitmask of
   // the input structs column.
-  auto const col_flattened_content  = col_flattened.flattened_columns();
-  auto const col_flattened_children = table_view{std::vector<column_view>{
-    col_flattened_content.begin() + static_cast<size_type>(has_null_elements),
-    col_flattened_content.end()}};
+  auto const haystack_flattened_content  = haystack_flattened.flattened_columns();
+  auto const haystack_flattened_children = table_view{std::vector<column_view>{
+    haystack_flattened_content.begin() + static_cast<size_type>(has_null_elements),
+    haystack_flattened_content.end()}};
 
-  auto const d_col_children_ptr = table_device_view::create(col_flattened_children, stream);
-  auto const d_val_ptr          = table_device_view::create(val_flattened, stream);
+  auto const d_haystack_children_ptr =
+    table_device_view::create(haystack_flattened_children, stream);
+  auto const d_needle_ptr = table_device_view::create(needle_flattened, stream);
 
   auto const start_iter = thrust::make_counting_iterator<size_type>(0);
-  auto const end_iter   = start_iter + col.size();
-  auto const comp       = row_equality_comparator(
-    nullate::DYNAMIC{has_null_elements}, *d_col_children_ptr, *d_val_ptr, null_equality::EQUAL);
+  auto const end_iter   = start_iter + haystack.size();
+  auto const comp       = row_equality_comparator(nullate::DYNAMIC{has_null_elements},
+                                            *d_haystack_children_ptr,
+                                            *d_needle_ptr,
+                                            null_equality::EQUAL);
   auto const found_iter = thrust::find_if(
     rmm::exec_policy(stream), start_iter, end_iter, [comp] __device__(auto const idx) {
-      return comp(idx, 0);  // compare col[idx] == val[0].
+      return comp(idx, 0);  // compare haystack[idx] == val[0].
     });
 
   return found_iter != end_iter;
 }
 
 template <>
-bool contains_scalar_dispatch::operator()<cudf::dictionary32>(column_view const& col,
-                                                              scalar const& value,
+bool contains_scalar_dispatch::operator()<cudf::dictionary32>(column_view const& haystack,
+                                                              scalar const& needle,
                                                               rmm::cuda_stream_view stream)
 {
-  auto dict_col = cudf::dictionary_column_view(col);
-  // first, find the value in the dictionary's key set
-  auto index = cudf::dictionary::detail::get_index(dict_col, value, stream);
+  auto dict_col = cudf::dictionary_column_view(haystack);
+  // first, find the needle in the dictionary's key set
+  auto index = cudf::dictionary::detail::get_index(dict_col, needle, stream);
   // if found, check the index is actually in the indices column
   return index->is_valid(stream) ? cudf::type_dispatcher(dict_col.indices().type(),
                                                          contains_scalar_dispatch{},
@@ -251,12 +236,13 @@ bool contains_scalar_dispatch::operator()<cudf::dictionary32>(column_view const&
 }  // namespace
 
 namespace detail {
-bool contains(column_view const& col, scalar const& value, rmm::cuda_stream_view stream)
+bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream)
 {
-  if (col.is_empty()) { return false; }
-  if (not value.is_valid(stream)) { return col.has_nulls(); }
+  if (haystack.is_empty()) { return false; }
+  if (not needle.is_valid(stream)) { return haystack.has_nulls(); }
 
-  return cudf::type_dispatcher(col.type(), contains_scalar_dispatch{}, col, value, stream);
+  return cudf::type_dispatcher(
+    haystack.type(), contains_scalar_dispatch{}, haystack, needle, stream);
 }
 
 struct multi_contains_dispatch {
@@ -267,44 +253,44 @@ struct multi_contains_dispatch {
                                      rmm::mr::device_memory_resource* mr)
   {
     std::unique_ptr<column> result = make_numeric_column(data_type{type_to_id<bool>()},
-                                                         haystack.size(),
-                                                         copy_bitmask(haystack),
-                                                         haystack.null_count(),
+                                                         needles.size(),
+                                                         copy_bitmask(needles),
+                                                         needles.null_count(),
                                                          stream,
                                                          mr);
 
-    if (haystack.is_empty()) { return result; }
+    if (needles.is_empty()) { return result; }
 
     mutable_column_view result_view = result.get()->mutable_view();
 
-    if (needles.is_empty()) {
+    if (haystack.is_empty()) {
       thrust::fill(
         rmm::exec_policy(stream), result_view.begin<bool>(), result_view.end<bool>(), false);
       return result;
     }
 
-    auto hash_set        = cudf::detail::unordered_multiset<Element>::create(needles, stream);
+    auto hash_set        = cudf::detail::unordered_multiset<Element>::create(haystack, stream);
     auto device_hash_set = hash_set.to_device();
 
-    auto d_haystack_ptr = column_device_view::create(haystack, stream);
-    auto d_haystack     = *d_haystack_ptr;
+    auto d_needles_ptr = column_device_view::create(needles, stream);
+    auto d_needles     = *d_needles_ptr;
 
-    if (haystack.has_nulls()) {
+    if (needles.has_nulls()) {
       thrust::transform(rmm::exec_policy(stream),
                         thrust::make_counting_iterator<size_type>(0),
-                        thrust::make_counting_iterator<size_type>(haystack.size()),
+                        thrust::make_counting_iterator<size_type>(needles.size()),
                         result_view.begin<bool>(),
-                        [device_hash_set, d_haystack] __device__(size_t index) {
-                          return d_haystack.is_null_nocheck(index) ||
-                                 device_hash_set.contains(d_haystack.element<Element>(index));
+                        [device_hash_set, d_needles] __device__(size_t index) {
+                          return d_needles.is_null_nocheck(index) ||
+                                 device_hash_set.contains(d_needles.element<Element>(index));
                         });
     } else {
       thrust::transform(rmm::exec_policy(stream),
                         thrust::make_counting_iterator<size_type>(0),
-                        thrust::make_counting_iterator<size_type>(haystack.size()),
+                        thrust::make_counting_iterator<size_type>(needles.size()),
                         result_view.begin<bool>(),
-                        [device_hash_set, d_haystack] __device__(size_t index) {
-                          return device_hash_set.contains(d_haystack.element<Element>(index));
+                        [device_hash_set, d_needles] __device__(size_t index) {
+                          return device_hash_set.contains(d_needles.element<Element>(index));
                         });
     }
 
@@ -336,10 +322,10 @@ std::unique_ptr<column> multi_contains_dispatch::operator()<dictionary32>(
   dictionary_column_view const haystack(haystack_in);
   dictionary_column_view const needles(needles_in);
   // first combine keys so both dictionaries have the same set
-  auto haystack_matched    = dictionary::detail::add_keys(haystack, needles.keys(), stream);
-  auto const haystack_view = dictionary_column_view(haystack_matched->view());
-  auto needles_matched     = dictionary::detail::set_keys(needles, haystack_view.keys(), stream);
+  auto needles_matched     = dictionary::detail::add_keys(needles, haystack.keys(), stream);
   auto const needles_view  = dictionary_column_view(needles_matched->view());
+  auto haystack_matched    = dictionary::detail::set_keys(haystack, needles_view.keys(), stream);
+  auto const haystack_view = dictionary_column_view(haystack_matched->view());
 
   // now just use the indices for the contains
   column_view const haystack_indices = haystack_view.get_indices_annotated();
@@ -363,56 +349,56 @@ std::unique_ptr<column> contains(column_view const& haystack,
     haystack.type(), multi_contains_dispatch{}, haystack, needles, stream, mr);
 }
 
-std::unique_ptr<column> lower_bound(table_view const& t,
-                                    table_view const& values,
+std::unique_ptr<column> lower_bound(table_view const& haystack,
+                                    table_view const& needles,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  return search_ordered(t, values, true, column_order, null_precedence, stream, mr);
+  return search_ordered(haystack, needles, true, column_order, null_precedence, stream, mr);
 }
 
-std::unique_ptr<column> upper_bound(table_view const& t,
-                                    table_view const& values,
+std::unique_ptr<column> upper_bound(table_view const& haystack,
+                                    table_view const& needles,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  return search_ordered(t, values, false, column_order, null_precedence, stream, mr);
+  return search_ordered(haystack, needles, false, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace detail
 
 // external APIs
 
-std::unique_ptr<column> lower_bound(table_view const& t,
-                                    table_view const& values,
+std::unique_ptr<column> lower_bound(table_view const& haystack,
+                                    table_view const& needles,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::lower_bound(
-    t, values, column_order, null_precedence, rmm::cuda_stream_default, mr);
+    haystack, needles, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> upper_bound(table_view const& t,
-                                    table_view const& values,
+std::unique_ptr<column> upper_bound(table_view const& haystack,
+                                    table_view const& needles,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::upper_bound(
-    t, values, column_order, null_precedence, rmm::cuda_stream_default, mr);
+    haystack, needles, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
-bool contains(column_view const& col, scalar const& value)
+bool contains(column_view const& haystack, scalar const& needle)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(col, value, rmm::cuda_stream_default);
+  return detail::contains(haystack, needle, rmm::cuda_stream_default);
 }
 
 std::unique_ptr<column> contains(column_view const& haystack,
diff --git a/cpp/tests/search/search_dictionary_test.cpp b/cpp/tests/search/search_dictionary_test.cpp
index 6b1caa5ed6f..9eba259ee39 100644
--- a/cpp/tests/search/search_dictionary_test.cpp
+++ b/cpp/tests/search/search_dictionary_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -88,7 +88,7 @@ TEST_F(DictionarySearchTest, contains_dictionary)
   EXPECT_FALSE(cudf::contains(column, string_scalar{"28"}));
 
   cudf::test::dictionary_column_wrapper<std::string> needles({"00", "17", "23", "27"});
-  fixed_width_column_wrapper<bool> expect{1, 1, 1, 1, 1, 1, 0};
+  fixed_width_column_wrapper<bool> expect{1, 1, 1, 0};
   auto result = cudf::contains(column, needles);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect);
 }
@@ -101,7 +101,7 @@ TEST_F(DictionarySearchTest, contains_nullable_dictionary)
   EXPECT_FALSE(cudf::contains(column, numeric_scalar<int64_t>{28}));
 
   cudf::test::dictionary_column_wrapper<int64_t> needles({0, 17, 23, 27});
-  fixed_width_column_wrapper<bool> expect({1, 0, 1, 1, 1, 1, 0}, {1, 0, 1, 1, 1, 1, 1});
+  fixed_width_column_wrapper<bool> expect{1, 1, 1, 0};
   auto result = cudf::contains(column, needles);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect);
 }
diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp
index 0a2533cd5f3..169eaffa41a 100644
--- a/cpp/tests/search/search_test.cpp
+++ b/cpp/tests/search/search_test.cpp
@@ -1627,7 +1627,7 @@ TEST_F(SearchTest, multi_contains_some)
   fixed_width_column_wrapper<element_type> haystack{0, 1, 17, 19, 23, 29, 71};
   fixed_width_column_wrapper<element_type> needles{17, 19, 45, 72};
 
-  fixed_width_column_wrapper<bool> expect{0, 0, 1, 1, 0, 0, 0};
+  fixed_width_column_wrapper<bool> expect{1, 1, 0, 0};
 
   auto result = cudf::contains(haystack, needles);
 
@@ -1641,7 +1641,7 @@ TEST_F(SearchTest, multi_contains_none)
   fixed_width_column_wrapper<element_type> haystack{0, 1, 17, 19, 23, 29, 71};
   fixed_width_column_wrapper<element_type> needles{2, 3};
 
-  fixed_width_column_wrapper<bool> expect{0, 0, 0, 0, 0, 0, 0};
+  fixed_width_column_wrapper<bool> expect{0, 0};
 
   auto result = cudf::contains(haystack, needles);
 
@@ -1657,7 +1657,7 @@ TEST_F(SearchTest, multi_contains_some_string)
 
   cudf::test::strings_column_wrapper needles(h_needles_strings.begin(), h_needles_strings.end());
 
-  fixed_width_column_wrapper<bool> expect{0, 0, 1, 1, 0, 0, 0};
+  fixed_width_column_wrapper<bool> expect{1, 1, 0, 0};
 
   auto result = cudf::contains(haystack, needles);
 
@@ -1673,7 +1673,7 @@ TEST_F(SearchTest, multi_contains_none_string)
 
   cudf::test::strings_column_wrapper needles(h_needles_strings.begin(), h_needles_strings.end());
 
-  fixed_width_column_wrapper<bool> expect{0, 0, 0, 0, 0, 0, 0};
+  fixed_width_column_wrapper<bool> expect{0, 0};
 
   auto result = cudf::contains(haystack, needles);
 
@@ -1688,7 +1688,7 @@ TEST_F(SearchTest, multi_contains_some_with_nulls)
                                                     {1, 1, 0, 1, 1, 1, 1}};
   fixed_width_column_wrapper<element_type> needles{{17, 19, 23, 72}, {1, 0, 1, 1}};
 
-  fixed_width_column_wrapper<bool> expect{{0, 0, 0, 0, 1, 0, 0}, {1, 1, 0, 1, 1, 1, 1}};
+  fixed_width_column_wrapper<bool> expect{{0, 0, 1, 0}, {1, 0, 1, 1}};
 
   auto result = cudf::contains(haystack, needles);
 
@@ -1703,7 +1703,7 @@ TEST_F(SearchTest, multi_contains_none_with_nulls)
                                                     {1, 1, 0, 1, 1, 1, 1}};
   fixed_width_column_wrapper<element_type> needles{{17, 19, 24, 72}, {1, 0, 1, 1}};
 
-  fixed_width_column_wrapper<bool> expect{{0, 0, 0, 0, 0, 0, 0}, {1, 1, 0, 1, 1, 1, 1}};
+  fixed_width_column_wrapper<bool> expect{{0, 0, 0, 0}, {1, 0, 1, 1}};
 
   auto result = cudf::contains(haystack, needles);
 
@@ -1715,7 +1715,7 @@ TEST_F(SearchTest, multi_contains_some_string_with_nulls)
   std::vector<const char*> h_haystack_strings{"0", "1", nullptr, "19", "23", "29", "71"};
   std::vector<const char*> h_needles_strings{"17", "23", nullptr, "72"};
 
-  fixed_width_column_wrapper<bool> expect{{0, 0, 0, 0, 1, 0, 0}, {1, 1, 0, 1, 1, 1, 1}};
+  fixed_width_column_wrapper<bool> expect{{0, 1, 0, 0}, {1, 1, 0, 1}};
 
   cudf::test::strings_column_wrapper haystack(
     h_haystack_strings.begin(),
@@ -1739,7 +1739,7 @@ TEST_F(SearchTest, multi_contains_none_string_with_nulls)
   std::vector<const char*> h_haystack_strings{"0", "1", nullptr, "19", "23", "29", "71"};
   std::vector<const char*> h_needles_strings{"2", nullptr};
 
-  fixed_width_column_wrapper<bool> expect{{0, 0, 0, 0, 0, 0, 0}, {1, 1, 0, 1, 1, 1, 1}};
+  fixed_width_column_wrapper<bool> expect{{0, 0}, {1, 0}};
 
   cudf::test::strings_column_wrapper haystack(
     h_haystack_strings.begin(),
@@ -1765,7 +1765,7 @@ TEST_F(SearchTest, multi_contains_empty_column)
   fixed_width_column_wrapper<element_type> haystack{};
   fixed_width_column_wrapper<element_type> needles{2, 3};
 
-  fixed_width_column_wrapper<bool> expect{};
+  fixed_width_column_wrapper<bool> expect{0, 0};
 
   auto result = cudf::contains(haystack, needles);
 
@@ -1781,7 +1781,7 @@ TEST_F(SearchTest, multi_contains_empty_column_string)
 
   cudf::test::strings_column_wrapper needles(h_needles_strings.begin(), h_needles_strings.end());
 
-  fixed_width_column_wrapper<bool> expect{};
+  fixed_width_column_wrapper<bool> expect{0, 0, 0, 0};
 
   auto result = cudf::contains(haystack, needles);
 
@@ -1795,7 +1795,7 @@ TEST_F(SearchTest, multi_contains_empty_input_set)
   fixed_width_column_wrapper<element_type> haystack{0, 1, 17, 19, 23, 29, 71};
   fixed_width_column_wrapper<element_type> needles{};
 
-  fixed_width_column_wrapper<bool> expect{0, 0, 0, 0, 0, 0, 0};
+  fixed_width_column_wrapper<bool> expect{};
 
   auto result = cudf::contains(haystack, needles);
 
@@ -1811,7 +1811,7 @@ TEST_F(SearchTest, multi_contains_empty_input_set_string)
 
   cudf::test::strings_column_wrapper needles(h_needles_strings.begin(), h_needles_strings.end());
 
-  fixed_width_column_wrapper<bool> expect{0, 0, 0, 0, 0, 0, 0};
+  fixed_width_column_wrapper<bool> expect{};
 
   auto result = cudf::contains(haystack, needles);
 
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index e871da18966..9f07b130a83 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -1769,22 +1769,23 @@ public boolean contains(Scalar needle) {
   }
 
   /**
-   * Returns a new ColumnVector of {@link DType#BOOL8} elements containing true if the corresponding
-   * entry in haystack is contained in needles and false if it is not. The caller will be responsible
-   * for the lifecycle of the new vector.
+   * Returns a new column of {@link DType#BOOL8} elements having the same size as this column,
+   * each row value is true if the corresponding entry in this column is contained in the
+   * given searchSpace column and false if it is not.
+   * The caller will be responsible for the lifecycle of the new vector.
    *
    * example:
    *
-   *   haystack = { 10, 20, 30, 40, 50 }
-   *   needles  = { 20, 40, 60, 80 }
+   *   col         = { 10, 20, 30, 40, 50 }
+   *   searchSpace = { 20, 40, 60, 80 }
    *
    *   result = { false, true, false, true, false }
    *
-   * @param needles
+   * @param searchSpace
    * @return A new ColumnVector of type {@link DType#BOOL8}
    */
-  public final ColumnVector contains(ColumnView needles) {
-    return new ColumnVector(containsVector(getNativeView(), needles.getNativeView()));
+  public final ColumnVector contains(ColumnView searchSpace) {
+    return new ColumnVector(containsVector(getNativeView(), searchSpace.getNativeView()));
   }
 
   /**
@@ -4080,7 +4081,7 @@ private static native long segmentedGather(long sourceColumnHandle, long gatherM
 
   private static native boolean containsScalar(long columnViewHaystack, long scalarHandle) throws CudfException;
 
-  private static native long containsVector(long columnViewHaystack, long columnViewNeedles) throws CudfException;
+  private static native long containsVector(long valuesHandle, long searchSpaceHandle) throws CudfException;
 
   private static native long transform(long viewHandle, String udf, boolean isPtx);
 
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index e074180c312..b33769bdc1b 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1166,15 +1166,18 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ColumnView_containsScalar(JNIEnv
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsVector(JNIEnv *env, jobject j_object,
-                                                                      jlong j_haystack_handle,
-                                                                      jlong j_needle_handle) {
-  JNI_NULL_CHECK(env, j_haystack_handle, "haystack vector is null", false);
-  JNI_NULL_CHECK(env, j_needle_handle, "needle vector is null", false);
+                                                                      jlong j_values_handle,
+                                                                      jlong j_search_space_handle) {
+  JNI_NULL_CHECK(env, j_values_handle, "values vector is null", false);
+  JNI_NULL_CHECK(env, j_search_space_handle, "search_space vector is null", false);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *haystack = reinterpret_cast<cudf::column_view *>(j_haystack_handle);
-    cudf::column_view *needle = reinterpret_cast<cudf::column_view *>(j_needle_handle);
-    return release_as_jlong(cudf::contains(*haystack, *needle));
+    auto const search_space_ptr =
+        reinterpret_cast<cudf::column_view const *>(j_search_space_handle);
+    auto const values_ptr = reinterpret_cast<cudf::column_view const *>(j_values_handle);
+
+    // The C++ API `cudf::contains` requires that the search space is the first parameter.
+    return release_as_jlong(cudf::contains(*search_space_ptr, *values_ptr));
   }
   CATCH_STD(env, 0);
 }
diff --git a/python/cudf/cudf/_lib/cpp/search.pxd b/python/cudf/cudf/_lib/cpp/search.pxd
index 4df73881ea5..8baef0aa1b9 100644
--- a/python/cudf/cudf/_lib/cpp/search.pxd
+++ b/python/cudf/cudf/_lib/cpp/search.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
@@ -12,15 +12,15 @@ from cudf._lib.cpp.table.table_view cimport table_view
 cdef extern from "cudf/search.hpp" namespace "cudf" nogil:
 
     cdef unique_ptr[column] lower_bound(
-        table_view t,
-        table_view values,
+        table_view haystack,
+        table_view needles,
         vector[libcudf_types.order] column_order,
         vector[libcudf_types.null_order] null_precedence,
     ) except +
 
     cdef unique_ptr[column] upper_bound(
-        table_view t,
-        table_view values,
+        table_view haystack,
+        table_view needles,
         vector[libcudf_types.order] column_order,
         vector[libcudf_types.null_order] null_precedence,
     ) except +
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 70097f15372..09a4754f519 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5407,7 +5407,7 @@ def fillna(
 
     def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]:
         found_indices = libcudf.search.contains(
-            self, column.as_column([value], dtype=self.dtype)
+            column.as_column([value], dtype=self.dtype), self
         )
         found_indices = libcudf.unary.cast(found_indices, dtype=np.int32)
         first = column.as_column(found_indices).find_first_value(np.int32(1))

From dc0c3cd0cdca4a5cf2d47a79099fb846de4d604c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 10 May 2022 16:58:42 -0500
Subject: [PATCH 07/12] Use `ThreadedMotoServer` instead of `subprocess` in
 spinning up `s3` server (#10822)

This is a follow-up PR to address review comments from here: https://github.com/rapidsai/cudf/pull/10769#pullrequestreview-968047334

This PR:

- [x] Uses `ThreadedMotoServer` instead of using `subprocess.open` to create a new server, this way it is guaranteed to close the server upon exit.
- [x] Add's IP address fixture instead of having it hard-coded at multiple places.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/10822
---
 python/cudf/cudf/tests/test_s3.py             | 40 +++++++------------
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   | 38 ++++++------------
 2 files changed, 27 insertions(+), 51 deletions(-)

diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 0966bee93fd..b754429555d 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -1,10 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import os
-import shlex
 import socket
-import subprocess
-import time
 from contextlib import contextmanager
 from io import BytesIO
 
@@ -21,9 +18,15 @@
 
 moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
-requests = pytest.importorskip("requests")
 s3fs = pytest.importorskip("s3fs")
 
+ThreadedMotoServer = pytest.importorskip("moto.server").ThreadedMotoServer
+
+
+@pytest.fixture(scope="session")
+def endpoint_ip():
+    return "127.0.0.1"
+
 
 @pytest.fixture(scope="session")
 def endpoint_port():
@@ -51,7 +54,7 @@ def ensure_safe_environment_variables():
 
 
 @pytest.fixture(scope="session")
-def s3_base(endpoint_port):
+def s3_base(endpoint_ip, endpoint_port):
     """
     Fixture to set up moto server in separate process
     """
@@ -65,35 +68,20 @@ def s3_base(endpoint_port):
         # Launching moto in server mode, i.e., as a separate process
         # with an S3 endpoint on localhost
 
-        endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
-
-        proc = subprocess.Popen(
-            shlex.split(f"moto_server s3 -p {endpoint_port}"),
-        )
+        endpoint_uri = f"http://{endpoint_ip}:{endpoint_port}/"
 
-        timeout = 5
-        while timeout > 0:
-            try:
-                # OK to go once server is accepting connections
-                r = requests.get(endpoint_uri)
-                if r.ok:
-                    break
-            except Exception:
-                pass
-            timeout -= 0.1
-            time.sleep(0.1)
+        server = ThreadedMotoServer(ip_address=endpoint_ip, port=endpoint_port)
+        server.start()
         yield endpoint_uri
-
-        proc.terminate()
-        proc.wait()
+        server.stop()
 
 
 @pytest.fixture()
-def s3so(endpoint_port):
+def s3so(endpoint_ip, endpoint_port):
     """
     Returns s3 storage options to pass to fsspec
     """
-    endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
+    endpoint_uri = f"http://{endpoint_ip}:{endpoint_port}/"
 
     return {"client_kwargs": {"endpoint_url": endpoint_uri}}
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index 9283380296c..5be0cf7c887 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -1,10 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import os
-import shlex
 import socket
-import subprocess
-import time
 from contextlib import contextmanager
 from io import BytesIO
 
@@ -16,8 +13,13 @@
 
 moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
-requests = pytest.importorskip("requests")
 s3fs = pytest.importorskip("s3fs")
+ThreadedMotoServer = pytest.importorskip("moto.server").ThreadedMotoServer
+
+
+@pytest.fixture(scope="session")
+def endpoint_ip():
+    return "127.0.0.1"
 
 
 @pytest.fixture(scope="session")
@@ -46,7 +48,7 @@ def ensure_safe_environment_variables():
 
 
 @pytest.fixture(scope="session")
-def s3_base(endpoint_port):
+def s3_base(endpoint_ip, endpoint_port):
     """
     Fixture to set up moto server in separate process
     """
@@ -59,35 +61,21 @@ def s3_base(endpoint_port):
 
         # Launching moto in server mode, i.e., as a separate process
         # with an S3 endpoint on localhost
-        endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
 
-        proc = subprocess.Popen(
-            shlex.split(f"moto_server s3 -p {endpoint_port}"),
-        )
+        endpoint_uri = f"http://{endpoint_ip}:{endpoint_port}/"
 
-        timeout = 5
-        while timeout > 0:
-            try:
-                # OK to go once server is accepting connections
-                r = requests.get(endpoint_uri)
-                if r.ok:
-                    break
-            except Exception:
-                pass
-            timeout -= 0.1
-            time.sleep(0.1)
+        server = ThreadedMotoServer(ip_address=endpoint_ip, port=endpoint_port)
+        server.start()
         yield endpoint_uri
-
-        proc.terminate()
-        proc.wait()
+        server.stop()
 
 
 @pytest.fixture()
-def s3so(endpoint_port):
+def s3so(endpoint_ip, endpoint_port):
     """
     Returns s3 storage options to pass to fsspec
     """
-    endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
+    endpoint_uri = f"http://{endpoint_ip}:{endpoint_port}/"
 
     return {"client_kwargs": {"endpoint_url": endpoint_uri}}
 

From 366206d4a04e77bc3fbc9b41948ddb816d4f38e3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 10 May 2022 17:05:18 -0500
Subject: [PATCH 08/12] Import `NA` from `missing` rather than using `cudf.NA`
 everywhere (#10821)

This PR changes cuDF so `NA` isn't used around the codebase from the top level `cudf` namespace and rather is imported directly from `missing`. This is part of https://github.com/rapidsai/cudf/issues/10820 and comes as a follow up to https://github.com/rapidsai/cudf/pull/10791#discussion_r867206392

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/10821
---
 python/cudf/cudf/_lib/scalar.pyx              | 23 ++++++++++---------
 python/cudf/cudf/core/_internals/where.py     |  5 ++--
 python/cudf/cudf/core/column/column.py        |  3 ++-
 python/cudf/cudf/core/column/lists.py         |  5 ++--
 .../cudf/cudf/core/column/numerical_base.py   |  3 ++-
 python/cudf/cudf/core/column/struct.py        |  3 ++-
 python/cudf/cudf/core/dataframe.py            | 11 ++++-----
 python/cudf/cudf/testing/testing.py           |  3 ++-
 python/cudf/cudf/utils/dtypes.py              |  3 ++-
 9 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 71ac022ba2d..6309720706b 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -27,6 +27,7 @@ from cudf._lib.types import (
     duration_unit_map,
 )
 from cudf.core.dtypes import ListDtype, StructDtype
+from cudf.core.missing import NA
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -170,7 +171,7 @@ cdef class DeviceScalar:
         return self.get_raw_ptr()[0].is_valid()
 
     def __repr__(self):
-        if self.value is cudf.NA:
+        if self.value is NA:
             return (
                 f"{self.__class__.__name__}"
                 f"({self.value}, {repr(self.dtype)})"
@@ -356,7 +357,7 @@ cdef _set_struct_from_pydict(unique_ptr[scalar]& s,
     else:
         pyarrow_table = pa.Table.from_arrays(
             [
-                pa.array([cudf.NA], from_pandas=True, type=f.type)
+                pa.array([NA], from_pandas=True, type=f.type)
                 for f in arrow_schema
             ],
             names=columns
@@ -371,7 +372,7 @@ cdef _set_struct_from_pydict(unique_ptr[scalar]& s,
 
 cdef _get_py_dict_from_struct(unique_ptr[scalar]& s):
     if not s.get()[0].is_valid():
-        return cudf.NA
+        return NA
 
     cdef table_view struct_table_view = (<struct_scalar*>s.get()).view()
     column_names = [str(i) for i in range(struct_table_view.num_columns())]
@@ -386,7 +387,7 @@ cdef _set_list_from_pylist(unique_ptr[scalar]& s,
                            object dtype,
                            bool valid=True):
 
-    value = value if valid else [cudf.NA]
+    value = value if valid else [NA]
     cdef Column col
     if isinstance(dtype.element_type, ListDtype):
         pa_type = dtype.element_type.to_arrow()
@@ -404,7 +405,7 @@ cdef _set_list_from_pylist(unique_ptr[scalar]& s,
 cdef _get_py_list_from_list(unique_ptr[scalar]& s):
 
     if not s.get()[0].is_valid():
-        return cudf.NA
+        return NA
 
     cdef column_view list_col_view = (<list_scalar*>s.get()).view()
     cdef Column list_col = Column.from_column_view(list_col_view, None)
@@ -416,14 +417,14 @@ cdef _get_py_list_from_list(unique_ptr[scalar]& s):
 
 cdef _get_py_string_from_string(unique_ptr[scalar]& s):
     if not s.get()[0].is_valid():
-        return cudf.NA
+        return NA
     return (<string_scalar*>s.get())[0].to_string().decode()
 
 
 cdef _get_np_scalar_from_numeric(unique_ptr[scalar]& s):
     cdef scalar* s_ptr = s.get()
     if not s_ptr[0].is_valid():
-        return cudf.NA
+        return NA
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
@@ -456,7 +457,7 @@ cdef _get_np_scalar_from_numeric(unique_ptr[scalar]& s):
 cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s):
     cdef scalar* s_ptr = s.get()
     if not s_ptr[0].is_valid():
-        return cudf.NA
+        return NA
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
@@ -480,7 +481,7 @@ cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s):
     cdef scalar* s_ptr = s.get()
 
     if not s_ptr[0].is_valid():
-        return cudf.NA
+        return NA
 
     cdef libcudf_types.data_type cdtype = s_ptr[0].type()
 
@@ -571,7 +572,7 @@ def as_device_scalar(val, dtype=None):
 
 
 def _is_null_host_scalar(slr):
-    if slr is None or slr is cudf.NA:
+    if slr is None or slr is NA:
         return True
     elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr):
         return True
@@ -603,5 +604,5 @@ def _nested_na_replace(input_list):
         if isinstance(value, list):
             _nested_na_replace(value)
         elif value is None:
-            input_list[idx] = cudf.NA
+            input_list[idx] = NA
     return input_list
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 59e7d629092..bc01752a2b4 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -12,6 +12,7 @@
 from cudf.core.dataframe import DataFrame
 from cudf.core.frame import Frame
 from cudf.core.index import Index
+from cudf.core.missing import NA
 from cudf.core.series import Series
 from cudf.core.single_column_frame import SingleColumnFrame
 
@@ -28,9 +29,7 @@ def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> ScalarLike:
             f"{type(other).__name__} to {col.dtype.name}"
         )
 
-    return cudf.Scalar(
-        other, dtype=col.dtype if other in {None, cudf.NA} else None
-    )
+    return cudf.Scalar(other, dtype=col.dtype if other in {None, NA} else None)
 
 
 def _check_and_cast_columns_with_other(
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index e1d91e6d0c0..47a2e3489e8 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -68,6 +68,7 @@
     ListDtype,
     StructDtype,
 )
+from cudf.core.missing import NA
 from cudf.core.mixins import BinaryOperand, Reducible
 from cudf.utils.dtypes import (
     cudf_dtype_from_pa_type,
@@ -499,7 +500,7 @@ def __setitem__(self, key: Any, value: Any):
             self._mimic_inplace(out, inplace=True)
 
     def _wrap_binop_normalization(self, other):
-        if other is cudf.NA or other is None:
+        if other is NA or other is None:
             return cudf.Scalar(other, dtype=self.dtype)
         if isinstance(other, np.ndarray) and other.ndim == 0:
             other = other.item()
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 30e418f0825..e8a5638f07a 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -30,6 +30,7 @@
 from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethods, ParentType
 from cudf.core.dtypes import ListDtype
+from cudf.core.missing import NA
 
 
 class ListColumn(ColumnBase):
@@ -91,7 +92,7 @@ def __setitem__(self, key, value):
         if isinstance(value, cudf.Scalar):
             if value.dtype != self.dtype:
                 raise TypeError("list nesting level mismatch")
-        elif value is cudf.NA:
+        elif value is NA:
             value = cudf.Scalar(value, dtype=self.dtype)
         else:
             raise ValueError(f"Can not set {value} into ListColumn")
@@ -354,7 +355,7 @@ def get(
             index = as_column(index)
             out = extract_element_column(self._column, as_column(index))
 
-        if not (default is None or default is cudf.NA):
+        if not (default is None or default is NA):
             # determine rows for which `index` is out-of-bounds
             lengths = count_elements(self._column)
             out_of_bounds_mask = (np.negative(index) > lengths) | (
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 659bb58d790..bb7711a3ead 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -11,6 +11,7 @@
 from cudf import _lib as libcudf
 from cudf._typing import ScalarLike
 from cudf.core.column import ColumnBase
+from cudf.core.missing import NA
 from cudf.core.mixins import Scannable
 
 
@@ -116,7 +117,7 @@ def quantile(
             scalar_result = result.element_indexing(0)
             return (
                 cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-                if scalar_result is cudf.NA
+                if scalar_result is NA
                 else scalar_result
             )
         return result
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index ed5e1c9450d..fa834ae8a5a 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -10,6 +10,7 @@
 from cudf.core.column import ColumnBase, build_struct_column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import StructDtype
+from cudf.core.missing import NA
 
 
 class StructColumn(ColumnBase):
@@ -102,7 +103,7 @@ def __setitem__(self, key, value):
         if isinstance(value, dict):
             # filling in fields not in dict
             for field in self.dtype.fields:
-                value[field] = value.get(field, cudf.NA)
+                value[field] = value.get(field, NA)
 
             value = cudf.Scalar(value, self.dtype)
         super().__setitem__(key, value)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a3e2f40b28e..0c3dc82719e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -76,6 +76,7 @@
     _indices_from_labels,
     doc_reset_index_template,
 )
+from cudf.core.missing import NA
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import DataFrameResampler
 from cudf.core.series import Series
@@ -364,9 +365,7 @@ def _setitem_tuple_arg(self, key, value):
                 scatter_map = _indices_from_labels(self._frame, key[0])
                 for col in columns_df._column_names:
                     columns_df[col][scatter_map] = (
-                        value._data[col]
-                        if col in value_column_names
-                        else cudf.NA
+                        value._data[col] if col in value_column_names else NA
                     )
 
             else:
@@ -479,7 +478,7 @@ def _setitem_tuple_arg(self, key, value):
             value_column_names = set(value._column_names)
             for col in columns_df._column_names:
                 columns_df[col][key[0]] = (
-                    value._data[col] if col in value_column_names else cudf.NA
+                    value._data[col] if col in value_column_names else NA
                 )
 
         else:
@@ -3867,8 +3866,8 @@ def applymap(
             # bytecode to generate the equivalent PTX
             # as a null-ignoring version of the function
             def _func(x):  # pragma: no cover
-                if x is cudf.NA:
-                    return cudf.NA
+                if x is NA:
+                    return NA
                 else:
                     return devfunc(x)
 
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index b134d2b26e9..070e4649c7b 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -20,6 +20,7 @@
     is_struct_dtype,
 )
 from cudf.core._compat import PANDAS_GE_110
+from cudf.core.missing import NA
 
 
 def dtype_can_compare_equal_to_other(dtype):
@@ -290,7 +291,7 @@ def assert_column_equal(
 
 
 def null_safe_scalar_equals(left, right):
-    if left in {cudf.NA, np.nan} or right in {cudf.NA, np.nan}:
+    if left in {NA, np.nan} or right in {NA, np.nan}:
         return left is right
     return left == right
 
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 35c6fdc73f8..c2d9a57b72f 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -12,6 +12,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_120
+from cudf.core.missing import NA
 
 _NA_REP = "<NA>"
 
@@ -591,7 +592,7 @@ def _can_cast(from_dtype, to_dtype):
     `np.can_cast` but with some special handling around
     cudf specific dtypes.
     """
-    if from_dtype in {None, cudf.NA}:
+    if from_dtype in {None, NA}:
         return True
     if isinstance(from_dtype, type):
         from_dtype = cudf.dtype(from_dtype)

From 2aaa863ee78f4cdc8d7a0180abe82bdad4cd7923 Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Wed, 11 May 2022 07:36:34 +0800
Subject: [PATCH 09/12] Add cudf JNI docker build github action (#10806)

Signed-off-by: Peixin Li <pxli@nyu.edu>

related to https://github.com/NVIDIA/spark-rapids-jni/issues/203

I don't know if any restriction of using github actions in rapids org,
so this PR is just a prototype and waiting for feedback :)

Example in my forked repo,
1. Manual trigger the build
![image](https://user-images.githubusercontent.com/8086184/167101308-b3e4376e-4c42-4603-aca1-2b0f40ace16f.png)
2. example logs https://github.com/pxLi/cudf/runs/6319840419?check_suite_focus=true
3. pushed images https://hub.docker.com/r/pxli/cudf-jni-build/tags
![image](https://user-images.githubusercontent.com/8086184/167102403-a5b49a99-d8de-46d6-b5df-a1ea50254448.png)


Some open questions:
1. Trigger strategy? Manual, crontab, merge trigger, or others?
2. which org in docker hub should we push the image to? gpuci?
3. which account we should use to push?
4. do we want to keep the commit tags or we just overwrite the branch-XY tag only? mostly about space :)

Thanks!

Authors:
  - Peixin (https://github.com/pxLi)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/10806
---
 .github/workflows/jni-docker-build.yml | 53 ++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 .github/workflows/jni-docker-build.yml

diff --git a/.github/workflows/jni-docker-build.yml b/.github/workflows/jni-docker-build.yml
new file mode 100644
index 00000000000..0bdc409d0ab
--- /dev/null
+++ b/.github/workflows/jni-docker-build.yml
@@ -0,0 +1,53 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: JNI Docker Build
+
+on:
+  workflow_dispatch: # manual trigger only
+
+concurrency:
+  group: jni-docker-build-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  docker-build:
+    if: github.repository == 'rapidsai/cudf'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
+          password: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
+
+      - name: Set ENVs
+        run: |
+          echo "IMAGE_NAME=rapidsai/cudf-jni-build" >> $GITHUB_ENV
+          echo "IMAGE_REF=${GITHUB_REF_NAME}" >> $GITHUB_ENV
+
+      - name: Build and Push
+        uses: docker/build-push-action@v3
+        with:
+          push: true
+          file: java/ci/Dockerfile.centos7
+          tags: "${{ env.IMAGE_NAME }}:${{ env.IMAGE_REF }}"

From efd2c3946bae0979f257f521021f6a22412af903 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 11 May 2022 08:27:05 -0400
Subject: [PATCH 10/12] Refactor regex builtin character-class identifiers
 (#10814)

Refactors the builtin regex class integer ids to common header for the compiler and executor.
The builtin regex character classes like `\s, \d, \W` have integer identifiers (bit values that can be combined) but were defined in separate source files. This PR refactors the declarations to the common header file `regcomp.h` to ensure the same value is used when parsing/compiling the instructions in `regcomp.cpp` and when evaluating the instructions in `regex.inl`.

This is just a cleanup of the code and does not effect behavior or performance.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/10814
---
 cpp/src/strings/regex/regcomp.cpp | 14 +++++++-------
 cpp/src/strings/regex/regcomp.h   | 31 +++++++++++++++++++++----------
 cpp/src/strings/regex/regex.inl   | 12 ++++++------
 3 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 829230d0842..f99acc3448a 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -44,17 +44,17 @@ enum OperatorType {
   NOP          = 0302,  // No operation, internal use only
 };
 
-static reclass ccls_w(1);   // [a-z], [A-Z], [0-9], and '_'
-static reclass ccls_W(8);   // now ccls_w plus '\n'
-static reclass ccls_s(2);   // all spaces or ctrl characters
-static reclass ccls_S(16);  // not ccls_s
-static reclass ccls_d(4);   // digits [0-9]
-static reclass ccls_D(32);  // not ccls_d plus '\n'
+static reclass ccls_w(CCLASS_W);   // \w
+static reclass ccls_s(CCLASS_S);   // \s
+static reclass ccls_d(CCLASS_D);   // \d
+static reclass ccls_W(NCCLASS_W);  // \W
+static reclass ccls_S(NCCLASS_S);  // \S
+static reclass ccls_D(NCCLASS_D);  // \D
 
 // Tables for analyzing quantifiers
 const std::array<int, 6> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL, RBRA}};
 const std::array<char, 5> quantifiers{{'*', '?', '+', '{', '|'}};
-// Valid regex characters that can be escaping to be used as literals
+// Valid regex characters that can be escaped and used as literals
 const std::array<char, 33> escapable_chars{
   {'.', '-', '+',  '*', '\\', '?', '^', '$', '|', '{', '}', '(', ')', '[', ']', '<', '>',
    '"', '~', '\'', '`', '_',  '@', '=', ';', ':', '!', '#', '%', '&', ',', '/', ' '}};
diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h
index 798b43830b4..162a2090268 100644
--- a/cpp/src/strings/regex/regcomp.h
+++ b/cpp/src/strings/regex/regcomp.h
@@ -41,9 +41,9 @@ enum InstType {
   BOL     = 0303,  // Beginning of line, ^
   EOL     = 0304,  // End of line, $
   CCLASS  = 0305,  // Character class, []
-  NCCLASS = 0306,  // Negated character class, []
-  BOW     = 0307,  // Boundary of word, /b
-  NBOW    = 0310,  // Not boundary of word, /b
+  NCCLASS = 0306,  // Negated character class, [^ ]
+  BOW     = 0307,  // Boundary of word, \b
+  NBOW    = 0310,  // Not boundary of word, \B
   END     = 0377   // Terminate: match found
 };
 
@@ -57,6 +57,13 @@ struct reclass {
   reclass(int m) : builtins(m) {}
 };
 
+constexpr int32_t CCLASS_W{1 << 0};   // [a-z], [A-Z], [0-9], and '_'
+constexpr int32_t CCLASS_S{1 << 1};   // all spaces or ctrl characters
+constexpr int32_t CCLASS_D{1 << 2};   // digits [0-9]
+constexpr int32_t NCCLASS_W{1 << 3};  // not CCLASS_W or '\n'
+constexpr int32_t NCCLASS_S{1 << 4};  // not CCLASS_S
+constexpr int32_t NCCLASS_D{1 << 5};  // not CCLASS_D or '\n'
+
 /**
  * @brief Structure of an encoded regex instruction
  */
@@ -76,12 +83,11 @@ struct reinst {
 };
 
 /**
- * @brief Regex program handles parsing a pattern in to individual set
+ * @brief Regex program handles parsing a pattern into a vector
  * of chained instructions.
  */
 class reprog {
  public:
-  reprog()              = default;
   reprog(const reprog&) = default;
   reprog(reprog&&)      = default;
   ~reprog()             = default;
@@ -89,8 +95,12 @@ class reprog {
   reprog& operator=(reprog&&) = default;
 
   /**
-   * @brief Parses the given regex pattern and compiles
-   * into a list of chained instructions.
+   * @brief Parses the given regex pattern and produces an instance
+   * of this object
+   *
+   * @param pattern Regex pattern encoded as UTF-8
+   * @param flags For interpretting certain `pattern` characters
+   * @return Instance of reprog
    */
   static reprog create_from(std::string_view pattern, regex_flags const flags);
 
@@ -122,12 +132,13 @@ class reprog {
 #endif
 
  private:
-  std::vector<reinst> _insts;
-  std::vector<reclass> _classes;
-  int32_t _startinst_id;
+  std::vector<reinst> _insts;           // instructions
+  std::vector<reclass> _classes;        // data for CCLASS instructions
+  int32_t _startinst_id{};              // id of first instruction
   std::vector<int32_t> _startinst_ids;  // short-cut to speed-up ORs
   int32_t _num_capturing_groups{};
 
+  reprog() = default;
   void check_for_errors(int32_t id, int32_t next_id);
 };
 
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index bae6fb275f6..8e2194f2094 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -148,17 +148,17 @@ __device__ __forceinline__ bool reclass_device::is_match(char32_t const ch,
   uint32_t codept = utf8_to_codepoint(ch);
   if (codept > 0x00FFFF) return false;
   int8_t fl = codepoint_flags[codept];
-  if ((builtins & 1) && ((ch == '_') || IS_ALPHANUM(fl)))  // \w
+  if ((builtins & CCLASS_W) && ((ch == '_') || IS_ALPHANUM(fl)))  // \w
     return true;
-  if ((builtins & 2) && IS_SPACE(fl))  // \s
+  if ((builtins & CCLASS_S) && IS_SPACE(fl))  // \s
     return true;
-  if ((builtins & 4) && IS_DIGIT(fl))  // \d
+  if ((builtins & CCLASS_D) && IS_DIGIT(fl))  // \d
     return true;
-  if ((builtins & 8) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl)))  // \W
+  if ((builtins & NCCLASS_W) && ((ch != '\n') && (ch != '_') && !IS_ALPHANUM(fl)))  // \W
     return true;
-  if ((builtins & 16) && !IS_SPACE(fl))  // \S
+  if ((builtins & NCCLASS_S) && !IS_SPACE(fl))  // \S
     return true;
-  if ((builtins & 32) && ((ch != '\n') && !IS_DIGIT(fl)))  // \D
+  if ((builtins & NCCLASS_D) && ((ch != '\n') && !IS_DIGIT(fl)))  // \D
     return true;
   //
   return false;

From 2b204d043322cd1c21fa28a4e8b92e9406504f20 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 11 May 2022 11:39:45 -0400
Subject: [PATCH 11/12] Fix compile warning in search.cu (#10827)

Compile warning introduced with merge of PR #10802

```
10 warnings like this:
12:43:23 $SRC_DIR/cpp/src/search/search.cu(108): warning #177-D: parameter "args" was declared but never referenced
12:43:23           detected during instantiation of function "lambda [](auto &&...)->auto [with <auto-1>=<rmm::exec_policy, const
thrust::counting_iterator<cudf::size_type, thrust::use_default, thrust::use_default, thrust::use_default> &,
thrust::counting_iterator<cudf::size_type, thrust::use_default, thrust::use_default, thrust::use_default>, const
thrust::counting_iterator<cudf::size_type, thrust::use_default, thrust::use_default, thrust::use_default> &,
thrust::counting_iterator<cudf::size_type, thrust::use_default, thrust::use_default, thrust::use_default>, cudf::size_type *const &,
const cudf::row_lexicographic_comparator<cudf::nullate::DYNAMIC> &>]"
12:43:23 (121): here
```

Line 108 has a lambda refactoring that seems to confuse the compiler.
```
  auto const do_search = [find_first](auto&&... args) {
    if (find_first) {
      thrust::lower_bound(std::forward<decltype(args)>(args)...);
    } else {
      thrust::upper_bound(std::forward<decltype(args)>(args)...);
    }
  };
  do_search(rmm::exec_policy(stream),
            count_it,  count_it + haystack.num_rows(), count_it, count_it + needles.num_rows(),
            out_it, comp);
```

The warning is wrong and the compiler generates the correct code so this is likely a compiler bug.

This PR fixes the warning by replacing the lambda with an if statement.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/10827
---
 cpp/src/search/search.cu | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index 491ad49e020..49d0959ecc4 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -105,21 +105,23 @@ std::unique_ptr<column> search_ordered(table_view const& haystack,
                                                  column_order_dv.data(),
                                                  null_precedence_dv.data());
 
-  auto const do_search = [find_first](auto&&... args) {
-    if (find_first) {
-      thrust::lower_bound(std::forward<decltype(args)>(args)...);
-    } else {
-      thrust::upper_bound(std::forward<decltype(args)>(args)...);
-    }
-  };
-  do_search(rmm::exec_policy(stream),
-            count_it,
-            count_it + haystack.num_rows(),
-            count_it,
-            count_it + needles.num_rows(),
-            out_it,
-            comp);
-
+  if (find_first) {
+    thrust::lower_bound(rmm::exec_policy(stream),
+                        count_it,
+                        count_it + haystack.num_rows(),
+                        count_it,
+                        count_it + needles.num_rows(),
+                        out_it,
+                        comp);
+  } else {
+    thrust::upper_bound(rmm::exec_policy(stream),
+                        count_it,
+                        count_it + haystack.num_rows(),
+                        count_it,
+                        count_it + needles.num_rows(),
+                        out_it,
+                        comp);
+  }
   return result;
 }
 

From 0cc29a0a0128c5632bd53fd41c0a568929b847ac Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 11 May 2022 13:32:04 -0500
Subject: [PATCH 12/12] Refactor binaryop/compiled/util.cpp (#10756)

This PR reduces the complexity of compile-time dispatches to resolve long compile times and massive memory usage in `binaryop/compiled.util.cpp`.

The file `binaryop/compiled/util.cpp` exposes two functions: `is_supported_operation(out, lhs, rhs, op)` and `get_common_type(out, lhs, rhs)`. I refactored both of them, since they were both expensive to compile.

In `is_supported_operation`, I replaced a quadruple dispatch (!!!!) on (LHS type, RHS type, binary operation, output type) with a triple dispatch (LHS type, RHS type, BinaryOp) and some runtime single-dispatches to handle the output type.

In `get_common_type`, I replaced a triple type dispatch on (output type, LHS type, RHS type) with a few double type dispatches. I used the definition of `std::common_type` to simplify `std::common_type_t<A, B, C>` into `std::common_type_t<std::common_type_t<A, B>, C>`, which means we can double-dispatch twice and use runtime `data_type` values in between.

**Impact:** Peak memory usage (max resident set size) when compiling this file drops from 14.6 GB (280acdfd65b12b4ac953c193c7d7fd35809e41be) to 2.4 GB (ee2c26a92a2e4ac6b0b3066c4a5dcbfd5805e944), and the time to compile drops from 2:52.48 minutes (280acdfd65b12b4ac953c193c7d7fd35809e41be) to 57.91 seconds (ee2c26a92a2e4ac6b0b3066c4a5dcbfd5805e944).

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/10756
---
 cpp/src/binaryop/compiled/util.cpp | 171 +++++++++++++++++------------
 1 file changed, 98 insertions(+), 73 deletions(-)

diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index d8f1eb03a16..91fa04be6e2 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -26,39 +26,52 @@
 namespace cudf::binops::compiled {
 
 namespace {
-/**
- * @brief Functor that returns optional common type of 2 or 3 given types.
- *
- */
+
 struct common_type_functor {
   template <typename TypeLhs, typename TypeRhs>
-  struct nested_common_type_functor {
-    template <typename TypeOut>
-    std::optional<data_type> operator()()
-    {
-      // If common_type exists
-      if constexpr (cudf::has_common_type_v<TypeOut, TypeLhs, TypeRhs>) {
-        using TypeCommon = typename std::common_type<TypeOut, TypeLhs, TypeRhs>::type;
-        return data_type{type_to_id<TypeCommon>()};
-      } else if constexpr (cudf::has_common_type_v<TypeLhs, TypeRhs>) {
-        using TypeCommon = typename std::common_type<TypeLhs, TypeRhs>::type;
-        // Eg. d=t-t
-        return data_type{type_to_id<TypeCommon>()};
-      }
-
-      // A compiler bug may cause a compilation error when using empty initializer list to construct
-      // an std::optional object containing no `data_type` value. Therefore, we should explicitly
-      // return `std::nullopt` instead.
-      return std::nullopt;
+  std::optional<data_type> operator()() const
+  {
+    if constexpr (cudf::has_common_type_v<TypeLhs, TypeRhs>) {
+      using TypeCommon = std::common_type_t<TypeLhs, TypeRhs>;
+      return data_type{type_to_id<TypeCommon>()};
     }
-  };
-  template <typename TypeLhs, typename TypeRhs>
-  std::optional<data_type> operator()(data_type out)
+
+    // A compiler bug may cause a compilation error when using empty
+    // initializer list to construct an std::optional object containing no
+    // `data_type` value. Therefore, we explicitly return `std::nullopt`
+    // instead.
+    return std::nullopt;
+  }
+};
+
+struct has_mutable_element_accessor_functor {
+  template <typename T>
+  bool operator()() const
+  {
+    return mutable_column_device_view::has_element_accessor<T>();
+  }
+};
+
+bool has_mutable_element_accessor(data_type t)
+{
+  return type_dispatcher(t, has_mutable_element_accessor_functor{});
+}
+
+template <typename InputType>
+struct is_constructible_functor {
+  template <typename TargetType>
+  bool operator()() const
   {
-    return type_dispatcher(out, nested_common_type_functor<TypeLhs, TypeRhs>{});
+    return std::is_constructible_v<TargetType, InputType>;
   }
 };
 
+template <typename InputType>
+bool is_constructible(data_type target_type)
+{
+  return type_dispatcher(target_type, is_constructible_functor<InputType>{});
+}
+
 /**
  * @brief Functor that return true if BinaryOperator supports given input and output types.
  *
@@ -66,9 +79,9 @@ struct common_type_functor {
  */
 template <typename BinaryOperator>
 struct is_binary_operation_supported {
-  // For types where Out type is fixed. (eg. comparison types)
+  // For types where Out type is fixed. (e.g. comparison types)
   template <typename TypeLhs, typename TypeRhs>
-  inline constexpr bool operator()()
+  inline constexpr bool operator()() const
   {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
                   column_device_view::has_element_accessor<TypeRhs>()) {
@@ -83,24 +96,22 @@ struct is_binary_operation_supported {
     }
   }
 
-  template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  inline constexpr bool operator()()
+  template <typename TypeLhs, typename TypeRhs>
+  inline constexpr bool operator()(data_type out_type) const
   {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
-                  column_device_view::has_element_accessor<TypeRhs>() and
-                  (mutable_column_device_view::has_element_accessor<TypeOut>() or
-                   is_fixed_point<TypeOut>())) {
-      if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
-        using common_t = std::common_type_t<TypeLhs, TypeRhs>;
-        if constexpr (std::is_invocable_v<BinaryOperator, common_t, common_t>) {
-          using ReturnType = std::invoke_result_t<BinaryOperator, common_t, common_t>;
-          return std::is_constructible_v<TypeOut, ReturnType> or
-                 (is_fixed_point<ReturnType>() and is_fixed_point<TypeOut>());
-        }
-      } else {
-        if constexpr (std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>) {
+                  column_device_view::has_element_accessor<TypeRhs>()) {
+      if (has_mutable_element_accessor(out_type) or is_fixed_point(out_type)) {
+        if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
+          using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+          if constexpr (std::is_invocable_v<BinaryOperator, common_t, common_t>) {
+            using ReturnType = std::invoke_result_t<BinaryOperator, common_t, common_t>;
+            return is_constructible<ReturnType>(out_type) or
+                   (is_fixed_point<ReturnType>() and is_fixed_point(out_type));
+          }
+        } else if constexpr (std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>) {
           using ReturnType = std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>;
-          return std::is_constructible_v<TypeOut, ReturnType>;
+          return is_constructible<ReturnType>(out_type);
         }
       }
     }
@@ -111,37 +122,36 @@ struct is_binary_operation_supported {
 struct is_supported_operation_functor {
   template <typename TypeLhs, typename TypeRhs>
   struct nested_support_functor {
-    template <typename BinaryOperator, typename TypeOut>
-    inline constexpr bool call()
+    template <typename BinaryOperator>
+    inline constexpr bool call(data_type out_type) const
     {
-      return is_binary_operation_supported<BinaryOperator>{}
-        .template operator()<TypeOut, TypeLhs, TypeRhs>();
+      return is_binary_operation_supported<BinaryOperator>{}.template operator()<TypeLhs, TypeRhs>(
+        out_type);
     }
-    template <typename TypeOut>
-    inline constexpr bool operator()(binary_operator op)
+    inline constexpr bool operator()(binary_operator op, data_type out_type) const
     {
       switch (op) {
         // clang-format off
-        case binary_operator::ADD:                  return call<ops::Add, TypeOut>();
-        case binary_operator::SUB:                  return call<ops::Sub, TypeOut>();
-        case binary_operator::MUL:                  return call<ops::Mul, TypeOut>();
-        case binary_operator::DIV:                  return call<ops::Div, TypeOut>();
-        case binary_operator::TRUE_DIV:             return call<ops::TrueDiv, TypeOut>();
-        case binary_operator::FLOOR_DIV:            return call<ops::FloorDiv, TypeOut>();
-        case binary_operator::MOD:                  return call<ops::Mod, TypeOut>();
-        case binary_operator::PYMOD:                return call<ops::PyMod, TypeOut>();
-        case binary_operator::POW:                  return call<ops::Pow, TypeOut>();
-        case binary_operator::BITWISE_AND:          return call<ops::BitwiseAnd, TypeOut>();
-        case binary_operator::BITWISE_OR:           return call<ops::BitwiseOr, TypeOut>();
-        case binary_operator::BITWISE_XOR:          return call<ops::BitwiseXor, TypeOut>();
-        case binary_operator::SHIFT_LEFT:           return call<ops::ShiftLeft, TypeOut>();
-        case binary_operator::SHIFT_RIGHT:          return call<ops::ShiftRight, TypeOut>();
-        case binary_operator::SHIFT_RIGHT_UNSIGNED: return call<ops::ShiftRightUnsigned, TypeOut>();
-        case binary_operator::LOG_BASE:             return call<ops::LogBase, TypeOut>();
-        case binary_operator::ATAN2:                return call<ops::ATan2, TypeOut>();
-        case binary_operator::PMOD:                 return call<ops::PMod, TypeOut>();
-        case binary_operator::NULL_MAX:             return call<ops::NullMax, TypeOut>();
-        case binary_operator::NULL_MIN:             return call<ops::NullMin, TypeOut>();
+        case binary_operator::ADD:                  return call<ops::Add>(out_type);
+        case binary_operator::SUB:                  return call<ops::Sub>(out_type);
+        case binary_operator::MUL:                  return call<ops::Mul>(out_type);
+        case binary_operator::DIV:                  return call<ops::Div>(out_type);
+        case binary_operator::TRUE_DIV:             return call<ops::TrueDiv>(out_type);
+        case binary_operator::FLOOR_DIV:            return call<ops::FloorDiv>(out_type);
+        case binary_operator::MOD:                  return call<ops::Mod>(out_type);
+        case binary_operator::PYMOD:                return call<ops::PyMod>(out_type);
+        case binary_operator::POW:                  return call<ops::Pow>(out_type);
+        case binary_operator::BITWISE_AND:          return call<ops::BitwiseAnd>(out_type);
+        case binary_operator::BITWISE_OR:           return call<ops::BitwiseOr>(out_type);
+        case binary_operator::BITWISE_XOR:          return call<ops::BitwiseXor>(out_type);
+        case binary_operator::SHIFT_LEFT:           return call<ops::ShiftLeft>(out_type);
+        case binary_operator::SHIFT_RIGHT:          return call<ops::ShiftRight>(out_type);
+        case binary_operator::SHIFT_RIGHT_UNSIGNED: return call<ops::ShiftRightUnsigned>(out_type);
+        case binary_operator::LOG_BASE:             return call<ops::LogBase>(out_type);
+        case binary_operator::ATAN2:                return call<ops::ATan2>(out_type);
+        case binary_operator::PMOD:                 return call<ops::PMod>(out_type);
+        case binary_operator::NULL_MAX:             return call<ops::NullMax>(out_type);
+        case binary_operator::NULL_MIN:             return call<ops::NullMin>(out_type);
         /*
         case binary_operator::GENERIC_BINARY:       // defined in jit only.
         */
@@ -152,13 +162,13 @@ struct is_supported_operation_functor {
   };
 
   template <typename BinaryOperator, typename TypeLhs, typename TypeRhs>
-  inline constexpr bool bool_op(data_type out)
+  inline constexpr bool bool_op(data_type out) const
   {
     return out.id() == type_id::BOOL8 and
            is_binary_operation_supported<BinaryOperator>{}.template operator()<TypeLhs, TypeRhs>();
   }
   template <typename TypeLhs, typename TypeRhs>
-  inline constexpr bool operator()(data_type out, binary_operator op)
+  inline constexpr bool operator()(data_type out, binary_operator op) const
   {
     switch (op) {
       // output type should be bool type.
@@ -175,7 +185,7 @@ struct is_supported_operation_functor {
         return bool_op<ops::NullLogicalAnd, TypeLhs, TypeRhs>(out);
       case binary_operator::NULL_LOGICAL_OR:
         return bool_op<ops::NullLogicalOr, TypeLhs, TypeRhs>(out);
-      default: return type_dispatcher(out, nested_support_functor<TypeLhs, TypeRhs>{}, op);
+      default: return nested_support_functor<TypeLhs, TypeRhs>{}(op, out);
     }
     return false;
   }
@@ -185,7 +195,22 @@ struct is_supported_operation_functor {
 
 std::optional<data_type> get_common_type(data_type out, data_type lhs, data_type rhs)
 {
-  return double_type_dispatcher(lhs, rhs, common_type_functor{}, out);
+  // Compute the common type of (out, lhs, rhs) if it exists, or the common
+  // type of (lhs, rhs) if it exists, else return a null optional.
+  // We can avoid a triple type dispatch by using the definition of
+  // std::common_type to compute this with double type dispatches.
+  // Specifically, std::common_type_t<TypeOut, TypeLhs, TypeRhs> is the same as
+  // std::common_type_t<std::common_type_t<TypeOut, TypeLhs>, TypeRhs>.
+  auto common_type = double_type_dispatcher(out, lhs, common_type_functor{});
+  if (common_type.has_value()) {
+    common_type = double_type_dispatcher(common_type.value(), rhs, common_type_functor{});
+  }
+  // If no common type of (out, lhs, rhs) exists, fall back to the common type
+  // of (lhs, rhs).
+  if (!common_type.has_value()) {
+    common_type = double_type_dispatcher(lhs, rhs, common_type_functor{});
+  }
+  return common_type;
 }
 
 bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op)