From 88e6a293384224ec3f5564d02cbddfe1a8f3b45b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 11 Jan 2022 13:59:53 -0800
Subject: [PATCH 01/23] Wrap CI script shell variables in quotes to fix local
 testing. (#10018)

This is a tiny PR that wraps shell script variables in quotes. This fixes an issue I saw in the upload script when `${BUILD_MODE}` was not set during local testing of the CI environment.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: https://github.com/rapidsai/cudf/pull/10018
---
 ci/benchmark/build.sh | 2 +-
 ci/cpu/upload.sh      | 2 +-
 ci/gpu/build.sh       | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index 979db1b5034..59bd908d151 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -98,7 +98,7 @@ conda list --show-channel-urls
 ################################################################################
 
 logger "Build libcudf..."
-if [[ ${BUILD_MODE} == "pull-request" ]]; then
+if [[ "${BUILD_MODE}" == "pull-request" ]]; then
     "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds
 else
     "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests -l --ptds
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 40e80def8ae..e6ef72d930c 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -12,7 +12,7 @@ export GPUCI_RETRY_SLEEP=30
 export LABEL_OPTION=${LABEL_OPTION:-"--label main"}
 
 # Skip uploads unless BUILD_MODE == "branch"
-if [ ${BUILD_MODE} != "branch" ]; then
+if [ "${BUILD_MODE}" != "branch" ]; then
   echo "Skipping upload"
   return 0
 fi
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 059e359e4e9..a8afc03af94 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -124,7 +124,7 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     ################################################################################
 
     gpuci_logger "Build from source"
-    if [[ ${BUILD_MODE} == "pull-request" ]]; then
+    if [[ "${BUILD_MODE}" == "pull-request" ]]; then
         "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds
     else
         "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests -l --ptds
@@ -222,7 +222,7 @@ else
     install_dask
 
     gpuci_logger "Build python libs from source"
-    if [[ ${BUILD_MODE} == "pull-request" ]]; then
+    if [[ "${BUILD_MODE}" == "pull-request" ]]; then
         "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds
     else
         "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka -l --ptds

From 25a7485eb752c66e042012e78f0832199ab20aeb Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 11 Jan 2022 17:28:30 -0500
Subject: [PATCH 02/23] Fix regex doc describing hexadecimal escape characters
 (#10009)

Fixes a documentation error found while diagnosing a hex regex pattern question.

The hex escape sequence only specifies a single character (not a single byte). So this means it can only be used to match ASCII characters (code-points 0-127) and not all UTF-8 characters. This is the same as for octal escape sequences. Also, the example provided for hex in the documentation has been corrected to use a valid ASCII character.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/10009
---
 cpp/doxygen/regex.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md
index b721448b45a..76ebb48d195 100644
--- a/cpp/doxygen/regex.md
+++ b/cpp/doxygen/regex.md
@@ -30,7 +30,7 @@ The details are based on features documented at https://www.regular-expressions.
 | Literal character | Any character except `[\^$.⎮?*+()` | All characters except the listed special characters match a single instance of themselves | `a` matches `a` |
 | Literal curly braces | `{` and `}` | `{` and `}` are literal characters, unless they are part of a valid regular expression token such as a quantifier `{3}` | `{` matches `{` |
 | Backslash escapes a metacharacter | `\` followed by any of `[\^$.⎮?*+(){}` | A backslash escapes special characters to suppress their special meaning | `\*` matches `*` |
-| Hexadecimal escape | `\xFF` where `FF` are 2 hexadecimal digits | Matches the character at the specified position in the code page | `\xA9` matches `©` |
+| Hexadecimal escape | `\xFF` where `FF` are 2 hexadecimal digits | Matches the character at the specified position in the ASCII table | `\x40` matches `@` |
 | Character escape | `\n`, `\r` and `\t` | Match an line-feed (LF) character, carriage return (CR) character and a tab character respectively | `\r\n` matches a Windows CRLF line break |
 | Character escape | `\a` | Match the "alert" or "bell" control character (ASCII 0x07) | |
 | Character escape | `\f` | Match the form-feed control character (ASCII 0x0C) | |

From 3216342f01d198cfbe2ef9e2ac861674414dc493 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 11 Jan 2022 17:04:59 -0600
Subject: [PATCH 03/23] Raise in `query` if dtype is not supported (#9921)

Closes https://github.com/rapidsai/cudf/issues/9894

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9921
---
 python/cudf/cudf/tests/test_query.py | 23 ++++++++++++++++++++
 python/cudf/cudf/utils/queryutils.py | 32 ++++++++++++++++++++++------
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py
index 9a02d5145bb..3de38b2cf6f 100644
--- a/python/cudf/cudf/tests/test_query.py
+++ b/python/cudf/cudf/tests/test_query.py
@@ -209,3 +209,26 @@ def test_query_with_index_keyword(query, a_val, b_val, c_val):
     expect = pdf.query(query)
 
     assert_eq(out, expect)
+
+
+@pytest.mark.parametrize(
+    "data, query",
+    [
+        # Only need to test the dtypes that pandas
+        # supports but that we do not
+        (["a", "b", "c"], "data == 'a'"),
+    ],
+)
+def test_query_unsupported_dtypes(data, query):
+    gdf = cudf.DataFrame({"data": data})
+
+    # make sure the query works in pandas
+    pdf = gdf.to_pandas()
+    pdf_result = pdf.query(query)
+
+    expect = pd.DataFrame({"data": ["a"]})
+    assert_eq(expect, pdf_result)
+
+    # but fails in cuDF
+    with pytest.raises(TypeError):
+        gdf.query(query)
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index 217466a5a1b..d9153c2b1d2 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -10,9 +10,20 @@
 import cudf
 from cudf.core.column import column_empty
 from cudf.utils import applyutils
+from cudf.utils.dtypes import (
+    BOOL_TYPES,
+    DATETIME_TYPES,
+    NUMERIC_TYPES,
+    TIMEDELTA_TYPES,
+)
 
 ENVREF_PREFIX = "__CUDF_ENVREF__"
 
+SUPPORTED_QUERY_TYPES = {
+    np.dtype(dt)
+    for dt in NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES
+}
+
 
 class QuerySyntaxError(ValueError):
     pass
@@ -197,6 +208,20 @@ def query_execute(df, expr, callenv):
 
     # compile
     compiled = query_compile(expr)
+    columns = compiled["colnames"]
+
+    # prepare col args
+    colarrays = [cudf.core.dataframe.extract_col(df, col) for col in columns]
+
+    # wait to check the types until we know which cols are used
+    if any(col.dtype not in SUPPORTED_QUERY_TYPES for col in colarrays):
+        raise TypeError(
+            "query only supports numeric, datetime, timedelta, "
+            "or bool dtypes."
+        )
+
+    colarrays = [col.data_array_view for col in colarrays]
+
     kernel = compiled["kernel"]
     # process env args
     envargs = []
@@ -214,13 +239,6 @@ def query_execute(df, expr, callenv):
             raise NameError(msg.format(name))
         else:
             envargs.append(val)
-    columns = compiled["colnames"]
-    # prepare col args
-
-    colarrays = [
-        cudf.core.dataframe.extract_col(df, col).data_array_view
-        for col in columns
-    ]
 
     # allocate output buffer
     nrows = len(df)

From 813ac97b2143c8d1d8ca95435863f5234408a681 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 11 Jan 2022 15:16:25 -0800
Subject: [PATCH 04/23] Use list of column inputs for `apply_boolean_mask`
 (#9832)

This PR brings changes from #9558 to `apply_boolean_mask` and removes the `as_frame` -> `as_column` round trip. Benchmark the column method:

```
------------------------------------- benchmark 'col0': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col0] (afte)      87.0090 (1.0)      132.8980 (1.0)       95.8815 (1.0)
column_apply_boolean_mask[col0] (befo)     210.4580 (2.42)     307.8270 (2.32)     225.4821 (2.35)
-----------------------------------------------------------------------------------------------------

------------------------------------- benchmark 'col1': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col1] (afte)      74.2240 (1.0)      110.0600 (1.0)       75.6356 (1.0)
column_apply_boolean_mask[col1] (befo)     172.5240 (2.32)     278.5250 (2.53)     176.5672 (2.33)
-----------------------------------------------------------------------------------------------------

------------------------------------- benchmark 'col2': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col2] (afte)     101.5740 (1.0)      141.8850 (1.0)      110.2334 (1.0)
column_apply_boolean_mask[col2] (befo)     234.1140 (2.30)     312.7140 (2.20)     245.5453 (2.23)
-----------------------------------------------------------------------------------------------------

------------------------------------- benchmark 'col3': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col3] (afte)      88.7710 (1.0)      142.7500 (1.0)       90.5082 (1.0)
column_apply_boolean_mask[col3] (befo)     195.0980 (2.20)     303.1020 (2.12)     199.8368 (2.21)
-----------------------------------------------------------------------------------------------------
```

Dataframe benchmark
```
----------------------------------- benchmark '100': 2 tests -----------------------------------
Name (time in us)                          Min                 Max                Mean
------------------------------------------------------------------------------------------------
df_apply_boolean_mask[100] (afte)     380.6770 (1.05)     654.7080 (1.18)     389.3374 (1.03)
df_apply_boolean_mask[100] (befo)     362.3220 (1.0)      554.6130 (1.0)      378.7087 (1.0)
------------------------------------------------------------------------------------------------

----------------------------------- benchmark '10000': 2 tests -----------------------------------
Name (time in us)                            Min                 Max                Mean
--------------------------------------------------------------------------------------------------
df_apply_boolean_mask[10000] (afte)     399.5240 (1.05)     461.6310 (1.0)      405.1225 (1.04)
df_apply_boolean_mask[10000] (befo)     379.4080 (1.0)      564.5770 (1.22)     389.6990 (1.0)
--------------------------------------------------------------------------------------------------
```

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9832
---
 python/cudf/cudf/_lib/stream_compaction.pyx | 18 +++++----------
 python/cudf/cudf/core/_base_index.py        | 25 ++++++++++++++++++++-
 python/cudf/cudf/core/algorithms.py         |  4 ++--
 python/cudf/cudf/core/column/column.py      | 10 ++++++---
 python/cudf/cudf/core/frame.py              | 13 -----------
 python/cudf/cudf/core/indexed_frame.py      | 20 +++++++++++++++++
 6 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index ef47e843723..4330c565982 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -75,24 +75,22 @@ def drop_nulls(columns: list, how="any", keys=None, thresh=None):
     return columns_from_unique_ptr(move(c_result))
 
 
-def apply_boolean_mask(source_table, Column boolean_mask):
+def apply_boolean_mask(columns: list, Column boolean_mask):
     """
     Drops the rows which correspond to False in boolean_mask.
 
     Parameters
     ----------
-    source_table : source table whose rows are dropped as per boolean_mask
+    columns : list of columns whose rows are dropped as per boolean_mask
     boolean_mask : a boolean column of same size as source_table
 
     Returns
     -------
-    Frame obtained from applying mask
+    columns obtained from applying mask
     """
 
-    assert pd.api.types.is_bool_dtype(boolean_mask.dtype)
-
     cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_table(source_table)
+    cdef table_view source_table_view = table_view_from_columns(columns)
     cdef column_view boolean_mask_view = boolean_mask.view()
 
     with nogil:
@@ -103,13 +101,7 @@ def apply_boolean_mask(source_table, Column boolean_mask):
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=(
-            None if source_table._index
-            is None else source_table._index_names)
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def drop_duplicates(columns: list,
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index aa89b8f849f..683f3fefe1c 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -9,8 +9,15 @@
 import pandas as pd
 
 import cudf
+from cudf._lib.stream_compaction import apply_boolean_mask
 from cudf._typing import DtypeObj
-from cudf.api.types import is_dtype_equal, is_integer, is_list_like, is_scalar
+from cudf.api.types import (
+    is_bool_dtype,
+    is_dtype_equal,
+    is_integer,
+    is_list_like,
+    is_scalar,
+)
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
 from cudf.core.column_accessor import ColumnAccessor
@@ -1414,6 +1421,22 @@ def from_pandas(cls, index, nan_as_null=None):
     def _constructor_expanddim(self):
         return cudf.MultiIndex
 
+    def _apply_boolean_mask(self, boolean_mask):
+        """Apply boolean mask to each row of `self`.
+
+        Rows corresponding to `False` is dropped.
+        """
+        boolean_mask = cudf.core.column.as_column(boolean_mask)
+        if not is_bool_dtype(boolean_mask.dtype):
+            raise ValueError("boolean_mask is not boolean type.")
+
+        result = self.__class__._from_columns(
+            apply_boolean_mask(list(self._columns), boolean_mask),
+            column_names=self._column_names,
+        )
+        result._copy_type_metadata(self)
+        return result
+
     def _split_columns_by_levels(self, levels):
         if isinstance(levels, int) and levels > 0:
             raise ValueError(f"Out of bound level: {levels}")
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 18c86f82f9c..a2a909968dc 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -5,8 +5,8 @@
 import numpy as np
 
 from cudf.core.column import as_column
-from cudf.core.frame import Frame
 from cudf.core.index import Index, RangeIndex
+from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.series import Series
 
 
@@ -92,7 +92,7 @@ def _index_or_values_interpolation(column, index=None):
     if num_nan == 0 or num_nan == len(column):
         return column
 
-    to_interp = Frame(data={None: column}, index=index)
+    to_interp = IndexedFrame(data={None: column}, index=index)
     known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))
 
     known_x = known_x_and_y._index._column.values
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index c1e037499fc..a966276842f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -35,6 +35,7 @@
 )
 from cudf._lib.scalar import as_device_scalar
 from cudf._lib.stream_compaction import (
+    apply_boolean_mask,
     distinct_count as cpp_distinct_count,
     drop_duplicates,
     drop_nulls,
@@ -997,9 +998,12 @@ def as_decimal32_column(
         raise NotImplementedError
 
     def apply_boolean_mask(self, mask) -> ColumnBase:
-        mask = as_column(mask, dtype="bool")
-        return (
-            self.as_frame()._apply_boolean_mask(boolean_mask=mask)._as_column()
+        mask = as_column(mask)
+        if not is_bool_dtype(mask.dtype):
+            raise ValueError("boolean_mask is not boolean type.")
+
+        return apply_boolean_mask([self], mask)[0]._with_type_metadata(
+            self.dtype
         )
 
     def argsort(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 0345966d6bd..6e47c0f41cf 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1461,19 +1461,6 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
 
         return self[out_cols]
 
-    def _apply_boolean_mask(self, boolean_mask):
-        """
-        Applies boolean mask to each row of `self`,
-        rows corresponding to `False` is dropped
-        """
-        result = self.__class__._from_data(
-            *libcudf.stream_compaction.apply_boolean_mask(
-                self, as_column(boolean_mask)
-            )
-        )
-        result._copy_type_metadata(self)
-        return result
-
     def interpolate(
         self,
         method="linear",
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 2f4d4a88195..7c5783bf637 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -19,6 +19,7 @@
 from cudf._typing import ColumnLike
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
+    is_bool_dtype,
     is_categorical_dtype,
     is_integer_dtype,
     is_list_like,
@@ -1197,6 +1198,25 @@ def resample(
             else cudf.core.resample.DataFrameResampler(self, by=by)
         )
 
+    def _apply_boolean_mask(self, boolean_mask):
+        """Apply boolean mask to each row of `self`.
+
+        Rows corresponding to `False` is dropped.
+        """
+        boolean_mask = cudf.core.column.as_column(boolean_mask)
+        if not is_bool_dtype(boolean_mask.dtype):
+            raise ValueError("boolean_mask is not boolean type.")
+
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.apply_boolean_mask(
+                list(self._index._columns + self._columns), boolean_mask
+            ),
+            column_names=self._column_names,
+            index_names=self._index.names,
+        )
+        result._copy_type_metadata(self)
+        return result
+
     def _reset_index(self, level, drop, col_level=0, col_fill=""):
         """Shared path for DataFrame.reset_index and Series.reset_index."""
         if level is not None and not isinstance(level, (tuple, list)):

From a43682e99ab618ec2028cd224abcfc56e2b2fabb Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 11 Jan 2022 19:20:48 -0500
Subject: [PATCH 05/23] cudftestutil no longer propagates compiler flags to
 external users (#10017)

Fixes #9952

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Keith Kraus (https://github.com/kkraus14)

URL: https://github.com/rapidsai/cudf/pull/10017
---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 84e486c7e18..a8100fb3f92 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -618,7 +618,7 @@ set_target_properties(
 )
 
 target_compile_options(
-  cudftestutil PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+  cudftestutil PUBLIC "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
                       "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
 )
 

From 093b0ad62ba44b21df2f6f4d23949ef49469d824 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 12 Jan 2022 08:45:06 -0500
Subject: [PATCH 06/23] Add strings tests to transpose_test.cpp (#9985)

This is a follow on to PR #9937.

Adds to the gtests in  `transpose_test.cpp` to include strings as supported by `cudf::transpose`. No function has changed -- only additional tests have been added.

The utility functions in `transpose_test.cpp` were enhanced to include string types by accepting a column-wrapper type template parameter as required.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9985
---
 cpp/include/cudf_test/type_lists.hpp   |  9 +++-
 cpp/tests/transpose/transpose_test.cpp | 58 ++++++++++++++++----------
 2 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index 097d072a5b4..3c46b912639 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -158,6 +158,13 @@ std::enable_if_t<cudf::is_timestamp_t<TypeParam>::value, TypeParam> make_type_pa
   return TypeParam{typename TypeParam::duration(init_value)};
 }
 
+template <typename TypeParam, typename T>
+std::enable_if_t<std::is_same_v<TypeParam, std::string>, TypeParam> make_type_param_scalar(
+  T const init_value)
+{
+  return std::to_string(init_value);
+}
+
 /**
  * @brief Type list for all integral types except type bool.
  */
diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp
index 7b7b7d8a4a9..e3d9808b211 100644
--- a/cpp/tests/transpose/transpose_test.cpp
+++ b/cpp/tests/transpose/transpose_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,9 +22,9 @@
 #include <algorithm>
 #include <limits>
 #include <random>
+#include <string>
 
 namespace {
-using cudf::test::fixed_width_column_wrapper;
 
 template <typename T, typename F>
 auto generate_vectors(size_t ncols, size_t nrows, F generator)
@@ -59,10 +59,10 @@ auto transpose_vectors(std::vector<std::vector<T>> const& input)
   return transposed;
 }
 
-template <typename T>
+template <typename T, typename ColumnWrapper>
 auto make_columns(std::vector<std::vector<T>> const& values)
 {
-  std::vector<fixed_width_column_wrapper<T>> columns;
+  std::vector<ColumnWrapper> columns;
   columns.reserve(values.size());
 
   for (auto const& value_col : values) {
@@ -72,11 +72,11 @@ auto make_columns(std::vector<std::vector<T>> const& values)
   return columns;
 }
 
-template <typename T>
+template <typename T, typename ColumnWrapper>
 auto make_columns(std::vector<std::vector<T>> const& values,
                   std::vector<std::vector<cudf::size_type>> const& valids)
 {
-  std::vector<fixed_width_column_wrapper<T>> columns;
+  std::vector<ColumnWrapper> columns;
   columns.reserve(values.size());
 
   for (size_t col = 0; col < values.size(); ++col) {
@@ -86,15 +86,14 @@ auto make_columns(std::vector<std::vector<T>> const& values,
   return columns;
 }
 
-template <typename T>
-auto make_table_view(std::vector<fixed_width_column_wrapper<T>> const& cols)
+template <typename ColumnWrapper>
+auto make_table_view(std::vector<ColumnWrapper> const& cols)
 {
   std::vector<cudf::column_view> views(cols.size());
 
-  std::transform(
-    cols.begin(), cols.end(), views.begin(), [](fixed_width_column_wrapper<T> const& col) {
-      return static_cast<cudf::column_view>(col);
-    });
+  std::transform(cols.begin(), cols.end(), views.begin(), [](auto const& col) {
+    return static_cast<cudf::column_view>(col);
+  });
 
   return cudf::table_view(views);
 }
@@ -102,6 +101,10 @@ auto make_table_view(std::vector<fixed_width_column_wrapper<T>> const& cols)
 template <typename T>
 void run_test(size_t ncols, size_t nrows, bool add_nulls)
 {
+  using ColumnWrapper = std::conditional_t<std::is_same_v<T, std::string>,
+                                           cudf::test::strings_column_wrapper,
+                                           cudf::test::fixed_width_column_wrapper<T>>;
+
   std::mt19937 rng(1);
 
   // Generate values as vector of vectors
@@ -109,8 +112,8 @@ void run_test(size_t ncols, size_t nrows, bool add_nulls)
     ncols, nrows, [&rng]() { return cudf::test::make_type_param_scalar<T>(rng()); });
   auto const valuesT = transpose_vectors(values);
 
-  std::vector<fixed_width_column_wrapper<T>> input_cols;
-  std::vector<fixed_width_column_wrapper<T>> expected_cols;
+  std::vector<ColumnWrapper> input_cols;
+  std::vector<ColumnWrapper> expected_cols;
   std::vector<cudf::size_type> expected_nulls(nrows);
 
   if (add_nulls) {
@@ -129,11 +132,11 @@ void run_test(size_t ncols, size_t nrows, bool add_nulls)
                    });
 
     // Create column wrappers from vector of vectors
-    input_cols    = make_columns(values, valids);
-    expected_cols = make_columns(valuesT, validsT);
+    input_cols    = make_columns<T, ColumnWrapper>(values, valids);
+    expected_cols = make_columns<T, ColumnWrapper>(valuesT, validsT);
   } else {
-    input_cols    = make_columns(values);
-    expected_cols = make_columns(valuesT);
+    input_cols    = make_columns<T, ColumnWrapper>(values);
+    expected_cols = make_columns<T, ColumnWrapper>(valuesT);
   }
 
   // Create table views from column wrappers
@@ -158,7 +161,13 @@ template <typename T>
 class TransposeTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(TransposeTest, cudf::test::FixedWidthTypes);
+// Using std::string here instead of cudf::test::StringTypes allows us to
+// use std::vector<T> utilities in this file just like the fixed-width types.
+// Should consider changing cudf::test::StringTypes to std::string instead of cudf::string_view.
+using StdStringType  = cudf::test::Types<std::string>;
+using TransposeTypes = cudf::test::Concat<cudf::test::FixedWidthTypes, StdStringType>;
+
+TYPED_TEST_SUITE(TransposeTest, TransposeTypes);  // cudf::test::FixedWidthTypes);
 
 TYPED_TEST(TransposeTest, SingleValue) { run_test<TypeParam>(1, 1, false); }
 
@@ -182,11 +191,14 @@ TYPED_TEST(TransposeTest, EmptyTable) { run_test<TypeParam>(0, 0, false); }
 
 TYPED_TEST(TransposeTest, EmptyColumns) { run_test<TypeParam>(10, 0, false); }
 
-TYPED_TEST(TransposeTest, MismatchedColumns)
+class TransposeTestError : public cudf::test::BaseFixture {
+};
+
+TEST_F(TransposeTestError, MismatchedColumns)
 {
-  fixed_width_column_wrapper<TypeParam, int32_t> col1({1, 2, 3});
-  fixed_width_column_wrapper<int8_t> col2{{4, 5, 6}};
-  fixed_width_column_wrapper<float> col3{{7, 8, 9}};
+  cudf::test::fixed_width_column_wrapper<uint32_t, int32_t> col1({1, 2, 3});
+  cudf::test::fixed_width_column_wrapper<int8_t> col2{{4, 5, 6}};
+  cudf::test::fixed_width_column_wrapper<float> col3{{7, 8, 9}};
   cudf::table_view input{{col1, col2, col3}};
   EXPECT_THROW(cudf::transpose(input), cudf::logic_error);
 }

From 76f89db80a64a2aa49b618aad80fe80e34e0332f Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 12 Jan 2022 12:57:00 -0800
Subject: [PATCH 07/23] Update JNI to use new arena mr constructor (#10027)

And fix a failing test.

Authors:
  - Rong Ou (https://github.com/rongou)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/10027
---
 java/src/main/native/src/RmmJni.cpp            | 6 +++---
 java/src/test/java/ai/rapids/cudf/RmmTest.java | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index d07b754c8db..769e8d2f356 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -348,10 +348,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j
     } else if (use_arena_alloc) {
       if (use_managed_mem) {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
-            std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_size);
+            std::make_shared<rmm::mr::managed_memory_resource>(), pool_size);
       } else {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
-            std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_size);
+            std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size);
       }
     } else if (use_cuda_async_alloc) {
       // Use `limiting_resource_adaptor` to set a hard limit on the max pool size since
diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java
index f9d097158b6..c56b131de86 100644
--- a/java/src/test/java/ai/rapids/cudf/RmmTest.java
+++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -414,7 +414,7 @@ public void testCudaAsyncIsIncompatibleWithManaged() {
 
   @Test
   public void testCudaMemoryBuffer() {
-    Rmm.initialize(RmmAllocationMode.ARENA, Rmm.logToStderr(), 1024);
+    Rmm.initialize(RmmAllocationMode.ARENA, Rmm.logToStderr(), 8 * 1024 * 1024);
     try (CudaMemoryBuffer one = CudaMemoryBuffer.allocate(512);
          CudaMemoryBuffer two = CudaMemoryBuffer.allocate(1024)) {
       assertEquals(512, one.length);

From b8c4816d2ce5205e7b88e5f9be74bf4ea75dfbf5 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 12 Jan 2022 15:11:17 -0600
Subject: [PATCH 08/23] Unpin `dask` and `distributed` in CI (#10028)

This PR unpins dask and distributed in CI.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/10028
---
 ci/benchmark/build.sh                    | 2 +-
 ci/gpu/build.sh                          | 2 +-
 conda/environments/cudf_dev_cuda11.5.yml | 4 ++--
 conda/recipes/dask-cudf/meta.yaml        | 8 ++++----
 python/dask_cudf/dev_requirements.txt    | 4 ++--
 python/dask_cudf/setup.py                | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index 59bd908d151..534ac19ee98 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
 export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
 
 # Dask & Distributed git tag
-export DASK_DISTRIBUTED_GIT_TAG='2021.11.2'
+export DASK_DISTRIBUTED_GIT_TAG='main'
 
 function remove_libcudf_kernel_cache_dir {
     EXITCODE=$?
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index a8afc03af94..39a39c46eff 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
 # Dask & Distributed git tag
-export DASK_DISTRIBUTED_GIT_TAG='2021.11.2'
+export DASK_DISTRIBUTED_GIT_TAG='main'
 
 # ucx-py version
 export UCX_PY_VERSION='0.24.*'
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index cc8d50a1717..c258a5caabb 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -41,8 +41,8 @@ dependencies:
   - pydocstyle=6.1.1
   - typing_extensions
   - pre-commit
-  - dask>=2021.11.1,<=2021.11.2
-  - distributed>=2021.11.1,<=2021.11.2
+  - dask>=2021.11.1
+  - distributed>=2021.11.1
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index da8bcea430a..fd34ff4112d 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -27,14 +27,14 @@ requirements:
   host:
     - python
     - cudf {{ version }}
-    - dask>=2021.11.1,<=2021.11.2
-    - distributed>=2021.11.1,<=2021.11.2
+    - dask>=2021.11.1
+    - distributed>=2021.11.1
     - cudatoolkit {{ cuda_version }}
   run:
     - python
     - cudf {{ version }}
-    - dask>=2021.11.1,<=2021.11.2
-    - distributed>=2021.11.1,<=2021.11.2
+    - dask>=2021.11.1
+    - distributed>=2021.11.1
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
 
 test:                                   # [linux64]
diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt
index db85515f379..d8b0745be79 100644
--- a/python/dask_cudf/dev_requirements.txt
+++ b/python/dask_cudf/dev_requirements.txt
@@ -1,7 +1,7 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
-dask>=2021.11.1,<=2021.11.2
-distributed>=2021.11.1,<=2021.11.2
+dask>=2021.11.1
+distributed>=2021.11.1
 fsspec>=0.6.0
 numba>=0.53.1
 numpy
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index b52c2ea37d6..425839772eb 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -10,8 +10,8 @@
 
 install_requires = [
     "cudf",
-    "dask>=2021.11.1,<=2021.11.2",
-    "distributed>=2021.11.1,<=2021.11.2",
+    "dask>=2021.11.1",
+    "distributed>=2021.11.1",
     "fsspec>=0.6.0",
     "numpy",
     "pandas>=1.0,<1.4.0dev0",

From 3176258bb2f1cdd03d80be54fe52208885fc44da Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 12 Jan 2022 15:41:32 -0800
Subject: [PATCH 09/23] Return null count from inplace_bitmask_and. (#9904)

This PR updates the function `cudf::detail::inplace_bitmask_and` to return the null count of the result. This change aligns `inplace_bitmask_and` with behavior changes introduced in #9616 to return null counts from functions acting on bitmasks. This will be helpful for #9621.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/9904
---
 cpp/include/cudf/detail/null_mask.hpp |  4 ++--
 cpp/src/bitmask/null_mask.cu          | 14 +++++++-------
 cpp/src/structs/utilities.cpp         |  5 +++--
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 6ee406de5ef..83ef78a8250 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -268,9 +268,9 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(
  * @param mask_size_bits The number of bits to be ANDed in each mask
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
- * @return rmm::device_buffer Output bitmask
+ * @return Count of set bits
  */
-void inplace_bitmask_and(
+cudf::size_type inplace_bitmask_and(
   device_span<bitmask_type> dest_mask,
   host_span<bitmask_type const*> masks,
   host_span<size_type const> masks_begin_bits,
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index ec3776fb6d5..d1107ad3cfd 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -404,14 +404,14 @@ std::vector<size_type> segmented_null_count(const bitmask_type* bitmask,
 }
 
 // Inplace Bitwise AND of the masks
-void inplace_bitmask_and(device_span<bitmask_type> dest_mask,
-                         host_span<bitmask_type const*> masks,
-                         host_span<size_type const> begin_bits,
-                         size_type mask_size,
-                         rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+cudf::size_type inplace_bitmask_and(device_span<bitmask_type> dest_mask,
+                                    host_span<bitmask_type const*> masks,
+                                    host_span<size_type const> begin_bits,
+                                    size_type mask_size,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
-  inplace_bitmask_binop(
+  return inplace_bitmask_binop(
     [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
     dest_mask,
     masks,
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 43a32c8405a..afea8a55b16 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -322,14 +322,15 @@ void superimpose_parent_nulls(bitmask_type const* parent_null_mask,
       reinterpret_cast<bitmask_type const*>(parent_null_mask),
       reinterpret_cast<bitmask_type const*>(current_child_mask)};
     std::vector<size_type> begin_bits{0, 0};
-    cudf::detail::inplace_bitmask_and(
+    auto const valid_count = cudf::detail::inplace_bitmask_and(
       device_span<bitmask_type>(current_child_mask, num_bitmask_words(child.size())),
       masks,
       begin_bits,
       child.size(),
       stream,
       mr);
-    child.set_null_count(UNKNOWN_NULL_COUNT);
+    auto const null_count = child.size() - valid_count;
+    child.set_null_count(null_count);
   }
 
   // If the child is also a struct, repeat for all grandchildren.

From 4950a7ae376200d086d4108edb572d164f4e81c8 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 12 Jan 2022 20:16:34 -0800
Subject: [PATCH 10/23] Remove deprecated `method` parameter from `merge` and
 `join`. (#9944)

This PR removes the deprecated `method` parameter from `DataFrame.merge` and `DataFrame.join`. This resolves #9353 and follows up on #9291.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9944
---
 python/cudf/cudf/core/dataframe.py     | 30 +-------------------------
 python/cudf/cudf/tests/test_joining.py |  4 ++--
 2 files changed, 3 insertions(+), 31 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index fe6ac8e1529..123f86cc200 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3438,7 +3438,6 @@ def merge(
         sort=False,
         lsuffix=None,
         rsuffix=None,
-        method=None,
         indicator=False,
         suffixes=("_x", "_y"),
     ):
@@ -3490,9 +3489,6 @@ def merge(
         suffixes: Tuple[str, str], defaults to ('_x', '_y')
             Suffixes applied to overlapping column names on the left and right
             sides
-        method :
-            This parameter is unused. It is deprecated and will be removed in a
-            future version.
 
         Returns
         -------
@@ -3554,13 +3550,6 @@ def merge(
         else:
             lsuffix, rsuffix = suffixes
 
-        if method is not None:
-            warnings.warn(
-                "The 'method' argument is deprecated and will be removed "
-                "in a future version of cudf.",
-                FutureWarning,
-            )
-
         # Compute merge
         gdf_result = super()._merge(
             right,
@@ -3578,14 +3567,7 @@ def merge(
 
     @annotate("JOIN", color="blue", domain="cudf_python")
     def join(
-        self,
-        other,
-        on=None,
-        how="left",
-        lsuffix="",
-        rsuffix="",
-        sort=False,
-        method=None,
+        self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False,
     ):
         """Join columns with other DataFrame on index or on a key column.
 
@@ -3599,9 +3581,6 @@ def join(
             column names when avoiding conflicts.
         sort : bool
             Set to True to ensure sorted ordering.
-        method :
-            This parameter is unused. It is deprecated and will be removed in a
-            future version.
 
         Returns
         -------
@@ -3615,13 +3594,6 @@ def join(
         - *on* is not supported yet due to lack of multi-index support.
         """
 
-        if method is not None:
-            warnings.warn(
-                "The 'method' argument is deprecated and will be removed "
-                "in a future version of cudf.",
-                FutureWarning,
-            )
-
         df = self.merge(
             other,
             left_index=True,
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index d25c6130bfb..2fb7393f5b4 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -256,7 +256,7 @@ def test_dataframe_join_mismatch_cats(how):
 
     pdf1 = pdf1.set_index("join_col")
     pdf2 = pdf2.set_index("join_col")
-    join_gdf = gdf1.join(gdf2, how=how, sort=True, method="hash")
+    join_gdf = gdf1.join(gdf2, how=how, sort=True)
     join_pdf = pdf1.join(pdf2, how=how)
 
     got = join_gdf.fillna(-1).to_pandas()
@@ -403,7 +403,7 @@ def test_dataframe_merge_order():
     gdf2["id"] = [4, 5]
     gdf2["a"] = [7, 8]
 
-    gdf = gdf1.merge(gdf2, how="left", on=["id", "a"], method="hash")
+    gdf = gdf1.merge(gdf2, how="left", on=["id", "a"])
 
     df1 = pd.DataFrame()
     df2 = pd.DataFrame()

From fe71baba07f4b582b5ec2e36ad50301f1186ca34 Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Thu, 13 Jan 2022 06:11:42 -0800
Subject: [PATCH 11/23] Fix memory leaks in JNI native code. (#10029)

This commit fixes a couple of minor, host-side memory leaks
in the JNI native code. The objects in question did not need
to go on the heap. They have, in this commit, been switched to
stack objects.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/10029
---
 java/src/main/native/src/ColumnVectorJni.cpp | 4 ++--
 java/src/main/native/src/ColumnViewJni.cpp   | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index e61ab8444d1..b0286f9ac27 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -359,10 +359,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_hash(JNIEnv *env, jobje
     std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(),
                    std::back_inserter(column_views),
                    [](auto const &p_column) { return *p_column; });
-    cudf::table_view *input_table = new cudf::table_view(column_views);
+    cudf::table_view input_table{column_views};
 
     std::unique_ptr<cudf::column> result =
-        cudf::hash(*input_table, static_cast<cudf::hash_id>(hash_function_id), seed);
+        cudf::hash(input_table, static_cast<cudf::hash_id>(hash_function_id), seed);
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 73ea49c18d9..d2cc2ab7d1c 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1604,17 +1604,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
     std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(),
                    std::back_inserter(column_views),
                    [](auto const &p_column) { return *p_column; });
-    cudf::table_view *input_table = new cudf::table_view(column_views);
+    cudf::table_view input_table{column_views};
 
     cudf::binary_operator op = static_cast<cudf::binary_operator>(bin_op);
     switch (op) {
       case cudf::binary_operator::BITWISE_AND: {
-        auto [new_bitmask, null_count] = cudf::bitmask_and(*input_table);
+        auto [new_bitmask, null_count] = cudf::bitmask_and(input_table);
         copy->set_null_mask(std::move(new_bitmask), null_count);
         break;
       }
       case cudf::binary_operator::BITWISE_OR: {
-        auto [new_bitmask, null_count] = cudf::bitmask_or(*input_table);
+        auto [new_bitmask, null_count] = cudf::bitmask_or(input_table);
         copy->set_null_mask(std::move(new_bitmask), null_count);
         break;
       }

From d0c85e152db772b37d713b0f86ae787311d673ac Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 13 Jan 2022 12:21:56 -0500
Subject: [PATCH 12/23] build.sh respects the `--build_metrics` and
 `--incl_cache_stats` flags (#10035)

Previously the script would do a comparison check like "$V"=="ON" which isn't a comparison in bash but a joining of strings ( "OFF==ON"). To do a comparison of two strings you need to have white space around `==`.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10035
---
 build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build.sh b/build.sh
index f5a59b6edcf..45074a6645f 100755
--- a/build.sh
+++ b/build.sh
@@ -186,7 +186,7 @@ if buildAll || hasArg libcudf; then
 
     # get the current count before the compile starts
     FILES_IN_CCACHE=""
-    if [[ "$BUILD_REPORT_INCL_CACHE_STATS"=="ON" && -x "$(command -v ccache)" ]]; then
+    if [[ "$BUILD_REPORT_INCL_CACHE_STATS" == "ON" && -x "$(command -v ccache)" ]]; then
         FILES_IN_CCACHE=$(ccache -s | grep "files in cache")
         echo "$FILES_IN_CCACHE"
         # zero the ccache statistics
@@ -212,7 +212,7 @@ if buildAll || hasArg libcudf; then
     compile_total=$(( compile_end - compile_start ))
 
     # Record build times
-    if [[ "$BUILD_REPORT_METRICS"=="ON" && -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then
+    if [[ "$BUILD_REPORT_METRICS" == "ON" && -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then
         echo "Formatting build metrics"
         python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt xml > ${LIB_BUILD_DIR}/ninja_log.xml
         MSG="<p>"

From dbe65f1b977d76c93932a42ec8047690e84f0267 Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Thu, 13 Jan 2022 11:09:02 -0700
Subject: [PATCH 13/23] Fix null check when comparing structs in `arg_min`
 operation of reduction/groupby (#10026)

This is another fix for https://github.com/NVIDIA/spark-rapids/pull/4434, when the null order is wrongly handled if the input structs column does not have nulls at the top level but only has null at the children levels.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/10026
---
 cpp/src/reductions/struct_minmax_util.cuh | 15 +++----
 cpp/tests/groupby/max_tests.cpp           | 51 ++++++++++++++++-------
 cpp/tests/groupby/min_tests.cpp           | 51 ++++++++++++++++-------
 3 files changed, 80 insertions(+), 37 deletions(-)

diff --git a/cpp/src/reductions/struct_minmax_util.cuh b/cpp/src/reductions/struct_minmax_util.cuh
index e5832b849bd..1de48ef482d 100644
--- a/cpp/src/reductions/struct_minmax_util.cuh
+++ b/cpp/src/reductions/struct_minmax_util.cuh
@@ -103,13 +103,14 @@ class comparison_binop_generator {
   {
     if (is_min_op) {
       null_orders = flattened_input.null_orders();
-      // Null structs are excluded from the operations, and that is equivalent to considering
-      // nulls as larger than all other non-null STRUCT elements (if finding for ARGMIN), or
-      // smaller than all other non-null STRUCT elements (if finding for ARGMAX).
-      // Thus, we need to set a separate null order for the top level structs column (which is
-      // stored at the first position in the null_orders array) to achieve this purpose.
-      null_orders.front() = cudf::null_order::AFTER;
-      null_orders_dvec    = cudf::detail::make_device_uvector_async(null_orders, stream);
+      // If the input column has nulls (at the top level), null structs are excluded from the
+      // operations, and that is equivalent to considering top-level nulls as larger than all other
+      // non-null STRUCT elements (if finding for ARGMIN), or smaller than all other non-null STRUCT
+      // elements (if finding for ARGMAX). Thus, we need to set a separate null order for the top
+      // level structs column (which is stored at the first position in the null_orders array) to
+      // achieve this purpose.
+      if (input.has_nulls()) { null_orders.front() = cudf::null_order::AFTER; }
+      null_orders_dvec = cudf::detail::make_device_uvector_async(null_orders, stream);
     }
     // else: Don't need to generate nulls order to copy to device memory if we have all null orders
     // are BEFORE (that happens when we have is_min_op == false).
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index 47bed11df30..266312d16a2 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -391,22 +391,43 @@ TEST_F(groupby_max_struct_test, null_keys_and_values)
 TEST_F(groupby_max_struct_test, values_with_null_child)
 {
   constexpr int32_t null{0};
-  auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
-  auto const vals = [] {
-    auto child1 = fixed_width_column_wrapper<int32_t>{1, 1};
-    auto child2 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
-    return structs_column_wrapper{child1, child2};
-  }();
-
-  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
-  auto const expect_vals = [] {
-    auto child1 = fixed_width_column_wrapper<int32_t>{1};
-    auto child2 = fixed_width_column_wrapper<int32_t>{-1};
-    return structs_column_wrapper{child1, child2};
-  }();
+  {
+    auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
+    auto const vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{1, 1};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
+    auto const expect_vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{1};
+      auto child2 = fixed_width_column_wrapper<int32_t>{-1};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto agg = cudf::make_max_aggregation<groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  }
 
-  auto agg = cudf::make_max_aggregation<groupby_aggregation>();
-  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  {
+    auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
+    auto const vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{null, null}, nulls_at({0, 1})};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
+    auto const expect_vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{-1};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{null}, null_at(0)};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto agg = cudf::make_max_aggregation<groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  }
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index 64bffe1c883..00fa282cee4 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -390,22 +390,43 @@ TEST_F(groupby_min_struct_test, null_keys_and_values)
 TEST_F(groupby_min_struct_test, values_with_null_child)
 {
   constexpr int32_t null{0};
-  auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
-  auto const vals = [] {
-    auto child1 = fixed_width_column_wrapper<int32_t>{1, 1};
-    auto child2 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
-    return structs_column_wrapper{child1, child2};
-  }();
-
-  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
-  auto const expect_vals = [] {
-    auto child1 = fixed_width_column_wrapper<int32_t>{1};
-    auto child2 = fixed_width_column_wrapper<int32_t>{{null}, null_at(0)};
-    return structs_column_wrapper{child1, child2};
-  }();
+  {
+    auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
+    auto const vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{1, 1};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
+    auto const expect_vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{1};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{null}, null_at(0)};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto agg = cudf::make_min_aggregation<groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  }
 
-  auto agg = cudf::make_min_aggregation<groupby_aggregation>();
-  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  {
+    auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
+    auto const vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{null, null}, nulls_at({0, 1})};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
+    auto const expect_vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{{null}, null_at(0)};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{null}, null_at(0)};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto agg = cudf::make_min_aggregation<groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  }
 }
 
 }  // namespace test

From c07fdabb491815eab8e4eb7655b2946218ee8a42 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Thu, 13 Jan 2022 12:16:36 -0600
Subject: [PATCH 14/23] Load balance optimization for contiguous_split (#9755)

The existing `contiguous_split` implementation was vulnerable to situations where `number of columns N * number of splits M` was < the number of SMs on the gpu.   This PR implements a postprocessing step which attempts to distribute the amount of bytes to be copied as evenly as possible across all available SMs.

PR has been updated to repartition using a constant chunk size of 1 MB.  This yields better results than the initial approach.

Before/after benchmarks for some particularly degenerate cases (T4)

```
Before (4 partitions)
4GB, 4 columns, no splits                      43.3 ms         43.3 ms            8 bytes_per_second=46.1738G/s
After
4GB, 4 columns, no splits                      10.1 ms         10.1 ms            8 bytes_per_second=198.642G/s
```
```
Before (2 partitions)
1GB, 1 column + validity, no splits         114 ms          114 ms            8 bytes_per_second=17.5212G/s
After
1GB, 1 column + validity, no splits         10.5 ms         10.6 ms            8 bytes_per_second=189.784G/s
```

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Elias Stehle (https://github.com/elstehle)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/9755
---
 .../copying/contiguous_split_benchmark.cu     |  65 +++--
 cpp/src/copying/contiguous_split.cu           | 248 +++++++++++++++---
 cpp/tests/copying/split_tests.cpp             |  27 ++
 3 files changed, 277 insertions(+), 63 deletions(-)

diff --git a/cpp/benchmarks/copying/contiguous_split_benchmark.cu b/cpp/benchmarks/copying/contiguous_split_benchmark.cu
index 506d676d196..55e1360efc8 100644
--- a/cpp/benchmarks/copying/contiguous_split_benchmark.cu
+++ b/cpp/benchmarks/copying/contiguous_split_benchmark.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,10 +34,18 @@ void BM_contiguous_split_common(benchmark::State& state,
                                 int64_t bytes_total)
 {
   // generate splits
-  cudf::size_type split_stride = num_rows / num_splits;
   std::vector<cudf::size_type> splits;
-  for (int idx = 0; idx < num_rows; idx += split_stride) {
-    splits.push_back(std::min(idx + split_stride, static_cast<cudf::size_type>(num_rows)));
+  if (num_splits > 0) {
+    cudf::size_type const split_stride = num_rows / num_splits;
+    // start after the first element.
+    auto iter = thrust::make_counting_iterator(1);
+    splits.reserve(num_splits);
+    std::transform(iter,
+                   iter + num_splits,
+                   std::back_inserter(splits),
+                   [split_stride, num_rows](cudf::size_type i) {
+                     return std::min(i * split_stride, static_cast<cudf::size_type>(num_rows));
+                   });
   }
 
   std::vector<std::unique_ptr<cudf::column>> columns(src_cols.size());
@@ -53,7 +61,8 @@ void BM_contiguous_split_common(benchmark::State& state,
     auto result = cudf::contiguous_split(src_table, splits);
   }
 
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * bytes_total);
+  // it's 2x bytes_total because we're both reading and writing.
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * bytes_total * 2);
 }
 
 class ContiguousSplit : public cudf::benchmark {
@@ -61,13 +70,13 @@ class ContiguousSplit : public cudf::benchmark {
 
 void BM_contiguous_split(benchmark::State& state)
 {
-  int64_t total_desired_bytes = state.range(0);
-  cudf::size_type num_cols    = state.range(1);
-  cudf::size_type num_splits  = state.range(2);
-  bool include_validity       = state.range(3) == 0 ? false : true;
+  int64_t const total_desired_bytes = state.range(0);
+  cudf::size_type const num_cols    = state.range(1);
+  cudf::size_type const num_splits  = state.range(2);
+  bool const include_validity       = state.range(3) == 0 ? false : true;
 
   cudf::size_type el_size = 4;  // ints and floats
-  int64_t num_rows        = total_desired_bytes / (num_cols * el_size);
+  int64_t const num_rows  = total_desired_bytes / (num_cols * el_size);
 
   // generate input table
   srand(31337);
@@ -85,8 +94,10 @@ void BM_contiguous_split(benchmark::State& state)
     }
   }
 
-  size_t total_bytes = total_desired_bytes;
-  if (include_validity) { total_bytes += num_rows / (sizeof(cudf::bitmask_type) * 8); }
+  int64_t const total_bytes =
+    total_desired_bytes +
+    (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols)
+                      : 0);
 
   BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes);
 }
@@ -102,17 +113,17 @@ int rand_range(int r)
 
 void BM_contiguous_split_strings(benchmark::State& state)
 {
-  int64_t total_desired_bytes = state.range(0);
-  cudf::size_type num_cols    = state.range(1);
-  cudf::size_type num_splits  = state.range(2);
-  bool include_validity       = state.range(3) == 0 ? false : true;
+  int64_t const total_desired_bytes = state.range(0);
+  cudf::size_type const num_cols    = state.range(1);
+  cudf::size_type const num_splits  = state.range(2);
+  bool const include_validity       = state.range(3) == 0 ? false : true;
 
-  const int64_t string_len = 8;
+  constexpr int64_t string_len = 8;
   std::vector<const char*> h_strings{
     "aaaaaaaa", "bbbbbbbb", "cccccccc", "dddddddd", "eeeeeeee", "ffffffff", "gggggggg", "hhhhhhhh"};
 
-  int64_t col_len_bytes = total_desired_bytes / num_cols;
-  int64_t num_rows      = col_len_bytes / string_len;
+  int64_t const col_len_bytes = total_desired_bytes / num_cols;
+  int64_t const num_rows      = col_len_bytes / string_len;
 
   // generate input table
   srand(31337);
@@ -133,8 +144,10 @@ void BM_contiguous_split_strings(benchmark::State& state)
     }
   }
 
-  size_t total_bytes = total_desired_bytes + (num_rows * sizeof(cudf::size_type));
-  if (include_validity) { total_bytes += num_rows / (sizeof(cudf::bitmask_type) * 8); }
+  int64_t const total_bytes =
+    total_desired_bytes + ((num_rows + 1) * sizeof(cudf::offset_type)) +
+    (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols)
+                      : 0);
 
   BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes);
 }
@@ -157,12 +170,16 @@ CSBM_BENCHMARK_DEFINE(6Gb10ColsValidity, (int64_t)6 * 1024 * 1024 * 1024, 10, 25
 CSBM_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 256, 0);
 CSBM_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 256, 1);
 CSBM_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 0);
-CSBM_BENCHMARK_DEFINE(46b10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 1);
+CSBM_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 1);
+CSBM_BENCHMARK_DEFINE(4Gb4ColsNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1);
+CSBM_BENCHMARK_DEFINE(4Gb4ColsValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1);
 
 CSBM_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 0);
 CSBM_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 1);
 CSBM_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 0);
 CSBM_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 1);
+CSBM_BENCHMARK_DEFINE(1Gb1ColNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
+CSBM_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
 
 #define CSBM_STRINGS_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \
   BENCHMARK_DEFINE_F(ContiguousSplitStrings, name)(::benchmark::State & state)       \
@@ -179,8 +196,12 @@ CSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1
 CSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 256, 1);
 CSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 0);
 CSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 1);
+CSBM_STRINGS_BENCHMARK_DEFINE(4Gb4ColsNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 0);
+CSBM_STRINGS_BENCHMARK_DEFINE(4Gb4ColsValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1);
 
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 0);
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 1);
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 0);
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 1);
+CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 0);
+CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 8dc93bc1de3..f8c0006ed45 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/iterator/discard_iterator.h>
 
 #include <numeric>
@@ -82,16 +83,21 @@ struct src_buf_info {
  * M partitions, then we have N*M destination buffers.
  */
 struct dst_buf_info {
+  // constant across all copy commands for this buffer
   std::size_t buf_size;  // total size of buffer, including padding
   int num_elements;      // # of elements to be copied
   int element_size;      // size of each element in bytes
-  int num_rows;  // # of rows (which may be different from num_elements in the case of validity or
-                 // offset buffers)
-  int src_row_index;       // row index to start reading from from my associated source buffer
+  int num_rows;  // # of rows to be copied(which may be different from num_elements in the case of
+                 // validity or offset buffers)
+
+  int src_element_index;   // element index to start reading from from my associated source buffer
   std::size_t dst_offset;  // my offset into the per-partition allocation
   int value_shift;         // amount to shift values down by (for offset buffers)
   int bit_shift;           // # of bits to shift right by (for validity buffers)
-  size_type valid_count;
+  size_type valid_count;   // validity count for this block of work
+
+  int src_buf_index;  // source buffer index
+  int dst_buf_index;  // destination buffer index
 };
 
 /**
@@ -116,7 +122,7 @@ struct dst_buf_info {
  * @param t Thread index
  * @param num_elements Number of elements to copy
  * @param element_size Size of each element in bytes
- * @param src_row_index Row index to start copying at
+ * @param src_element_index Element index to start copying at
  * @param stride Size of the kernel block
  * @param value_shift Shift incoming 4-byte offset values down by this amount
  * @param bit_shift Shift incoming data right by this many bits
@@ -129,14 +135,14 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
                             int t,
                             std::size_t num_elements,
                             std::size_t element_size,
-                            std::size_t src_row_index,
+                            std::size_t src_element_index,
                             uint32_t stride,
                             int value_shift,
                             int bit_shift,
                             std::size_t num_rows,
                             size_type* valid_count)
 {
-  src += (src_row_index * element_size);
+  src += (src_element_index * element_size);
 
   size_type thread_valid_count = 0;
 
@@ -240,38 +246,36 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
 }
 
 /**
- * @brief Kernel which copies a single buffer from a set of partitioned
- * column buffers.
+ * @brief Kernel which copies data from multiple source buffers to multiple
+ * destination buffers.
  *
  * When doing a contiguous_split on X columns comprising N total internal buffers
  * with M splits, we end up having to copy N*M source/destination buffer pairs.
+ * These logical copies are further subdivided to distribute the amount of work
+ * to be done as evenly as possible across the multiprocessors on the device.
  * This kernel is arranged such that each block copies 1 source/destination pair.
- * This function retrieves the relevant buffers and then calls copy_buffer to perform
- * the actual copy.
  *
- * @param num_src_bufs Total number of source buffers (N)
- * @param src_bufs Input source buffers (N)
- * @param dst_bufs Destination buffers (N*M)
+ * @param src_bufs Input source buffers
+ * @param dst_bufs Destination buffers
  * @param buf_info Information on the range of values to be copied for each destination buffer.
  */
 template <int block_size>
-__global__ void copy_partition(int num_src_bufs,
-                               uint8_t const** src_bufs,
-                               uint8_t** dst_bufs,
-                               dst_buf_info* buf_info)
+__global__ void copy_partitions(uint8_t const** src_bufs,
+                                uint8_t** dst_bufs,
+                                dst_buf_info* buf_info)
 {
-  int const partition_index   = blockIdx.x / num_src_bufs;
-  int const src_buf_index     = blockIdx.x % num_src_bufs;
-  std::size_t const buf_index = (partition_index * num_src_bufs) + src_buf_index;
+  auto const buf_index     = blockIdx.x;
+  auto const src_buf_index = buf_info[buf_index].src_buf_index;
+  auto const dst_buf_index = buf_info[buf_index].dst_buf_index;
 
   // copy, shifting offsets and validity bits as needed
   copy_buffer<block_size>(
-    dst_bufs[partition_index] + buf_info[buf_index].dst_offset,
+    dst_bufs[dst_buf_index] + buf_info[buf_index].dst_offset,
     src_bufs[src_buf_index],
     threadIdx.x,
     buf_info[buf_index].num_elements,
     buf_info[buf_index].element_size,
-    buf_info[buf_index].src_row_index,
+    buf_info[buf_index].src_element_index,
     blockDim.x,
     buf_info[buf_index].value_shift,
     buf_info[buf_index].bit_shift,
@@ -728,9 +732,32 @@ struct dst_offset_output_iterator {
   using reference         = std::size_t&;
   using iterator_category = thrust::output_device_iterator_tag;
 
-  dst_offset_output_iterator operator+ __host__ __device__(int i)
+  dst_offset_output_iterator operator+ __host__ __device__(int i) { return {c + i}; }
+
+  void operator++ __host__ __device__() { c++; }
+
+  reference operator[] __device__(int i) { return dereference(c + i); }
+  reference operator* __device__() { return dereference(c); }
+
+ private:
+  reference __device__ dereference(dst_buf_info* c) { return c->dst_offset; }
+};
+
+/**
+ * @brief Output iterator for writing values to the valid_count field of the
+ * dst_buf_info struct
+ */
+struct dst_valid_count_output_iterator {
+  dst_buf_info* c;
+  using value_type        = size_type;
+  using difference_type   = size_type;
+  using pointer           = size_type*;
+  using reference         = size_type&;
+  using iterator_category = thrust::output_device_iterator_tag;
+
+  dst_valid_count_output_iterator operator+ __host__ __device__(int i)
   {
-    return dst_offset_output_iterator{c + i};
+    return dst_valid_count_output_iterator{c + i};
   }
 
   void operator++ __host__ __device__() { c++; }
@@ -739,7 +766,7 @@ struct dst_offset_output_iterator {
   reference operator* __device__() { return dereference(c); }
 
  private:
-  reference __device__ dereference(dst_buf_info* c) { return c->dst_offset; }
+  reference __device__ dereference(dst_buf_info* c) { return c->valid_count; }
 };
 
 /**
@@ -762,6 +789,148 @@ struct size_of_helper {
   }
 };
 
+/**
+ * @brief Functor for returning the number of chunks an input buffer is being
+ * subdivided into during the repartitioning step.
+ *
+ * Note: columns types which themselves inherently have no data (strings, lists,
+ * structs) return 0.
+ */
+struct num_chunks_func {
+  thrust::pair<size_t, size_t> const* chunks;
+  __device__ size_t operator()(size_type i) const { return thrust::get<0>(chunks[i]); }
+};
+
+void copy_data(int num_bufs,
+               int num_src_bufs,
+               uint8_t const** d_src_bufs,
+               uint8_t** d_dst_bufs,
+               dst_buf_info* _d_dst_buf_info,
+               rmm::cuda_stream_view stream)
+{
+  // Since we parallelize at one block per copy, we are vulnerable to situations where we
+  // have small numbers of copies to do (a combination of small numbers of splits and/or columns),
+  // so we will take the actual set of outgoing source/destination buffers and further partition
+  // them into much smaller chunks in order to drive up the number of blocks and overall occupancy.
+  auto const desired_chunk_size = size_t{1 * 1024 * 1024};
+  rmm::device_uvector<thrust::pair<size_t, size_t>> chunks(num_bufs, stream);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    _d_dst_buf_info,
+    _d_dst_buf_info + num_bufs,
+    chunks.begin(),
+    [desired_chunk_size] __device__(dst_buf_info const& buf) -> thrust::pair<size_t, size_t> {
+      // Total bytes for this incoming partition
+      size_t const bytes = buf.num_elements * buf.element_size;
+
+      // This clause handles nested data types (e.g. list or string) that store no data in the roow
+      // columns, only in their children.
+      if (bytes == 0) { return {1, 0}; }
+
+      // The number of chunks we want to subdivide this buffer into
+      size_t const num_chunks =
+        max(size_t{1}, util::round_up_unsafe(bytes, desired_chunk_size) / desired_chunk_size);
+
+      // NOTE: leaving chunk size as a separate parameter for future tuning
+      // possibilities, even though in the current implementation it will be a
+      // constant.
+      return {num_chunks, desired_chunk_size};
+    });
+
+  rmm::device_uvector<offset_type> chunk_offsets(num_bufs + 1, stream);
+  auto buf_count_iter = cudf::detail::make_counting_transform_iterator(
+    0, [num_bufs, num_chunks = num_chunks_func{chunks.begin()}] __device__(size_type i) {
+      return i == num_bufs ? 0 : num_chunks(i);
+    });
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         buf_count_iter,
+                         buf_count_iter + num_bufs + 1,
+                         chunk_offsets.begin(),
+                         0);
+
+  auto out_to_in_index = [chunk_offsets = chunk_offsets.begin(), num_bufs] __device__(size_type i) {
+    return static_cast<size_type>(
+             thrust::upper_bound(thrust::seq, chunk_offsets, chunk_offsets + num_bufs + 1, i) -
+             chunk_offsets) -
+           1;
+  };
+
+  // apply the chunking.
+  auto const num_chunks =
+    cudf::detail::make_counting_transform_iterator(0, num_chunks_func{chunks.begin()});
+  size_type const new_buf_count =
+    thrust::reduce(rmm::exec_policy(stream), num_chunks, num_chunks + chunks.size());
+  rmm::device_uvector<dst_buf_info> d_dst_buf_info(new_buf_count, stream);
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    iter,
+    iter + new_buf_count,
+    [_d_dst_buf_info,
+     d_dst_buf_info = d_dst_buf_info.begin(),
+     chunks         = chunks.begin(),
+     chunk_offsets  = chunk_offsets.begin(),
+     num_bufs,
+     num_src_bufs,
+     out_to_in_index] __device__(size_type i) {
+      size_type const in_buf_index = out_to_in_index(i);
+      size_type const chunk_index  = i - chunk_offsets[in_buf_index];
+      auto const chunk_size        = thrust::get<1>(chunks[in_buf_index]);
+      dst_buf_info const& in       = _d_dst_buf_info[in_buf_index];
+
+      // adjust info
+      dst_buf_info& out = d_dst_buf_info[i];
+      out.element_size  = in.element_size;
+      out.value_shift   = in.value_shift;
+      out.bit_shift     = in.bit_shift;
+      out.valid_count =
+        in.valid_count;  // valid count will be set to 1 if this is a validity buffer
+      out.src_buf_index = in.src_buf_index;
+      out.dst_buf_index = in.dst_buf_index;
+
+      size_type const elements_per_chunk =
+        out.element_size == 0 ? 0 : chunk_size / out.element_size;
+      out.num_elements = ((chunk_index + 1) * elements_per_chunk) > in.num_elements
+                           ? in.num_elements - (chunk_index * elements_per_chunk)
+                           : elements_per_chunk;
+
+      size_type const rows_per_chunk =
+        // if this is a validity buffer, each element is a bitmask_type, which
+        // corresponds to 32 rows.
+        out.valid_count > 0
+          ? elements_per_chunk * static_cast<size_type>(detail::size_in_bits<bitmask_type>())
+          : elements_per_chunk;
+      out.num_rows = ((chunk_index + 1) * rows_per_chunk) > in.num_rows
+                       ? in.num_rows - (chunk_index * rows_per_chunk)
+                       : rows_per_chunk;
+
+      out.src_element_index = in.src_element_index + (chunk_index * elements_per_chunk);
+      out.dst_offset        = in.dst_offset + (chunk_index * chunk_size);
+
+      // out.bytes and out.buf_size are unneeded here because they are only used to
+      // calculate real output buffer sizes. the data we are generating here is
+      // purely intermediate for the purposes of doing more uniform copying of data
+      // underneath the final structure of the output
+    });
+
+  // perform the copy
+  constexpr size_type block_size = 256;
+  copy_partitions<block_size><<<new_buf_count, block_size, 0, stream.value()>>>(
+    d_src_bufs, d_dst_bufs, d_dst_buf_info.data());
+
+  // postprocess valid_counts
+  auto keys = cudf::detail::make_counting_transform_iterator(
+    0, [out_to_in_index] __device__(size_type i) { return out_to_in_index(i); });
+  auto values = thrust::make_transform_iterator(
+    d_dst_buf_info.begin(), [] __device__(dst_buf_info const& info) { return info.valid_count; });
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        keys,
+                        keys + new_buf_count,
+                        values,
+                        thrust::make_discard_iterator(),
+                        dst_valid_count_output_iterator{_d_dst_buf_info});
+}
+
 };  // anonymous namespace
 
 namespace detail {
@@ -933,9 +1102,9 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
         }
       }
 
-      // final row indices and row count
-      int const out_row_index = src_info.is_validity ? row_start / 32 : row_start;
-      int const num_rows      = row_end - row_start;
+      // final element indices and row count
+      int const out_element_index = src_info.is_validity ? row_start / 32 : row_start;
+      int const num_rows          = row_end - row_start;
       // if I am an offsets column, all my values need to be shifted
       int const value_shift = src_info.offsets == nullptr ? 0 : src_info.offsets[row_start];
       // if I am a validity column, we may need to shift bits
@@ -953,15 +1122,17 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
       std::size_t const bytes =
         static_cast<std::size_t>(num_elements) * static_cast<std::size_t>(element_size);
 
-      return dst_buf_info{util::round_up_unsafe(bytes, 64ul),
+      return dst_buf_info{util::round_up_unsafe(bytes, split_align),
                           num_elements,
                           element_size,
                           num_rows,
-                          out_row_index,
+                          out_element_index,
                           0,
                           value_shift,
                           bit_shift,
-                          src_info.is_validity ? 1 : 0};
+                          src_info.is_validity ? 1 : 0,
+                          src_buf_index,
+                          split_index};
     });
 
   // compute total size of each partition
@@ -1043,12 +1214,8 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   CUDA_TRY(cudaMemcpyAsync(
     d_src_bufs, h_src_bufs, src_bufs_size + dst_bufs_size, cudaMemcpyHostToDevice, stream.value()));
 
-  // copy.  1 block per buffer
-  {
-    constexpr size_type block_size = 256;
-    copy_partition<block_size><<<num_bufs, block_size, 0, stream.value()>>>(
-      num_src_bufs, d_src_bufs, d_dst_bufs, d_dst_buf_info);
-  }
+  // perform the copy.
+  copy_data(num_bufs, num_src_bufs, d_src_bufs, d_dst_bufs, d_dst_buf_info, stream);
 
   // DtoH dst info (to retrieve null counts)
   CUDA_TRY(cudaMemcpyAsync(
@@ -1078,7 +1245,6 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
 
     cols.clear();
   }
-
   return result;
 }
 
@@ -1092,4 +1258,4 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   return cudf::detail::contiguous_split(input, splits, rmm::cuda_stream_default, mr);
 }
 
-};  // namespace cudf
+};  // namespace cudf
\ No newline at end of file
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index b5a793ecd1c..1ee732b8a59 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -17,6 +17,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/filling.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -1315,6 +1316,32 @@ TEST_F(ContiguousSplitUntypedTest, ProgressiveSizes)
   }
 }
 
+TEST_F(ContiguousSplitUntypedTest, ValidityRepartition)
+{
+  // it is tricky to actually get the internal repartitioning/load-balancing code to add new splits
+  // inside a validity buffer.  Under almost all situations, the fraction of bytes that validity
+  // represents is so small compared to the bytes for all other data, that those buffers end up not
+  // getting subdivided. this test forces it happen by using a small, single column of int8's, which
+  // keeps the overall fraction that validity takes up large enough to cause a repartition.
+  srand(0);
+  auto rvalids                   = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+    return static_cast<float>(rand()) / static_cast<float>(RAND_MAX) < 0.5f ? 0 : 1;
+  });
+  cudf::size_type const num_rows = 2000000;
+  auto col                       = cudf::sequence(num_rows, cudf::numeric_scalar<int8_t>{0});
+  col->set_null_mask(cudf::test::detail::make_null_mask(rvalids, rvalids + num_rows));
+
+  cudf::table_view t({*col});
+  auto result   = cudf::contiguous_split(t, {num_rows / 2});
+  auto expected = cudf::split(t, {num_rows / 2});
+  CUDF_EXPECTS(result.size() == expected.size(),
+               "Mismatch in split results in ValidityRepartition test");
+
+  for (size_t idx = 0; idx < result.size(); idx++) {
+    CUDF_TEST_EXPECT_TABLES_EQUAL(result[idx].table, expected[idx]);
+  }
+}
+
 TEST_F(ContiguousSplitUntypedTest, ValidityEdgeCase)
 {
   // tests an edge case where the splits cause the final validity data to be copied

From 1eceaed26d8242401f2be12b50eb635872fe1bf6 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Fri, 14 Jan 2022 06:29:14 +0530
Subject: [PATCH 15/23] Add partitioning support to Parquet chunked writer
 (#10000)

Chunked writer (`class ParquetWriter`) now takes an argument `partition_cols`. For each call to `write_table(df)`, the `df` is partitioned and the parts are appended to the same corresponding file in the dataset directory. This can be used when partitioning is desired but when one wants to avoid making many small files in each sub directory e.g.
Instead of repeated call to `write_to_dataset` like so:
```python
write_to_dataset(df1, root_path, partition_cols=['group'])
write_to_dataset(df2, root_path, partition_cols=['group'])
...
```
which will yield the following structure
```
root_dir/
  group=value1/
    <uuid1>.parquet
    <uuid2>.parquet
    ...
  group=value2/
    <uuid1>.parquet
    <uuid2>.parquet
    ...
  ...
```
One can write with
```python
pw = ParquetWriter(root_path, partition_cols=['group'])
pw.write_table(df1)
pw.write_table(df2)
pw.close()
```
to get the structure
```
root_dir/
  group=value1/
    <uuid1>.parquet
  group=value2/
    <uuid1>.parquet
  ...
```

Closes #7196
Also workaround fixes
fixes #9216
fixes #7011

TODO:

- [x] Tests

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/10000
---
 python/cudf/cudf/_lib/cpp/io/parquet.pxd |   4 +
 python/cudf/cudf/_lib/parquet.pyx        |  52 +++--
 python/cudf/cudf/io/parquet.py           | 257 ++++++++++++++++++++---
 python/cudf/cudf/tests/test_parquet.py   |  83 +++++++-
 4 files changed, 350 insertions(+), 46 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 07b312361f2..d02fffe9c0d 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -207,6 +207,10 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_chunked_writer& write(
             cudf_table_view.table_view table_,
         ) except+
+        parquet_chunked_writer& write(
+            const cudf_table_view.table_view& table_,
+            const vector[cudf_io_types.partition_info]& partitions,
+        ) except+
         unique_ptr[vector[uint8_t]] close(
             vector[string] column_chunks_file_paths,
         ) except+
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 36099b03ef6..16873435e1d 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -411,23 +411,31 @@ cdef class ParquetWriter:
     cdef unique_ptr[cpp_parquet_chunked_writer] writer
     cdef unique_ptr[table_input_metadata] tbl_meta
     cdef cudf_io_types.sink_info sink
-    cdef unique_ptr[cudf_io_types.data_sink] _data_sink
+    cdef vector[unique_ptr[cudf_io_types.data_sink]] _data_sink
     cdef cudf_io_types.statistics_freq stat_freq
     cdef cudf_io_types.compression_type comp_type
     cdef object index
 
-    def __cinit__(self, object path, object index=None,
+    def __cinit__(self, object filepaths_or_buffers, object index=None,
                   object compression=None, str statistics="ROWGROUP"):
-        self.sink = make_sink_info(path, self._data_sink)
+        filepaths_or_buffers = (
+            list(filepaths_or_buffers)
+            if is_list_like(filepaths_or_buffers)
+            else [filepaths_or_buffers]
+        )
+        self.sink = make_sinks_info(filepaths_or_buffers, self._data_sink)
         self.stat_freq = _get_stat_freq(statistics)
         self.comp_type = _get_comp_type(compression)
         self.index = index
         self.initialized = False
 
-    def write_table(self, table):
+    def write_table(self, table, object partitions_info=None):
         """ Writes a single table to the file """
         if not self.initialized:
-            self._initialize_chunked_state(table)
+            self._initialize_chunked_state(
+                table,
+                num_partitions=len(partitions_info) if partitions_info else 1
+            )
 
         cdef table_view tv
         if self.index is not False and (
@@ -437,8 +445,15 @@ cdef class ParquetWriter:
         else:
             tv = table_view_from_table(table, ignore_index=True)
 
+        cdef vector[cudf_io_types.partition_info] partitions
+        if partitions_info is not None:
+            for part in partitions_info:
+                partitions.push_back(
+                    cudf_io_types.partition_info(part[0], part[1])
+                )
+
         with nogil:
-            self.writer.get()[0].write(tv)
+            self.writer.get()[0].write(tv, partitions)
 
     def close(self, object metadata_file_path=None):
         cdef unique_ptr[vector[uint8_t]] out_metadata_c
@@ -449,7 +464,13 @@ cdef class ParquetWriter:
 
         # Update metadata-collection options
         if metadata_file_path is not None:
-            column_chunks_file_paths.push_back(str.encode(metadata_file_path))
+            if is_list_like(metadata_file_path):
+                for path in metadata_file_path:
+                    column_chunks_file_paths.push_back(str.encode(path))
+            else:
+                column_chunks_file_paths.push_back(
+                    str.encode(metadata_file_path)
+                )
 
         with nogil:
             out_metadata_c = move(
@@ -463,10 +484,13 @@ cdef class ParquetWriter:
             return np.asarray(out_metadata_py)
         return None
 
-    def __dealloc__(self):
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
         self.close()
 
-    def _initialize_chunked_state(self, table):
+    def _initialize_chunked_state(self, table, num_partitions=1):
         """ Prepares all the values required to build the
         chunked_parquet_writer_options and creates a writer"""
         cdef table_view tv
@@ -499,10 +523,14 @@ cdef class ParquetWriter:
                 table[name]._column, self.tbl_meta.get().column_metadata[i]
             )
 
-        pandas_metadata = generate_pandas_metadata(table, self.index)
+        index = (
+            False if isinstance(table._index, cudf.RangeIndex) else self.index
+        )
+        pandas_metadata = generate_pandas_metadata(table, index)
+        cdef map[string, string] tmp_user_data
+        tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata)
         cdef vector[map[string, string]] user_data
-        user_data.resize(1)
-        user_data.back()[str.encode("pandas")] = str.encode(pandas_metadata)
+        user_data = vector[map[string, string]](num_partitions, tmp_user_data)
 
         cdef chunked_parquet_writer_options args
         with nogil:
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index ca03e40e2a6..9694d19e159 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -5,9 +5,11 @@
 import warnings
 from collections import defaultdict
 from contextlib import ExitStack
+from typing import Dict, List, Tuple
 from uuid import uuid4
 
 import fsspec
+import numpy as np
 import pyarrow as pa
 from pyarrow import dataset as ds, parquet as pq
 
@@ -126,32 +128,21 @@ def write_to_dataset(
 
     if partition_cols is not None and len(partition_cols) > 0:
 
-        data_cols = df.columns.drop(partition_cols)
-        if len(data_cols) == 0:
-            raise ValueError("No data left to save outside partition columns")
-
-        part_names, part_offsets, _, grouped_df = df.groupby(
-            partition_cols
-        )._grouped()
-        if not preserve_index:
-            grouped_df.reset_index(drop=True, inplace=True)
-        grouped_df.drop(columns=partition_cols, inplace=True)
-        # Copy the entire keys df in one operation rather than using iloc
-        part_names = part_names.to_pandas().to_frame(index=False)
-
-        full_paths = []
-        metadata_file_paths = []
-        for keys in part_names.itertuples(index=False):
-            subdir = fs.sep.join(
-                [f"{name}={val}" for name, val in zip(partition_cols, keys)]
-            )
-            prefix = fs.sep.join([root_path, subdir])
-            fs.mkdirs(prefix, exist_ok=True)
-            filename = filename or uuid4().hex + ".parquet"
-            full_path = fs.sep.join([prefix, filename])
-            full_paths.append(full_path)
-            if return_metadata:
-                metadata_file_paths.append(fs.sep.join([subdir, filename]))
+        (
+            full_paths,
+            metadata_file_paths,
+            grouped_df,
+            part_offsets,
+            _,
+        ) = _get_partitioned(
+            df,
+            root_path,
+            partition_cols,
+            filename,
+            fs,
+            preserve_index,
+            **kwargs,
+        )
 
         if return_metadata:
             kwargs["metadata_file_path"] = metadata_file_paths
@@ -164,7 +155,7 @@ def write_to_dataset(
         )
 
     else:
-        filename = filename or uuid4().hex + ".parquet"
+        filename = filename or _generate_filename()
         full_path = fs.sep.join([root_path, filename])
         if return_metadata:
             kwargs["metadata_file_path"] = filename
@@ -737,13 +728,12 @@ def to_parquet(
             )
 
         if partition_offsets:
-            kwargs["partitions_info"] = [
-                (
-                    partition_offsets[i],
-                    partition_offsets[i + 1] - partition_offsets[i],
+            kwargs["partitions_info"] = list(
+                zip(
+                    partition_offsets,
+                    np.roll(partition_offsets, -1) - partition_offsets,
                 )
-                for i in range(0, len(partition_offsets) - 1)
-            ]
+            )[:-1]
 
         return _write_parquet(
             df,
@@ -790,9 +780,210 @@ def merge_parquet_filemetadata(filemetadata_list):
     return libparquet.merge_filemetadata(filemetadata_list)
 
 
+def _generate_filename():
+    return uuid4().hex + ".parquet"
+
+
+def _get_partitioned(
+    df,
+    root_path,
+    partition_cols,
+    filename=None,
+    fs=None,
+    preserve_index=False,
+    **kwargs,
+):
+    fs = ioutils._ensure_filesystem(fs, root_path, **kwargs)
+    fs.mkdirs(root_path, exist_ok=True)
+    if not (set(df._data) - set(partition_cols)):
+        raise ValueError("No data left to save outside partition columns")
+
+    part_names, part_offsets, _, grouped_df = df.groupby(
+        partition_cols
+    )._grouped()
+    if not preserve_index:
+        grouped_df.reset_index(drop=True, inplace=True)
+    grouped_df.drop(columns=partition_cols, inplace=True)
+    # Copy the entire keys df in one operation rather than using iloc
+    part_names = part_names.to_pandas().to_frame(index=False)
+
+    full_paths = []
+    metadata_file_paths = []
+    for keys in part_names.itertuples(index=False):
+        subdir = fs.sep.join(
+            [f"{name}={val}" for name, val in zip(partition_cols, keys)]
+        )
+        prefix = fs.sep.join([root_path, subdir])
+        fs.mkdirs(prefix, exist_ok=True)
+        filename = filename or _generate_filename()
+        full_path = fs.sep.join([prefix, filename])
+        full_paths.append(full_path)
+        metadata_file_paths.append(fs.sep.join([subdir, filename]))
+
+    return full_paths, metadata_file_paths, grouped_df, part_offsets, filename
+
+
 ParquetWriter = libparquet.ParquetWriter
 
 
+class ParquetDatasetWriter:
+    def __init__(
+        self,
+        path,
+        partition_cols,
+        index=None,
+        compression=None,
+        statistics="ROWGROUP",
+    ) -> None:
+        """
+        Write a parquet file or dataset incrementally
+
+        Parameters
+        ----------
+        path : str
+            File path or Root Directory path. Will be used as Root Directory
+            path while writing a partitioned dataset.
+        partition_cols : list
+            Column names by which to partition the dataset
+            Columns are partitioned in the order they are given
+        index : bool, default None
+            If ``True``, include the dataframe’s index(es) in the file output.
+            If ``False``, they will not be written to the file. If ``None``,
+            index(es) other than RangeIndex will be saved as columns.
+        compression : {'snappy', None}, default 'snappy'
+            Name of the compression to use. Use ``None`` for no compression.
+        statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP'
+            Level at which column statistics should be included in file.
+
+
+        Examples
+        ________
+        Using a context
+
+        >>> df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]})
+        >>> df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]})
+        >>> with ParquetDatasetWriter("./dataset", partition_cols=["a"]) as cw:
+        ...     cw.write_table(df1)
+        ...     cw.write_table(df2)
+
+        By manually calling ``close()``
+
+        >>> cw = ParquetDatasetWriter("./dataset", partition_cols=["a"])
+        >>> cw.write_table(df1)
+        >>> cw.write_table(df2)
+        >>> cw.close()
+
+        Both the methods will generate the same directory structure
+
+        .. code-block:: bash
+
+            dataset/
+                a=1
+                    <filename>.parquet
+                a=2
+                    <filename>.parquet
+                a=3
+                    <filename>.parquet
+
+        """
+        self.path = path
+        self.common_args = {
+            "index": index,
+            "compression": compression,
+            "statistics": statistics,
+        }
+        self.partition_cols = partition_cols
+        # Collection of `ParquetWriter`s, and the corresponding
+        # partition_col values they're responsible for
+        self._chunked_writers: List[
+            Tuple[libparquet.ParquetWriter, List[str], str]
+        ] = []
+        # Map of partition_col values to their ParquetWriter's index
+        # in self._chunked_writers for reverse lookup
+        self.path_cw_map: Dict[str, int] = {}
+        self.filename = None
+
+    def write_table(self, df):
+        """
+        Write a dataframe to the file/dataset
+        """
+        (
+            paths,
+            metadata_file_paths,
+            grouped_df,
+            offsets,
+            self.filename,
+        ) = _get_partitioned(
+            df,
+            self.path,
+            self.partition_cols,
+            preserve_index=self.common_args["index"],
+            filename=self.filename,
+        )
+
+        existing_cw_batch = defaultdict(dict)
+        new_cw_paths = []
+
+        for path, part_info, meta_path in zip(
+            paths,
+            zip(offsets, np.roll(offsets, -1) - offsets),
+            metadata_file_paths,
+        ):
+            if path in self.path_cw_map:  # path is a currently open file
+                cw_idx = self.path_cw_map[path]
+                existing_cw_batch[cw_idx][path] = part_info
+            else:  # path not currently handled by any chunked writer
+                new_cw_paths.append((path, part_info, meta_path))
+
+        # Write out the parts of grouped_df currently handled by existing cw's
+        for cw_idx, path_to_part_info_map in existing_cw_batch.items():
+            cw = self._chunked_writers[cw_idx][0]
+            # match found paths with this cw's paths and nullify partition info
+            # for partition_col values not in this batch
+            this_cw_part_info = [
+                path_to_part_info_map.get(path, (0, 0))
+                for path in self._chunked_writers[cw_idx][1]
+            ]
+            cw.write_table(grouped_df, this_cw_part_info)
+
+        # Create new cw for unhandled paths encountered in this write_table
+        new_paths, part_info, meta_paths = zip(*new_cw_paths)
+        self._chunked_writers.append(
+            (
+                ParquetWriter(new_paths, **self.common_args),
+                new_paths,
+                meta_paths,
+            )
+        )
+        new_cw_idx = len(self._chunked_writers) - 1
+        self.path_cw_map.update({k: new_cw_idx for k in new_paths})
+        self._chunked_writers[-1][0].write_table(grouped_df, part_info)
+
+    def close(self, return_metadata=False):
+        """
+        Close all open files and optionally return footer metadata as a binary
+        blob
+        """
+
+        metadata = [
+            cw.close(metadata_file_path=meta_path if return_metadata else None)
+            for cw, _, meta_path in self._chunked_writers
+        ]
+
+        if return_metadata:
+            return (
+                merge_parquet_filemetadata(metadata)
+                if len(metadata) > 1
+                else metadata[0]
+            )
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.close()
+
+
 def _check_decimal128_type(arrow_type):
     if isinstance(arrow_type, pa.Decimal128Type):
         if arrow_type.precision > cudf.Decimal64Dtype.MAX_PRECISION:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 9a66de8a3a6..016ed1229f1 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -18,7 +18,11 @@
 from pyarrow import fs as pa_fs, parquet as pq
 
 import cudf
-from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata
+from cudf.io.parquet import (
+    ParquetDatasetWriter,
+    ParquetWriter,
+    merge_parquet_filemetadata,
+)
 from cudf.testing import dataset_generator as dg
 from cudf.testing._utils import (
     TIMEDELTA_TYPES,
@@ -1573,6 +1577,16 @@ def test_parquet_writer_gpu_chunked(tmpdir, simple_pdf, simple_gdf):
     assert_eq(pd.read_parquet(gdf_fname), pd.concat([simple_pdf, simple_pdf]))
 
 
+def test_parquet_writer_gpu_chunked_context(tmpdir, simple_pdf, simple_gdf):
+    gdf_fname = tmpdir.join("gdf.parquet")
+
+    with ParquetWriter(gdf_fname) as writer:
+        writer.write_table(simple_gdf)
+        writer.write_table(simple_gdf)
+
+    assert_eq(pd.read_parquet(gdf_fname), pd.concat([simple_pdf, simple_pdf]))
+
+
 def test_parquet_write_bytes_io(simple_gdf):
     output = BytesIO()
     simple_gdf.to_parquet(output)
@@ -1627,6 +1641,73 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename):
                 assert fn == filename
 
 
+@pytest.mark.parametrize("return_meta", [True, False])
+def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta):
+    pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
+    gdf_dir = str(tmpdir_factory.mktemp("gdf_dir"))
+
+    df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]})
+    df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]})
+
+    cw = ParquetDatasetWriter(gdf_dir, partition_cols=["a"], index=False)
+    cw.write_table(df1)
+    cw.write_table(df2)
+    meta_byte_array = cw.close(return_metadata=return_meta)
+    pdf = cudf.concat([df1, df2]).to_pandas()
+    pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"])
+
+    if return_meta:
+        fmd = pq.ParquetFile(BytesIO(meta_byte_array)).metadata
+        assert fmd.num_rows == len(pdf)
+        assert fmd.num_row_groups == 4
+        files = {
+            os.path.join(directory, files[0])
+            for directory, _, files in os.walk(gdf_dir)
+            if files
+        }
+        meta_files = {
+            os.path.join(gdf_dir, fmd.row_group(i).column(c).file_path)
+            for i in range(fmd.num_row_groups)
+            for c in range(fmd.row_group(i).num_columns)
+        }
+        assert files == meta_files
+
+    # Read back with pandas to compare
+    expect_pd = pd.read_parquet(pdf_dir)
+    got_pd = pd.read_parquet(gdf_dir)
+    assert_eq(expect_pd, got_pd)
+
+    # Check that cudf and pd return the same read
+    got_cudf = cudf.read_parquet(gdf_dir)
+    assert_eq(got_pd, got_cudf)
+
+
+def test_parquet_writer_chunked_partitioned_context(tmpdir_factory):
+    pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
+    gdf_dir = str(tmpdir_factory.mktemp("gdf_dir"))
+
+    df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]})
+    df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]})
+
+    with ParquetDatasetWriter(
+        gdf_dir, partition_cols=["a"], index=False
+    ) as cw:
+        cw.write_table(df1)
+        cw.write_table(df2)
+
+    pdf = cudf.concat([df1, df2]).to_pandas()
+    pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"])
+
+    # Read back with pandas to compare
+    expect_pd = pd.read_parquet(pdf_dir)
+    got_pd = pd.read_parquet(gdf_dir)
+    assert_eq(expect_pd, got_pd)
+
+    # Check that cudf and pd return the same read
+    got_cudf = cudf.read_parquet(gdf_dir)
+    assert_eq(got_pd, got_cudf)
+
+
 @pytest.mark.parametrize("cols", [None, ["b"]])
 def test_parquet_write_to_dataset(tmpdir_factory, cols):
     dir1 = tmpdir_factory.mktemp("dir1")

From ca77542cab1fc0bcf0d1c8cc67f79ef69fb02536 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Fri, 14 Jan 2022 10:11:47 -0500
Subject: [PATCH 16/23] Allow custom sort functions for dask-cudf `sort_values`
 (#9789)

Similar to https://github.com/dask/dask/pull/8345, this PR allows the sorting function called on each partition in last step of dask-cudf's `sort_values` to be generalized, along with the kwargs that are supplied to it. This allows `sort_values` to be extended to support more complex ascending / null position handling.

The context for this PR is a desire to simplify the [sorting algorithm](https://github.com/dask-contrib/dask-sql/blob/main/dask_sql/physical/utils/sort.py) used by dask-sql; since it only really differs from dask-cudf's sorting algorithm in that it uses a custom sorting function, it seems like it would be easier to allow for that extension upstream rather than duplicate code in dask-sql.

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/9789
---
 python/dask_cudf/dask_cudf/core.py            | 29 +++++++++----------
 python/dask_cudf/dask_cudf/sorting.py         | 21 ++++++++++++--
 python/dask_cudf/dask_cudf/tests/test_sort.py | 19 ++++++++++++
 3 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index bf063918c89..e191873f82b 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -235,6 +235,8 @@ def sort_values(
         set_divisions=False,
         ascending=True,
         na_position="last",
+        sort_function=None,
+        sort_function_kwargs=None,
         **kwargs,
     ):
         if kwargs:
@@ -242,21 +244,18 @@ def sort_values(
                 f"Unsupported input arguments passed : {list(kwargs.keys())}"
             )
 
-        if self.npartitions == 1:
-            df = self.map_partitions(
-                M.sort_values, by, ascending=ascending, na_position=na_position
-            )
-        else:
-            df = sorting.sort_values(
-                self,
-                by,
-                max_branch=max_branch,
-                divisions=divisions,
-                set_divisions=set_divisions,
-                ignore_index=ignore_index,
-                ascending=ascending,
-                na_position=na_position,
-            )
+        df = sorting.sort_values(
+            self,
+            by,
+            max_branch=max_branch,
+            divisions=divisions,
+            set_divisions=set_divisions,
+            ignore_index=ignore_index,
+            ascending=ascending,
+            na_position=na_position,
+            sort_function=sort_function,
+            sort_function_kwargs=sort_function_kwargs,
+        )
 
         if ignore_index:
             return df.reset_index(drop=True)
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index e8551493bb1..af40d9ca41b 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -222,6 +222,8 @@ def sort_values(
     ignore_index=False,
     ascending=True,
     na_position="last",
+    sort_function=None,
+    sort_function_kwargs=None,
 ):
     """Sort by the given list/tuple of column names."""
     if not isinstance(ascending, bool):
@@ -235,6 +237,21 @@ def sort_values(
     elif not isinstance(by, list):
         by = [by]
 
+    # parse custom sort function / kwargs if provided
+    sort_kwargs = {
+        "by": by,
+        "ascending": ascending,
+        "na_position": na_position,
+    }
+    if sort_function is None:
+        sort_function = M.sort_values
+    if sort_function_kwargs is not None:
+        sort_kwargs.update(sort_function_kwargs)
+
+    # handle single partition case
+    if npartitions == 1:
+        return df.map_partitions(sort_function, **sort_kwargs)
+
     # Step 1 - Calculate new divisions (if necessary)
     if divisions is None:
         divisions = quantile_divisions(df, by, npartitions)
@@ -265,9 +282,7 @@ def sort_values(
     df3.divisions = (None,) * (df3.npartitions + 1)
 
     # Step 3 - Return final sorted df
-    df4 = df3.map_partitions(
-        M.sort_values, by, ascending=ascending, na_position=na_position
-    )
+    df4 = df3.map_partitions(sort_function, **sort_kwargs)
     if not isinstance(divisions, gd.DataFrame) and set_divisions:
         # Can't have multi-column divisions elsewhere in dask (yet)
         df4.divisions = methods.tolist(divisions)
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index f4ae83245cb..0b258dd33e7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -83,3 +83,22 @@ def test_sort_values_with_nulls(data, by, ascending, na_position):
 
     # cudf ordering for nulls is non-deterministic
     dd.assert_eq(got[by], expect[by], check_index=False)
+
+
+@pytest.mark.parametrize("by", [["a", "b"], ["b", "a"]])
+@pytest.mark.parametrize("nparts", [1, 10])
+def test_sort_values_custom_function(by, nparts):
+    df = cudf.DataFrame({"a": [1, 2, 3] * 20, "b": [4, 5, 6, 7] * 15})
+    ddf = dd.from_pandas(df, npartitions=nparts)
+
+    def f(partition, by_columns, ascending, na_position, **kwargs):
+        return partition.sort_values(
+            by_columns, ascending=ascending, na_position=na_position
+        )
+
+    with dask.config.set(scheduler="single-threaded"):
+        got = ddf.sort_values(
+            by=by[0], sort_function=f, sort_function_kwargs={"by_columns": by}
+        )
+    expect = df.sort_values(by=by)
+    dd.assert_eq(got, expect, check_index=False)

From ce31d7d3ad765c88bef9f5e860abe3e5488a1fbd Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 14 Jan 2022 10:55:47 -0500
Subject: [PATCH 17/23] Fix octal pattern matching in regex string (#9993)

Closes #9946

Fixes decoding logic in regex pattern compile step to consume only up to the last octal character. The original logic was incorrectly discarding the next pattern character. And if the octal characters were specified at the end of the pattern invalid bytes were read passed the end of the pattern. This is what caused the intermittent failure since sometimes the invalid bytes were 0 which masked the issue.

This PR also includes tests for octal patterns in various positions in the regex pattern.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Conor Hoekstra (https://github.com/codereport)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9993
---
 cpp/src/strings/regex/regcomp.cpp    |  8 ++++----
 cpp/tests/strings/contains_tests.cpp | 15 ++++++++++++++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 7da4915d668..8fbd82b8dc7 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -258,10 +258,10 @@ class regex_parser {
       // treating all quoted numbers as Octal, since we are not supporting backreferences
       if (yy >= '0' && yy <= '7') {
         yy         = yy - '0';
-        char32_t c = *exprp++;
+        char32_t c = *exprp;
         while (c >= '0' && c <= '7') {
           yy = (yy << 3) | (c - '0');
-          c  = *exprp++;
+          c  = *(++exprp);
         }
         return CHAR;
       } else {
@@ -926,7 +926,7 @@ void reprog::optimize2()
   _startinst_ids.push_back(-1);  // terminator mark
 }
 
-#ifndef NDBUG
+#ifndef NDEBUG
 void reprog::print(regex_flags const flags)
 {
   printf("Flags = 0x%08x\n", static_cast<uint32_t>(flags));
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index f95b282171f..48c4aac9e8a 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -237,6 +237,19 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
   }
 }
 
+TEST_F(StringsContainsTests, OctalTest)
+{
+  cudf::test::strings_column_wrapper strings({"AZ", "B", "CDAZEY", ""});
+  auto strings_view = cudf::strings_column_view(strings);
+  cudf::test::fixed_width_column_wrapper<bool> expected({1, 0, 1, 0});
+  auto results = cudf::strings::contains_re(strings_view, "\\101");
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = cudf::strings::contains_re(strings_view, "\\101Z");
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = cudf::strings::contains_re(strings_view, "D*\\101\\132");
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsContainsTests, EmbeddedNullCharacter)
 {
   std::vector<std::string> data(10);

From b01c8464b2e1412263a7368f01242f6cdd58e89b Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Fri, 14 Jan 2022 08:05:54 -0800
Subject: [PATCH 18/23] Allow CuPy 10 (#10048)

Relaxes version constraints to allow CuPy 10.

xref: https://github.com/rapidsai/integration/pull/413

Authors:
  - https://github.com/jakirkham

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: https://github.com/rapidsai/cudf/pull/10048
---
 conda/environments/cudf_dev_cuda11.5.yml | 2 +-
 conda/recipes/cudf/meta.yaml             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index c258a5caabb..bbbc754e850 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -9,7 +9,7 @@ channels:
 dependencies:
   - clang=11.1.0
   - clang-tools=11.1.0
-  - cupy>=9.5.0,<10.0.0a0
+  - cupy>=9.5.0,<11.0.0a0
   - rmm=22.02.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 2600ab358cc..a20749bc8c9 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -40,7 +40,7 @@ requirements:
     - python
     - typing_extensions
     - pandas >=1.0,<1.4.0dev0
-    - cupy >=9.5.0,<10.0.0a0
+    - cupy >=9.5.0,<11.0.0a0
     - numba >=0.54
     - numpy
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} *cuda

From 12adb8a93dc51e14c106b12c4a68f3a2e1fe3207 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 14 Jan 2022 14:59:40 -0600
Subject: [PATCH 19/23] Fix repr and concat of `StructColumn` (#10042)

Fixes: #8963

This PR fixes a trivial issue in `concat` where the assumption was that `_with_type_metadata` is an in-place operation, but it isn't.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10042
---
 python/cudf/cudf/core/dataframe.py  | 14 +++++++++++---
 python/cudf/cudf/core/series.py     |  9 +++++----
 python/cudf/cudf/tests/test_repr.py | 30 +++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 123f86cc200..8fb9b84d96b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1548,10 +1548,18 @@ def _concat(
                     cudf.core.index.as_index(out.index._values)
                 )
 
-        # Reassign precision for any decimal cols
+        # Reassign precision for decimal cols & type schema for struct cols
         for name, col in out._data.items():
-            if isinstance(col, cudf.core.column.Decimal64Column):
-                col = col._with_type_metadata(tables[0]._data[name].dtype)
+            if isinstance(
+                col,
+                (
+                    cudf.core.column.Decimal64Column,
+                    cudf.core.column.StructColumn,
+                ),
+            ):
+                out._data[name] = col._with_type_metadata(
+                    tables[0]._data[name].dtype
+                )
 
         # Reassign index and column names
         if isinstance(objs[0].columns, pd.MultiIndex):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 11166320760..7da3bdbb31e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1448,10 +1448,11 @@ def _concat(cls, objs, axis=0, index=True):
 
         col = concat_columns([o._column for o in objs])
 
-        if isinstance(col, cudf.core.column.Decimal64Column):
-            col = col._with_type_metadata(objs[0]._column.dtype)
-
-        if isinstance(col, cudf.core.column.StructColumn):
+        # Reassign precision for decimal cols & type schema for struct cols
+        if isinstance(
+            col,
+            (cudf.core.column.Decimal64Column, cudf.core.column.StructColumn),
+        ):
             col = col._with_type_metadata(objs[0].dtype)
 
         return cls(data=col, index=index, name=name)
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index f8c136b8c2d..82020f30f7c 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1475,3 +1475,33 @@ def test_empty_series_name():
     gs = cudf.from_pandas(ps)
 
     assert ps.__repr__() == gs.__repr__()
+
+
+def test_repr_struct_after_concat():
+    df = cudf.DataFrame(
+        {
+            "a": cudf.Series(
+                [
+                    {"sa": 2056831253},
+                    {"sa": -1463792165},
+                    {"sa": 1735783038},
+                    {"sa": 103774433},
+                    {"sa": -1413247520},
+                ]
+                * 13
+            ),
+            "b": cudf.Series(
+                [
+                    {"sa": {"ssa": 1140062029}},
+                    None,
+                    {"sa": {"ssa": 1998862860}},
+                    {"sa": None},
+                    {"sa": {"ssa": -395088502}},
+                ]
+                * 13
+            ),
+        }
+    )
+    pdf = df.to_pandas()
+
+    assert df.__repr__() == pdf.__repr__()

From 8c8d6ef7fdc8f17159df63182ee9e9b0cf8df3b1 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 14 Jan 2022 17:22:07 -0600
Subject: [PATCH 20/23] Fix dataframe setitem with `ndarray` types (#10056)

Fixes: #9928

This PR fixes 2d array assignment in `setitem`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Michael Wang (https://github.com/isVoid)

URL: https://github.com/rapidsai/cudf/pull/10056
---
 python/cudf/cudf/core/dataframe.py       | 18 ++++++++++++++++--
 python/cudf/cudf/tests/test_dataframe.py | 11 +++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8fb9b84d96b..6bbb2fca77c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1123,7 +1123,15 @@ def __setitem__(self, arg, value):
                     for col_name in self._data:
                         self._data[col_name][mask] = value
             else:
-                if isinstance(value, DataFrame):
+                if isinstance(value, (cupy.ndarray, np.ndarray)):
+                    _setitem_with_dataframe(
+                        input_df=self,
+                        replace_df=cudf.DataFrame(value),
+                        input_cols=arg,
+                        mask=None,
+                        ignore_index=True,
+                    )
+                elif isinstance(value, DataFrame):
                     _setitem_with_dataframe(
                         input_df=self,
                         replace_df=value,
@@ -6401,6 +6409,7 @@ def _setitem_with_dataframe(
     replace_df: DataFrame,
     input_cols: Any = None,
     mask: Optional[cudf.core.column.ColumnBase] = None,
+    ignore_index: bool = False,
 ):
     """
     This function sets item dataframes relevant columns with replacement df
@@ -6408,6 +6417,7 @@ def _setitem_with_dataframe(
     :param replace_df: Replacement DataFrame to replace values with
     :param input_cols: columns to replace in the input dataframe
     :param mask: boolean mask in case of masked replacing
+    :param ignore_index: Whether to conduct index equality and reindex
     """
 
     if input_cols is None:
@@ -6418,7 +6428,11 @@ def _setitem_with_dataframe(
             "Number of Input Columns must be same replacement Dataframe"
         )
 
-    if len(input_df) != 0 and not input_df.index.equals(replace_df.index):
+    if (
+        not ignore_index
+        and len(input_df) != 0
+        and not input_df.index.equals(replace_df.index)
+    ):
         replace_df = replace_df.reindex(input_df.index)
 
     for col_1, col_2 in zip(input_cols, replace_df.columns):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index e5b298a8448..372587ba677 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9030,3 +9030,14 @@ def test_dataframe_add_suffix():
     expected = pdf.add_suffix("_item")
 
     assert_eq(got, expected)
+
+
+def test_dataframe_assign_cp_np_array():
+    m, n = 5, 3
+    cp_ndarray = cupy.random.randn(m, n)
+    pdf = pd.DataFrame({f"f_{i}": range(m) for i in range(n)})
+    gdf = cudf.DataFrame({f"f_{i}": range(m) for i in range(n)})
+    pdf[[f"f_{i}" for i in range(n)]] = cupy.asnumpy(cp_ndarray)
+    gdf[[f"f_{i}" for i in range(n)]] = cp_ndarray
+
+    assert_eq(pdf, gdf)

From e24fa8f0b0cca2c9a441002623fdbc40631ed369 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 14 Jan 2022 16:35:43 -0800
Subject: [PATCH 21/23] Run doctests. (#9815)

This PR adds doctests and resolves #9513. Several issues were found by running doctests that have now been resolved:

- [x] #9821
- [x] #9822
- [x] #9823
- [x] #9824
- [x] #9825
- [x] #9826
- [x] #9827
- [x] #9828 (workaround by deleting doctests)
- [x] #9829

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9815
---
 python/cudf/cudf/__init__.py                |  63 ++++++++++
 python/cudf/cudf/api/__init__.py            |   4 +-
 python/cudf/cudf/api/extensions/__init__.py |   6 +
 python/cudf/cudf/core/_base_index.py        |  14 ++-
 python/cudf/cudf/core/column/categorical.py |   1 -
 python/cudf/cudf/core/dataframe.py          | 122 ++++++++++++--------
 python/cudf/cudf/core/groupby/groupby.py    |  11 +-
 python/cudf/cudf/core/index.py              |  72 ++++++------
 python/cudf/cudf/core/multiindex.py         |  16 +--
 python/cudf/cudf/core/reshape.py            |   3 +-
 python/cudf/cudf/core/scalar.py             |   2 +-
 python/cudf/cudf/core/series.py             |  42 ++++---
 python/cudf/cudf/core/tools/datetimes.py    |   8 +-
 python/cudf/cudf/tests/test_dataframe.py    |  81 ++++++++-----
 python/cudf/cudf/tests/test_doctests.py     | 102 ++++++++++++++++
 python/cudf/cudf/utils/docutils.py          |  14 +--
 python/cudf/cudf/utils/ioutils.py           |   6 +-
 17 files changed, 402 insertions(+), 165 deletions(-)
 create mode 100644 python/cudf/cudf/tests/test_doctests.py

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 2461e7b09bc..4dadf6a1869 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -126,3 +126,66 @@
 
 __version__ = get_versions()["version"]
 del get_versions
+
+__all__ = [
+    "BaseIndex",
+    "CategoricalDtype",
+    "CategoricalIndex",
+    "DataFrame",
+    "DateOffset",
+    "DatetimeIndex",
+    "Decimal32Dtype",
+    "Decimal64Dtype",
+    "Float32Index",
+    "Float64Index",
+    "GenericIndex",
+    "Grouper",
+    "Index",
+    "Int16Index",
+    "Int32Index",
+    "Int64Index",
+    "Int8Index",
+    "IntervalDtype",
+    "IntervalIndex",
+    "ListDtype",
+    "MultiIndex",
+    "NA",
+    "RangeIndex",
+    "Scalar",
+    "Series",
+    "StringIndex",
+    "StructDtype",
+    "TimedeltaIndex",
+    "UInt16Index",
+    "UInt32Index",
+    "UInt64Index",
+    "UInt8Index",
+    "api",
+    "concat",
+    "cut",
+    "date_range",
+    "factorize",
+    "from_dataframe",
+    "from_dlpack",
+    "from_pandas",
+    "get_dummies",
+    "interval_range",
+    "isclose",
+    "melt",
+    "merge",
+    "merge_sorted",
+    "pivot",
+    "read_avro",
+    "read_csv",
+    "read_feather",
+    "read_hdf",
+    "read_json",
+    "read_orc",
+    "read_parquet",
+    "read_text",
+    "set_allocator",
+    "testing",
+    "to_datetime",
+    "to_numeric",
+    "unstack",
+]
diff --git a/python/cudf/cudf/api/__init__.py b/python/cudf/cudf/api/__init__.py
index 21c24015e41..c66bfb4efeb 100644
--- a/python/cudf/cudf/api/__init__.py
+++ b/python/cudf/cudf/api/__init__.py
@@ -1,3 +1,5 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
-from cudf.api import types
+from cudf.api import extensions, types
+
+__all__ = ["extensions", "types"]
diff --git a/python/cudf/cudf/api/extensions/__init__.py b/python/cudf/cudf/api/extensions/__init__.py
index c971e6f7731..eeb5dcdb32a 100644
--- a/python/cudf/cudf/api/extensions/__init__.py
+++ b/python/cudf/cudf/api/extensions/__init__.py
@@ -5,3 +5,9 @@
     register_index_accessor,
     register_series_accessor,
 )
+
+__all__ = [
+    "register_dataframe_accessor",
+    "register_index_accessor",
+    "register_series_accessor",
+]
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 683f3fefe1c..4f2614e843f 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -495,7 +495,7 @@ def fillna(self, value, downcast=None):
         >>> import cudf
         >>> index = cudf.Index([1, 2, None, 4])
         >>> index
-        Int64Index([1, 2, null, 4], dtype='int64')
+        Int64Index([1, 2, <NA>, 4], dtype='int64')
         >>> index.fillna(3)
         Int64Index([1, 2, 3, 4], dtype='int64')
         """
@@ -553,7 +553,7 @@ def to_pandas(self):
         >>> type(idx.to_pandas())
         <class 'pandas.core.indexes.numeric.Int64Index'>
         >>> type(idx)
-        <class 'cudf.core.index.GenericIndex'>
+        <class 'cudf.core.index.Int64Index'>
         """
         return pd.Index(self._values.to_pandas(), name=self.name)
 
@@ -942,6 +942,7 @@ def is_interval(self):
         Examples
         --------
         >>> import cudf
+        >>> import pandas as pd
         >>> idx = cudf.from_pandas(
         ...     pd.Index([pd.Interval(left=0, right=5),
         ...               pd.Interval(left=5, right=10)])
@@ -1105,15 +1106,16 @@ def join(
         Examples
         --------
         >>> import cudf
-        >>> lhs = cudf.DataFrame(
-        ...     {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b']
-        ... ).index
+        >>> lhs = cudf.DataFrame({
+        ...     "a": [2, 3, 1],
+        ...     "b": [3, 4, 2],
+        ... }).set_index(['a', 'b']).index
         >>> lhs
         MultiIndex([(2, 3),
                     (3, 4),
                     (1, 2)],
                    names=['a', 'b'])
-        >>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index
+        >>> rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index('a').index
         >>> rhs
         Int64Index([1, 4, 3], dtype='int64', name='a')
         >>> lhs.join(rhs, how='inner')
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 4be7a422de0..de06e62cbb1 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -61,7 +61,6 @@ class CategoricalAccessor(ColumnMethods):
     --------
     >>> s = cudf.Series([1,2,3], dtype='category')
     >>> s
-    >>> s
     0    1
     1    2
     2    3
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6bbb2fca77c..69600426ec0 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -463,12 +463,12 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     ...     [(t0+ timedelta(seconds=x)) for x in range(n)])
     ... })
     >>> df
-        id                datetimes
-    0    0  2018-10-07T12:00:00.000
-    1    1  2018-10-07T12:00:01.000
-    2    2  2018-10-07T12:00:02.000
-    3    3  2018-10-07T12:00:03.000
-    4    4  2018-10-07T12:00:04.000
+        id            datetimes
+    0    0  2018-10-07 12:00:00
+    1    1  2018-10-07 12:00:01
+    2    2  2018-10-07 12:00:02
+    3    3  2018-10-07 12:00:03
+    4    4  2018-10-07 12:00:04
 
     Build DataFrame via list of rows as tuples:
 
@@ -984,23 +984,34 @@ def __getitem__(self, arg):
 
         Examples
         --------
-        >>> df = DataFrame([('a', list(range(20))),
-        ...                 ('b', list(range(20))),
-        ...                 ('c', list(range(20)))])
-        >>> df[:4]    # get first 4 rows of all columns
+        >>> df = cudf.DataFrame({
+        ...     'a': list(range(10)),
+        ...     'b': list(range(10)),
+        ...     'c': list(range(10)),
+        ... })
+
+        Get first 4 rows of all columns.
+
+        >>> df[:4]
            a  b  c
         0  0  0  0
         1  1  1  1
         2  2  2  2
         3  3  3  3
-        >>> df[-5:]  # get last 5 rows of all columns
-             a   b   c
-        15  15  15  15
-        16  16  16  16
-        17  17  17  17
-        18  18  18  18
-        19  19  19  19
-        >>> df[['a', 'c']] # get columns a and c
+
+        Get last 5 rows of all columns.
+
+        >>> df[-5:]
+           a  b  c
+        5  5  5  5
+        6  6  6  6
+        7  7  7  7
+        8  8  8  8
+        9  9  9  9
+
+        Get columns a and c.
+
+        >>> df[['a', 'c']]
            a  c
         0  0  0
         1  1  1
@@ -1012,8 +1023,17 @@ def __getitem__(self, arg):
         7  7  7
         8  8  8
         9  9  9
-        >>> df[[True, False, True, False]] # mask the entire dataframe,
-        # returning the rows specified in the boolean mask
+
+        Return the rows specified in the boolean mask.
+
+        >>> df[[True, False, True, False, True,
+        ...     False, True, False, True, False]]
+           a  b  c
+        0  0  0  0
+        2  2  2  2
+        4  4  4  4
+        6  6  6  6
+        8  8  8  8
         """
         if _is_scalar_or_zero_d_array(arg) or isinstance(arg, tuple):
             return self._get_columns_by_label(arg, downcast=True)
@@ -1261,10 +1281,12 @@ def memory_usage(self, index=True, deep=False):
         object     40000
         bool        5000
         dtype: int64
+
         Use a Categorical for efficient storage of an object-dtype column with
         many repeated values.
+
         >>> df['object'].astype('category').memory_usage(deep=True)
-        5048
+        5008
         """
         if deep:
             warnings.warn(
@@ -2225,11 +2247,11 @@ def reindex(
         3    3  13.0
         4    4  14.0
         >>> df_new
-           key   val  sum
-        0    0  10.0  NaN
-        3    3  13.0  NaN
-        4    4  14.0  NaN
-        5   -1   NaN  NaN
+           key   val   sum
+        0     0  10.0  <NA>
+        3     3  13.0  <NA>
+        4     4  14.0  <NA>
+        5  <NA>  <NA>  <NA>
         """
 
         if labels is None and index is None and columns is None:
@@ -3701,10 +3723,10 @@ def query(self, expr, local_dict=None):
 
         Examples
         --------
-        >>> import cudf
-        >>> a = ('a', [1, 2, 2])
-        >>> b = ('b', [3, 4, 5])
-        >>> df = cudf.DataFrame([a, b])
+        >>> df = cudf.DataFrame({
+        ...     "a": [1, 2, 2],
+        ...     "b": [3, 4, 5],
+        ... })
         >>> expr = "(a == 2 and b == 4) or (b == 3)"
         >>> df.query(expr)
            a  b
@@ -3720,8 +3742,8 @@ def query(self, expr, local_dict=None):
         >>> df['datetimes'] = data
         >>> search_date = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d')
         >>> df.query('datetimes==@search_date')
-                        datetimes
-        1 2018-10-08T00:00:00.000
+           datetimes
+        1 2018-10-08
 
         Using local_dict:
 
@@ -3732,9 +3754,9 @@ def query(self, expr, local_dict=None):
         >>> df['datetimes'] = data
         >>> search_date2 = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d')
         >>> df.query('datetimes==@search_date',
-        ...         local_dict={'search_date':search_date2})
-                        datetimes
-        1 2018-10-08T00:00:00.000
+        ...          local_dict={'search_date': search_date2})
+           datetimes
+        1 2018-10-08
         """
         # can't use `annotate` decorator here as we inspect the calling
         # environment.
@@ -4189,18 +4211,23 @@ def info(
         dtypes: float64(1), int64(1), object(1)
         memory usage: 130.0+ bytes
 
-        Pipe output of DataFrame.info to buffer instead of sys.stdout,
-        get buffer content and writes to a text file:
+        Pipe output of DataFrame.info to a buffer instead of sys.stdout and
+        print buffer contents:
 
         >>> import io
         >>> buffer = io.StringIO()
         >>> df.info(buf=buffer)
-        >>> s = buffer.getvalue()
-        >>> with open("df_info.txt", "w",
-        ...           encoding="utf-8") as f:
-        ...     f.write(s)
-        ...
-        369
+        >>> print(buffer.getvalue())
+        <class 'cudf.core.dataframe.DataFrame'>
+        RangeIndex: 5 entries, 0 to 4
+        Data columns (total 3 columns):
+         #   Column     Non-Null Count  Dtype
+        ---  ------     --------------  -----
+         0   int_col    5 non-null      int64
+         1   text_col   5 non-null      object
+         2   float_col  5 non-null      float64
+        dtypes: float64(1), int64(1), object(1)
+        memory usage: 130.0+ bytes
 
         The `memory_usage` parameter allows deep introspection mode, specially
         useful for big DataFrames and fine-tune memory optimization:
@@ -5761,7 +5788,7 @@ def stack(self, level=-1, dropna=True):
         Examples
         --------
         >>> import cudf
-        >>> df = cudf.DataFrame({'a':[0,1,3], 'b':[1,2,4]})
+        >>> df = cudf.DataFrame({'a': [0, 1, 3], 'b': [1, 2, 4]})
         >>> df.stack()
         0  a    0
            b    1
@@ -6084,8 +6111,11 @@ def explode(self, column, ignore_index=False):
         Examples
         --------
         >>> import cudf
-        >>> cudf.DataFrame(
-                {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]})
+        >>> df = cudf.DataFrame({
+        ...     "a": [[1, 2, 3], [], None, [4, 5]],
+        ...     "b": [11, 22, 33, 44],
+        ... })
+        >>> df
                    a   b
         0  [1, 2, 3]  11
         1         []  22
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 08ef3f07776..5b041ba53b9 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1256,9 +1256,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
     --------
     >>> import cudf
     >>> import pandas as pd
-    >>> df = cudf.DataFrame({'Animal': ['Falcon', 'Falcon',
-    ...                               'Parrot', 'Parrot'],
-    ...                    'Max Speed': [380., 370., 24., 26.]})
+    >>> df = cudf.DataFrame({
+    ...     'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],
+    ...     'Max Speed': [380., 370., 24., 26.],
+    ... })
     >>> df
        Animal  Max Speed
     0  Falcon      380.0
@@ -1272,10 +1273,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
     Parrot       25.0
 
     >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
-    ... ['Captive', 'Wild', 'Captive', 'Wild']]
+    ...           ['Captive', 'Wild', 'Captive', 'Wild']]
     >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
     >>> df = cudf.DataFrame({'Max Speed': [390., 350., 30., 20.]},
-            index=index)
+    ...     index=index)
     >>> df
                     Max Speed
     Animal Type
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 859a81bc5f4..1e493708415 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1535,9 +1535,11 @@ class DatetimeIndex(GenericIndex):
     --------
     >>> import cudf
     >>> cudf.DatetimeIndex([1, 2, 3, 4], name="a")
-    DatetimeIndex(['1970-01-01 00:00:00.001000', '1970-01-01 00:00:00.002000',
-                   '1970-01-01 00:00:00.003000', '1970-01-01 00:00:00.004000'],
-                  dtype='datetime64[ms]', name='a')
+    DatetimeIndex(['1970-01-01 00:00:00.000000001',
+                   '1970-01-01 00:00:00.000000002',
+                   '1970-01-01 00:00:00.000000003',
+                   '1970-01-01 00:00:00.000000004'],
+                  dtype='datetime64[ns]', name='a')
     """
 
     def __init__(
@@ -1899,12 +1901,13 @@ def ceil(self, freq):
         Examples
         --------
         >>> import cudf
-        >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00",
-        ... "1999-12-31 18:40:00"])
+        >>> gIndex = cudf.DatetimeIndex([
+        ...     "2020-05-31 08:05:42",
+        ...     "1999-12-31 18:40:30",
+        ... ])
         >>> gIndex.ceil("T")
-        DatetimeIndex(['2020-05-31 08:00:00', '1999-12-31 18:40:00'],
-        dtype='datetime64[ns]', freq=None)
-        """
+        DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]')
+        """  # noqa: E501
         out_column = self._values.ceil(freq)
 
         return self.__class__._from_data({self.name: out_column})
@@ -1930,12 +1933,13 @@ def floor(self, freq):
         Examples
         --------
         >>> import cudf
-        >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:59:59"
-        ... ,"1999-12-31 18:44:59"])
+        >>> gIndex = cudf.DatetimeIndex([
+        ...     "2020-05-31 08:59:59",
+        ...     "1999-12-31 18:44:59",
+        ... ])
         >>> gIndex.floor("T")
-        DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'],
-        dtype='datetime64[ns]', freq=None)
-        """
+        DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]')
+        """  # noqa: E501
         out_column = self._values.floor(freq)
 
         return self.__class__._from_data({self.name: out_column})
@@ -1967,21 +1971,14 @@ def round(self, freq):
         ...     "2001-01-01 00:05:04",
         ... ], dtype="datetime64[ns]")
         >>> dt_idx
-        DatetimeIndex(['2001-01-01 00:04:45',
-                '2001-01-01 00:05:04',
-                '2001-01-01 00:04:58'],
-                dtype='datetime64[ns]', freq=None)
+        DatetimeIndex(['2001-01-01 00:04:45', '2001-01-01 00:04:58',
+                       '2001-01-01 00:05:04'],
+                      dtype='datetime64[ns]')
         >>> dt_idx.round('H')
-        DatetimeIndex(['2001-01-01',
-                    '2001-01-01',
-                    '2001-01-01'],
-                    dtype='datetime64[ns]', freq=None)
+        DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01'], dtype='datetime64[ns]')
         >>> dt_idx.round('T')
-        DatetimeIndex(['2001-01-01 00:05:00',
-                    '2001-01-01 00:05:00',
-                    '2001-01-01 00:05:00'],
-                    dtype='datetime64[ns]', freq=None)
-        """
+        DatetimeIndex(['2001-01-01 00:05:00', '2001-01-01 00:05:00', '2001-01-01 00:05:00'], dtype='datetime64[ns]')
+        """  # noqa: E501
         out_column = self._values.round(freq)
 
         return self.__class__._from_data({self.name: out_column})
@@ -2018,14 +2015,15 @@ class TimedeltaIndex(GenericIndex):
     --------
     >>> import cudf
     >>> cudf.TimedeltaIndex([1132223, 2023232, 342234324, 4234324],
-    ...     dtype='timedelta64[ns]')
-    TimedeltaIndex(['00:00:00.001132', '00:00:00.002023', '00:00:00.342234',
-                    '00:00:00.004234'],
-                dtype='timedelta64[ns]')
-    >>> cudf.TimedeltaIndex([1, 2, 3, 4], dtype='timedelta64[s]',
+    ...     dtype="timedelta64[ns]")
+    TimedeltaIndex(['0 days 00:00:00.001132223', '0 days 00:00:00.002023232',
+                    '0 days 00:00:00.342234324', '0 days 00:00:00.004234324'],
+                  dtype='timedelta64[ns]')
+    >>> cudf.TimedeltaIndex([1, 2, 3, 4], dtype="timedelta64[s]",
     ...     name="delta-index")
-    TimedeltaIndex(['00:00:01', '00:00:02', '00:00:03', '00:00:04'],
-                dtype='timedelta64[s]', name='delta-index')
+    TimedeltaIndex(['0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03',
+                    '0 days 00:00:04'],
+                  dtype='timedelta64[s]', name='delta-index')
     """
 
     def __init__(
@@ -2154,11 +2152,11 @@ class CategoricalIndex(GenericIndex):
     >>> import pandas as pd
     >>> cudf.CategoricalIndex(
     ... data=[1, 2, 3, 4], categories=[1, 2], ordered=False, name="a")
-    CategoricalIndex([1, 2, <NA>, <NA>], categories=[1, 2], ordered=False, name='a', dtype='category', name='a')
+    CategoricalIndex([1, 2, <NA>, <NA>], categories=[1, 2], ordered=False, dtype='category', name='a')
 
     >>> cudf.CategoricalIndex(
     ... data=[1, 2, 3, 4], dtype=pd.CategoricalDtype([1, 2, 3]), name="a")
-    CategoricalIndex([1, 2, 3, <NA>], categories=[1, 2, 3], ordered=False, name='a', dtype='category', name='a')
+    CategoricalIndex([1, 2, 3, <NA>], categories=[1, 2, 3], ordered=False, dtype='category', name='a')
     """  # noqa: E501
 
     def __init__(
@@ -2449,9 +2447,7 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None):
         >>> import cudf
         >>> import pandas as pd
         >>> cudf.IntervalIndex.from_breaks([0, 1, 2, 3])
-        IntervalIndex([(0, 1], (1, 2], (2, 3]],
-                    closed='right',
-                    dtype='interval[int64]')
+        IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval')
         """
         if copy:
             breaks = column.as_column(breaks, dtype=dtype).copy()
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index b333c862f21..3acc947c649 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -190,7 +190,7 @@ def rename(self, names, inplace=False):
         Renaming each levels of a MultiIndex to specified name:
 
         >>> midx = cudf.MultiIndex.from_product(
-                [('A', 'B'), (2020, 2021)], names=['c1', 'c2'])
+        ...     [('A', 'B'), (2020, 2021)], names=['c1', 'c2'])
         >>> midx.rename(['lv1', 'lv2'])
         MultiIndex([('A', 2020),
                     ('A', 2021),
@@ -1086,7 +1086,7 @@ def values(self):
             [4, 2],
             [5, 1]])
         >>> type(midx.values)
-        <class 'cupy.core.core.ndarray'>
+        <class 'cupy._core.core.ndarray'>
         """
         return self.to_frame(index=False).values
 
@@ -1587,13 +1587,13 @@ def get_loc(self, key, method=None, tolerance=None):
         --------
         >>> import cudf
         >>> mi = cudf.MultiIndex.from_tuples(
-            [('a', 'd'), ('b', 'e'), ('b', 'f')])
+        ...     [('a', 'd'), ('b', 'e'), ('b', 'f')])
         >>> mi.get_loc('b')
         slice(1, 3, None)
         >>> mi.get_loc(('b', 'e'))
         1
         >>> non_monotonic_non_unique_idx = cudf.MultiIndex.from_tuples(
-            [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')])
+        ...     [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')])
         >>> non_monotonic_non_unique_idx.get_loc('b') # differ from pandas
         slice(1, 4, 2)
 
@@ -1609,10 +1609,10 @@ def get_loc(self, key, method=None, tolerance=None):
 
                 >>> import pandas as pd
                 >>> import cudf
-                >>> x = pd.MultiIndex.from_tuples(
-                            [(2, 1, 1), (1, 2, 3), (1, 2, 1),
-                                (1, 1, 1), (1, 1, 1), (2, 2, 1)]
-                        )
+                >>> x = pd.MultiIndex.from_tuples([
+                ...     (2, 1, 1), (1, 2, 3), (1, 2, 1),
+                ...     (1, 1, 1), (1, 1, 1), (2, 2, 1),
+                ... ])
                 >>> x.get_loc(1)
                 array([False,  True,  True,  True,  True, False])
                 >>> cudf.from_pandas(x).get_loc(1)
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 1733a6c0b9a..68113cfdca9 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -891,7 +891,7 @@ def pivot(data, index=None, columns=None, values=None):
     Examples
     --------
     >>> a = cudf.DataFrame()
-    >>> a['a'] = [1, 1, 2, 2],
+    >>> a['a'] = [1, 1, 2, 2]
     >>> a['b'] = ['a', 'b', 'a', 'b']
     >>> a['c'] = [1, 2, 3, 4]
     >>> a.pivot(index='a', columns='b')
@@ -973,6 +973,7 @@ def unstack(df, level, fill_value=None):
 
     Examples
     --------
+    >>> df = cudf.DataFrame()
     >>> df['a'] = [1, 1, 1, 2, 2]
     >>> df['b'] = [1, 2, 3, 1, 2]
     >>> df['c'] = [5, 6, 7, 8, 9]
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 787b28e213c..37bb8e32c5a 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -32,7 +32,7 @@ class Scalar(object):
     >>> cudf.Scalar(42, dtype='int64') + np.int8(21)
     Scalar(63, dtype=int64)
     >>> x = cudf.Scalar(42, dtype='datetime64[s]')
-    >>> y = cudf.Scalar(21, dtype='timedelta64[ns])
+    >>> y = cudf.Scalar(21, dtype='timedelta64[ns]')
     >>> x - y
     Scalar(1970-01-01T00:00:41.999999979, dtype=datetime64[ns])
     >>> cudf.Series([1,2,3]) + cudf.Scalar(1)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 7da3bdbb31e..6842a05a505 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -512,13 +512,26 @@ def from_pandas(cls, s, nan_as_null=None):
     @property
     def dt(self):
         """
-        Accessor object for datetimelike properties of the Series values.
+        Accessor object for datetime-like properties of the Series values.
 
         Examples
         --------
+        >>> s = cudf.Series(cudf.date_range(
+        ...   start='2001-02-03 12:00:00',
+        ...   end='2001-02-03 14:00:00',
+        ...   freq='1H'))
         >>> s.dt.hour
+        0    12
+        1    13
+        dtype: int16
         >>> s.dt.second
+        0    0
+        1    0
+        dtype: int16
         >>> s.dt.day
+        0    3
+        1    3
+        dtype: int16
 
         Returns
         -------
@@ -674,10 +687,12 @@ def drop(
            y    3
         2  x    4
            y    5
+        dtype: int64
         >>> s.drop(labels='y', level=1)
         0  x    0
         1  x    2
         2  x    4
+        Name: 2, dtype: int64
         """
         if labels is not None:
             if index is not None or columns is not None:
@@ -1032,7 +1047,7 @@ def memory_usage(self, index=True, deep=False):
         --------
         >>> s = cudf.Series(range(3), index=['a','b','c'])
         >>> s.memory_usage()
-        48
+        43
 
         Not including the index gives the size of the rest of the data, which
         is necessarily smaller:
@@ -1539,7 +1554,7 @@ def dropna(self, axis=0, inplace=False, how=None):
         >>> ser
         0       1
         1       2
-        2    null
+        2    <NA>
         dtype: int64
 
         Drop null values from a Series.
@@ -1800,7 +1815,7 @@ def data(self):
         3    4
         dtype: int64
         >>> series.data
-        <cudf.core.buffer.Buffer object at 0x7f23c192d110>
+        <cudf.core.buffer.Buffer object at 0x...>
         >>> series.data.to_host_array()
         array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
                0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)
@@ -1824,14 +1839,7 @@ def as_mask(self):
         >>> import cudf
         >>> s = cudf.Series([True, False, True])
         >>> s.as_mask()
-        <cudf.core.buffer.Buffer object at 0x7f23c3eed0d0>
-        >>> s.as_mask().to_host_array()
-        array([  5,   0,   0,   0,   0,   0,   0,   0,   1,   0,   0,   0,   0,
-                 0,   0,   0,   2,   0,   0,   0,   0,   0,   0,   0, 181, 164,
-               188,   1,   0,   0,   0,   0, 255, 255, 255, 255, 255, 255, 255,
-               127, 253, 214,  62, 241,   1,   0,   0,   0,   0,   0,   0,   0,
-                 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
-             dtype=uint8)
+        <cudf.core.buffer.Buffer object at 0x...>
         """
         if not is_bool_dtype(self.dtype):
             raise TypeError(
@@ -2805,11 +2813,11 @@ def autocorr(self, lag=1):
         Examples
         --------
         >>> import cudf
-        >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05])
+        >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05, 0.17])
         >>> s.autocorr()
-        0.10355263309024071
+        0.1438853844...
         >>> s.autocorr(lag=2)
-        -0.9999999999999999
+        -0.9647548490...
         """
         return self.corr(self.shift(lag))
 
@@ -3584,7 +3592,7 @@ def keys(self):
         dtype: int64
 
         >>> sr.keys()
-        RangeIndex(start=0, stop=6)
+        RangeIndex(start=0, stop=6, step=1)
         >>> sr = cudf.Series(['a', 'b', 'c'])
         >>> sr
         0    a
@@ -3592,7 +3600,7 @@ def keys(self):
         2    c
         dtype: object
         >>> sr.keys()
-        RangeIndex(start=0, stop=3)
+        RangeIndex(start=0, stop=3, step=1)
         >>> sr = cudf.Series([1, 2, 3], index=['a', 'b', 'c'])
         >>> sr
         a    1
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 15426d0173a..62c31691ac1 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -397,10 +397,10 @@ class DateOffset:
     --------
     >>> from cudf import DateOffset
     >>> ts = cudf.Series([
-        "2000-01-01 00:00:00.012345678",
-        "2000-01-31 00:00:00.012345678",
-        "2000-02-29 00:00:00.012345678",
-    ], dtype='datetime64[ns])
+    ...     "2000-01-01 00:00:00.012345678",
+    ...     "2000-01-31 00:00:00.012345678",
+    ...     "2000-02-29 00:00:00.012345678",
+    ... ], dtype='datetime64[ns]')
     >>> ts + DateOffset(months=3)
     0   2000-04-01 00:00:00.012345678
     1   2000-04-30 00:00:00.012345678
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 372587ba677..6171f20929d 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -746,18 +746,31 @@ def test_index_astype(nelem):
     np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy())
 
 
-def test_dataframe_to_string():
-    pd.options.display.max_rows = 5
-    pd.options.display.max_columns = 8
-    # Test basic
+def test_dataframe_to_string_with_skipped_rows():
+    # Test skipped rows
     df = cudf.DataFrame(
         {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]}
     )
-    string = str(df)
 
-    assert string.splitlines()[-1] == "[6 rows x 2 columns]"
+    with pd.option_context("display.max_rows", 5):
+        got = df.to_string()
+
+    expect = textwrap.dedent(
+        """\
+            a   b
+        0   1  11
+        1   2  12
+        .. ..  ..
+        4   5  15
+        5   6  16
+
+        [6 rows x 2 columns]"""
+    )
+    assert got == expect
 
-    # Test skipped columns
+
+def test_dataframe_to_string_with_skipped_rows_and_columns():
+    # Test skipped rows and skipped columns
     df = cudf.DataFrame(
         {
             "a": [1, 2, 3, 4, 5, 6],
@@ -766,11 +779,26 @@ def test_dataframe_to_string():
             "d": [11, 12, 13, 14, 15, 16],
         }
     )
-    string = df.to_string()
 
-    assert string.splitlines()[-1] == "[6 rows x 4 columns]"
+    with pd.option_context("display.max_rows", 5, "display.max_columns", 3):
+        got = df.to_string()
+
+    expect = textwrap.dedent(
+        """\
+            a  ...   d
+        0   1  ...  11
+        1   2  ...  12
+        .. ..  ...  ..
+        4   5  ...  15
+        5   6  ...  16
+
+        [6 rows x 4 columns]"""
+    )
+    assert got == expect
+
 
-    # Test masked
+def test_dataframe_to_string_with_masked_data():
+    # Test masked data
     df = cudf.DataFrame(
         {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]}
     )
@@ -783,34 +811,33 @@ def test_dataframe_to_string():
     assert masked.null_count == 2
     df["c"] = masked
 
-    # check data
+    # Check data
     values = masked.copy()
     validids = [0, 2, 3, 5]
     densearray = masked.dropna().to_numpy()
     np.testing.assert_equal(data[validids], densearray)
-    # valid position is correct
-
+    # Valid position is correct
     for i in validids:
         assert data[i] == values[i]
-    # null position is correct
+    # Null position is correct
     for i in range(len(values)):
         if i not in validids:
             assert values[i] is cudf.NA
 
-    pd.options.display.max_rows = 10
-    got = df.to_string()
+    with pd.option_context("display.max_rows", 10):
+        got = df.to_string()
 
-    expect = """
-a b  c
-0 1 11 0
-1 2 12 <NA>
-2 3 13 2
-3 4 14 3
-4 5 15 <NA>
-5 6 16 5
-"""
-    # values should match despite whitespace difference
-    assert got.split() == expect.split()
+    expect = textwrap.dedent(
+        """\
+           a   b     c
+        0  1  11     0
+        1  2  12  <NA>
+        2  3  13     2
+        3  4  14     3
+        4  5  15  <NA>
+        5  6  16     5"""
+    )
+    assert got == expect
 
 
 def test_dataframe_to_string_wide(monkeypatch):
diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py
new file mode 100644
index 00000000000..05d6886c297
--- /dev/null
+++ b/python/cudf/cudf/tests/test_doctests.py
@@ -0,0 +1,102 @@
+import contextlib
+import doctest
+import inspect
+import io
+import os
+
+import numpy as np
+import pytest
+
+import cudf
+
+
+def _name_in_all(parent, name):
+    return name in getattr(parent, "__all__", [])
+
+
+def _is_public_name(parent, name):
+    return not name.startswith("_")
+
+
+def _find_doctests_in_obj(obj, finder=None, criteria=None):
+    """Find all doctests in an object.
+
+    Parameters
+    ----------
+    obj : module or class
+        The object to search for docstring examples.
+    finder : doctest.DocTestFinder, optional
+        The DocTestFinder object to use. If not provided, a DocTestFinder is
+        constructed.
+    criteria : callable, optional
+        Callable indicating whether to recurse over members of the provided
+        object. If not provided, names not defined in the object's ``__all__``
+        property are ignored.
+
+    Yields
+    ------
+    doctest.DocTest
+        The next doctest found in the object.
+    """
+    if finder is None:
+        finder = doctest.DocTestFinder()
+    if criteria is None:
+        criteria = _name_in_all
+    for docstring in finder.find(obj):
+        if docstring.examples:
+            yield docstring
+    for name, member in inspect.getmembers(obj):
+        # Only recurse over members matching the criteria
+        if not criteria(obj, name):
+            continue
+        # Recurse over the public API of modules (objects defined in the
+        # module's __all__)
+        if inspect.ismodule(member):
+            yield from _find_doctests_in_obj(
+                member, finder, criteria=_name_in_all
+            )
+        # Recurse over the public API of classes (attributes not prefixed with
+        # an underscore)
+        if inspect.isclass(member):
+            yield from _find_doctests_in_obj(
+                member, finder, criteria=_is_public_name
+            )
+
+
+class TestDoctests:
+    @pytest.fixture(autouse=True)
+    def chdir_to_tmp_path(cls, tmp_path):
+        # Some doctests generate files, so this fixture runs the tests in a
+        # temporary directory.
+        original_directory = os.getcwd()
+        os.chdir(tmp_path)
+        yield
+        os.chdir(original_directory)
+
+    @pytest.mark.parametrize(
+        "docstring",
+        _find_doctests_in_obj(cudf),
+        ids=lambda docstring: docstring.name,
+    )
+    def test_docstring(self, docstring):
+        # We ignore differences in whitespace in the doctest output, and enable
+        # the use of an ellipsis "..." to match any string in the doctest
+        # output. An ellipsis is useful for, e.g., memory addresses or
+        # imprecise floating point values.
+        optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
+        runner = doctest.DocTestRunner(optionflags=optionflags)
+
+        # These global names are pre-defined and can be used in doctests
+        # without first importing them.
+        globals = dict(cudf=cudf, np=np,)
+        docstring.globs = globals
+
+        # Capture stdout and include failing outputs in the traceback.
+        doctest_stdout = io.StringIO()
+        with contextlib.redirect_stdout(doctest_stdout):
+            runner.run(docstring)
+            results = runner.summarize()
+        assert not results.failed, (
+            f"{results.failed} of {results.attempted} doctests failed for "
+            f"{docstring.name}:\n{doctest_stdout.getvalue()}"
+        )
diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py
index 7a4a2673f9b..2fcf996b641 100644
--- a/python/cudf/cudf/utils/docutils.py
+++ b/python/cudf/cudf/utils/docutils.py
@@ -225,13 +225,13 @@ def wrapper(func):
         2   2010-01-01
         dtype: datetime64[s]
         >>> s.describe()
-        count                                3
-        mean     2006-09-01 08:00:00.000000000
-        min      2000-01-01 00:00:00.000000000
-        25%      2004-12-31 12:00:00.000000000
-        50%      2010-01-01 00:00:00.000000000
-        75%      2010-01-01 00:00:00.000000000
-        max      2010-01-01 00:00:00.000000000
+        count                     3
+        mean    2006-09-01 08:00:00
+        min     2000-01-01 00:00:00
+        25%     2004-12-31 12:00:00
+        50%     2010-01-01 00:00:00
+        75%     2010-01-01 00:00:00
+        max     2010-01-01 00:00:00
         dtype: object
 
         Describing a ``DataFrame``. By default only numeric fields are
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index b881f9372bc..6f958860dad 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -972,9 +972,9 @@
 >>> import cudf
 >>> filename = 'foo.csv'
 >>> df = cudf.DataFrame({'x': [0, 1, 2, 3],
-                         'y': [1.0, 3.3, 2.2, 4.4],
-                         'z': ['a', 'b', 'c', 'd']})
->>> df = df.set_index([3, 2, 1, 0])
+...                      'y': [1.0, 3.3, 2.2, 4.4],
+...                      'z': ['a', 'b', 'c', 'd']})
+>>> df = df.set_index(cudf.Series([3, 2, 1, 0]))
 >>> df.to_csv(filename)
 
 """

From 7ff5f128bec185c40017bab20c08f1342fa6b74e Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Sat, 15 Jan 2022 14:12:12 -0700
Subject: [PATCH 22/23] Support structs for `cudf::contains` with column/scalar
 input (#9929)

This PR adds support for `cudf::contains` so we can check whether a structs column contains a scalar struct element.

Partially addresses #8965. This does not support checking if structs given in a structs column exist in another structs column. Such cases will be supported when the new data structure mentioned in https://github.com/rapidsai/cudf/issues/9413 is merged into cudf.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/9929
---
 cpp/src/search/search.cu                |  66 +++++--
 cpp/tests/search/search_struct_test.cpp | 241 +++++++++++++++++++++++-
 2 files changed, 288 insertions(+), 19 deletions(-)

diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index 241b3c595f1..81ed3cfbd51 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -173,11 +173,56 @@ bool contains_scalar_dispatch::operator()<cudf::list_view>(column_view const&,
 }
 
 template <>
-bool contains_scalar_dispatch::operator()<cudf::struct_view>(column_view const&,
-                                                             scalar const&,
-                                                             rmm::cuda_stream_view)
+bool contains_scalar_dispatch::operator()<cudf::struct_view>(column_view const& col,
+                                                             scalar const& value,
+                                                             rmm::cuda_stream_view stream)
 {
-  CUDF_FAIL("struct_view type not supported yet");
+  CUDF_EXPECTS(col.type() == value.type(), "scalar and column types must match");
+
+  auto const scalar_table = static_cast<struct_scalar const*>(&value)->view();
+  CUDF_EXPECTS(col.num_children() == scalar_table.num_columns(),
+               "struct scalar and structs column must have the same number of children");
+  for (size_type i = 0; i < col.num_children(); ++i) {
+    CUDF_EXPECTS(col.child(i).type() == scalar_table.column(i).type(),
+                 "scalar and column children types must match");
+  }
+
+  // Prepare to flatten the structs column and scalar.
+  auto const has_null_elements =
+    has_nested_nulls(table_view{std::vector<column_view>{col.child_begin(), col.child_end()}}) ||
+    has_nested_nulls(scalar_table);
+  auto const flatten_nullability = has_null_elements
+                                     ? structs::detail::column_nullability::FORCE
+                                     : structs::detail::column_nullability::MATCH_INCOMING;
+
+  // Flatten the input structs column, only materialize the bitmask if there is null in the input.
+  auto const col_flattened =
+    structs::detail::flatten_nested_columns(table_view{{col}}, {}, {}, flatten_nullability);
+  auto const val_flattened =
+    structs::detail::flatten_nested_columns(scalar_table, {}, {}, flatten_nullability);
+
+  // The struct scalar only contains the struct member columns.
+  // Thus, if there is any null in the input, we must exclude the first column in the flattened
+  // table of the input column from searching because that column is the materialized bitmask of
+  // the input structs column.
+  auto const col_flattened_content  = col_flattened.flattened_columns();
+  auto const col_flattened_children = table_view{std::vector<column_view>{
+    col_flattened_content.begin() + static_cast<size_type>(has_null_elements),
+    col_flattened_content.end()}};
+
+  auto const d_col_children_ptr = table_device_view::create(col_flattened_children, stream);
+  auto const d_val_ptr          = table_device_view::create(val_flattened, stream);
+
+  auto const start_iter = thrust::make_counting_iterator<size_type>(0);
+  auto const end_iter   = start_iter + col.size();
+  auto const comp       = row_equality_comparator(
+    nullate::DYNAMIC{has_null_elements}, *d_col_children_ptr, *d_val_ptr, null_equality::EQUAL);
+  auto const found_iter = thrust::find_if(
+    rmm::exec_policy(stream), start_iter, end_iter, [comp] __device__(auto const idx) {
+      return comp(idx, 0);  // compare col[idx] == val[0].
+    });
+
+  return found_iter != end_iter;
 }
 
 template <>
@@ -203,7 +248,6 @@ namespace detail {
 bool contains(column_view const& col, scalar const& value, rmm::cuda_stream_view stream)
 {
   if (col.is_empty()) { return false; }
-
   if (not value.is_valid(stream)) { return col.has_nulls(); }
 
   return cudf::type_dispatcher(col.type(), contains_scalar_dispatch{}, col, value, stream);
@@ -264,20 +308,14 @@ struct multi_contains_dispatch {
 
 template <>
 std::unique_ptr<column> multi_contains_dispatch::operator()<list_view>(
-  column_view const& haystack,
-  column_view const& needles,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  column_view const&, column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
 {
   CUDF_FAIL("list_view type not supported");
 }
 
 template <>
 std::unique_ptr<column> multi_contains_dispatch::operator()<struct_view>(
-  column_view const& haystack,
-  column_view const& needles,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  column_view const&, column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
 {
   CUDF_FAIL("struct_view type not supported");
 }
diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp
index db2ecb89d6a..a1f0b1d81cf 100644
--- a/cpp/tests/search/search_struct_test.cpp
+++ b/cpp/tests/search/search_struct_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/search.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -35,15 +36,14 @@ constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_leve
 constexpr int32_t null{0};  // Mark for null child elements
 constexpr int32_t XXX{0};   // Mark for null struct elements
 
-template <typename T>
-struct TypedStructSearchTest : public cudf::test::BaseFixture {
-};
-
 using TestTypes = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                      cudf::test::FloatingPointTypes,
                                      cudf::test::DurationTypes,
                                      cudf::test::TimestampTypes>;
 
+template <typename T>
+struct TypedStructSearchTest : public cudf::test::BaseFixture {
+};
 TYPED_TEST_SUITE(TypedStructSearchTest, TestTypes);
 
 namespace {
@@ -353,3 +353,234 @@ TYPED_TEST(TypedStructSearchTest, ComplexStructTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), verbosity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), verbosity);
 }
+
+template <typename T>
+struct TypedScalarStructContainTest : public cudf::test::BaseFixture {
+};
+TYPED_TEST_SUITE(TypedScalarStructContainTest, TestTypes);
+
+TYPED_TEST(TypedScalarStructContainTest, EmptyInputTest)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const col = [] {
+    auto child = col_wrapper{};
+    return structs_col{{child}};
+  }();
+
+  auto const val = [] {
+    auto child = col_wrapper{1};
+    return cudf::struct_scalar(std::vector<cudf::column_view>{child});
+  }();
+
+  EXPECT_EQ(false, cudf::contains(col, val));
+}
+
+TYPED_TEST(TypedScalarStructContainTest, TrivialInputTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const col = [] {
+    auto child1 = col_wrapper{1, 2, 3};
+    auto child2 = col_wrapper{4, 5, 6};
+    auto child3 = strings_col{"x", "y", "z"};
+    return structs_col{{child1, child2, child3}};
+  }();
+
+  auto const val1 = [] {
+    auto child1 = col_wrapper{1};
+    auto child2 = col_wrapper{4};
+    auto child3 = strings_col{"x"};
+    return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+  }();
+  auto const val2 = [] {
+    auto child1 = col_wrapper{1};
+    auto child2 = col_wrapper{4};
+    auto child3 = strings_col{"a"};
+    return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+  }();
+
+  EXPECT_EQ(true, cudf::contains(col, val1));
+  EXPECT_EQ(false, cudf::contains(col, val2));
+}
+
+TYPED_TEST(TypedScalarStructContainTest, SlicedColumnInputTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  constexpr int32_t dont_care{0};
+
+  auto const col_original = [] {
+    auto child1 = col_wrapper{dont_care, dont_care, 1, 2, 3, dont_care};
+    auto child2 = col_wrapper{dont_care, dont_care, 4, 5, 6, dont_care};
+    auto child3 = strings_col{"dont_care", "dont_care", "x", "y", "z", "dont_care"};
+    return structs_col{{child1, child2, child3}};
+  }();
+  auto const col = cudf::slice(col_original, {2, 5})[0];
+
+  auto const val1 = [] {
+    auto child1 = col_wrapper{1};
+    auto child2 = col_wrapper{4};
+    auto child3 = strings_col{"x"};
+    return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+  }();
+  auto const val2 = [] {
+    auto child1 = col_wrapper{dont_care};
+    auto child2 = col_wrapper{dont_care};
+    auto child3 = strings_col{"dont_care"};
+    return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+  }();
+
+  EXPECT_EQ(true, cudf::contains(col, val1));
+  EXPECT_EQ(false, cudf::contains(col, val2));
+}
+
+TYPED_TEST(TypedScalarStructContainTest, SimpleInputWithNullsTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  constexpr int32_t null{0};
+
+  // Test with nulls at the top level.
+  {
+    auto const col = [] {
+      auto child1 = col_wrapper{1, null, 3};
+      auto child2 = col_wrapper{4, null, 6};
+      auto child3 = strings_col{"x", "" /*NULL*/, "z"};
+      return structs_col{{child1, child2, child3}, null_at(1)};
+    }();
+
+    auto const val1 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{"x"};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+    auto const val2 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{"a"};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+
+    EXPECT_EQ(true, cudf::contains(col, val1));
+    EXPECT_EQ(false, cudf::contains(col, val2));
+  }
+
+  // Test with nulls at the children level.
+  {
+    auto const col = [] {
+      auto child1 = col_wrapper{{1, null, 3}, null_at(1)};
+      auto child2 = col_wrapper{{4, null, 6}, null_at(1)};
+      auto child3 = strings_col{{"" /*NULL*/, "y", "z"}, null_at(0)};
+      return structs_col{{child1, child2, child3}};
+    }();
+
+    auto const val1 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{{"" /*NULL*/}, null_at(0)};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+    auto const val2 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{""};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+
+    EXPECT_EQ(true, cudf::contains(col, val1));
+    EXPECT_EQ(false, cudf::contains(col, val2));
+  }
+
+  // Test with nulls in the input scalar.
+  {
+    auto const col = [] {
+      auto child1 = col_wrapper{1, 2, 3};
+      auto child2 = col_wrapper{4, 5, 6};
+      auto child3 = strings_col{"x", "y", "z"};
+      return structs_col{{child1, child2, child3}};
+    }();
+
+    auto const val1 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{"x"};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+    auto const val2 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{{"" /*NULL*/}, null_at(0)};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+
+    EXPECT_EQ(true, cudf::contains(col, val1));
+    EXPECT_EQ(false, cudf::contains(col, val2));
+  }
+}
+
+TYPED_TEST(TypedScalarStructContainTest, SlicedInputWithNullsTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  constexpr int32_t dont_care{0};
+  constexpr int32_t null{0};
+
+  // Test with nulls at the top level.
+  {
+    auto const col_original = [] {
+      auto child1 = col_wrapper{dont_care, dont_care, 1, null, 3, dont_care};
+      auto child2 = col_wrapper{dont_care, dont_care, 4, null, 6, dont_care};
+      auto child3 = strings_col{"dont_care", "dont_care", "x", "" /*NULL*/, "z", "dont_care"};
+      return structs_col{{child1, child2, child3}, null_at(3)};
+    }();
+    auto const col = cudf::slice(col_original, {2, 5})[0];
+
+    auto const val1 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{"x"};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+    auto const val2 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{"a"};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+
+    EXPECT_EQ(true, cudf::contains(col, val1));
+    EXPECT_EQ(false, cudf::contains(col, val2));
+  }
+
+  // Test with nulls at the children level.
+  {
+    auto const col_original = [] {
+      auto child1 =
+        col_wrapper{{dont_care, dont_care /*also NULL*/, 1, null, 3, dont_care}, null_at(3)};
+      auto child2 =
+        col_wrapper{{dont_care, dont_care /*also NULL*/, 4, null, 6, dont_care}, null_at(3)};
+      auto child3 = strings_col{
+        {"dont_care", "dont_care" /*also NULL*/, "" /*NULL*/, "y", "z", "dont_care"}, null_at(2)};
+      return structs_col{{child1, child2, child3}, null_at(1)};
+    }();
+    auto const col = cudf::slice(col_original, {2, 5})[0];
+
+    auto const val1 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{{"x"}, null_at(0)};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+    auto const val2 = [] {
+      auto child1 = col_wrapper{dont_care};
+      auto child2 = col_wrapper{dont_care};
+      auto child3 = strings_col{"dont_care"};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+
+    EXPECT_EQ(true, cudf::contains(col, val1));
+    EXPECT_EQ(false, cudf::contains(col, val2));
+  }
+}

From e4a16ae2550f5a7481887b28c7a60fc14fea2f5c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 18 Jan 2022 06:31:14 -0800
Subject: [PATCH 23/23] Implement mixed equality/conditional joins (#9917)

This PR implements mixed equality/inequality joins for inner, left, and full joins. This resolves #9696 and contributes to #5401. For the moment, all APIs are functional only, but an object-oriented API is planned to support caching of the hash table.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Yunsong Wang (https://github.com/PointKernel)
  - Jason Lowe (https://github.com/jlowe)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/9917
---
 cpp/CMakeLists.txt                            |   1 +
 .../cudf/ast/detail/expression_evaluator.cuh  |   4 +-
 cpp/include/cudf/join.hpp                     | 267 +++++++-
 cpp/include/cudf/table/row_operators.cuh      |   7 +-
 cpp/src/join/conditional_join.cu              |  21 +-
 cpp/src/join/conditional_join_kernels.cuh     |   6 +-
 cpp/src/join/hash_join.cu                     |  43 --
 cpp/src/join/hash_join.cuh                    |  44 +-
 cpp/src/join/join_common_utils.cuh            |  11 +
 cpp/src/join/join_common_utils.hpp            |   9 +
 cpp/src/join/mixed_join.cu                    | 557 +++++++++++++++
 cpp/src/join/mixed_join_kernels.cuh           | 322 +++++++++
 cpp/tests/CMakeLists.txt                      |   2 +-
 cpp/tests/join/conditional_join_tests.cu      |  13 +-
 cpp/tests/join/mixed_join_tests.cu            | 643 ++++++++++++++++++
 15 files changed, 1884 insertions(+), 66 deletions(-)
 create mode 100644 cpp/src/join/mixed_join.cu
 create mode 100644 cpp/src/join/mixed_join_kernels.cuh
 create mode 100644 cpp/tests/join/mixed_join_tests.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a8100fb3f92..2f51f582e12 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -322,6 +322,7 @@ add_library(
   src/jit/parser.cpp
   src/jit/type.cpp
   src/join/conditional_join.cu
+  src/join/mixed_join.cu
   src/join/cross_join.cu
   src/join/hash_join.cu
   src/join/join.cu
diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index 0b739482c4d..ecd46ec2c23 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -429,7 +429,7 @@ struct expression_evaluator {
   __device__ __forceinline__ void evaluate(
     expression_result<ResultSubclass, T, result_has_nulls>& output_object,
     cudf::size_type const row_index,
-    IntermediateDataType<has_nulls>* thread_intermediate_storage)
+    IntermediateDataType<has_nulls>* thread_intermediate_storage) const
   {
     evaluate(output_object, row_index, row_index, row_index, thread_intermediate_storage);
   }
@@ -452,7 +452,7 @@ struct expression_evaluator {
     cudf::size_type const left_row_index,
     cudf::size_type const right_row_index,
     cudf::size_type const output_row_index,
-    IntermediateDataType<has_nulls>* thread_intermediate_storage)
+    IntermediateDataType<has_nulls>* thread_intermediate_storage) const
   {
     cudf::size_type operator_source_index{0};
     for (cudf::size_type operator_index = 0; operator_index < plan.operators.size();
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index 8ea6bd1a6cc..30400074c50 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -19,6 +19,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -701,7 +702,7 @@ conditional_inner_join(
  * The first returned vector contains all the row indices from the left
  * table (in unspecified order). The corresponding value in the
  * second returned vector is either (1) the row index of the matched row
- * from the right table, if there is a match  or  (2) an unspecified
+ * from the right table, if there is a match or (2) an unspecified
  * out-of-bounds value.
  *
  * If the provided predicate returns NULL for a pair of rows
@@ -858,6 +859,270 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   std::optional<std::size_t> output_size = {},
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a pair of row index vectors corresponding to all pairs of
+ * rows between the specified tables where the columns of the equality table
+ * are equal and the predicate evaluates to true on the conditional tables.
+ *
+ * The first returned vector contains the row indices from the left
+ * table that have a match in the right table (in unspecified order).
+ * The corresponding values in the second returned vector are
+ * the matched row indices from the right table.
+ *
+ * If the provided predicate returns NULL for a pair of rows
+ * (left, right), that pair is not included in the output. It is the user's
+ * responsiblity to choose a suitable compare_nulls value AND use appropriate
+ * null-safe operators in the expression.
+ *
+ * If the provided output size or per-row counts are incorrect, behavior is undefined.
+ *
+ * @code{.pseudo}
+ * left_equality: {{0, 1, 2}}
+ * right_equality: {{1, 2, 3}}
+ * left_conditional: {{4, 4, 4}}
+ * right_conditional: {{3, 4, 5}}
+ * Expression: Left.Column_0 > Right.Column_0
+ * Result: {{1}, {0}}
+ * @endcode
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size_data An optional pair of values indicating the exact output size and the
+ * number of matches for each row in the larger of the two input tables, left or right (may be
+ * precomputed using the corresponding mixed_inner_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a mixed inner join between the four input tables.
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_inner_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls = null_equality::EQUAL,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a pair of row index vectors corresponding to all pairs of
+ * rows between the specified tables where the columns of the equality table
+ * are equal and the predicate evaluates to true on the conditional tables,
+ * or null matches for rows in left that have no match in right.
+ *
+ * The first returned vector contains the row indices from the left
+ * tables that have a match in the right tables (in unspecified order).
+ * The corresponding value in the second returned vector is either (1)
+ * the row index of the matched row from the right tables, or (2) an
+ * unspecified out-of-bounds value.
+ *
+ * If the provided predicate returns NULL for a pair of rows
+ * (left, right), that pair is not included in the output. It is the user's
+ * responsiblity to choose a suitable compare_nulls value AND use appropriate
+ * null-safe operators in the expression.
+ *
+ * If the provided output size or per-row counts are incorrect, behavior is undefined.
+ *
+ * @code{.pseudo}
+ * left_equality: {{0, 1, 2}}
+ * right_equality: {{1, 2, 3}}
+ * left_conditional: {{4, 4, 4}}
+ * right_conditional: {{3, 4, 5}}
+ * Expression: Left.Column_0 > Right.Column_0
+ * Result: {{0, 1, 2}, {None, 0, None}}
+ * @endcode
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size_data An optional pair of values indicating the exact output size and the
+ * number of matches for each row in the larger of the two input tables, left or right (may be
+ * precomputed using the corresponding mixed_left_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a mixed left join between the four input tables.
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_left_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls = null_equality::EQUAL,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a pair of row index vectors corresponding to all pairs of
+ * rows between the specified tables where the columns of the equality table
+ * are equal and the predicate evaluates to true on the conditional tables,
+ * or null matches for rows in either pair of tables that have no matches in
+ * the other pair.
+ *
+ * Taken pairwise, the values from the returned vectors are one of:
+ * (1) row indices corresponding to matching rows from the left and
+ * right tables, (2) a row index and an unspecified out-of-bounds value,
+ * representing a row from one table without a match in the other.
+ *
+ * If the provided predicate returns NULL for a pair of rows
+ * (left, right), that pair is not included in the output. It is the user's
+ * responsiblity to choose a suitable compare_nulls value AND use appropriate
+ * null-safe operators in the expression.
+ *
+ * If the provided output size or per-row counts are incorrect, behavior is undefined.
+ *
+ * @code{.pseudo}
+ * left_equality: {{0, 1, 2}}
+ * right_equality: {{1, 2, 3}}
+ * left_conditional: {{4, 4, 4}}
+ * right_conditional: {{3, 4, 5}}
+ * Expression: Left.Column_0 > Right.Column_0
+ * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}}
+ * @endcode
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size_data An optional pair of values indicating the exact output size and the
+ * number of matches for each row in the larger of the two input tables, left or right (may be
+ * precomputed using the corresponding mixed_full_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a mixed full join between the four input tables.
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_full_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls = null_equality::EQUAL,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the exact number of matches (rows) when performing a
+ * mixed inner join between the specified tables where the columns of the
+ * equality table are equal and the predicate evaluates to true on the
+ * conditional tables.
+ *
+ * If the provided predicate returns NULL for a pair of rows (left, right),
+ * that pair is not included in the output. It is the user's responsiblity to
+ * choose a suitable compare_nulls value AND use appropriate null-safe
+ * operators in the expression.
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size An optional pair of values indicating the exact output size and the number of
+ * matches for each row in the larger of the two input tables, left or right (may be precomputed
+ * using the corresponding mixed_inner_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair containing the size that would result from performing the
+ * requested join and the number of matches for each row in one of the two
+ * tables. Which of the two tables is an implementation detail and should not
+ * be relied upon, simply passed to the corresponding `mixed_inner_join` API as
+ * is.
+ */
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_inner_join_size(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the exact number of matches (rows) when performing a
+ * mixed left join between the specified tables where the columns of the
+ * equality table are equal and the predicate evaluates to true on the
+ * conditional tables.
+ *
+ * If the provided predicate returns NULL for a pair of rows (left, right),
+ * that pair is not included in the output. It is the user's responsiblity to
+ * choose a suitable compare_nulls value AND use appropriate null-safe
+ * operators in the expression.
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size An optional pair of values indicating the exact output size and the number of
+ * matches for each row in the larger of the two input tables, left or right (may be precomputed
+ * using the corresponding mixed_inner_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair containing the size that would result from performing the
+ * requested join and the number of matches for each row in one of the two
+ * tables. Which of the two tables is an implementation detail and should not
+ * be relied upon, simply passed to the corresponding `mixed_left_join` API as
+ * is.
+ */
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_join_size(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns the exact number of matches (rows) when performing a
  * conditional inner join between the specified tables where the predicate
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 32ddd1ef49a..a3b08fda15d 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -492,8 +492,11 @@ template <template <typename> class hash_function, typename Nullate>
 class row_hasher {
  public:
   row_hasher() = delete;
-  row_hasher(Nullate has_nulls, table_device_view t) : _table{t}, _has_nulls{has_nulls} {}
-  row_hasher(Nullate has_nulls, table_device_view t, uint32_t seed)
+  CUDF_HOST_DEVICE row_hasher(Nullate has_nulls, table_device_view t)
+    : _table{t}, _has_nulls{has_nulls}
+  {
+  }
+  CUDF_HOST_DEVICE row_hasher(Nullate has_nulls, table_device_view t, uint32_t seed)
     : _table{t}, _seed(seed), _has_nulls{has_nulls}
   {
   }
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 3992361ab1c..c3dc343dd2d 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -97,9 +97,11 @@ conditional_join(table_view const& left,
   // For inner joins we support optimizing the join by launching one thread for
   // whichever table is larger rather than always using the left table.
   auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-  detail::grid_1d config(swap_tables ? right_num_rows : left_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
+  detail::grid_1d const config(swap_tables ? right_num_rows : left_num_rows,
+                               DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
-  join_kind kernel_join_type = join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type;
+  join_kind const kernel_join_type =
+    join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type;
 
   // If the join size was not provided as an input, compute it here.
   std::size_t join_size;
@@ -197,6 +199,13 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
                                                  rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
+  // Until we add logic to handle the number of non-matches in the right table,
+  // full joins are not supported in this function. Note that this does not
+  // prevent actually performing full joins since we do that by calculating the
+  // left join and then concatenating the complementary right indices.
+  CUDF_EXPECTS(join_type != join_kind::FULL_JOIN,
+               "Size estimation is not available for full joins.");
+
   // We can immediately filter out cases where one table is empty. In
   // some cases, we return all the rows of the other table with a corresponding
   // null index for the empty table; in others, we return an empty output.
@@ -232,8 +241,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   // If none of the input columns actually contain nulls, we can still use the
   // non-nullable version of the expression evaluation code path for
   // performance, so we capture that information as well.
-  auto const nullable  = cudf::nullable(left) || cudf::nullable(right);
-  auto const has_nulls = nullable && (cudf::has_nulls(left) || cudf::has_nulls(right));
+  auto const has_nulls = binary_predicate.may_evaluate_null(left, right, stream);
 
   auto const parser =
     ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr};
@@ -246,11 +254,10 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   // For inner joins we support optimizing the join by launching one thread for
   // whichever table is larger rather than always using the left table.
   auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-  detail::grid_1d config(swap_tables ? right_num_rows : left_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
+  detail::grid_1d const config(swap_tables ? right_num_rows : left_num_rows,
+                               DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
-  assert(join_type != join_kind::FULL_JOIN);
-
   // Allocate storage for the counter used to get the size of the join output
   rmm::device_scalar<std::size_t> size(0, stream, mr);
   CHECK_CUDA(stream.value());
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 4aceb79a9aa..746377296b5 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -67,8 +67,8 @@ __global__ void compute_conditional_join_output_size(
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
 
   std::size_t thread_counter{0};
-  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * blockDim.x;
-  cudf::size_type const stride         = blockDim.x * gridDim.x;
+  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * block_size;
+  cudf::size_type const stride         = block_size * gridDim.x;
   cudf::size_type const left_num_rows  = left_table.num_rows();
   cudf::size_type const right_num_rows = right_table.num_rows();
   auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
@@ -169,7 +169,7 @@ __global__ void conditional_join(table_device_view left_table,
 
   __syncwarp();
 
-  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * blockDim.x;
+  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
 
   unsigned int const activemask = __ballot_sync(0xffffffff, outer_row_index < outer_num_rows);
 
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index c259be2a285..7590c93f0c3 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -19,7 +19,6 @@
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.cuh>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -34,10 +33,6 @@
 namespace cudf {
 namespace detail {
 
-namespace {
-
-}  // anonymous namespace
-
 std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
   table_view const& probe, table_view const& build)
 {
@@ -46,44 +41,6 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table
   return std::make_pair(std::move(empty_probe), std::move(empty_build));
 }
 
-/**
- * @brief Builds the hash table based on the given `build_table`.
- *
- * @param build Table of columns used to build join hash.
- * @param hash_table Build hash table.
- * @param compare_nulls Controls whether null join-key values should match or not.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- *
- */
-void build_join_hash_table(cudf::table_view const& build,
-                           multimap_type& hash_table,
-                           null_equality compare_nulls,
-                           rmm::cuda_stream_view stream)
-{
-  auto build_table_ptr = cudf::table_device_view::create(build, stream);
-
-  CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty");
-  CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows");
-
-  row_hash hash_build{nullate::DYNAMIC{cudf::has_nulls(build)}, *build_table_ptr};
-  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
-  make_pair_function pair_func{hash_build, empty_key_sentinel};
-
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
-
-  size_type const build_table_num_rows{build_table_ptr->num_rows()};
-  if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    hash_table.insert(iter, iter + build_table_num_rows, stream.value());
-  } else {
-    thrust::counting_iterator<size_type> stencil(0);
-    auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
-    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
-
-    // insert valid rows
-    hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value());
-  }
-}
-
 /**
  * @brief Probes the `hash_table` built from `build_table` for tuples in `probe_table`,
  * and returns the output indices of `build_table` and `probe_table` as a combined table.
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index 5a042f65aad..4005d6101bd 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -20,6 +20,7 @@
 
 #include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/join.hpp>
@@ -57,7 +58,8 @@ constexpr auto remap_sentinel_hash(H hash, S sentinel)
  */
 class make_pair_function {
  public:
-  make_pair_function(row_hash const& hash, hash_value_type const empty_key_sentinel)
+  CUDF_HOST_DEVICE make_pair_function(row_hash const& hash,
+                                      hash_value_type const empty_key_sentinel)
     : _hash{hash}, _empty_key_sentinel{empty_key_sentinel}
   {
   }
@@ -143,6 +145,46 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table
 std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& left,
                                                 std::unique_ptr<cudf::table>&& right);
 
+/**
+ * @brief Builds the hash table based on the given `build_table`.
+ *
+ * @tparam MultimapType The type of the hash table
+ *
+ * @param build Table of columns used to build join hash.
+ * @param hash_table Build hash table.
+ * @param compare_nulls Controls whether null join-key values should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ *
+ */
+template <typename MultimapType>
+void build_join_hash_table(cudf::table_view const& build,
+                           MultimapType& hash_table,
+                           null_equality compare_nulls,
+                           rmm::cuda_stream_view stream)
+{
+  auto build_table_ptr = cudf::table_device_view::create(build, stream);
+
+  CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty");
+  CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows");
+
+  row_hash hash_build{nullate::DYNAMIC{cudf::has_nulls(build)}, *build_table_ptr};
+  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_build, empty_key_sentinel};
+
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
+
+  size_type const build_table_num_rows{build_table_ptr->num_rows()};
+  if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
+    hash_table.insert(iter, iter + build_table_num_rows, stream.value());
+  } else {
+    thrust::counting_iterator<size_type> stencil(0);
+    auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
+    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
+
+    // insert valid rows
+    hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value());
+  }
+}
 }  // namespace detail
 
 struct hash_join::hash_join_impl {
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 2fd0207a2c0..b778f13b5e1 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -45,6 +45,17 @@ class row_is_valid {
 
 /**
  * @brief Device functor to determine if two pairs are identical.
+ *
+ * This equality comparator is designed for use with cuco::static_multimap's
+ * pair* APIs, which will compare equality based on comparing (key, value)
+ * pairs. In the context of joins, these pairs are of the form
+ * (row_hash, row_id). A hash probe hit indicates that hash of a probe row's hash is
+ * equal to the hash of the hash of some row in the multimap, at which point we need an
+ * equality comparator that will check whether the contents of the rows are
+ * identical. This comparator does so by verifying key equality (i.e. that
+ * probe_row_hash == build_row_hash) and then using a row_equality_comparator
+ * to compare the contents of the row indices that are stored as the payload in
+ * the hash map.
  */
 class pair_equality {
  public:
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 9a7540bcd33..938a85247f8 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -51,6 +51,15 @@ using multimap_type =
                         hash_table_allocator_type,
                         cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, hash_type, hash_type>>;
 
+// Multimap type used for mixed joins. TODO: This is a temporary alias used
+// until the mixed joins are converted to using CGs properly. Right now it's
+// using a cooperative group of size 1.
+using mixed_multimap_type = cuco::static_multimap<hash_value_type,
+                                                  size_type,
+                                                  cuda::thread_scope_device,
+                                                  hash_table_allocator_type,
+                                                  cuco::double_hashing<1, hash_type, hash_type>>;
+
 using row_hash = cudf::row_hasher<default_hash, cudf::nullate::DYNAMIC>;
 
 using row_equality = cudf::row_equality_comparator<cudf::nullate::DYNAMIC>;
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
new file mode 100644
index 00000000000..c609b58132c
--- /dev/null
+++ b/cpp/src/join/mixed_join.cu
@@ -0,0 +1,557 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/join.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+#include <join/hash_join.cuh>
+#include <join/join_common_utils.cuh>
+#include <join/join_common_utils.hpp>
+#include <join/mixed_join_kernels.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <optional>
+#include <utility>
+
+#include <cstdio>
+
+namespace cudf {
+namespace detail {
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  join_kind join_type,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> const& output_size_data,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(),
+               "The left conditional and equality tables must have the same number of rows.");
+  CUDF_EXPECTS(right_conditional.num_rows() == right_equality.num_rows(),
+               "The right conditional and equality tables must have the same number of rows.");
+
+  auto const right_num_rows{right_conditional.num_rows()};
+  auto const left_num_rows{left_conditional.num_rows()};
+  auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
+
+  // The "outer" table is the larger of the two tables. The kernels are
+  // launched with one thread per row of the outer table, which also means that
+  // it is the probe table for the hash
+  auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows};
+
+  // We can immediately filter out cases where the right table is empty. In
+  // some cases, we return all the rows of the left table with a corresponding
+  // null index for the right table; in others, we return an empty output.
+  if (right_num_rows == 0) {
+    switch (join_type) {
+      // Left, left anti, and full all return all the row indices from left
+      // with a corresponding NULL from the right.
+      case join_kind::LEFT_JOIN:
+      case join_kind::LEFT_ANTI_JOIN:
+      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left_conditional, stream);
+      // Inner and left semi joins return empty output because no matches can exist.
+      case join_kind::INNER_JOIN:
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  } else if (left_num_rows == 0) {
+    switch (join_type) {
+      // Left, left anti, left semi, and inner joins all return empty sets.
+      case join_kind::LEFT_JOIN:
+      case join_kind::LEFT_ANTI_JOIN:
+      case join_kind::INNER_JOIN:
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+      // Full joins need to return the trivial complement.
+      case join_kind::FULL_JOIN: {
+        auto ret_flipped = get_trivial_left_join_indices(right_conditional, stream);
+        return std::make_pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
+      }
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  }
+
+  // If evaluating the expression may produce null outputs we create a nullable
+  // output column and follow the null-supporting expression evaluation code
+  // path.
+  auto const has_nulls =
+    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
+    binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream);
+
+  auto const parser = ast::detail::expression_parser{
+    binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr};
+  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
+               "The expression must produce a boolean output.");
+
+  // TODO: The non-conditional join impls start with a dictionary matching,
+  // figure out what that is and what it's needed for (and if conditional joins
+  // need to do the same).
+  auto& probe     = swap_tables ? right_equality : left_equality;
+  auto& build     = swap_tables ? left_equality : right_equality;
+  auto probe_view = table_device_view::create(probe, stream);
+  auto build_view = table_device_view::create(build, stream);
+  row_equality equality_probe{
+    cudf::nullate::DYNAMIC{has_nulls}, *probe_view, *build_view, compare_nulls};
+
+  // Don't use multimap_type because we want a CG size of 1.
+  mixed_multimap_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    std::numeric_limits<hash_value_type>::max(),
+    cudf::detail::JoinNoneValue,
+    stream.value(),
+    detail::hash_table_allocator_type{default_allocator<char>{}, stream}};
+
+  // TODO: To add support for nested columns we will need to flatten in many
+  // places. However, this probably isn't worth adding any time soon since we
+  // won't be able to support AST conditions for those types anyway.
+  build_join_hash_table(build, hash_table, compare_nulls, stream);
+  auto hash_table_view = hash_table.get_device_view();
+
+  auto left_conditional_view  = table_device_view::create(left_conditional, stream);
+  auto right_conditional_view = table_device_view::create(right_conditional, stream);
+
+  // For inner joins we support optimizing the join by launching one thread for
+  // whichever table is larger rather than always using the left table.
+  detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+  join_kind const kernel_join_type =
+    join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type;
+
+  // If the join size data was not provided as an input, compute it here.
+  std::size_t join_size;
+  // Using an optional because we only need to allocate a new vector if one was
+  // not passed as input, and rmm::device_uvector is not default constructible
+  std::optional<rmm::device_uvector<size_type>> matches_per_row{};
+  device_span<size_type const> matches_per_row_span{};
+
+  if (output_size_data.has_value()) {
+    join_size            = output_size_data->first;
+    matches_per_row_span = output_size_data->second;
+  } else {
+    // Allocate storage for the counter used to get the size of the join output
+    rmm::device_scalar<std::size_t> size(0, stream, mr);
+    CHECK_CUDA(stream.value());
+
+    matches_per_row =
+      rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
+    // Note that the view goes out of scope after this else statement, but the
+    // data owned by matches_per_row stays alive so the data pointer is valid.
+    auto mutable_matches_per_row_span = cudf::device_span<size_type>{
+      matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
+    matches_per_row_span = cudf::device_span<size_type const>{
+      matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
+    if (has_nulls) {
+      compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_conditional_view,
+          *right_conditional_view,
+          *probe_view,
+          *build_view,
+          equality_probe,
+          kernel_join_type,
+          hash_table_view,
+          parser.device_expression_data,
+          swap_tables,
+          size.data(),
+          mutable_matches_per_row_span);
+    } else {
+      compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_conditional_view,
+          *right_conditional_view,
+          *probe_view,
+          *build_view,
+          equality_probe,
+          kernel_join_type,
+          hash_table_view,
+          parser.device_expression_data,
+          swap_tables,
+          size.data(),
+          mutable_matches_per_row_span);
+    }
+    CHECK_CUDA(stream.value());
+    join_size = size.value(stream);
+  }
+
+  // The initial early exit clauses guarantee that we will not reach this point
+  // unless both the left and right tables are non-empty. Under that
+  // constraint, neither left nor full joins can return an empty result since
+  // at minimum we are guaranteed null matches for all non-matching rows. In
+  // all other cases (inner, left semi, and left anti joins) if we reach this
+  // point we can safely return an empty result.
+  if (join_size == 0) {
+    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+  }
+
+  // Given the number of matches per row, we need to compute the offsets for insertion.
+  auto join_result_offsets =
+    rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
+  thrust::exclusive_scan(rmm::exec_policy{stream},
+                         matches_per_row_span.begin(),
+                         matches_per_row_span.end(),
+                         join_result_offsets.begin());
+
+  auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+  auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+
+  auto const& join_output_l = left_indices->data();
+  auto const& join_output_r = right_indices->data();
+
+  if (has_nulls) {
+    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_conditional_view,
+        *right_conditional_view,
+        *probe_view,
+        *build_view,
+        equality_probe,
+        kernel_join_type,
+        hash_table_view,
+        join_output_l,
+        join_output_r,
+        parser.device_expression_data,
+        join_result_offsets.data(),
+        swap_tables);
+  } else {
+    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_conditional_view,
+        *right_conditional_view,
+        *probe_view,
+        *build_view,
+        equality_probe,
+        kernel_join_type,
+        hash_table_view,
+        join_output_l,
+        join_output_r,
+        parser.device_expression_data,
+        join_result_offsets.data(),
+        swap_tables);
+  }
+
+  CHECK_CUDA(stream.value());
+
+  auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices));
+
+  // For full joins, get the indices in the right table that were not joined to
+  // by any row in the left table.
+  if (join_type == join_kind::FULL_JOIN) {
+    auto complement_indices = detail::get_left_join_indices_complement(
+      join_indices.second, left_num_rows, right_num_rows, stream, mr);
+    join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
+  }
+  return join_indices;
+}
+
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>>
+compute_mixed_join_output_size(table_view const& left_equality,
+                               table_view const& right_equality,
+                               table_view const& left_conditional,
+                               table_view const& right_conditional,
+                               ast::expression const& binary_predicate,
+                               null_equality compare_nulls,
+                               join_kind join_type,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  // Until we add logic to handle the number of non-matches in the right table,
+  // full joins are not supported in this function. Note that this does not
+  // prevent actually performing full joins since we do that by calculating the
+  // left join and then concatenating the complementary right indices.
+  CUDF_EXPECTS(join_type != join_kind::FULL_JOIN,
+               "Size estimation is not available for full joins.");
+
+  CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(),
+               "The left conditional and equality tables must have the same number of rows.");
+  CUDF_EXPECTS(right_conditional.num_rows() == right_equality.num_rows(),
+               "The right conditional and equality tables must have the same number of rows.");
+
+  auto const right_num_rows{right_conditional.num_rows()};
+  auto const left_num_rows{left_conditional.num_rows()};
+  auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
+
+  // The "outer" table is the larger of the two tables. The kernels are
+  // launched with one thread per row of the outer table, which also means that
+  // it is the probe table for the hash
+  auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows};
+
+  auto matches_per_row = std::make_unique<rmm::device_uvector<size_type>>(
+    static_cast<std::size_t>(outer_num_rows), stream, mr);
+  auto matches_per_row_span = cudf::device_span<size_type>{
+    matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
+
+  // We can immediately filter out cases where one table is empty. In
+  // some cases, we return all the rows of the other table with a corresponding
+  // null index for the empty table; in others, we return an empty output.
+  if (right_num_rows == 0) {
+    switch (join_type) {
+      // Left, left anti, and full all return all the row indices from left
+      // with a corresponding NULL from the right.
+      case join_kind::LEFT_JOIN:
+      case join_kind::LEFT_ANTI_JOIN:
+      case join_kind::FULL_JOIN: {
+        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 1);
+        return {left_num_rows, std::move(matches_per_row)};
+      }
+      // Inner and left semi joins return empty output because no matches can exist.
+      case join_kind::INNER_JOIN:
+      case join_kind::LEFT_SEMI_JOIN: {
+        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0);
+        return {0, std::move(matches_per_row)};
+      }
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  } else if (left_num_rows == 0) {
+    switch (join_type) {
+      // Left, left anti, left semi, and inner joins all return empty sets.
+      case join_kind::LEFT_JOIN:
+      case join_kind::LEFT_ANTI_JOIN:
+      case join_kind::INNER_JOIN:
+      case join_kind::LEFT_SEMI_JOIN: {
+        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0);
+        return {0, std::move(matches_per_row)};
+      }
+      // Full joins need to return the trivial complement.
+      case join_kind::FULL_JOIN: {
+        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 1);
+        return {right_num_rows, std::move(matches_per_row)};
+      }
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  }
+
+  // If evaluating the expression may produce null outputs we create a nullable
+  // output column and follow the null-supporting expression evaluation code
+  // path.
+  auto const has_nulls =
+    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
+    binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream);
+
+  auto const parser = ast::detail::expression_parser{
+    binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr};
+  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
+               "The expression must produce a boolean output.");
+
+  // TODO: The non-conditional join impls start with a dictionary matching,
+  // figure out what that is and what it's needed for (and if conditional joins
+  // need to do the same).
+  auto& probe     = swap_tables ? right_equality : left_equality;
+  auto& build     = swap_tables ? left_equality : right_equality;
+  auto probe_view = table_device_view::create(probe, stream);
+  auto build_view = table_device_view::create(build, stream);
+  row_equality equality_probe{
+    cudf::nullate::DYNAMIC{has_nulls}, *probe_view, *build_view, compare_nulls};
+
+  // Don't use multimap_type because we want a CG size of 1.
+  mixed_multimap_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    std::numeric_limits<hash_value_type>::max(),
+    cudf::detail::JoinNoneValue,
+    stream.value(),
+    detail::hash_table_allocator_type{default_allocator<char>{}, stream}};
+
+  // TODO: To add support for nested columns we will need to flatten in many
+  // places. However, this probably isn't worth adding any time soon since we
+  // won't be able to support AST conditions for those types anyway.
+  build_join_hash_table(build, hash_table, compare_nulls, stream);
+  auto hash_table_view = hash_table.get_device_view();
+
+  auto left_conditional_view  = table_device_view::create(left_conditional, stream);
+  auto right_conditional_view = table_device_view::create(right_conditional, stream);
+
+  // For inner joins we support optimizing the join by launching one thread for
+  // whichever table is larger rather than always using the left table.
+  detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+
+  // Allocate storage for the counter used to get the size of the join output
+  rmm::device_scalar<std::size_t> size(0, stream, mr);
+  CHECK_CUDA(stream.value());
+
+  // Determine number of output rows without actually building the output to simply
+  // find what the size of the output will be.
+  if (has_nulls) {
+    compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_conditional_view,
+        *right_conditional_view,
+        *probe_view,
+        *build_view,
+        equality_probe,
+        join_type,
+        hash_table_view,
+        parser.device_expression_data,
+        swap_tables,
+        size.data(),
+        matches_per_row_span);
+  } else {
+    compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_conditional_view,
+        *right_conditional_view,
+        *probe_view,
+        *build_view,
+        equality_probe,
+        join_type,
+        hash_table_view,
+        parser.device_expression_data,
+        swap_tables,
+        size.data(),
+        matches_per_row_span);
+  }
+  CHECK_CUDA(stream.value());
+
+  return {size.value(stream), std::move(matches_per_row)};
+}
+
+}  // namespace detail
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_inner_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::mixed_join(left_equality,
+                            right_equality,
+                            left_conditional,
+                            right_conditional,
+                            binary_predicate,
+                            compare_nulls,
+                            detail::join_kind::INNER_JOIN,
+                            output_size_data,
+                            rmm::cuda_stream_default,
+                            mr);
+}
+
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_inner_join_size(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::compute_mixed_join_output_size(left_equality,
+                                                right_equality,
+                                                left_conditional,
+                                                right_conditional,
+                                                binary_predicate,
+                                                compare_nulls,
+                                                detail::join_kind::INNER_JOIN,
+                                                rmm::cuda_stream_default,
+                                                mr);
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_left_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::mixed_join(left_equality,
+                            right_equality,
+                            left_conditional,
+                            right_conditional,
+                            binary_predicate,
+                            compare_nulls,
+                            detail::join_kind::LEFT_JOIN,
+                            output_size_data,
+                            rmm::cuda_stream_default,
+                            mr);
+}
+
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_join_size(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::compute_mixed_join_output_size(left_equality,
+                                                right_equality,
+                                                left_conditional,
+                                                right_conditional,
+                                                binary_predicate,
+                                                compare_nulls,
+                                                detail::join_kind::LEFT_JOIN,
+                                                rmm::cuda_stream_default,
+                                                mr);
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_full_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::mixed_join(left_equality,
+                            right_equality,
+                            left_conditional,
+                            right_conditional,
+                            binary_predicate,
+                            compare_nulls,
+                            detail::join_kind::FULL_JOIN,
+                            output_size_data,
+                            rmm::cuda_stream_default,
+                            mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels.cuh b/cpp/src/join/mixed_join_kernels.cuh
new file mode 100644
index 00000000000..9812d4c4b7d
--- /dev/null
+++ b/cpp/src/join/mixed_join_kernels.cuh
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <join/join_common_utils.cuh>
+#include <join/join_common_utils.hpp>
+
+#include <cudf/ast/detail/expression_evaluator.cuh>
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <cooperative_groups.h>
+
+#include <cub/cub.cuh>
+#include <cuco/detail/pair.cuh>
+#include <thrust/equal.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+namespace cudf {
+namespace detail {
+namespace cg = cooperative_groups;
+
+/**
+ * @brief Device functor to determine if two pairs are identical.
+ *
+ * This equality comparator is designed for use with cuco::static_multimap's
+ * pair* APIs, which will compare equality based on comparing (key, value)
+ * pairs. In the context of joins, these pairs are of the form
+ * (row_hash, row_id). A hash probe hit indicates that hash of a probe row's hash is
+ * equal to the hash of the hash of some row in the multimap, at which point we need an
+ * equality comparator that will check whether the contents of the rows are
+ * identical. This comparator does so by verifying key equality (i.e. that
+ * probe_row_hash == build_row_hash) and then using a row_equality_comparator
+ * to compare the contents of the row indices that are stored as the payload in
+ * the hash map.
+ *
+ * This particular comparator is a specialized version of the pair_equality used in hash joins. This
+ * version also checks the expression_evaluator.
+ */
+template <bool has_nulls>
+class pair_expression_equality {
+ public:
+  __device__ pair_expression_equality(
+    cudf::ast::detail::expression_evaluator<has_nulls> const& evaluator,
+    cudf::ast::detail::IntermediateDataType<has_nulls>* thread_intermediate_storage,
+    bool const swap_tables,
+    row_equality const& equality_probe)
+    : evaluator{evaluator},
+      thread_intermediate_storage{thread_intermediate_storage},
+      swap_tables{swap_tables},
+      equality_probe{equality_probe}
+  {
+  }
+
+  // The parameters are build/probe rather than left/right because the operator
+  // is called by cuco's kernels with parameters in this order (note that this
+  // is an implementation detail that we should eventually stop relying on by
+  // defining operators with suitable heterogeneous typing). Rather than
+  // converting to left/right semantics, we can operate directly on build/probe
+  // until we get to the expression evaluator, which needs to convert back to
+  // left/right semantics because the conditional expression need not be
+  // commutative.
+  __device__ __forceinline__ bool operator()(const pair_type& build_row,
+                                             const pair_type& probe_row) const noexcept
+  {
+    auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+    // Three levels of checks:
+    // 1. Row hashes of the columns involved in the equality condition are equal.
+    // 2. The contents of the columns involved in the equality condition are equal.
+    // 3. The predicate evaluated on the relevant columns (already encoded in the evaluator)
+    // evaluates to true.
+    if ((probe_row.first == build_row.first) &&
+        equality_probe(probe_row.second, build_row.second)) {
+      auto const lrow_idx = swap_tables ? build_row.second : probe_row.second;
+      auto const rrow_idx = swap_tables ? probe_row.second : build_row.second;
+      evaluator.evaluate(output_dest, lrow_idx, rrow_idx, 0, thread_intermediate_storage);
+      return (output_dest.is_valid() && output_dest.value());
+    }
+    return false;
+  }
+
+ private:
+  cudf::ast::detail::IntermediateDataType<has_nulls>* thread_intermediate_storage;
+  cudf::ast::detail::expression_evaluator<has_nulls> const& evaluator;
+  bool const swap_tables;
+  row_equality const& equality_probe;
+};
+
+/**
+ * @brief Computes the output size of joining the left table to the right table.
+ *
+ * This method probes the hash table with each row in the probe table using a
+ * custom equality comparator that also checks that the conditional expression
+ * evaluates to true between the left/right tables when a match is found
+ * between probe and build rows.
+ *
+ * @tparam block_size The number of threads per block for this kernel
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
+ *
+ * @param[in] left_table The left table
+ * @param[in] right_table The right table
+ * @param[in] probe The table with which to probe the hash table for matches.
+ * @param[in] build The table with which the hash table was built.
+ * @param[in] equality_probe The equality comparator used when probing the hash table.
+ * @param[in] join_type The type of join to be performed
+ * @param[in] hash_table_view The hash table built from `build`.
+ * @param[in] device_expression_data Container of device data required to evaluate the desired
+ * expression.
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
+ * @param[out] output_size The resulting output size
+ * @param[out] matches_per_row The number of matches in one pair of
+ * equality/conditional tables for each row in the other pair of tables. If
+ * swap_tables is true, matches_per_row corresponds to the right_table,
+ * otherwise it corresponds to the left_table. Note that corresponding swap of
+ * left/right tables to determine which is the build table and which is the
+ * probe table has already happened on the host.
+ */
+template <int block_size, bool has_nulls>
+__global__ void compute_mixed_join_output_size(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row)
+{
+  // The (required) extern storage of the shared memory array leads to
+  // conflicting declarations between different templates. The easiest
+  // workaround is to declare an arbitrary (here char) array type then cast it
+  // after the fact to the appropriate type.
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage =
+    intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates);
+
+  std::size_t thread_counter{0};
+  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * block_size;
+  cudf::size_type const stride         = block_size * gridDim.x;
+  cudf::size_type const left_num_rows  = left_table.num_rows();
+  cudf::size_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
+
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
+
+  // TODO: The hash join code assumes that nulls exist here, so I'm doing the
+  // same but at some point we may want to benchmark that.
+  row_hash hash_probe{nullate::DYNAMIC{has_nulls}, probe};
+  auto const empty_key_sentinel = hash_table_view.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_probe, empty_key_sentinel};
+
+  for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
+       outer_row_index += stride) {
+    // Figure out the number of elements for this key.
+    cg::thread_block_tile<1> this_thread = cg::this_thread();
+    auto query_pair                      = pair_func(outer_row_index);
+    // TODO: Address asymmetry in operator.
+    auto count_equality = pair_expression_equality<has_nulls>{
+      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+    // TODO: This entire kernel probably won't work for left anti joins since I
+    // need to use a normal map (not a multimap), so this condition is probably
+    // overspecified at the moment.
+    if (join_type == join_kind::LEFT_JOIN || join_type == join_kind::LEFT_ANTI_JOIN ||
+        join_type == join_kind::FULL_JOIN) {
+      matches_per_row[outer_row_index] =
+        hash_table_view.pair_count_outer(this_thread, query_pair, count_equality);
+    } else {
+      matches_per_row[outer_row_index] =
+        hash_table_view.pair_count(this_thread, query_pair, count_equality);
+    }
+    thread_counter += matches_per_row[outer_row_index];
+  }
+
+  using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
+
+  // Add block counter to global counter
+  if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
+}
+
+/**
+ * @brief Performs a join using the combination of a hash lookup to identify
+ * equal rows between one pair of tables and the evaluation of an expression
+ * containing an arbitrary expression.
+ *
+ * This method probes the hash table with each row in the probe table using a
+ * custom equality comparator that also checks that the conditional expression
+ * evaluates to true between the left/right tables when a match is found
+ * between probe and build rows.
+ *
+ * @tparam block_size The number of threads per block for this kernel
+ * @tparam output_cache_size The side of the shared memory buffer to cache join
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
+ *
+ * @param[in] left_table The left table
+ * @param[in] right_table The right table
+ * @param[in] probe The table with which to probe the hash table for matches.
+ * @param[in] build The table with which the hash table was built.
+ * @param[in] equality_probe The equality comparator used when probing the hash table.
+ * @param[in] join_type The type of join to be performed
+ * @param[in] hash_table_view The hash table built from `build`.
+ * @param[out] join_output_l The left result of the join operation
+ * @param[out] join_output_r The right result of the join operation
+ * @param[in] device_expression_data Container of device data required to evaluate the desired
+ * expression.
+ * @param[in] join_result_offsets The starting indices in join_output[l|r]
+ * where the matches for each row begin. Equivalent to a prefix sum of
+ * matches_per_row.
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
+ */
+template <cudf::size_type block_size,
+          cudf::size_type output_cache_size,
+          bool has_nulls,
+          typename OutputIt1,
+          typename OutputIt2>
+__global__ void mixed_join(table_device_view left_table,
+                           table_device_view right_table,
+                           table_device_view probe,
+                           table_device_view build,
+                           row_equality const equality_probe,
+                           join_kind const join_type,
+                           cudf::detail::mixed_multimap_type::device_view hash_table_view,
+                           OutputIt1 join_output_l,
+                           OutputIt2 join_output_r,
+                           cudf::ast::detail::expression_device_view device_expression_data,
+                           cudf::size_type const* join_result_offsets,
+                           bool const swap_tables)
+{
+  // Normally the casting of a shared memory array is used to create multiple
+  // arrays of different types from the shared memory buffer, but here it is
+  // used to circumvent conflicts between arrays of different types between
+  // different template instantiations due to the extern specifier.
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage =
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+
+  cudf::size_type const left_num_rows  = left_table.num_rows();
+  cudf::size_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
+
+  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
+
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
+
+  // TODO: The hash join code assumes that nulls exist here, so I'm doing the
+  // same but at some point we may want to benchmark that.
+  row_hash hash_probe{nullate::DYNAMIC{has_nulls}, probe};
+  auto const empty_key_sentinel = hash_table_view.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_probe, empty_key_sentinel};
+
+  if (outer_row_index < outer_num_rows) {
+    // Figure out the number of elements for this key.
+    cg::thread_block_tile<1> this_thread = cg::this_thread();
+    // Figure out the number of elements for this key.
+    auto query_pair = pair_func(outer_row_index);
+    auto equality   = pair_expression_equality<has_nulls>{
+      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+
+    auto probe_key_begin       = thrust::make_discard_iterator();
+    auto probe_value_begin     = swap_tables ? join_output_r + join_result_offsets[outer_row_index]
+                                             : join_output_l + join_result_offsets[outer_row_index];
+    auto contained_key_begin   = thrust::make_discard_iterator();
+    auto contained_value_begin = swap_tables ? join_output_l + join_result_offsets[outer_row_index]
+                                             : join_output_r + join_result_offsets[outer_row_index];
+
+    // TODO: This entire kernel probably won't work for left anti joins since I
+    // need to use a normal map (not a multimap), so this condition is probably
+    // overspecified at the moment.
+    if (join_type == join_kind::LEFT_JOIN || join_type == join_kind::LEFT_ANTI_JOIN ||
+        join_type == join_kind::FULL_JOIN) {
+      hash_table_view.pair_retrieve_outer(this_thread,
+                                          query_pair,
+                                          probe_key_begin,
+                                          probe_value_begin,
+                                          contained_key_begin,
+                                          contained_value_begin,
+                                          equality);
+    } else {
+      hash_table_view.pair_retrieve(this_thread,
+                                    query_pair,
+                                    probe_key_begin,
+                                    probe_value_begin,
+                                    contained_key_begin,
+                                    contained_value_begin,
+                                    equality);
+    }
+  }
+}
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d90260400a0..e23403e68e4 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -95,7 +95,7 @@ ConfigureTest(
 # * join tests ------------------------------------------------------------------------------------
 ConfigureTest(
   JOIN_TEST join/join_tests.cpp join/conditional_join_tests.cu join/cross_join_tests.cpp
-  join/semi_anti_join_tests.cpp
+  join/semi_anti_join_tests.cpp join/mixed_join_tests.cu
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 9f9547b06cf..702acb884e4 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -30,6 +30,7 @@
 #include <thrust/sort.h>
 
 #include <algorithm>
+#include <iostream>
 #include <random>
 #include <stdexcept>
 #include <tuple>
@@ -125,7 +126,7 @@ gen_random_nullable_repeated_columns(unsigned int N = 10000, unsigned int num_re
 }  // namespace
 
 /**
- * The principal fixture for all conditional joins.
+ * Fixture for all nested loop conditional joins.
  */
 template <typename T>
 struct ConditionalJoinTest : public cudf::test::BaseFixture {
@@ -341,7 +342,7 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
 };
 
 /**
- * Tests of inner joins.
+ * Tests of conditional inner joins.
  */
 template <typename T>
 struct ConditionalInnerJoinTest : public ConditionalJoinPairReturnTest<T> {
@@ -527,7 +528,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoNullsNoOutputRowAllEqual)
 };
 
 /**
- * Tests of left joins.
+ * Tests of conditional left joins.
  */
 template <typename T>
 struct ConditionalLeftJoinTest : public ConditionalJoinPairReturnTest<T> {
@@ -582,7 +583,7 @@ TYPED_TEST(ConditionalLeftJoinTest, TestCompareRandomToHashNulls)
 };
 
 /**
- * Tests of full joins.
+ * Tests of conditional full joins.
  */
 template <typename T>
 struct ConditionalFullJoinTest : public ConditionalJoinPairReturnTest<T> {
@@ -762,7 +763,7 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
 };
 
 /**
- * Tests of left semi joins.
+ * Tests of conditional left semi joins.
  */
 template <typename T>
 struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest<T> {
@@ -809,7 +810,7 @@ TYPED_TEST(ConditionalLeftSemiJoinTest, TestCompareRandomToHashNulls)
 };
 
 /**
- * Tests of left anti joins.
+ * Tests of conditional left anti joins.
  */
 template <typename T>
 struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest<T> {
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
new file mode 100644
index 00000000000..d6a348698b5
--- /dev/null
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -0,0 +1,643 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/join.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <thrust/pair.h>
+#include <thrust/sort.h>
+
+#include <algorithm>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace {
+using PairJoinReturn   = std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+                                 std::unique_ptr<rmm::device_uvector<cudf::size_type>>>;
+using SingleJoinReturn = std::unique_ptr<rmm::device_uvector<cudf::size_type>>;
+using NullMaskVector   = std::vector<bool>;
+
+template <typename T>
+using ColumnVector = std::vector<std::vector<T>>;
+
+template <typename T>
+using NullableColumnVector = std::vector<std::pair<std::vector<T>, NullMaskVector>>;
+
+constexpr cudf::size_type JoinNoneValue =
+  std::numeric_limits<cudf::size_type>::min();  // TODO: how to test if this isn't public?
+
+// Common column references.
+const auto col_ref_left_0  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+
+// Common expressions.
+auto left_zero_eq_right_zero =
+  cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
+
+// Generate a single pair of left/right non-nullable columns of random data
+// suitable for testing a join against a reference join implementation.
+template <typename T>
+std::pair<std::vector<T>, std::vector<T>> gen_random_repeated_columns(
+  unsigned int N_left            = 10000,
+  unsigned int num_repeats_left  = 10,
+  unsigned int N_right           = 10000,
+  unsigned int num_repeats_right = 10)
+{
+  // Generate columns of num_repeats repeats of the integer range [0, num_unique),
+  // then merge a shuffled version and compare to hash join.
+  unsigned int num_unique_left  = N_left / num_repeats_left;
+  unsigned int num_unique_right = N_right / num_repeats_right;
+
+  std::vector<T> left(N_left);
+  std::vector<T> right(N_right);
+
+  for (unsigned int i = 0; i < num_repeats_left; ++i) {
+    std::iota(std::next(left.begin(), num_unique_left * i),
+              std::next(left.begin(), num_unique_left * (i + 1)),
+              0);
+  }
+  for (unsigned int i = 0; i < num_repeats_right; ++i) {
+    std::iota(std::next(right.begin(), num_unique_right * i),
+              std::next(right.begin(), num_unique_right * (i + 1)),
+              0);
+  }
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::shuffle(left.begin(), left.end(), gen);
+  std::shuffle(right.begin(), right.end(), gen);
+  return std::make_pair(std::move(left), std::move(right));
+}
+
+// Generate a single pair of left/right nullable columns of random data
+// suitable for testing a join against a reference join implementation.
+template <typename T>
+std::pair<std::pair<std::vector<T>, std::vector<bool>>,
+          std::pair<std::vector<T>, std::vector<bool>>>
+gen_random_nullable_repeated_columns(unsigned int N = 10000, unsigned int num_repeats = 10)
+{
+  auto [left, right] = gen_random_repeated_columns<T>(N, num_repeats);
+
+  std::vector<bool> left_nulls(N);
+  std::vector<bool> right_nulls(N);
+
+  // Seed with a real random value, if available
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> uniform_dist(0, 1);
+
+  std::generate(left_nulls.begin(), left_nulls.end(), [&uniform_dist, &gen]() {
+    return uniform_dist(gen) > 0.5;
+  });
+  std::generate(right_nulls.begin(), right_nulls.end(), [&uniform_dist, &gen]() {
+    return uniform_dist(gen) > 0.5;
+  });
+
+  return std::make_pair(std::make_pair(std::move(left), std::move(left_nulls)),
+                        std::make_pair(std::move(right), std::move(right_nulls)));
+}
+
+}  // namespace
+
+/**
+ * Fixture for all mixed hash + conditional joins.
+ */
+template <typename T>
+struct MixedJoinTest : public cudf::test::BaseFixture {
+  /**
+   * Convenience utility for parsing initializer lists of input data into
+   * suitable inputs for tables.
+   */
+  template <typename U>
+  std::tuple<std::vector<cudf::test::fixed_width_column_wrapper<T>>,
+             std::vector<cudf::test::fixed_width_column_wrapper<T>>,
+             std::vector<cudf::column_view>,
+             std::vector<cudf::column_view>,
+             cudf::table_view,
+             cudf::table_view,
+             cudf::table_view,
+             cudf::table_view>
+  parse_input(std::vector<U> left_data,
+              std::vector<U> right_data,
+              std::vector<cudf::size_type> equality_columns,
+              std::vector<cudf::size_type> conditional_columns)
+  {
+    auto wrapper_generator = [](U& v) {
+      if constexpr (std::is_same_v<U, std::vector<T>>) {
+        return cudf::test::fixed_width_column_wrapper<T>(v.begin(), v.end());
+      } else if constexpr (std::is_same_v<U, std::pair<std::vector<T>, std::vector<bool>>>) {
+        return cudf::test::fixed_width_column_wrapper<T>(
+          v.first.begin(), v.first.end(), v.second.begin());
+      }
+      throw std::runtime_error("Invalid input to parse_input.");
+      return cudf::test::fixed_width_column_wrapper<T>();
+    };
+
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    std::vector<cudf::test::fixed_width_column_wrapper<T>> left_wrappers;
+    std::vector<cudf::column_view> left_columns;
+    for (auto v : left_data) {
+      left_wrappers.push_back(wrapper_generator(v));
+      left_columns.push_back(left_wrappers.back());
+    }
+
+    std::vector<cudf::test::fixed_width_column_wrapper<T>> right_wrappers;
+    std::vector<cudf::column_view> right_columns;
+    for (auto v : right_data) {
+      right_wrappers.push_back(wrapper_generator(v));
+      right_columns.push_back(right_wrappers.back());
+    }
+
+    auto left  = cudf::table_view(left_columns);
+    auto right = cudf::table_view(right_columns);
+
+    return std::make_tuple(std::move(left_wrappers),
+                           std::move(right_wrappers),
+                           std::move(left_columns),
+                           std::move(right_columns),
+                           left.select(equality_columns),
+                           right.select(equality_columns),
+                           left.select(conditional_columns),
+                           right.select(conditional_columns));
+  }
+};
+
+/**
+ * Fixture for join types that return both left and right indices (inner, left,
+ * and full joins).
+ */
+template <typename T>
+struct MixedJoinPairReturnTest : public MixedJoinTest<T> {
+  /*
+   * Perform a join of tables constructed from two input data sets according to
+   * verify that the outputs match the expected outputs (up to order).
+   */
+  virtual void _test(cudf::table_view left_equality,
+                     cudf::table_view right_equality,
+                     cudf::table_view left_conditional,
+                     cudf::table_view right_conditional,
+                     cudf::ast::operation predicate,
+                     std::vector<cudf::size_type> expected_counts,
+                     std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs,
+                     cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+  {
+    auto [result_size, actual_counts] = this->join_size(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+    EXPECT_TRUE(result_size == expected_outputs.size());
+
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_counts_cw(
+      expected_counts.begin(), expected_counts.end());
+    auto const actual_counts_view =
+      cudf::column_view(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                        actual_counts->size(),
+                        actual_counts->data());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_counts_cw, actual_counts_view);
+
+    auto result = this->join(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
+    for (size_t i = 0; i < result.first->size(); ++i) {
+      // Note: Not trying to be terribly efficient here since these tests are
+      // small, otherwise a batch copy to host before constructing the tuples
+      // would be important.
+      result_pairs.push_back({result.first->element(i, rmm::cuda_stream_default),
+                              result.second->element(i, rmm::cuda_stream_default)});
+    }
+    std::sort(result_pairs.begin(), result_pairs.end());
+    std::sort(expected_outputs.begin(), expected_outputs.end());
+
+    EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin()));
+  }
+
+  /*
+   * Perform a join of tables constructed from two input data sets according to
+   * the provided predicate and verify that the outputs match the expected
+   * outputs (up to order).
+   */
+  void test(ColumnVector<T> left_data,
+            ColumnVector<T> right_data,
+            std::vector<cudf::size_type> equality_columns,
+            std::vector<cudf::size_type> conditional_columns,
+            cudf::ast::operation predicate,
+            std::vector<cudf::size_type> expected_counts,
+            std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
+  {
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    auto [left_wrappers,
+          right_wrappers,
+          left_columns,
+          right_columns,
+          left_equality,
+          right_equality,
+          left_conditional,
+          right_conditional] =
+      this->parse_input(left_data, right_data, equality_columns, conditional_columns);
+    this->_test(left_equality,
+                right_equality,
+                left_conditional,
+                right_conditional,
+                predicate,
+                expected_counts,
+                expected_outputs);
+  }
+
+  /*
+   * Perform a join of tables constructed from two input data sets according to
+   * the provided predicate and verify that the outputs match the expected
+   * outputs (up to order).
+   */
+  void test_nulls(NullableColumnVector<T> left_data,
+                  NullableColumnVector<T> right_data,
+                  std::vector<cudf::size_type> equality_columns,
+                  std::vector<cudf::size_type> conditional_columns,
+                  cudf::ast::operation predicate,
+                  std::vector<cudf::size_type> expected_counts,
+                  std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs,
+                  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+  {
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    auto [left_wrappers,
+          right_wrappers,
+          left_columns,
+          right_columns,
+          left_equality,
+          right_equality,
+          left_conditional,
+          right_conditional] =
+      this->parse_input(left_data, right_data, equality_columns, conditional_columns);
+    this->_test(left_equality,
+                right_equality,
+                left_conditional,
+                right_conditional,
+                predicate,
+                expected_counts,
+                expected_outputs,
+                compare_nulls);
+  }
+
+  /**
+   * This method must be implemented by subclasses for specific types of joins.
+   * It should be a simply forwarding of arguments to the appropriate cudf
+   * mixed join API.
+   */
+  virtual PairJoinReturn join(cudf::table_view left_equality,
+                              cudf::table_view right_equality,
+                              cudf::table_view left_conditional,
+                              cudf::table_view right_conditional,
+                              cudf::ast::operation predicate,
+                              cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0;
+
+  /**
+   * This method must be implemented by subclasses for specific types of joins.
+   * It should be a simply forwarding of arguments to the appropriate cudf
+   * mixed join size computation API.
+   */
+  virtual std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
+    cudf::table_view left_equality,
+    cudf::table_view right_equality,
+    cudf::table_view left_conditional,
+    cudf::table_view right_conditional,
+    cudf::ast::operation predicate,
+    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0;
+};
+
+/**
+ * Tests of mixed inner joins.
+ */
+template <typename T>
+struct MixedInnerJoinTest : public MixedJoinPairReturnTest<T> {
+  PairJoinReturn join(cudf::table_view left_equality,
+                      cudf::table_view right_equality,
+                      cudf::table_view left_conditional,
+                      cudf::table_view right_conditional,
+                      cudf::ast::operation predicate,
+                      cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_inner_join(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+
+  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
+    cudf::table_view left_equality,
+    cudf::table_view right_equality,
+    cudf::table_view left_conditional,
+    cudf::table_view right_conditional,
+    cudf::ast::operation predicate,
+    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_inner_join_size(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+};
+
+TYPED_TEST_SUITE(MixedInnerJoinTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(MixedInnerJoinTest, Empty)
+{
+  this->test({}, {}, {}, {}, left_zero_eq_right_zero, {}, {});
+}
+
+TYPED_TEST(MixedInnerJoinTest, BasicEquality)
+{
+  this->test({{0, 1, 2}, {3, 4, 5}, {10, 20, 30}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {0, 1, 0},
+             {{1, 1}});
+}
+
+TYPED_TEST(MixedInnerJoinTest, BasicNullEqualityEqual)
+{
+  this->test_nulls({{{0, 1, 2}, {1, 1, 0}}, {{3, 4, 5}, {1, 1, 1}}, {{10, 20, 30}, {1, 1, 1}}},
+                   {{{0, 1, 3}, {1, 1, 0}}, {{5, 4, 5}, {1, 1, 1}}, {{30, 40, 30}, {1, 1, 1}}},
+                   {0},
+                   {1, 2},
+                   left_zero_eq_right_zero,
+                   {0, 1, 1},
+                   {{1, 1}, {2, 2}},
+                   cudf::null_equality::EQUAL);
+};
+
+TYPED_TEST(MixedInnerJoinTest, BasicNullEqualityUnequal)
+{
+  this->test_nulls({{{0, 1, 2}, {1, 1, 0}}, {{3, 4, 5}, {1, 1, 1}}, {{10, 20, 30}, {1, 1, 1}}},
+                   {{{0, 1, 3}, {1, 1, 0}}, {{5, 4, 5}, {1, 1, 1}}, {{30, 40, 30}, {1, 1, 1}}},
+                   {0},
+                   {1, 2},
+                   left_zero_eq_right_zero,
+                   {0, 1, 0},
+                   {{1, 1}},
+                   cudf::null_equality::UNEQUAL);
+};
+
+TYPED_TEST(MixedInnerJoinTest, AsymmetricEquality)
+{
+  this->test({{0, 2, 1}, {3, 5, 4}, {10, 30, 20}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {0, 0, 1},
+             {{2, 1}});
+}
+
+TYPED_TEST(MixedInnerJoinTest, AsymmetricLeftLargerEquality)
+{
+  this->test({{0, 2, 1, 4}, {3, 5, 4, 10}, {10, 30, 20, 100}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {0, 0, 1, 0},
+             {{2, 1}});
+}
+
+TYPED_TEST(MixedInnerJoinTest, AsymmetricLeftLargerGreater)
+{
+  auto col_ref_left_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
+  auto col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+  auto condition =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {0, 1},
+             condition,
+             {0, 0, 1, 0, 0, 0, 0, 1, 1, 0},
+             {{2, 2}, {7, 0}, {8, 1}});
+}
+
+TYPED_TEST(MixedInnerJoinTest, AsymmetricRightLargerEquality)
+{
+  this->test({{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {{0, 2, 1, 4}, {3, 5, 4, 10}, {10, 30, 20, 100}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {0, 0, 1, 0},
+             {{1, 2}});
+}
+
+TYPED_TEST(MixedInnerJoinTest, BasicInequality)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto const col_ref_left_2  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_2 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+
+  auto scalar_1        = cudf::numeric_scalar<TypeParam>(35);
+  auto const literal_1 = cudf::ast::literal(scalar_1);
+
+  auto const op1 =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_left_1, col_ref_right_1);
+  auto const op2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, literal_1, col_ref_right_2);
+
+  auto const predicate = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, op1, op2);
+
+  this->test({{0, 1, 2, 4}, {3, 4, 5, 6}, {10, 20, 30, 40}},
+             {{0, 1, 3, 4}, {5, 4, 5, 7}, {30, 40, 50, 60}},
+             {0},
+             {1, 2},
+             predicate,
+             {0, 0, 0, 1},
+             {{3, 3}});
+}
+
+/**
+ * Tests of mixed left joins.
+ */
+template <typename T>
+struct MixedLeftJoinTest : public MixedJoinPairReturnTest<T> {
+  PairJoinReturn join(cudf::table_view left_equality,
+                      cudf::table_view right_equality,
+                      cudf::table_view left_conditional,
+                      cudf::table_view right_conditional,
+                      cudf::ast::operation predicate,
+                      cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_left_join(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+
+  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
+    cudf::table_view left_equality,
+    cudf::table_view right_equality,
+    cudf::table_view left_conditional,
+    cudf::table_view right_conditional,
+    cudf::ast::operation predicate,
+    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_left_join_size(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+};
+
+TYPED_TEST_SUITE(MixedLeftJoinTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(MixedLeftJoinTest, Basic)
+{
+  this->test({{0, 1, 2}, {3, 4, 5}, {10, 20, 30}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {1, 1, 1},
+             {{0, JoinNoneValue}, {1, 1}, {2, JoinNoneValue}});
+}
+
+TYPED_TEST(MixedLeftJoinTest, Basic2)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto const col_ref_left_2  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_2 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+
+  auto scalar_1        = cudf::numeric_scalar<TypeParam>(35);
+  auto const literal_1 = cudf::ast::literal(scalar_1);
+
+  auto const op1 =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_left_1, col_ref_right_1);
+  auto const op2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, literal_1, col_ref_right_2);
+
+  auto const predicate = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, op1, op2);
+
+  this->test({{0, 1, 2, 4}, {3, 4, 5, 6}, {10, 20, 30, 40}},
+             {{0, 1, 3, 4}, {5, 4, 5, 7}, {30, 40, 50, 60}},
+             {0},
+             {1, 2},
+             predicate,
+             {1, 1, 1, 1},
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}, {3, 3}});
+}
+
+/**
+ * Tests of mixed full joins.
+ */
+template <typename T>
+struct MixedFullJoinTest : public MixedJoinPairReturnTest<T> {
+  PairJoinReturn join(cudf::table_view left_equality,
+                      cudf::table_view right_equality,
+                      cudf::table_view left_conditional,
+                      cudf::table_view right_conditional,
+                      cudf::ast::operation predicate,
+                      cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_full_join(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+
+  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
+    cudf::table_view left_equality,
+    cudf::table_view right_equality,
+    cudf::table_view left_conditional,
+    cudf::table_view right_conditional,
+    cudf::ast::operation predicate,
+    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    // Full joins don't actually support size calculations, and there's no easy way to spoof it.
+    CUDF_FAIL("Size calculation not supported for full joins.");
+  }
+
+  /*
+   * Override method to remove size calculation testing since it's not possible for full joins.
+   */
+  void _test(cudf::table_view left_equality,
+             cudf::table_view right_equality,
+             cudf::table_view left_conditional,
+             cudf::table_view right_conditional,
+             cudf::ast::operation predicate,
+             std::vector<cudf::size_type> expected_counts,
+             std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs,
+             cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    auto result = this->join(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
+    for (size_t i = 0; i < result.first->size(); ++i) {
+      result_pairs.push_back({result.first->element(i, rmm::cuda_stream_default),
+                              result.second->element(i, rmm::cuda_stream_default)});
+    }
+    std::sort(result_pairs.begin(), result_pairs.end());
+    std::sort(expected_outputs.begin(), expected_outputs.end());
+
+    EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin()));
+  }
+};
+
+TYPED_TEST_SUITE(MixedFullJoinTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(MixedFullJoinTest, Basic)
+{
+  this->test(
+    {{0, 1, 2}, {3, 4, 5}, {10, 20, 30}},
+    {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+    {0},
+    {1, 2},
+    left_zero_eq_right_zero,
+    {1, 1, 1},
+    {{0, JoinNoneValue}, {1, 1}, {2, JoinNoneValue}, {JoinNoneValue, 0}, {JoinNoneValue, 2}});
+}
+
+TYPED_TEST(MixedFullJoinTest, Basic2)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto const col_ref_left_2  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_2 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+
+  auto scalar_1        = cudf::numeric_scalar<TypeParam>(35);
+  auto const literal_1 = cudf::ast::literal(scalar_1);
+
+  auto const op1 =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_left_1, col_ref_right_1);
+  auto const op2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, literal_1, col_ref_right_2);
+
+  auto const predicate = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, op1, op2);
+
+  this->test({{0, 1, 2, 4}, {3, 4, 5, 6}, {10, 20, 30, 40}},
+             {{0, 1, 3, 4}, {5, 4, 5, 7}, {30, 40, 50, 60}},
+             {0},
+             {1, 2},
+             predicate,
+             {1, 1, 1, 1},
+             {{0, JoinNoneValue},
+              {1, JoinNoneValue},
+              {2, JoinNoneValue},
+              {3, 3},
+              {JoinNoneValue, 0},
+              {JoinNoneValue, 1},
+              {JoinNoneValue, 2}});
+}