From 784814759aae431ad0696967d81fbeba51705a7b Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 22 Nov 2023 18:29:27 +0000
Subject: [PATCH 1/2] Improve memory footprint of isin by using contains

Previously, isin was implemented using an inner join between the
column we are searching (the haystack) and the values we are searching
for (the needles). This had a large memory footprint when there were
repeated needles (since that blows up the cardinality of the merge).

To fix this, note that we don't need to do a merge at all, since
libcudf provides a primitive (contains) to search for many needles in
a haystack. The only thing we must bear in mind is that
left.isin(right) is asking for the locations in left that match an
entry in right, whereas contains(haystack, needles) provides a bool
mask that selects needles that are in the haystack. To get the
behaviour we want, we therefore need to do contains(right, left) and
treat the values to search for as the haystack.

As well as having a much better memory footprint, this hash-based
approach search is significantly faster than the previous merge-based
one.

While we are here, lower the memory footprint of MultiIndex.isin by
using a left-semi join (the implementation is separate from the isin
implementation on columns and looks a little more complicated to
unpick).

- Closes #14298
---
 python/cudf/cudf/core/column/column.py | 12 +++++-------
 python/cudf/cudf/core/multiindex.py    |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b4f65693d85..cd537dbe9d3 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -916,13 +916,11 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
         Helper function for `isin` which merges `self` & `rhs`
         to determine what values of `rhs` exist in `self`.
         """
-        ldf = cudf.DataFrame({"x": self, "orig_order": arange(len(self))})
-        rdf = cudf.DataFrame(
-            {"x": rhs, "bool": full(len(rhs), True, dtype="bool")}
-        )
-        res = ldf.merge(rdf, on="x", how="left").sort_values(by="orig_order")
-        res = res.drop_duplicates(subset="orig_order", ignore_index=True)
-        return res._data["bool"].fillna(False)
+        # We've already matched dtypes by now
+        result = libcudf.search.contains(rhs, self)
+        if result.null_count:
+            return result.fillna(False)
+        return result
 
     def as_mask(self) -> Buffer:
         """Convert booleans to bitmask
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index d0c8a513686..fdf806d5a02 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -746,7 +746,7 @@ def isin(self, values, level=None):
                 )
             self_df = self.to_frame(index=False).reset_index()
             values_df = values_idx.to_frame(index=False)
-            idx = self_df.merge(values_df)._data["index"]
+            idx = self_df.merge(values_df, how="leftsemi")._data["index"]
             res = cudf.core.column.full(size=len(self), fill_value=False)
             res[idx] = True
             result = res.values

From 884f9ceed35dc865daf7f320a4ee2d818a6eba85 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 23 Nov 2023 10:48:37 +0000
Subject: [PATCH 2/2] Fix null handling

The result returned from libcudf is masked by the null mask of the
needles. If it has any nulls we must replace them with
whether or not the haystack contains nulls to match the semantics we
need for isin.
---
 python/cudf/cudf/core/column/column.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index cd537dbe9d3..e8c381cfa7b 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -917,9 +917,19 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
         to determine what values of `rhs` exist in `self`.
         """
         # We've already matched dtypes by now
+        # self.isin(other) asks "which values of self are in other"
+        # contains(haystack, needles) asks "which needles are in haystack"
+        # hence this argument ordering.
         result = libcudf.search.contains(rhs, self)
-        if result.null_count:
-            return result.fillna(False)
+        if self.null_count > 0:
+            # If one of the needles is null, then the result contains
+            # nulls, these nulls should be replaced by whether or not the
+            # haystack contains a null.
+            # TODO: this is unnecessary if we resolve
+            # https://github.com/rapidsai/cudf/issues/14515 by
+            # providing a mode in which cudf::contains does not mask
+            # the result.
+            result = result.fillna(rhs.null_count > 0, dtype=bool)
         return result
 
     def as_mask(self) -> Buffer: