From ac35d1950a8e34866706db10e627ef1a5a471e5d Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 30 Nov 2023 17:00:21 +0000 Subject: [PATCH] Improve memory footprint of isin by using contains (#14478) Previously, isin was implemented using an inner join between the column we are searching (the haystack) and the values we are searching for (the needles). This had a large memory footprint when there were repeated needles (since that blows up the cardinality of the merge). To fix this, note that we don't need to do a merge at all, since libcudf provides a primitive (contains) to search for many needles in a haystack. The only thing we must bear in mind is that left.isin(right) is asking for the locations in left that match an entry in right, whereas contains(haystack, needles) provides a bool mask that selects needles that are in the haystack. To get the behaviour we want, we therefore need to do contains(right, left) and treat the values to search for as the haystack. As well as having a much better memory footprint, this hash-based approach search is significantly faster than the previous merge-based one. While we are here, lower the memory footprint of MultiIndex.isin by using a left-semi join (the implementation is separate from the isin implementation on columns and looks a little more complicated to unpick). - Closes #14298 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/14478 --- python/cudf/cudf/core/column/column.py | 22 +++++++++++++++------- python/cudf/cudf/core/multiindex.py | 2 +- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a76f4d7383c..ba83e3985c5 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -919,13 +919,21 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: Helper function for `isin` which merges `self` & `rhs` to determine what values of `rhs` exist in `self`. """ - ldf = cudf.DataFrame({"x": self, "orig_order": arange(len(self))}) - rdf = cudf.DataFrame( - {"x": rhs, "bool": full(len(rhs), True, dtype="bool")} - ) - res = ldf.merge(rdf, on="x", how="left").sort_values(by="orig_order") - res = res.drop_duplicates(subset="orig_order", ignore_index=True) - return res._data["bool"].fillna(False) + # We've already matched dtypes by now + # self.isin(other) asks "which values of self are in other" + # contains(haystack, needles) asks "which needles are in haystack" + # hence this argument ordering. + result = libcudf.search.contains(rhs, self) + if self.null_count > 0: + # If one of the needles is null, then the result contains + # nulls, these nulls should be replaced by whether or not the + # haystack contains a null. + # TODO: this is unnecessary if we resolve + # https://github.com/rapidsai/cudf/issues/14515 by + # providing a mode in which cudf::contains does not mask + # the result. + result = result.fillna(rhs.null_count > 0, dtype=bool) + return result def as_mask(self) -> Buffer: """Convert booleans to bitmask diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 2f5066cc94b..4f98a878792 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -747,7 +747,7 @@ def isin(self, values, level=None): ) self_df = self.to_frame(index=False).reset_index() values_df = values_idx.to_frame(index=False) - idx = self_df.merge(values_df)._data["index"] + idx = self_df.merge(values_df, how="leftsemi")._data["index"] res = cudf.core.column.full(size=len(self), fill_value=False) res[idx] = True result = res.values