Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement joins in pylibcudf #14972

Merged
merged 4 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ This page provides API documentation for pylibcudf.
copying
gpumemoryview
groupby
join
scalar
table
types
Expand Down
6 changes: 6 additions & 0 deletions docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
====
join
====

.. automodule:: cudf._lib.pylibcudf.join
:members:
9 changes: 5 additions & 4 deletions python/cudf/cudf/_lib/cpp/join.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
Expand All @@ -13,19 +13,20 @@ from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type

ctypedef unique_ptr[device_uvector[size_type]] gather_map_type
ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type

cdef extern from "cudf/join.hpp" namespace "cudf" nogil:
cdef pair[gather_map_type, gather_map_type] inner_join(
cdef gather_map_pair_type inner_join(
const table_view left_keys,
const table_view right_keys,
) except +

cdef pair[gather_map_type, gather_map_type] left_join(
cdef gather_map_pair_type left_join(
const table_view left_keys,
const table_view right_keys,
) except +

cdef pair[gather_map_type, gather_map_type] full_join(
cdef gather_map_pair_type full_join(
const table_view left_keys,
const table_view right_keys,
) except +
Expand Down
76 changes: 22 additions & 54 deletions python/cudf/cudf/_lib/join.pyx
Original file line number Diff line number Diff line change
@@ -1,73 +1,41 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport make_unique, unique_ptr
from libcpp.pair cimport pair
from libcpp.utility cimport move

from rmm._lib.device_buffer cimport device_buffer

cimport cudf._lib.cpp.join as cpp_join
from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport data_type, size_type, type_id
from cudf._lib.utils cimport table_view_from_columns

from cudf._lib import pylibcudf

# The functions below return the *gathermaps* that represent
# the join result when joining on the keys `lhs` and `rhs`.


@acquire_spill_lock()
def join(list lhs, list rhs, how=None):
cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result
cdef table_view c_lhs = table_view_from_columns(lhs)
cdef table_view c_rhs = table_view_from_columns(rhs)

if how == "inner":
with nogil:
c_result = move(cpp_join.inner_join(c_lhs, c_rhs))
elif how == "left":
with nogil:
c_result = move(cpp_join.left_join(c_lhs, c_rhs))
elif how == "outer":
with nogil:
c_result = move(cpp_join.full_join(c_lhs, c_rhs))
else:
if how == "outer":
how = "full"
if (join_func := getattr(pylibcudf.join, f"{how}_join", None)) is None:
raise ValueError(f"Invalid join type {how}")

cdef Column left_rows = _gather_map_as_column(move(c_result.first))
cdef Column right_rows = _gather_map_as_column(move(c_result.second))
return left_rows, right_rows
left_rows, right_rows = join_func(
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in lhs]),
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in rhs]),
)
return Column.from_pylibcudf(left_rows), Column.from_pylibcudf(right_rows)


@acquire_spill_lock()
def semi_join(list lhs, list rhs, how=None):
# left-semi and left-anti joins
cdef cpp_join.gather_map_type c_result
cdef table_view c_lhs = table_view_from_columns(lhs)
cdef table_view c_rhs = table_view_from_columns(rhs)

if how == "leftsemi":
with nogil:
c_result = move(cpp_join.left_semi_join(c_lhs, c_rhs))
elif how == "leftanti":
with nogil:
c_result = move(cpp_join.left_anti_join(c_lhs, c_rhs))
else:
if (
join_func := getattr(
pylibcudf.join, f"{how.replace('left', 'left_')}_join", None
)
) is None:
raise ValueError(f"Invalid join type {how}")

cdef Column left_rows = _gather_map_as_column(move(c_result))
return left_rows, None


cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map):
# help to convert a gather map to a Column
cdef device_buffer c_empty
cdef size_type size = gather_map.get()[0].size()
cdef unique_ptr[column] c_col = move(make_unique[column](
data_type(type_id.INT32),
size,
gather_map.get()[0].release(), move(c_empty), 0))
return Column.from_unique_ptr(move(c_col))
return Column.from_pylibcudf(
join_func(
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in lhs]),
pylibcudf.Table([c.to_pylibcudf(mode="read") for c in rhs]),
)
), None
5 changes: 3 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
# the License.
# =============================================================================

set(cython_sources aggregation.pyx binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx
groupby.pyx interop.pyx scalar.pyx table.pyx types.pyx unary.pyx utils.pyx
set(cython_sources
aggregation.pyx binaryop.pyx column.pyx copying.pyx gpumemoryview.pyx groupby.pyx interop.pyx
join.pyx scalar.pyx table.pyx types.pyx unary.pyx utils.pyx
)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

# TODO: Verify consistent usage of relative/absolute imports in pylibcudf.
from . cimport aggregation, binaryop, copying, groupby, interop, unary
from . cimport aggregation, binaryop, copying, groupby, interop, join, unary
from .column cimport Column
from .gpumemoryview cimport gpumemoryview
from .scalar cimport Scalar
Expand All @@ -21,6 +21,7 @@ __all__ = [
"gpumemoryview",
"groupby",
"interop",
"join",
"unary",
"types",
]
3 changes: 2 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.

from . import aggregation, binaryop, copying, groupby, interop, unary
from . import aggregation, binaryop, copying, groupby, interop, join, unary
from .column import Column
from .gpumemoryview import gpumemoryview
from .scalar import Scalar
Expand All @@ -19,6 +19,7 @@
"gpumemoryview",
"groupby",
"interop",
"join",
"unary",
"types",
]
3 changes: 3 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ cdef class GroupBy:
c_requests.push_back(move(request._to_libcudf_agg_request()))

cdef pair[unique_ptr[table], vector[aggregation_result]] c_res
# TODO: Need to capture C++ exceptions indicating that an invalid type was used.
# We rely on libcudf to tell us this rather than checking the types beforehand
# ourselves.
with nogil:
c_res = move(dereference(self.c_obj).aggregate(c_requests))
return GroupBy._parse_outputs(move(c_res))
Expand Down
15 changes: 15 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/join.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from .column cimport Column
from .table cimport Table


cpdef tuple inner_join(Table left_keys, Table right_keys)

cpdef tuple left_join(Table left_keys, Table right_keys)

cpdef tuple full_join(Table left_keys, Table right_keys)

cpdef Column left_semi_join(Table left_keys, Table right_keys)

cpdef Column left_anti_join(Table left_keys, Table right_keys)
159 changes: 159 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/join.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from cython.operator import dereference

from libcpp.memory cimport make_unique
from libcpp.utility cimport move

from rmm._lib.device_buffer cimport device_buffer

from cudf._lib.cpp cimport join as cpp_join
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.types cimport data_type, size_type, type_id

from .column cimport Column
from .table cimport Table


cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map):
# helper to convert a gather map to a Column
cdef device_buffer c_empty
cdef size_type size = dereference(gather_map.get()).size()
return Column.from_libcudf(
move(
make_unique[column](
data_type(type_id.INT32),
size,
dereference(gather_map.get()).release(),
move(c_empty),
0
)
)
)


cpdef tuple inner_join(Table left_keys, Table right_keys):
"""Perform an inner join between two tables.

For details, see :cpp:func:`inner_join`.

Parameters
----------
left_keys : Table
The left table to join.
right_keys : Table
The right table to join.

Returns
-------
Tuple[Column, Column]
A tuple containing the row indices from the left and right tables after the
join.
"""
cdef cpp_join.gather_map_pair_type c_result
with nogil:
c_result = cpp_join.inner_join(left_keys.view(), right_keys.view())
return (
_column_from_gather_map(move(c_result.first)),
_column_from_gather_map(move(c_result.second)),
)


cpdef tuple left_join(Table left_keys, Table right_keys):
"""Perform a left join between two tables.

For details, see :cpp:func:`left_join`.

Parameters
----------
left_keys : Table
The left table to join.
right_keys : Table
The right table to join.

Returns
-------
Tuple[Column, Column]
A tuple containing the row indices from the left and right tables after the
join.
"""
cdef cpp_join.gather_map_pair_type c_result
with nogil:
c_result = cpp_join.left_join(left_keys.view(), right_keys.view())
return (
_column_from_gather_map(move(c_result.first)),
_column_from_gather_map(move(c_result.second)),
)


cpdef tuple full_join(Table left_keys, Table right_keys):
"""Perform a full join between two tables.

For details, see :cpp:func:`full_join`.

Parameters
----------
left_keys : Table
The left table to join.
right_keys : Table
The right table to join.

Returns
-------
Tuple[Column, Column]
A tuple containing the row indices from the left and right tables after the
join.
"""
cdef cpp_join.gather_map_pair_type c_result
with nogil:
c_result = cpp_join.full_join(left_keys.view(), right_keys.view())
return (
_column_from_gather_map(move(c_result.first)),
_column_from_gather_map(move(c_result.second)),
)


cpdef Column left_semi_join(Table left_keys, Table right_keys):
"""Perform a left semi join between two tables.

For details, see :cpp:func:`left_semi_join`.

Parameters
----------
left_keys : Table
The left table to join.
right_keys : Table
The right table to join.

Returns
-------
Column
A column containing the row indices from the left table after the join.
"""
cdef cpp_join.gather_map_type c_result
with nogil:
c_result = cpp_join.left_semi_join(left_keys.view(), right_keys.view())
return _column_from_gather_map(move(c_result))


cpdef Column left_anti_join(Table left_keys, Table right_keys):
"""Perform a left anti join between two tables.

For details, see :cpp:func:`left_anti_join`.

Parameters
----------
left_keys : Table
The left table to join.
right_keys : Table
The right table to join.

Returns
-------
Column
A column containing the row indices from the left table after the join.
"""
cdef cpp_join.gather_map_type c_result
with nogil:
c_result = cpp_join.left_anti_join(left_keys.view(), right_keys.view())
return _column_from_gather_map(move(c_result))
Loading