Use Lists of Columns for Various Files (NVIDIA#10463)

This PR covers many low hanging fruits for rapidsai/cudf#10153. All API accepting Frames now accepts a list of columns in the following files: - hash.pyx - interop.pyx - join.pyx - partitioning.pyx - quantiles.pyx - reshape.pyx - search.pyx - transform.pyx - lists.pyx - string/combine.pyx This PR covers point 5 in the follow-ups to rapidsai/cudf#9889. Also, in `join.pyx`, gil was not released when dispatching workload to libcudf. This PR fixes that. Authors: - Michael Wang (https://github.com/isVoid) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: rapidsai/cudf#10463
tgravescs · Apr 19, 2022 · 9dc728a · 9dc728a
1 parent 17d49fa
commit 9dc728a
Show file tree

Hide file tree

Showing 22 changed files with 299 additions and 371 deletions.
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
@@ -14,16 +14,14 @@ from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id
 from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
-def hash_partition(source_table, object columns_to_hash,
-                   int num_partitions, bool keep_index=True):
+def hash_partition(list source_columns, object columns_to_hash,
+                   int num_partitions):
     cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash
     cdef int c_num_partitions = num_partitions
-    cdef table_view c_source_view = table_view_from_table(
-        source_table, not keep_index
-    )
+    cdef table_view c_source_view = table_view_from_columns(source_columns)
 
     cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
     with nogil:
@@ -36,27 +34,17 @@ def hash_partition(source_table, object columns_to_hash,
         )
 
     # Note that the offsets (`c_result.second`) may be empty when
-    # the original table (`source_table`) is empty. We need to
+    # the original table (`source_columns`) is empty. We need to
     # return a list of zeros in this case.
     return (
-        *data_from_unique_ptr(
-            move(c_result.first),
-            column_names=source_table._column_names,
-            index_names=(
-                source_table._index_names
-                if keep_index is True
-                else None
-            )
-
-        ),
-        list(c_result.second) if c_result.second.size()
-        else [0] * num_partitions
+        columns_from_unique_ptr(move(c_result.first)),
+        list(c_result.second)
+        if c_result.second.size() else [0] * num_partitions
     )
 
 
-def hash(source_table, str method, int seed=0):
-    cdef table_view c_source_view = table_view_from_table(
-        source_table, ignore_index=True)
+def hash(list source_columns, str method, int seed=0):
+    cdef table_view c_source_view = table_view_from_columns(source_columns)
     cdef unique_ptr[column] c_result
     cdef cpp_hash_id c_hash_function
     if method == "murmur3":

diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import cudf
 
@@ -20,12 +20,12 @@ from cudf._lib.cpp.interop cimport (
 )
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 def from_dlpack(dlpack_capsule):
     """
-    Converts a DLPack Tensor PyCapsule into a cudf Frame object.
+    Converts a DLPack Tensor PyCapsule into a list of columns.
 
     DLPack Tensor PyCapsule is expected to have the name "dltensor".
     """
@@ -40,31 +40,25 @@ def from_dlpack(dlpack_capsule):
             cpp_from_dlpack(dlpack_tensor)
         )
 
-    res = data_from_unique_ptr(
-        move(c_result),
-        column_names=range(0, c_result.get()[0].num_columns())
-    )
+    res = columns_from_unique_ptr(move(c_result))
     dlpack_tensor.deleter(dlpack_tensor)
     return res
 
 
-def to_dlpack(source_table):
+def to_dlpack(list source_columns):
     """
-    Converts a cudf Frame into a DLPack Tensor PyCapsule.
+    Converts a list of columns into a DLPack Tensor PyCapsule.
 
     DLPack Tensor PyCapsule will have the name "dltensor".
     """
-    for column in source_table._columns:
-        if column.null_count:
-            raise ValueError(
-                "Cannot create a DLPack tensor with null values. \
-                    Input is required to have null count as zero."
-            )
+    if any(column.null_count for column in source_columns):
+        raise ValueError(
+            "Cannot create a DLPack tensor with null values. \
+                Input is required to have null count as zero."
+        )
 
     cdef DLManagedTensor *dlpack_tensor
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, ignore_index=True
-    )
+    cdef table_view source_table_view = table_view_from_columns(source_columns)
 
     with nogil:
         dlpack_tensor = cpp_to_dlpack(
@@ -110,27 +104,22 @@ cdef vector[column_metadata] gather_metadata(object metadata) except *:
         raise ValueError("Malformed metadata has been encountered")
 
 
-def to_arrow(input_table,
-             object metadata,
-             bool keep_index=True):
-    """Convert from cudf Frame to PyArrow Table.
+def to_arrow(list source_columns, object metadata):
+    """Convert a list of columns from
+    cudf Frame to a PyArrow Table.
 
     Parameters
     ----------
-    input_table : cudf table
-    column_names : names for the pyarrow arrays
-    field_names : field names for nested type arrays
-    keep_index : whether index needs to be part of arrow table
+    source_columns : a list of columns to convert
+    metadata : a list of metadata, see `gather_metadata` for layout
 
     Returns
     -------
     pyarrow table
     """
 
     cdef vector[column_metadata] cpp_metadata = gather_metadata(metadata)
-    cdef table_view input_table_view = (
-        table_view_from_table(input_table, not keep_index)
-    )
+    cdef table_view input_table_view = table_view_from_columns(source_columns)
 
     cdef shared_ptr[CTable] cpp_arrow_table
     with nogil:
@@ -141,22 +130,16 @@ def to_arrow(input_table,
     return pyarrow_wrap_table(cpp_arrow_table)
 
 
-def from_arrow(
-    object input_table,
-    object column_names=None,
-    object index_names=None
-):
-    """Convert from PyArrow Table to cudf Frame.
+def from_arrow(object input_table):
+    """Convert from PyArrow Table to a list of columns.
 
     Parameters
     ----------
     input_table : PyArrow table
-    column_names : names for the cudf table data columns
-    index_names : names for the cudf table index columns
 
     Returns
     -------
-    cudf Frame
+    A list of columns to construct Frame object
     """
     cdef shared_ptr[CTable] cpp_arrow_table = (
         pyarrow_unwrap_table(input_table)
@@ -166,8 +149,4 @@ def from_arrow(
     with nogil:
         c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=column_names,
-        index_names=index_names
-    )
+    return columns_from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from itertools import chain
 
@@ -16,31 +16,25 @@ from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
-from cudf._lib.utils cimport table_view_from_table
+from cudf._lib.utils cimport table_view_from_columns
 
 # The functions below return the *gathermaps* that represent
 # the join result when joining on the keys `lhs` and `rhs`.
 
-cpdef join(lhs, rhs, how=None):
+cpdef join(list lhs, list rhs, how=None):
     cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result
-    cdef table_view c_lhs = table_view_from_table(lhs)
-    cdef table_view c_rhs = table_view_from_table(rhs)
+    cdef table_view c_lhs = table_view_from_columns(lhs)
+    cdef table_view c_rhs = table_view_from_columns(rhs)
 
     if how == "inner":
-        c_result = move(cpp_join.inner_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.inner_join(c_lhs, c_rhs))
     elif how == "left":
-        c_result = move(cpp_join.left_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.left_join(c_lhs, c_rhs))
     elif how == "outer":
-        c_result = move(cpp_join.full_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.full_join(c_lhs, c_rhs))
     else:
         raise ValueError(f"Invalid join type {how}")
 
@@ -49,30 +43,23 @@ cpdef join(lhs, rhs, how=None):
     return left_rows, right_rows
 
 
-cpdef semi_join(lhs, rhs, how=None):
+cpdef semi_join(list lhs, list rhs, how=None):
     # left-semi and left-anti joins
     cdef cpp_join.gather_map_type c_result
-    cdef table_view c_lhs = table_view_from_table(lhs)
-    cdef table_view c_rhs = table_view_from_table(rhs)
+    cdef table_view c_lhs = table_view_from_columns(lhs)
+    cdef table_view c_rhs = table_view_from_columns(rhs)
 
     if how == "leftsemi":
-        c_result = move(cpp_join.left_semi_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.left_semi_join(c_lhs, c_rhs))
     elif how == "leftanti":
-        c_result = move(cpp_join.left_anti_join(
-            c_lhs,
-            c_rhs
-        ))
+        with nogil:
+            c_result = move(cpp_join.left_anti_join(c_lhs, c_rhs))
     else:
         raise ValueError(f"Invalid join type {how}")
 
     cdef Column left_rows = _gather_map_as_column(move(c_result))
-    return (
-        left_rows,
-        None
-    )
+    return left_rows, None
 
 
 cdef Column _gather_map_as_column(cpp_join.gather_map_type gather_map):

diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
@@ -42,7 +42,7 @@ from cudf.core.dtypes import ListDtype
 
 from cudf._lib.cpp.lists.contains cimport contains, index_of as cpp_index_of
 from cudf._lib.cpp.lists.extract cimport extract_list_element
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 def count_elements(Column col):
@@ -61,20 +61,18 @@ def count_elements(Column col):
     return result
 
 
-def explode_outer(tbl, int explode_column_idx, bool ignore_index=False):
-    cdef table_view c_table_view = table_view_from_table(tbl, ignore_index)
+def explode_outer(
+    list source_columns, int explode_column_idx
+):
+    cdef table_view c_table_view = table_view_from_columns(source_columns)
     cdef size_type c_explode_column_idx = explode_column_idx
 
     cdef unique_ptr[table] c_result
 
     with nogil:
         c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx))
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=tbl._column_names,
-        index_names=None if ignore_index else tbl._index_names
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal):
@@ -197,18 +195,17 @@ def index_of(Column col, object py_search_key):
     return Column.from_unique_ptr(move(c_result))
 
 
-def concatenate_rows(tbl):
+def concatenate_rows(list source_columns):
     cdef unique_ptr[column] c_result
 
-    cdef table_view c_table_view = table_view_from_table(tbl)
+    cdef table_view c_table_view = table_view_from_columns(source_columns)
 
     with nogil:
         c_result = move(cpp_concatenate_rows(
             c_table_view,
         ))
 
-    result = Column.from_unique_ptr(move(c_result))
-    return result
+    return Column.from_unique_ptr(move(c_result))
 
 
 def concatenate_list_elements(Column input_column, dropna=False):

diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -11,21 +11,19 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.partitioning cimport partition as cpp_partition
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 
 cimport cudf._lib.cpp.types as libcudf_types
 
 
-def partition(source_table, Column partition_map,
-              object num_partitions, bool keep_index=True):
+def partition(list source_columns, Column partition_map,
+              object num_partitions):
 
     if num_partitions is None:
         num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True)
     cdef int c_num_partitions = num_partitions
-    cdef table_view c_source_view = table_view_from_table(
-        source_table, not keep_index
-    )
+    cdef table_view c_source_view = table_view_from_columns(source_columns)
 
     cdef column_view c_partition_map_view = partition_map.view()
 
@@ -40,13 +38,5 @@ def partition(source_table, Column partition_map,
         )
 
     return (
-        *data_from_unique_ptr(
-            move(c_result.first),
-            column_names=source_table._column_names,
-            index_names=source_table._index_names if(
-                keep_index is True)
-            else None
-
-        ),
-        list(c_result.second)
+        columns_from_unique_ptr(move(c_result.first)), list(c_result.second)
     )