From d39dcfb987de6f06267bb10fe861641498aa8dab Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 20 Apr 2022 12:17:25 -0700 Subject: [PATCH 1/7] Update `merge_sorted` cython interface --- python/cudf/cudf/_lib/merge.pyx | 84 ++++++++------------------------- 1 file changed, 20 insertions(+), 64 deletions(-) diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx index 915b46c5691..9bc9085878c 100644 --- a/python/cudf/cudf/_lib/merge.pyx +++ b/python/cudf/cudf/_lib/merge.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -10,79 +10,39 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.merge cimport merge as cpp_merge from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns def merge_sorted( - object tables, - object keys=None, - bool by_index=False, - bool ignore_index=False, + list list_of_columns, + list key_columns_idx, bool ascending=True, - object na_position="last", + str na_position="last", ): - cdef vector[libcudf_types.size_type] c_column_keys + cdef vector[libcudf_types.size_type] c_column_keys = key_columns_idx cdef vector[table_view] c_input_tables cdef vector[libcudf_types.order] c_column_order cdef vector[libcudf_types.null_order] c_null_precedence - cdef libcudf_types.order column_order - cdef libcudf_types.null_order null_precedence - cdef source_table - # Create vector of tables - # Use metadata from 0th table for names, etc - c_input_tables.reserve(len(tables)) - for source_table in tables: + c_input_tables.reserve(len(list_of_columns)) + for source_columns in list_of_columns: c_input_tables.push_back( - table_view_from_table(source_table, ignore_index)) - source_table = tables[0] + table_view_from_columns(source_columns)) - # Define sorting order and null precedence - column_order = (libcudf_types.order.ASCENDING - if ascending - else libcudf_types.order.DESCENDING) + num_keys = len(key_columns_idx) - if ascending is False: - if na_position == "last": - na_position = "first" - else: - na_position = "last" - null_precedence = ( + cdef libcudf_types.order column_order = ( + libcudf_types.order.ASCENDING if ascending + else libcudf_types.order.DESCENDING + ) + c_column_order = vector[libcudf_types.order](num_keys, column_order) + + if not ascending: + na_position = "last" if na_position == "first" else "first" + cdef libcudf_types.null_order null_precedence = ( libcudf_types.null_order.BEFORE if na_position == "first" else libcudf_types.null_order.AFTER ) - - # Determine index-column offset and index names - if ignore_index: - num_index_columns = 0 - index_names = None - else: - num_index_columns = ( - 0 if source_table._index is None - else source_table._index._num_columns - ) - index_names = source_table._index_names - - # Define C vectors for each key column - if not by_index and keys is not None: - num_keys = len(keys) - c_column_keys.reserve(num_keys) - for name in keys: - c_column_keys.push_back( - num_index_columns + source_table._column_names.index(name) - ) - else: - if by_index: - start = 0 - stop = num_index_columns - else: - start = num_index_columns - stop = num_index_columns + source_table._num_columns - num_keys = stop - start - c_column_keys.reserve(num_keys) - for key in range(start, stop): - c_column_keys.push_back(key) - c_column_order = vector[libcudf_types.order](num_keys, column_order) c_null_precedence = vector[libcudf_types.null_order]( num_keys, null_precedence @@ -100,8 +60,4 @@ def merge_sorted( ) ) - return data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names, - index_names=index_names, - ) + return columns_from_unique_ptr(move(c_result)) From a69f08ca815961b6a4651675433a47f8c5581032 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 20 Apr 2022 13:21:55 -0700 Subject: [PATCH 2/7] Rename argument --- python/cudf/cudf/_lib/merge.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx index 9bc9085878c..3bba3f3eec6 100644 --- a/python/cudf/cudf/_lib/merge.pyx +++ b/python/cudf/cudf/_lib/merge.pyx @@ -15,11 +15,11 @@ from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns def merge_sorted( list list_of_columns, - list key_columns_idx, + list key_columns_indices, bool ascending=True, str na_position="last", ): - cdef vector[libcudf_types.size_type] c_column_keys = key_columns_idx + cdef vector[libcudf_types.size_type] c_column_keys = key_columns_indices cdef vector[table_view] c_input_tables cdef vector[libcudf_types.order] c_column_order cdef vector[libcudf_types.null_order] c_null_precedence @@ -29,7 +29,7 @@ def merge_sorted( c_input_tables.push_back( table_view_from_columns(source_columns)) - num_keys = len(key_columns_idx) + num_keys = len(key_columns_indices) cdef libcudf_types.order column_order = ( libcudf_types.order.ASCENDING if ascending From 69132206d44468c9dc3085da9e85ec93111899d7 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 20 Apr 2022 13:26:17 -0700 Subject: [PATCH 3/7] Updates `merge_sorted` python API. --- python/cudf/cudf/core/reshape.py | 42 +++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index f58c93aa0dc..54609cfa5b6 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -772,10 +772,10 @@ def merge_sorted( Parameters ---------- - objs : list of DataFrame, Series, or Index + objs : list of DataFrame or Series keys : list, default None List of Column names to sort by. If None, all columns used - (Ignored if `index=True`) + (Ignored if `by_index=True`) by_index : bool, default False Use index for sorting. `keys` input will be ignored if True ignore_index : bool, default False @@ -806,18 +806,38 @@ def merge_sorted( if by_index and ignore_index: raise ValueError("`by_index` and `ignore_index` cannot both be True") - result = objs[0].__class__._from_data( - *cudf._lib.merge.merge_sorted( - objs, - keys=keys, - by_index=by_index, - ignore_index=ignore_index, + if by_index: + key_columns_indices = list(range(0, objs[0]._index.nlevels)) + else: + if keys is None: + key_columns_indices = list(range(0, objs[0]._num_columns)) + else: + key_columns_indices = [ + objs[0]._column_names.index(key) for key in keys + ] + if not ignore_index: + key_columns_indices = [ + idx + objs[0]._index.nlevels for idx in key_columns_indices + ] + + columns = [ + [ + *(obj._index._data.columns if not ignore_index else ()), + *obj._columns, + ] + for obj in objs + ] + + return objs[0]._from_columns_like_self( + cudf._lib.merge.merge_sorted( + list_of_columns=columns, + key_columns_indices=key_columns_indices, ascending=ascending, na_position=na_position, - ) + ), + column_names=objs[0]._column_names, + index_names=None if ignore_index else objs[0]._index_names, ) - result._copy_type_metadata(objs[0]) - return result def _pivot(df, index, columns): From 277a2c202863ed5f74c99e3ca902ddb2b6b0b203 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 21 Apr 2022 13:09:35 -0700 Subject: [PATCH 4/7] Update python/cudf/cudf/_lib/merge.pyx Co-authored-by: Ashwin Srinath <3190405+shwina@users.noreply.github.com> --- python/cudf/cudf/_lib/merge.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx index 3bba3f3eec6..093ea3d2c03 100644 --- a/python/cudf/cudf/_lib/merge.pyx +++ b/python/cudf/cudf/_lib/merge.pyx @@ -14,7 +14,7 @@ from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns def merge_sorted( - list list_of_columns, + list input_columns, list key_columns_indices, bool ascending=True, str na_position="last", From d3f5678d2efa24502204c58da384b1705e383d47 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 21 Apr 2022 13:39:58 -0700 Subject: [PATCH 5/7] Add docstrings and fix review commit --- python/cudf/cudf/_lib/merge.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx index 093ea3d2c03..0552fcbce0f 100644 --- a/python/cudf/cudf/_lib/merge.pyx +++ b/python/cudf/cudf/_lib/merge.pyx @@ -19,13 +19,17 @@ def merge_sorted( bool ascending=True, str na_position="last", ): + """Merge multiple lists of lexicographically sorted columns into one list. + + `input_columns` is a list of lists of columns to be merged. + """ cdef vector[libcudf_types.size_type] c_column_keys = key_columns_indices cdef vector[table_view] c_input_tables cdef vector[libcudf_types.order] c_column_order cdef vector[libcudf_types.null_order] c_null_precedence - c_input_tables.reserve(len(list_of_columns)) - for source_columns in list_of_columns: + c_input_tables.reserve(len(input_columns)) + for source_columns in input_columns: c_input_tables.push_back( table_view_from_columns(source_columns)) From 62e8eed6dc29565c431f086ed6563ff94f8c18f0 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 21 Apr 2022 13:41:02 -0700 Subject: [PATCH 6/7] Update docstring in merge --- python/cudf/cudf/_lib/merge.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx index 0552fcbce0f..dae2c466266 100644 --- a/python/cudf/cudf/_lib/merge.pyx +++ b/python/cudf/cudf/_lib/merge.pyx @@ -19,9 +19,9 @@ def merge_sorted( bool ascending=True, str na_position="last", ): - """Merge multiple lists of lexicographically sorted columns into one list. - - `input_columns` is a list of lists of columns to be merged. + """Merge multiple lists of lexicographically sorted columns into one list + of sorted columns. `input_columns` is a list of lists of columns to be + merged. """ cdef vector[libcudf_types.size_type] c_column_keys = key_columns_indices cdef vector[table_view] c_input_tables From d8a61fc211340f0d9f7d0de9748b945fb89189d4 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 21 Apr 2022 18:14:06 -0700 Subject: [PATCH 7/7] Fix call-site --- python/cudf/cudf/core/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 54609cfa5b6..5977b63777f 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -830,7 +830,7 @@ def merge_sorted( return objs[0]._from_columns_like_self( cudf._lib.merge.merge_sorted( - list_of_columns=columns, + input_columns=columns, key_columns_indices=key_columns_indices, ascending=ascending, na_position=na_position,