Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement cudf.MultiIndex.from_arrays #14740

Merged
merged 5 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/cudf/source/user_guide/api_docs/index_objects.rst
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ MultiIndex constructors
.. autosummary::
:toctree: api/

MultiIndex.from_arrays
MultiIndex.from_tuples
MultiIndex.from_product
MultiIndex.from_frame
Expand Down
63 changes: 63 additions & 0 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from cudf.core._compat import PANDAS_GE_150
from cudf.core.frame import Frame
from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
from cudf.utils.dtypes import is_column_like
from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name

Expand Down Expand Up @@ -1226,6 +1227,7 @@ def from_tuples(cls, tuples, names=None):

See Also
--------
MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
MultiIndex.from_product : Make a MultiIndex from cartesian product
of iterables.
MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
Expand Down Expand Up @@ -1335,6 +1337,7 @@ def from_frame(cls, df, names=None):

See Also
--------
MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
MultiIndex.from_product : Make a MultiIndex from cartesian product
of iterables.
Expand Down Expand Up @@ -1429,6 +1432,66 @@ def from_product(cls, arrays, names=None):
pdi = pd.MultiIndex.from_product(arrays, names=names)
return cls.from_pandas(pdi)

@classmethod
@_cudf_nvtx_annotate
def from_arrays(
cls,
arrays,
sortorder=None,
names=None,
) -> MultiIndex:
"""
Convert arrays to MultiIndex.

Parameters
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing sortorder docstring in parameters.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. Added

----------
arrays : list / sequence of array-likes
Each array-like gives one level's value for each data point.
len(arrays) is the number of levels.
sortorder : optional int
Not yet supported
names : list / sequence of str, optional
Names for the levels in the index.

Returns
-------
MultiIndex

See Also
--------
MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
MultiIndex.from_product : Make a MultiIndex from cartesian product
of iterables.
MultiIndex.from_frame : Make a MultiIndex from a DataFrame.

Examples
--------
>>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
>>> cudf.MultiIndex.from_arrays(arrays, names=('number', 'color'))
MultiIndex([(1, 'red'),
(1, 'blue'),
(2, 'red'),
(2, 'blue')],
names=['number', 'color'])
"""
# Imported here due to circular import
from cudf.core.algorithms import factorize

error_msg = "Input must be a list / sequence of array-likes."
if not is_list_like(arrays):
raise TypeError(error_msg)
codes = []
levels = []
for array in arrays:
if not (is_list_like(array) or is_column_like(array)):
raise TypeError(error_msg)
code, level = factorize(array, sort=True)
codes.append(code)
levels.append(level)
Comment on lines +1485 to +1490
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a functional exercise, this can rewrite as a map statement:

code_levels = map(functools.partial(factorize, sort=True), array)
codes, levels = [x[0] for x in code_levels], [x[1] for x in code_levels]

Error checking is also functional:

if not all (is_list_like(arr) or is_column_like(arr) for arr in arrays):
    raise TypeError(error_msg)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ya my idea was just to do the validation + factorization in the same loop

return cls(
codes=codes, levels=levels, sortorder=sortorder, names=names
)

@_cudf_nvtx_annotate
def _poplevels(self, level):
"""
Expand Down
38 changes: 30 additions & 8 deletions python/cudf/cudf/tests/test_multiindex.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.

"""
Test related to MultiIndex
Expand Down Expand Up @@ -2085,12 +2085,7 @@ def test_multiindex_eq_other_multiindex():
params=[
"from_product",
"from_tuples",
pytest.param(
"from_arrays",
marks=pytest.mark.xfail(
reason="TODO: from_arrays is not implemented"
),
),
"from_arrays",
"init",
]
)
Expand All @@ -2100,7 +2095,7 @@ def midx(request):
elif request.param == "from_tuples":
return cudf.MultiIndex.from_tuples([(0, 1), (0, 0), (1, 1), (1, 0)])
elif request.param == "from_arrays":
return cudf.MultiIndex.from_arrays([0, 0, 1, 1], [1, 0, 1, 0])
return cudf.MultiIndex.from_arrays([[0, 0, 1, 1], [1, 0, 1, 0]])
elif request.param == "init":
return cudf.MultiIndex(
levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [1, 0, 1, 0]]
Expand All @@ -2112,3 +2107,30 @@ def midx(request):
def test_multindex_constructor_levels_always_indexes(midx):
assert_eq(midx.levels[0], cudf.Index([0, 1]))
assert_eq(midx.levels[1], cudf.Index([0, 1]))


@pytest.mark.parametrize(
"array",
[
list,
tuple,
np.array,
cp.array,
pd.Index,
cudf.Index,
pd.Series,
cudf.Series,
],
)
def test_multiindex_from_arrays(array):
pd_data = [[0, 0, 1, 1], [1, 0, 1, 0]]
cudf_data = [array(lst) for lst in pd_data]
result = pd.MultiIndex.from_arrays(pd_data)
expected = cudf.MultiIndex.from_arrays(cudf_data)
assert_eq(result, expected)


@pytest.mark.parametrize("arg", ["foo", ["foo"]])
def test_multiindex_from_arrays_wrong_arg(arg):
with pytest.raises(TypeError):
cudf.MultiIndex.from_arrays(arg)