Skip to content

Commit

Permalink
Refactor ColumnMethods and its subclasses to remove column argument…
Browse files Browse the repository at this point in the history
… and require `parent` argument (#8306)

Prior to this PR, ColumnMethods takes a `column` argument as well as an optional `parent` argument.

- When passed only a `column` argument, its methods return Columns. We have made use of this internally to do certain operations on Columns.

- When passed a `parent` argument, its methods return objects of the type of the `parent`. This enables us to use the same class to implement accessor methods for both Series and Index.

This PR makes it so that we only accept (and now require) a `parent` argument, simplifying the class and its usage. All instances of `ColumnMethods` being used internally have been replaced.


Part of the [Array refactor](#8273).

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - https://github.com/brandon-b-miller
  - Michael Wang (https://github.com/isVoid)

URL: #8306
  • Loading branch information
shwina authored Jul 15, 2021
1 parent 6ca7b58 commit 65a38af
Show file tree
Hide file tree
Showing 20 changed files with 668 additions and 618 deletions.
88 changes: 88 additions & 0 deletions python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
from cudf._lib.nvtext.generate_ngrams import (
generate_character_ngrams,
generate_ngrams,
)
from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
from cudf._lib.nvtext.replace import filter_tokens, replace_tokens
from cudf._lib.nvtext.stemmer import (
LetterType,
is_letter,
is_letter_multi,
porter_stemmer_measure,
)
from cudf._lib.nvtext.subword_tokenize import subword_tokenize_vocab_file
from cudf._lib.nvtext.tokenize import (
_count_tokens_column,
_count_tokens_scalar,
_tokenize_column,
_tokenize_scalar,
character_tokenize,
detokenize,
)
from cudf._lib.strings.attributes import (
code_points,
count_bytes,
count_characters,
)
from cudf._lib.strings.capitalize import capitalize, title
from cudf._lib.strings.case import swapcase, to_lower, to_upper
from cudf._lib.strings.char_types import (
filter_alphanum,
is_alnum,
is_alpha,
is_decimal,
is_digit,
is_lower,
is_numeric,
is_space,
is_upper,
)
from cudf._lib.strings.combine import (
concatenate,
join,
join_lists_with_column,
join_lists_with_scalar,
)
from cudf._lib.strings.contains import contains_re, count_re, match_re
from cudf._lib.strings.convert.convert_fixed_point import to_decimal
from cudf._lib.strings.convert.convert_floats import is_float
from cudf._lib.strings.convert.convert_integers import is_integer
from cudf._lib.strings.convert.convert_urls import url_decode, url_encode
from cudf._lib.strings.extract import extract
from cudf._lib.strings.find import (
contains,
contains_multiple,
endswith,
endswith_multiple,
find,
rfind,
startswith,
startswith_multiple,
)
from cudf._lib.strings.findall import findall
from cudf._lib.strings.json import get_json_object
from cudf._lib.strings.padding import PadSide, center, ljust, pad, rjust, zfill
from cudf._lib.strings.replace import (
insert,
replace,
replace_multi,
slice_replace,
)
from cudf._lib.strings.replace_re import (
replace_multi_re,
replace_re,
replace_with_backrefs,
)
from cudf._lib.strings.split.partition import partition, rpartition
from cudf._lib.strings.split.split import (
rsplit,
rsplit_record,
split,
split_record,
)
from cudf._lib.strings.strip import lstrip, rstrip, strip
from cudf._lib.strings.substring import get, slice_from, slice_strings
from cudf._lib.strings.translate import filter_characters, translate
from cudf._lib.strings.wrap import wrap
24 changes: 12 additions & 12 deletions python/cudf/cudf/_lib/strings/combine.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@ from cudf._lib.cpp.strings.combine cimport (


def concatenate(Table source_strings,
object py_separator,
object py_narep):
object sep,
object na_rep):
"""
Returns a Column by concatenating strings column-wise in `source_strings`
with the specified `py_separator` between each column and
`na`/`None` values are replaced by `py_narep`
with the specified `sep` between each column and
`na`/`None` values are replaced by `na_rep`
"""
cdef DeviceScalar separator = py_separator.device_value
cdef DeviceScalar narep = py_narep.device_value
cdef DeviceScalar separator = sep.device_value
cdef DeviceScalar narep = na_rep.device_value

cdef unique_ptr[column] c_result
cdef table_view source_view = source_strings.data_view()
Expand All @@ -53,16 +53,16 @@ def concatenate(Table source_strings,


def join(Column source_strings,
object py_separator,
object py_narep):
object sep,
object na_rep):
"""
Returns a Column by concatenating strings row-wise in `source_strings`
with the specified `py_separator` between each column and
`na`/`None` values are replaced by `py_narep`
with the specified `sep` between each column and
`na`/`None` values are replaced by `na_rep`
"""

cdef DeviceScalar separator = py_separator.device_value
cdef DeviceScalar narep = py_narep.device_value
cdef DeviceScalar separator = sep.device_value
cdef DeviceScalar narep = na_rep.device_value

cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
Expand Down
7 changes: 3 additions & 4 deletions python/cudf/cudf/_lib/transpose.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,10 @@ def transpose(Table source):
if is_categorical_dtype(dtype):
if any(not is_categorical_dtype(c.dtype) for c in source._columns):
raise ValueError('Columns must all have the same dtype')
cats = list(c.cat().categories for c in source._columns)
cats = cudf.Series(cudf.concat(cats)).drop_duplicates()._column
cats = list(c.categories for c in source._columns)
cats = cudf.core.column.concat_columns(cats).unique()
source = Table(index=source._index, data=[
(name, col.cat()._set_categories(
col.cat().categories, cats, is_unique=True).codes)
(name, col._set_categories(cats, is_unique=True).codes)
for name, col in source._data.items()
])
elif dtype.kind in 'OU':
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@
BinaryOperand = Union["cudf.Scalar", "cudf.core.column.ColumnBase"]

DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"]
SeriesOrIndex = Union["cudf.Series", "cudf.core.index.BaseIndex"]
2 changes: 1 addition & 1 deletion python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def _union_categoricals(
sorted_categories = result_col.categories.sort_by_values(
ascending=True
)[0]
result_col = result_col.cat().reorder_categories(
result_col = result_col.reorder_categories(
new_categories=sorted_categories
)

Expand Down
6 changes: 6 additions & 0 deletions python/cudf/cudf/core/column/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
"""
isort: skip_file
"""


from cudf.core.column.categorical import CategoricalColumn
from cudf.core.column.column import (
Expand All @@ -12,6 +16,7 @@
column_empty,
column_empty_like,
column_empty_like_same_mask,
concat_columns,
deserialize_columns,
full,
serialize_columns,
Expand All @@ -27,3 +32,4 @@
Decimal32Column,
Decimal64Column,
)
from cudf.core.column.interval import IntervalColumn # noqa: F401
Loading

0 comments on commit 65a38af

Please sign in to comment.