From ab35f7dca36ee456b909a33fccb7fb27a652620d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 29 Apr 2021 10:45:01 -0400 Subject: [PATCH 01/14] Add initial Buffer.copy() --- python/cudf/cudf/core/buffer.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 9fc5570e35a..293fefc380c 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -141,6 +141,13 @@ def empty(cls, size: int) -> Buffer: dbuf = DeviceBuffer(size=size) return Buffer(dbuf) + def copy(self): + from rmm._lib.device_buffer import copy_device_to_ptr + + out = Buffer(DeviceBuffer(size=self.size)) + copy_device_to_ptr(self.ptr, out.ptr, self.size) + return out + def _buffer_data_from_array_interface(array_interface): ptr = array_interface["data"][0] From 72197a2a6da6ba2a95bb848526ffde97bafa39d8 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 29 Apr 2021 11:21:49 -0400 Subject: [PATCH 02/14] Add Buffer copy tests --- python/cudf/cudf/tests/test_buffer.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py index 241d719f09e..4600d932c6f 100644 --- a/python/cudf/cudf/tests/test_buffer.py +++ b/python/cudf/cudf/tests/test_buffer.py @@ -1,5 +1,6 @@ import cupy as cp import pytest +from cupy.testing import assert_array_equal from cudf.core.buffer import Buffer @@ -44,3 +45,14 @@ def test_buffer_from_cuda_iface_dtype(data, dtype): TypeError, match="Buffer data must be of uint8 type" ): buf = Buffer(data=data, size=data.size) # noqa: F841 + + +@pytest.mark.parametrize("size", [0, 1, 10, 100, 1000, 10_000]) +def test_buffer_copy(size): + data = cp.random.randint(low=0, high=100, size=size, dtype="u1") + buf = Buffer(data=data) + got = buf.copy() + assert got.size == buf.size + if size > 0: + assert got.ptr != buf.ptr + assert_array_equal(cp.asarray(buf), cp.asarray(got)) From 9b4611b6b7833ef3496d91d51082477d03c063fa Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 29 Apr 2021 11:22:18 -0400 Subject: [PATCH 03/14] Docstring --- python/cudf/cudf/core/buffer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 293fefc380c..c6875052685 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -142,6 +142,10 @@ def empty(cls, size: int) -> Buffer: return Buffer(dbuf) def copy(self): + """ + Create a new Buffer containing a copy of the data contained + in this Buffer. + """ from rmm._lib.device_buffer import copy_device_to_ptr out = Buffer(DeviceBuffer(size=self.size)) From 8d64f843b6fb6e4d640d3c61e5e5e56157c88403 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 20 May 2021 11:58:17 -0400 Subject: [PATCH 04/14] Initial refactor of ColumnMethods --- python/cudf/cudf/_lib/strings/__init__.py | 88 ++++ python/cudf/cudf/_lib/strings/combine.pyx | 24 +- python/cudf/cudf/core/column/categorical.py | 440 ++++++++++------ python/cudf/cudf/core/column/lists.py | 33 +- python/cudf/cudf/core/column/methods.py | 17 +- python/cudf/cudf/core/column/string.py | 550 +++++++++----------- python/cudf/cudf/core/column/struct.py | 15 +- python/cudf/cudf/core/frame.py | 9 +- python/cudf/cudf/core/index.py | 16 +- python/cudf/cudf/core/join/_join_helpers.py | 4 +- python/cudf/cudf/core/series.py | 8 +- python/cudf/cudf/core/tools/numeric.py | 22 +- python/cudf/cudf/io/csv.py | 2 +- 13 files changed, 693 insertions(+), 535 deletions(-) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index e69de29bb2d..e942f742c66 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -0,0 +1,88 @@ +from cudf._lib.nvtext.edit_distance import edit_distance +from cudf._lib.nvtext.generate_ngrams import ( + generate_character_ngrams, + generate_ngrams, +) +from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize +from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces +from cudf._lib.nvtext.replace import filter_tokens, replace_tokens +from cudf._lib.nvtext.stemmer import ( + LetterType, + is_letter, + is_letter_multi, + porter_stemmer_measure, +) +from cudf._lib.nvtext.subword_tokenize import subword_tokenize_vocab_file +from cudf._lib.nvtext.tokenize import ( + _count_tokens_column, + _count_tokens_scalar, + _tokenize_column, + _tokenize_scalar, + character_tokenize, + detokenize, +) +from cudf._lib.strings.attributes import ( + code_points, + count_bytes, + count_characters, +) +from cudf._lib.strings.capitalize import capitalize, title +from cudf._lib.strings.case import swapcase, to_lower, to_upper +from cudf._lib.strings.char_types import ( + filter_alphanum, + is_alnum, + is_alpha, + is_decimal, + is_digit, + is_lower, + is_numeric, + is_space, + is_upper, +) +from cudf._lib.strings.combine import ( + concatenate, + join, + join_lists_with_column, + join_lists_with_scalar, +) +from cudf._lib.strings.contains import contains_re, count_re, match_re +from cudf._lib.strings.convert.convert_fixed_point import to_decimal +from cudf._lib.strings.convert.convert_floats import is_float +from cudf._lib.strings.convert.convert_integers import is_integer +from cudf._lib.strings.convert.convert_urls import url_decode, url_encode +from cudf._lib.strings.extract import extract +from cudf._lib.strings.find import ( + contains, + contains_multiple, + endswith, + endswith_multiple, + find, + rfind, + startswith, + startswith_multiple, +) +from cudf._lib.strings.findall import findall +from cudf._lib.strings.json import get_json_object +from cudf._lib.strings.padding import PadSide, center, ljust, pad, rjust, zfill +from cudf._lib.strings.replace import ( + insert, + replace, + replace_multi, + slice_replace, +) +from cudf._lib.strings.replace_re import ( + replace_multi_re, + replace_re, + replace_with_backrefs, +) +from cudf._lib.strings.split.partition import partition, rpartition +from cudf._lib.strings.split.split import ( + rsplit, + rsplit_record, + split, + split_record, +) +from cudf._lib.strings.strip import lstrip, rstrip, strip +from cudf._lib.strings.substring import get, slice_from, slice_strings +from cudf._lib.strings.translate import filter_characters, translate +from cudf._lib.strings.wrap import wrap diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 25619de3ed0..4560695f280 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -21,15 +21,15 @@ from cudf._lib.cpp.strings.combine cimport ( def concatenate(Table source_strings, - object py_separator, - object py_narep): + object sep, + object na_rep): """ Returns a Column by concatenating strings column-wise in `source_strings` - with the specified `py_separator` between each column and - `na`/`None` values are replaced by `py_narep` + with the specified `sep` between each column and + `na`/`None` values are replaced by `na_rep` """ - cdef DeviceScalar separator = py_separator.device_value - cdef DeviceScalar narep = py_narep.device_value + cdef DeviceScalar separator = sep.device_value + cdef DeviceScalar narep = na_rep.device_value cdef unique_ptr[column] c_result cdef table_view source_view = source_strings.data_view() @@ -51,16 +51,16 @@ def concatenate(Table source_strings, def join(Column source_strings, - object py_separator, - object py_narep): + object sep, + object na_rep): """ Returns a Column by concatenating strings row-wise in `source_strings` - with the specified `py_separator` between each column and - `na`/`None` values are replaced by `py_narep` + with the specified `sep` between each column and + `na`/`None` values are replaced by `na_rep` """ - cdef DeviceScalar separator = py_separator.device_value - cdef DeviceScalar narep = py_narep.device_value + cdef DeviceScalar separator = sep.device_value + cdef DeviceScalar narep = na_rep.device_value cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index c199947d261..aed4425093f 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -53,7 +53,7 @@ class CategoricalAccessor(ColumnMethodsMixin): _column: CategoricalColumn - def __init__(self, column: Any, parent: ParentType = None): + def __init__(self, parent: ParentType): """ Accessor object for categorical properties of the Series values. Be aware that assigning to `categories` is a inplace operation, @@ -107,18 +107,18 @@ def __init__(self, column: Any, parent: ParentType = None): dtype: category Categories (3, int64): [1, 2, 3] """ - if not is_categorical_dtype(column.dtype): + if not is_categorical_dtype(parent.dtype): raise AttributeError( "Can only use .cat accessor with a 'category' dtype" ) - super().__init__(column=column, parent=parent) + super().__init__(parent=parent) @property def categories(self) -> "cudf.Index": """ The categories of this categorical. """ - return cudf.core.index.as_index(self._column.categories) + return cudf.core.index.as_index(self._parent._column.categories) @property def codes(self) -> "cudf.Series": @@ -130,14 +130,14 @@ def codes(self) -> "cudf.Series": if isinstance(self._parent, cudf.Series) else None ) - return cudf.Series(self._column.codes, index=index) + return cudf.Series(self._parent._column.codes, index=index) @property def ordered(self) -> Optional[bool]: """ Whether the categories have an ordered relationship. """ - return self._column.ordered + return self._parent._column.ordered def as_ordered(self, inplace: bool = False) -> Optional[ParentType]: """ @@ -192,13 +192,9 @@ def as_ordered(self, inplace: bool = False) -> Optional[ParentType]: dtype: category Categories (3, int64): [1 < 2 < 10] """ - out_col = self._column - if not out_col.ordered: - out_col = self._set_categories( - self._column.categories, self._column.categories, ordered=True, - ) - - return self._return_or_inplace(out_col, inplace=inplace) + return self._return_or_inplace( + self._parent._column.as_ordered(), inplace=inplace + ) def as_unordered(self, inplace: bool = False) -> Optional[ParentType]: """ @@ -264,13 +260,9 @@ def as_unordered(self, inplace: bool = False) -> Optional[ParentType]: dtype: category Categories (3, int64): [1, 2, 10] """ - out_col = self._column - if out_col.ordered: - out_col = self._set_categories( - self._column.categories, self.categories, ordered=False - ) - - return self._return_or_inplace(out_col, inplace=inplace) + return self._return_or_inplace( + self._parent._column.as_unordered(), inplace=inplace + ) def add_categories( self, new_categories: Any, inplace: bool = False @@ -326,7 +318,7 @@ def add_categories( Categories (5, int64): [1, 2, 0, 3, 4] """ - old_categories = self._column.categories + old_categories = self._parent._column.categories new_categories = column.as_column( new_categories, dtype=old_categories.dtype if len(new_categories) == 0 else None, @@ -351,9 +343,9 @@ def add_categories( raise ValueError("new categories must not include old categories") new_categories = old_categories.append(new_categories) - out_col = self._column - if not self._categories_equal(new_categories): - out_col = self._set_categories(old_categories, new_categories) + out_col = self._parent._column + if not out_col._categories_equal(new_categories): + out_col = out_col._set_categories(new_categories) return self._return_or_inplace(out_col, inplace=inplace) @@ -441,11 +433,9 @@ def remove_categories( raise ValueError(f"removals must all be in old categories: {vals}") new_categories = cats[~cats.isin(removals)]._column - out_col = self._column - if not self._categories_equal(new_categories): - out_col = self._set_categories( - self._column.categories, new_categories - ) + out_col = self._parent._column + if not out_col._categories_equal(new_categories): + out_col = out_col._set_categories(new_categories) return self._return_or_inplace(out_col, inplace=inplace) @@ -548,7 +538,7 @@ def set_categories( # categories. if rename: # enforce same length - if len(new_categories) != len(self._column.categories): + if len(new_categories) != len(self._parent._column.categories): raise ValueError( "new_categories must have the same " "number of items as old categories" @@ -556,29 +546,29 @@ def set_categories( out_col = column.build_categorical_column( categories=new_categories, - codes=self._column.base_children[0], - mask=self._column.base_mask, - size=self._column.size, - offset=self._column.offset, + codes=self._parent._column.base_children[0], + mask=self._parent._column.base_mask, + size=self._parent._column.size, + offset=self._parent._column.offset, ordered=ordered, ) else: - out_col = self._column + out_col = self._parent._column if not (type(out_col.categories) is type(new_categories)): # If both categories are of different Column types, # return a column full of Nulls. out_col = _create_empty_categorical_column( - self._column, + self._parent._column, CategoricalDtype( categories=new_categories, ordered=ordered ), ) elif ( - not self._categories_equal(new_categories, ordered=ordered) + not out_col._categories_equal(new_categories, ordered=ordered) or not self.ordered == ordered ): - out_col = self._set_categories( - self._column.categories, new_categories, ordered=ordered, + out_col = out_col._set_categories( + new_categories, ordered=ordered, ) return self._return_or_inplace(out_col, inplace=inplace) @@ -658,102 +648,21 @@ def reorder_categories( # Ignore order for comparison because we're only interested # in whether new_categories has all the same values as the # current set of categories. - if not self._categories_equal(new_categories, ordered=False): + if not self._parent._column._categories_equal( + new_categories, ordered=False + ): raise ValueError( "items in new_categories are not the same as in " "old categories" ) - out_col = self._set_categories( - self._column.categories, new_categories, ordered=ordered + out_col = self._parent._column._set_categories( + new_categories, ordered=ordered ) return self._return_or_inplace(out_col, inplace=inplace) - def _categories_equal( - self, new_categories: ColumnBase, ordered=False - ) -> bool: - cur_categories = self._column.categories - if len(new_categories) != len(cur_categories): - return False - if new_categories.dtype != cur_categories.dtype: - return False - # if order doesn't matter, sort before the equals call below - if not ordered: - cur_categories = cudf.Series(cur_categories).sort_values( - ignore_index=True - ) - new_categories = cudf.Series(new_categories).sort_values( - ignore_index=True - ) - return cur_categories.equals(new_categories) - - def _set_categories( - self, - current_categories: Any, - new_categories: Any, - is_unique: bool = False, - ordered: bool = False, - ) -> CategoricalColumn: - """Returns a new CategoricalColumn with the categories set to the - specified *new_categories*. - - Notes - ----- - Assumes ``new_categories`` is the same dtype as the current categories - """ - - cur_cats = column.as_column(current_categories) - new_cats = column.as_column(new_categories) - - # Join the old and new categories to build a map from - # old to new codes, inserting na_sentinel for any old - # categories that don't exist in the new categories - - # Ensure new_categories is unique first - if not (is_unique or new_cats.is_unique): - # drop_duplicates() instead of unique() to preserve order - new_cats = ( - cudf.Series(new_cats) - .drop_duplicates(ignore_index=True) - ._column - ) - - cur_codes = self.codes - max_cat_size = ( - len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats) - ) - out_code_dtype = min_unsigned_type(max_cat_size) - - cur_order = column.arange(len(cur_codes)) - old_codes = column.arange(len(cur_cats), dtype=out_code_dtype) - new_codes = column.arange(len(new_cats), dtype=out_code_dtype) - - new_df = cudf.DataFrame({"new_codes": new_codes, "cats": new_cats}) - old_df = cudf.DataFrame({"old_codes": old_codes, "cats": cur_cats}) - cur_df = cudf.DataFrame({"old_codes": cur_codes, "order": cur_order}) - - # Join the old and new categories and line up their codes - df = old_df.merge(new_df, on="cats", how="left") - # Join the old and new codes to "recode" the codes data buffer - df = cur_df.merge(df, on="old_codes", how="left") - df = df.sort_values(by="order") - df.reset_index(drop=True, inplace=True) - - ordered = ordered if ordered is not None else self.ordered - new_codes = df["new_codes"]._column - - # codes can't have masks, so take mask out before moving in - return column.build_categorical_column( - categories=new_cats, - codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype), - mask=new_codes.base_mask, - size=new_codes.size, - offset=new_codes.offset, - ordered=ordered, - ) - def _decategorize(self) -> ColumnBase: - return self._column._get_decategorized_column() + return self._parent._column._get_decategorized_column() class CategoricalColumn(column.ColumnBase): @@ -941,9 +850,6 @@ def ordered(self) -> Optional[bool]: def ordered(self, value: bool): self.dtype.ordered = value - def cat(self, parent: ParentType = None): - return CategoricalAccessor(self, parent=parent) - def unary_operator(self, unaryop: str): raise TypeError( f"Series of dtype `category` cannot perform the operation: " @@ -1091,7 +997,7 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series: col = self signed_dtype = min_signed_type(len(col.categories)) - codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array() + codes = col.codes.astype(signed_dtype).fillna(-1).to_array() categories = col.categories.dropna(drop_nan=True).to_pandas() data = pd.Categorical.from_codes( codes, categories=categories, ordered=col.ordered @@ -1222,13 +1128,11 @@ def find_and_replace( # named 'index', which came from the filtered categories, # contains the new ints that we need to map to to_replace_col = column.as_column(catmap.index).astype( - self.cat().codes.dtype - ) - replacement_col = catmap["index"]._column.astype( - self.cat().codes.dtype + self.codes.dtype ) + replacement_col = catmap["index"]._column.astype(self.codes.dtype) - replaced = column.as_column(self.cat().codes) + replaced = column.as_column(self.codes) output = libcudf.replace.replace( replaced, to_replace_col, replacement_col ) @@ -1306,10 +1210,8 @@ def fillna( ) # TODO: only required if fill_value has a subset of the # categories: - fill_value = fill_value.cat()._set_categories( - fill_value.cat().categories, - self.categories, - is_unique=True, + fill_value = fill_value._set_categories( + self.categories, is_unique=True, ) fill_value = column.as_column(fill_value.codes).astype( self.codes.dtype @@ -1377,8 +1279,8 @@ def as_categorical_column( # return a column full of Nulls. return _create_empty_categorical_column(self, dtype) - return self.cat().set_categories( - new_categories=dtype.categories, ordered=dtype.ordered + return self.set_categories( + new_categories=dtype.categories, ordered=bool(dtype.ordered) ) def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: @@ -1402,8 +1304,8 @@ def as_timedelta_column(self, dtype, **kwargs) -> TimeDeltaColumn: def _get_decategorized_column(self) -> ColumnBase: if self.null_count == len(self): # self.categories is empty; just return codes - return self.cat().codes._column - gather_map = self.cat().codes.astype("int32").fillna(0)._column + return self.codes + gather_map = self.codes.astype("int32").fillna(0) out = self.categories.take(gather_map) out = out.set_mask(self.mask) return out @@ -1436,19 +1338,14 @@ def copy(self, deep: bool = True) -> CategoricalColumn: ) def __sizeof__(self) -> int: - return ( - self.cat().categories.__sizeof__() + self.cat().codes.__sizeof__() - ) + return self.categories.__sizeof__() + self.codes.__sizeof__() def _memory_usage(self, **kwargs) -> int: deep = kwargs.get("deep", False) if deep: return self.__sizeof__() else: - return ( - self.categories._memory_usage() - + self.cat().codes.memory_usage() - ) + return self.categories._memory_usage() + self.codes._memory_usage() def _mimic_inplace( self, other_col: ColumnBase, inplace: bool = False @@ -1475,14 +1372,9 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: # Combine and de-dupe the categories cats = ( - cudf.concat([o.cat().categories for o in objs]) - .drop_duplicates() - ._column + cudf.concat([o.categories for o in objs]).drop_duplicates()._column ) - objs = [ - o.cat()._set_categories(o.cat().categories, cats, is_unique=True) - for o in objs - ] + objs = [o._set_categories(cats, is_unique=True) for o in objs] codes = [o.codes for o in objs] newsize = sum(map(len, codes)) @@ -1506,6 +1398,238 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: offset=codes_col.offset, ) + def set_categories( + self, new_categories: Any, ordered: bool = False, rename: bool = False, + ) -> CategoricalColumn: + """ + Set the categories to the specified new_categories. + + + `new_categories` can include new categories (which + will result in unused categories) or remove old categories + (which results in values set to null). If `rename==True`, + the categories will simple be renamed (less or more items + than in old categories will result in values set to null or + in unused categories respectively). + + This method can be used to perform more than one action + of adding, removing, and reordering simultaneously and + is therefore faster than performing the individual steps + via the more specialised methods. + + On the other hand this methods does not do checks + (e.g., whether the old categories are included in the + new categories on a reorder), which can result in + surprising changes. + + Parameters + ---------- + + new_categories : list-like + The categories in new order. + + ordered : bool, default None + Whether or not the categorical is treated as + a ordered categorical. If not given, do + not change the ordered information. + + rename : bool, default False + Whether or not the `new_categories` should be + considered as a rename of the old categories + or as reordered categories. + + Returns + ------- + cat + Categorical with reordered categories + or None if inplace. + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([1, 1, 2, 10, 2, 10], dtype='category') + >>> s + 0 1 + 1 1 + 2 2 + 3 10 + 4 2 + 5 10 + dtype: category + Categories (3, int64): [1, 2, 10] + >>> s.cat.set_categories([1, 10]) + 0 1 + 1 1 + 2 + 3 10 + 4 + 5 10 + dtype: category + Categories (2, int64): [1, 10] + >>> s.cat.set_categories([1, 10], inplace=True) + >>> s + 0 1 + 1 1 + 2 + 3 10 + 4 + 5 10 + dtype: category + Categories (2, int64): [1, 10] + """ + ordered = ordered if ordered is not None else self.ordered + new_categories = column.as_column(new_categories) + + if isinstance(new_categories, CategoricalColumn): + new_categories = new_categories.categories + + # when called with rename=True, the pandas behavior is + # to replace the current category values with the new + # categories. + if rename: + # enforce same length + if len(new_categories) != len(self.categories): + raise ValueError( + "new_categories must have the same " + "number of items as old categories" + ) + + out_col = column.build_categorical_column( + categories=new_categories, + codes=self.base_children[0], + mask=self.base_mask, + size=self.size, + offset=self.offset, + ordered=ordered, + ) + else: + out_col = self + if not (type(out_col.categories) is type(new_categories)): + # If both categories are of different Column types, + # return a column full of Nulls. + out_col = _create_empty_categorical_column( + self, + CategoricalDtype( + categories=new_categories, ordered=ordered + ), + ) + elif ( + not out_col._categories_equal(new_categories, ordered=ordered) + or not self.ordered == ordered + ): + out_col = out_col._set_categories( + new_categories, ordered=ordered, + ) + return out_col + + def _categories_equal( + self, new_categories: ColumnBase, ordered=False + ) -> bool: + cur_categories = self.categories + if len(new_categories) != len(cur_categories): + return False + if new_categories.dtype != cur_categories.dtype: + return False + # if order doesn't matter, sort before the equals call below + if not ordered: + cur_categories = cudf.Series(cur_categories).sort_values( + ignore_index=True + ) + new_categories = cudf.Series(new_categories).sort_values( + ignore_index=True + ) + return cur_categories.equals(new_categories) + + def _set_categories( + self, + new_categories: Any, + is_unique: bool = False, + ordered: bool = False, + ) -> CategoricalColumn: + """Returns a new CategoricalColumn with the categories set to the + specified *new_categories*. + + Notes + ----- + Assumes ``new_categories`` is the same dtype as the current categories + """ + + cur_cats = column.as_column(self.categories) + new_cats = column.as_column(new_categories) + + # Join the old and new categories to build a map from + # old to new codes, inserting na_sentinel for any old + # categories that don't exist in the new categories + + # Ensure new_categories is unique first + if not (is_unique or new_cats.is_unique): + # drop_duplicates() instead of unique() to preserve order + new_cats = ( + cudf.Series(new_cats) + .drop_duplicates(ignore_index=True) + ._column + ) + + cur_codes = self.codes + max_cat_size = ( + len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats) + ) + out_code_dtype = min_unsigned_type(max_cat_size) + + cur_order = column.arange(len(cur_codes)) + old_codes = column.arange(len(cur_cats), dtype=out_code_dtype) + new_codes = column.arange(len(new_cats), dtype=out_code_dtype) + + new_df = cudf.DataFrame({"new_codes": new_codes, "cats": new_cats}) + old_df = cudf.DataFrame({"old_codes": old_codes, "cats": cur_cats}) + cur_df = cudf.DataFrame({"old_codes": cur_codes, "order": cur_order}) + + # Join the old and new categories and line up their codes + df = old_df.merge(new_df, on="cats", how="left") + # Join the old and new codes to "recode" the codes data buffer + df = cur_df.merge(df, on="old_codes", how="left") + df = df.sort_values(by="order") + df.reset_index(drop=True, inplace=True) + + ordered = ordered if ordered is not None else self.ordered + new_codes = df["new_codes"]._column + + # codes can't have masks, so take mask out before moving in + return column.build_categorical_column( + categories=new_cats, + codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype), + mask=new_codes.base_mask, + size=new_codes.size, + offset=new_codes.offset, + ordered=ordered, + ) + + def as_ordered(self): + out_col = self + if not out_col.ordered: + out_col = column.build_categorical_column( + categories=self.categories, + codes=self.codes, + mask=self.base_mask, + size=self.base_size, + offset=self.offset, + ordered=True, + ) + return out_col + + def as_unordered(self): + out_col = self + if out_col.ordered: + out_col = column.build_categorical_column( + categories=self.categories, + codes=self.codes, + mask=self.base_mask, + size=self.base_size, + offset=self.offset, + ordered=False, + ) + return out_col + def _create_empty_categorical_column( categorical_column: CategoricalColumn, dtype: "CategoricalDtype" @@ -1516,7 +1640,7 @@ def _create_empty_categorical_column( cudf.utils.utils.scalar_broadcast_to( categorical_column.default_na_value(), categorical_column.size, - np.dtype(categorical_column.cat().codes), + np.dtype(categorical_column.codes), ) ), offset=categorical_column.offset, diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 7ea02c0e878..3d09bafd9ad 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -145,9 +145,6 @@ def offsets(self): """ return self.children[0] - def list(self, parent=None): - return ListMethods(self, parent=parent) - def to_arrow(self): offsets = self.offsets.to_arrow() elements = ( @@ -239,12 +236,12 @@ class ListMethods(ColumnMethodsMixin): List methods for Series """ - def __init__(self, column, parent=None): - if not is_list_dtype(column.dtype): + def __init__(self, parent=None): + if not is_list_dtype(parent.dtype): raise AttributeError( "Can only use .list accessor with a 'list' dtype" ) - super().__init__(column=column, parent=parent) + super().__init__(parent=parent) def get(self, index): """ @@ -273,7 +270,7 @@ def get(self, index): min_col_list_len = self.len().min() if -min_col_list_len <= index < min_col_list_len: return self._return_or_inplace( - extract_element(self._column, index) + extract_element(self._parent._column, index) ) else: raise IndexError("list index out of range") @@ -302,7 +299,7 @@ def contains(self, search_key): search_key = cudf.Scalar(search_key) try: res = self._return_or_inplace( - contains_scalar(self._column, search_key) + contains_scalar(self._parent._column, search_key) ) except RuntimeError as e: if ( @@ -339,11 +336,11 @@ def leaves(self): 5 6 dtype: int64 """ - if type(self._column.elements) is ListColumn: - return self._column.elements.list(parent=self._parent).leaves + if type(self._parent._column.elements) is ListColumn: + return self._parent._column.elements.elements else: return self._return_or_inplace( - self._column.elements, retain_index=False + self._parent._column.elements, retain_index=False ) def len(self): @@ -368,7 +365,7 @@ def len(self): 2 2 dtype: int32 """ - return self._return_or_inplace(count_elements(self._column)) + return self._return_or_inplace(count_elements(self._parent._column)) def take(self, lists_indices): """ @@ -401,7 +398,7 @@ def take(self, lists_indices): lists_indices_col = as_column(lists_indices) if not isinstance(lists_indices_col, ListColumn): raise ValueError("lists_indices should be list type array.") - if not lists_indices_col.size == self._column.size: + if not lists_indices_col.size == self._parent._column.size: raise ValueError( "lists_indices and list column is of different " "size." ) @@ -416,7 +413,7 @@ def take(self, lists_indices): try: res = self._return_or_inplace( - segmented_gather(self._column, lists_indices_col) + segmented_gather(self._parent._column, lists_indices_col) ) except RuntimeError as e: if "contains nulls" in str(e): @@ -451,12 +448,12 @@ def unique(self): dtype: list """ - if is_list_dtype(self._column.children[1].dtype): + if is_list_dtype(self._parent._column.children[1].dtype): raise NotImplementedError("Nested lists unique is not supported.") return self._return_or_inplace( drop_list_duplicates( - self._column, nulls_equal=True, nans_all_equal=True + self._parent._column, nulls_equal=True, nans_all_equal=True ) ) @@ -506,10 +503,10 @@ def sort_values( raise NotImplementedError("`kind` not currently implemented.") if na_position not in {"first", "last"}: raise ValueError(f"Unknown `na_position` value {na_position}") - if is_list_dtype(self._column.children[1].dtype): + if is_list_dtype(self._parent._column.children[1].dtype): raise NotImplementedError("Nested lists sort is not supported.") return self._return_or_inplace( - sort_lists(self._column, ascending, na_position), + sort_lists(self._parent._column, ascending, na_position), retain_index=not ignore_index, ) diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index eec9c2a7860..e2f4acde8cd 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -2,26 +2,17 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional, Union, overload +from typing import Optional, Union, overload from typing_extensions import Literal import cudf -if TYPE_CHECKING: - from cudf.core.column import ColumnBase - class ColumnMethodsMixin: - _column: ColumnBase - _parent: Optional[Union["cudf.Series", "cudf.Index"]] + _parent: Union["cudf.Series", "cudf.Index"] - def __init__( - self, - column: ColumnBase, - parent: Union["cudf.Series", "cudf.Index"] = None, - ): - self._column = column + def __init__(self, parent: Union["cudf.Series", "cudf.Index"]): self._parent = parent @overload @@ -69,7 +60,7 @@ def _return_or_inplace( ) return None else: - self._column._mimic_inplace(new_col, inplace=True) + self._parent._column._mimic_inplace(new_col, inplace=True) return None else: if self._parent is None: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 940b38ef5ff..36a7e159dc3 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -15,145 +15,8 @@ import cudf from cudf import _lib as libcudf -from cudf._lib import string_casting as str_cast +from cudf._lib import string_casting as str_cast, strings as libstrings from cudf._lib.column import Column -from cudf._lib.nvtext.edit_distance import edit_distance as cpp_edit_distance -from cudf._lib.nvtext.generate_ngrams import ( - generate_character_ngrams as cpp_generate_character_ngrams, - generate_ngrams as cpp_generate_ngrams, -) -from cudf._lib.nvtext.ngrams_tokenize import ( - ngrams_tokenize as cpp_ngrams_tokenize, -) -from cudf._lib.nvtext.normalize import ( - normalize_characters as cpp_normalize_characters, - normalize_spaces as cpp_normalize_spaces, -) -from cudf._lib.nvtext.replace import ( - filter_tokens as cpp_filter_tokens, - replace_tokens as cpp_replace_tokens, -) -from cudf._lib.nvtext.stemmer import ( - LetterType, - is_letter as cpp_is_letter, - is_letter_multi as cpp_is_letter_multi, - porter_stemmer_measure as cpp_porter_stemmer_measure, -) -from cudf._lib.nvtext.subword_tokenize import ( - subword_tokenize_vocab_file as cpp_subword_tokenize_vocab_file, -) -from cudf._lib.nvtext.tokenize import ( - _count_tokens_column as cpp_count_tokens_column, - _count_tokens_scalar as cpp_count_tokens_scalar, - _tokenize_column as cpp_tokenize_column, - _tokenize_scalar as cpp_tokenize_scalar, - character_tokenize as cpp_character_tokenize, - detokenize as cpp_detokenize, -) -from cudf._lib.strings.attributes import ( - code_points as cpp_code_points, - count_bytes as cpp_count_bytes, - count_characters as cpp_count_characters, -) -from cudf._lib.strings.capitalize import ( - capitalize as cpp_capitalize, - title as cpp_title, -) -from cudf._lib.strings.case import ( - swapcase as cpp_swapcase, - to_lower as cpp_to_lower, - to_upper as cpp_to_upper, -) -from cudf._lib.strings.char_types import ( - filter_alphanum as cpp_filter_alphanum, - is_alnum as cpp_is_alnum, - is_alpha as cpp_is_alpha, - is_decimal as cpp_is_decimal, - is_digit as cpp_is_digit, - is_lower as cpp_is_lower, - is_numeric as cpp_is_numeric, - is_space as cpp_isspace, - is_upper as cpp_is_upper, -) -from cudf._lib.strings.combine import ( - concatenate as cpp_concatenate, - join as cpp_join, - join_lists_with_column as cpp_join_lists_with_column, - join_lists_with_scalar as cpp_join_lists_with_scalar, -) -from cudf._lib.strings.contains import ( - contains_re as cpp_contains_re, - count_re as cpp_count_re, - match_re as cpp_match_re, -) -from cudf._lib.strings.convert.convert_fixed_point import ( - to_decimal as cpp_to_decimal, -) -from cudf._lib.strings.convert.convert_floats import is_float as cpp_is_float -from cudf._lib.strings.convert.convert_integers import ( - is_integer as cpp_is_integer, -) -from cudf._lib.strings.convert.convert_urls import ( - url_decode as cpp_url_decode, - url_encode as cpp_url_encode, -) -from cudf._lib.strings.extract import extract as cpp_extract -from cudf._lib.strings.find import ( - contains as cpp_contains, - contains_multiple as cpp_contains_multiple, - endswith as cpp_endswith, - endswith_multiple as cpp_endswith_multiple, - find as cpp_find, - rfind as cpp_rfind, - startswith as cpp_startswith, - startswith_multiple as cpp_startswith_multiple, -) -from cudf._lib.strings.findall import findall as cpp_findall -from cudf._lib.strings.json import get_json_object as cpp_get_json_object -from cudf._lib.strings.padding import ( - PadSide, - center as cpp_center, - ljust as cpp_ljust, - pad as cpp_pad, - rjust as cpp_rjust, - zfill as cpp_zfill, -) -from cudf._lib.strings.replace import ( - insert as cpp_string_insert, - replace as cpp_replace, - replace_multi as cpp_replace_multi, - slice_replace as cpp_slice_replace, -) -from cudf._lib.strings.replace_re import ( - replace_multi_re as cpp_replace_multi_re, - replace_re as cpp_replace_re, - replace_with_backrefs as cpp_replace_with_backrefs, -) -from cudf._lib.strings.split.partition import ( - partition as cpp_partition, - rpartition as cpp_rpartition, -) -from cudf._lib.strings.split.split import ( - rsplit as cpp_rsplit, - rsplit_record as cpp_rsplit_record, - split as cpp_split, - split_record as cpp_split_record, -) -from cudf._lib.strings.strip import ( - lstrip as cpp_lstrip, - rstrip as cpp_rstrip, - strip as cpp_strip, -) -from cudf._lib.strings.substring import ( - get as cpp_string_get, - slice_from as cpp_slice_from, - slice_strings as cpp_slice_strings, -) -from cudf._lib.strings.translate import ( - filter_characters as cpp_filter_characters, - translate as cpp_translate, -) -from cudf._lib.strings.wrap import wrap as cpp_wrap from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import column, datetime @@ -216,7 +79,7 @@ class StringMethods(ColumnMethodsMixin): - def __init__(self, column, parent=None): + def __init__(self, parent=None): """ Vectorized string functions for Series and Index. @@ -226,13 +89,15 @@ def __init__(self, column, parent=None): inspiration from R’s stringr package. """ value_type = ( - column.dtype.leaf_type if is_list_dtype(column) else column.dtype + parent.dtype.leaf_type + if is_list_dtype(parent.dtype) + else parent.dtype ) if not is_string_dtype(value_type): raise AttributeError( "Can only use .str accessor with string values" ) - super().__init__(column=column, parent=parent) + super().__init__(parent=parent) def htoi(self) -> ParentType: """ @@ -255,7 +120,7 @@ def htoi(self) -> ParentType: dtype: int64 """ - out = str_cast.htoi(self._column) + out = str_cast.htoi(self._parent._column) return self._return_or_inplace(out, inplace=False) @@ -286,7 +151,7 @@ def ip2int(self) -> ParentType: dtype: int64 """ - out = str_cast.ip2int(self._column) + out = str_cast.ip2int(self._parent._column) return self._return_or_inplace(out, inplace=False) @@ -316,7 +181,9 @@ def len(self) -> ParentType: dtype: int32 """ - return self._return_or_inplace(cpp_count_characters(self._column)) + return self._return_or_inplace( + libstrings.count_characters(self._parent._column) + ) def byte_count(self) -> ParentType: """ @@ -343,7 +210,9 @@ def byte_count(self) -> ParentType: 2 11 dtype: int32 """ - return self._return_or_inplace(cpp_count_bytes(self._column),) + return self._return_or_inplace( + libstrings.count_bytes(self._parent._column), + ) @overload def cat(self, sep: str = None, na_rep: str = None) -> str: @@ -443,13 +312,15 @@ def cat(self, others=None, sep=None, na_rep=None): sep = "" if others is None: - data = cpp_join( - self._column, cudf.Scalar(sep), cudf.Scalar(na_rep, "str"), + data = libstrings.join( + self._parent._column, + cudf.Scalar(sep), + cudf.Scalar(na_rep, "str"), ) else: other_cols = _get_cols_list(self._parent, others) - all_cols = [self._column] + other_cols - data = cpp_concatenate( + all_cols = [self._parent._column] + other_cols + data = libstrings.concatenate( cudf.DataFrame( {index: value for index, value in enumerate(all_cols)} ), @@ -595,15 +466,15 @@ def join( f" of type : {type(string_na_rep)}" ) - if isinstance(self._column, cudf.core.column.ListColumn): - strings_column = self._column + if isinstance(self._parent._column, cudf.core.column.ListColumn): + strings_column = self._parent._column else: - # If self._column is not a ListColumn, we will have to + # If self._parent._column is not a ListColumn, we will have to # split each row by character and create a ListColumn out of it. strings_column = self._split_by_character() if is_scalar(sep): - data = cpp_join_lists_with_scalar( + data = libstrings.join_lists_with_scalar( strings_column, cudf.Scalar(sep), cudf.Scalar(string_na_rep) ) elif can_convert_to_column(sep): @@ -619,7 +490,7 @@ def join( f"of type: {type(sep_na_rep)}" ) - data = cpp_join_lists_with_column( + data = libstrings.join_lists_with_column( strings_column, sep_column, cudf.Scalar(string_na_rep), @@ -634,16 +505,16 @@ def join( return self._return_or_inplace(data) def _split_by_character(self): - result_col = cpp_character_tokenize(self._column) + result_col = libstrings.character_tokenize(self._parent._column) - offset_col = self._column.children[0] + offset_col = self._parent._column.children[0] res = cudf.core.column.ListColumn( - size=len(self._column), - dtype=cudf.ListDtype(self._column.dtype), - mask=self._column.mask, + size=len(self._parent._column), + dtype=cudf.ListDtype(self._parent._column.dtype), + mask=self._parent._column.mask, offset=0, - null_count=self._column.null_count, + null_count=self._parent._column.null_count, children=(offset_col, result_col), ) return res @@ -708,7 +579,7 @@ def extract( if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") - out = cpp_extract(self._column, pat) + out = libstrings.extract(self._parent._column, pat) if out._num_columns == 1 and expand is False: return self._return_or_inplace(out._columns[0], expand=expand) else: @@ -835,18 +706,18 @@ def contains( if pat is None: result_col = column.column_empty( - len(self._column), dtype="bool", masked=True + len(self._parent._column), dtype="bool", masked=True ) elif is_scalar(pat): if regex is True: - result_col = cpp_contains_re(self._column, pat) + result_col = libstrings.contains_re(self._parent._column, pat) else: - result_col = cpp_contains( - self._column, cudf.Scalar(pat, "str") + result_col = libstrings.contains( + self._parent._column, cudf.Scalar(pat, "str") ) else: - result_col = cpp_contains_multiple( - self._column, column.as_column(pat, dtype="str") + result_col = libstrings.contains_multiple( + self._parent._column, column.as_column(pat, dtype="str") ) return self._return_or_inplace(result_col) @@ -934,12 +805,14 @@ def replace( ) return self._return_or_inplace( - cpp_replace_multi_re( - self._column, pat, column.as_column(repl, dtype="str") + libstrings.replace_multi_re( + self._parent._column, + pat, + column.as_column(repl, dtype="str"), ) if regex - else cpp_replace_multi( - self._column, + else libstrings.replace_multi( + self._parent._column, column.as_column(pat, dtype="str"), column.as_column(repl, dtype="str"), ), @@ -950,10 +823,12 @@ def replace( # Pandas forces non-regex replace when pat is a single-character return self._return_or_inplace( - cpp_replace_re(self._column, pat, cudf.Scalar(repl, "str"), n) + libstrings.replace_re( + self._parent._column, pat, cudf.Scalar(repl, "str"), n + ) if regex is True and len(pat) > 1 - else cpp_replace( - self._column, + else libstrings.replace( + self._parent._column, cudf.Scalar(pat, "str"), cudf.Scalar(repl, "str"), n, @@ -987,7 +862,7 @@ def replace_with_backrefs(self, pat: str, repl: str) -> ParentType: dtype: object """ return self._return_or_inplace( - cpp_replace_with_backrefs(self._column, pat, repl) + libstrings.replace_with_backrefs(self._parent._column, pat, repl) ) def slice( @@ -1058,7 +933,7 @@ def slice( """ return self._return_or_inplace( - cpp_slice_strings(self._column, start, stop, step), + libstrings.slice_strings(self._parent._column, start, stop, step), ) def isinteger(self) -> ParentType: @@ -1119,7 +994,9 @@ def isinteger(self) -> ParentType: 2 False dtype: bool """ - return self._return_or_inplace(cpp_is_integer(self._column)) + return self._return_or_inplace( + libstrings.is_integer(self._parent._column) + ) def ishex(self) -> ParentType: """ @@ -1158,7 +1035,7 @@ def ishex(self) -> ParentType: 4 True dtype: bool """ - return self._return_or_inplace(str_cast.is_hex(self._column)) + return self._return_or_inplace(str_cast.is_hex(self._parent._column)) def istimestamp(self, format: str) -> ParentType: """ @@ -1181,7 +1058,7 @@ def istimestamp(self, format: str) -> ParentType: dtype: bool """ return self._return_or_inplace( - str_cast.istimestamp(self._column, format) + str_cast.istimestamp(self._parent._column, format) ) def isfloat(self) -> ParentType: @@ -1245,7 +1122,9 @@ def isfloat(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_float(self._column)) + return self._return_or_inplace( + libstrings.is_float(self._parent._column) + ) def isdecimal(self) -> ParentType: """ @@ -1306,7 +1185,9 @@ def isdecimal(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_decimal(self._column)) + return self._return_or_inplace( + libstrings.is_decimal(self._parent._column) + ) def isalnum(self) -> ParentType: """ @@ -1375,7 +1256,9 @@ def isalnum(self) -> ParentType: 2 False dtype: bool """ - return self._return_or_inplace(cpp_is_alnum(self._column)) + return self._return_or_inplace( + libstrings.is_alnum(self._parent._column) + ) def isalpha(self) -> ParentType: """ @@ -1431,7 +1314,9 @@ def isalpha(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_alpha(self._column)) + return self._return_or_inplace( + libstrings.is_alpha(self._parent._column) + ) def isdigit(self) -> ParentType: """ @@ -1493,7 +1378,9 @@ def isdigit(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_digit(self._column)) + return self._return_or_inplace( + libstrings.is_digit(self._parent._column) + ) def isnumeric(self) -> ParentType: """ @@ -1561,7 +1448,9 @@ def isnumeric(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_numeric(self._column)) + return self._return_or_inplace( + libstrings.is_numeric(self._parent._column) + ) def isupper(self) -> ParentType: """ @@ -1618,7 +1507,9 @@ def isupper(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_upper(self._column)) + return self._return_or_inplace( + libstrings.is_upper(self._parent._column) + ) def islower(self) -> ParentType: """ @@ -1675,7 +1566,9 @@ def islower(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_lower(self._column)) + return self._return_or_inplace( + libstrings.is_lower(self._parent._column) + ) def isipv4(self) -> ParentType: """ @@ -1699,7 +1592,7 @@ def isipv4(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(str_cast.is_ipv4(self._column)) + return self._return_or_inplace(str_cast.is_ipv4(self._parent._column)) def lower(self) -> ParentType: """ @@ -1738,7 +1631,9 @@ def lower(self) -> ParentType: 3 swapcase dtype: object """ - return self._return_or_inplace(cpp_to_lower(self._column)) + return self._return_or_inplace( + libstrings.to_lower(self._parent._column) + ) def upper(self) -> ParentType: """ @@ -1787,7 +1682,9 @@ def upper(self) -> ParentType: 3 SWAPCASE dtype: object """ - return self._return_or_inplace(cpp_to_upper(self._column)) + return self._return_or_inplace( + libstrings.to_upper(self._parent._column) + ) def capitalize(self) -> ParentType: """ @@ -1815,7 +1712,9 @@ def capitalize(self) -> ParentType: 1 Goodbye, friend dtype: object """ - return self._return_or_inplace(cpp_capitalize(self._column)) + return self._return_or_inplace( + libstrings.capitalize(self._parent._column) + ) def swapcase(self) -> ParentType: """ @@ -1860,7 +1759,9 @@ def swapcase(self) -> ParentType: 3 sWaPcAsE dtype: object """ - return self._return_or_inplace(cpp_swapcase(self._column)) + return self._return_or_inplace( + libstrings.swapcase(self._parent._column) + ) def title(self) -> ParentType: """ @@ -1905,7 +1806,7 @@ def title(self) -> ParentType: 3 Swapcase dtype: object """ - return self._return_or_inplace(cpp_title(self._column)) + return self._return_or_inplace(libstrings.title(self._parent._column)) def filter_alphanum( self, repl: str = None, keep: bool = True @@ -1941,7 +1842,9 @@ def filter_alphanum( repl = "" return self._return_or_inplace( - cpp_filter_alphanum(self._column, cudf.Scalar(repl), keep), + libstrings.filter_alphanum( + self._parent._column, cudf.Scalar(repl), keep + ), ) def slice_from( @@ -1984,8 +1887,10 @@ def slice_from( """ return self._return_or_inplace( - cpp_slice_from( - self._column, column.as_column(starts), column.as_column(stops) + libstrings.slice_from( + self._parent._column, + column.as_column(starts), + column.as_column(stops), ), ) @@ -2073,7 +1978,9 @@ def slice_replace( repl = "" return self._return_or_inplace( - cpp_slice_replace(self._column, start, stop, cudf.Scalar(repl)), + libstrings.slice_replace( + self._parent._column, start, stop, cudf.Scalar(repl) + ), ) def insert(self, start: int = 0, repl: str = None) -> ParentType: @@ -2123,7 +2030,7 @@ def insert(self, start: int = 0, repl: str = None) -> ParentType: repl = "" return self._return_or_inplace( - cpp_string_insert(self._column, start, cudf.Scalar(repl)), + libstrings.insert(self._parent._column, start, cudf.Scalar(repl)), ) def get(self, i: int = 0) -> ParentType: @@ -2168,7 +2075,7 @@ def get(self, i: int = 0) -> ParentType: dtype: object """ - return self._return_or_inplace(cpp_string_get(self._column, i)) + return self._return_or_inplace(libstrings.get(self._parent._column, i)) def get_json_object(self, json_path): """ @@ -2221,8 +2128,8 @@ def get_json_object(self, json_path): try: res = self._return_or_inplace( - cpp_get_json_object( - self._column, cudf.Scalar(json_path, "str") + libstrings.get_json_object( + self._parent._column, cudf.Scalar(json_path, "str") ) ) except RuntimeError as e: @@ -2366,18 +2273,22 @@ def split( pat = "" if expand: - if self._column.null_count == len(self._column): - result_table = cudf.core.frame.Frame({0: self._column.copy()}) + if self._parent._column.null_count == len(self._parent._column): + result_table = cudf.core.frame.Frame( + {0: self._parent._column.copy()} + ) else: - result_table = cpp_split( - self._column, cudf.Scalar(pat, "str"), n + result_table = libstrings.split( + self._parent._column, cudf.Scalar(pat, "str"), n ) if len(result_table._data) == 1: - if result_table._data[0].null_count == len(self._column): + if result_table._data[0].null_count == len( + self._parent._column + ): result_table = cudf.core.frame.Frame({}) else: - result_table = cpp_split_record( - self._column, cudf.Scalar(pat, "str"), n + result_table = libstrings.split_record( + self._parent._column, cudf.Scalar(pat, "str"), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2521,15 +2432,23 @@ def rsplit( pat = "" if expand: - if self._column.null_count == len(self._column): - result_table = cudf.core.frame.Frame({0: self._column.copy()}) + if self._parent._column.null_count == len(self._parent._column): + result_table = cudf.core.frame.Frame( + {0: self._parent._column.copy()} + ) else: - result_table = cpp_rsplit(self._column, cudf.Scalar(pat), n) + result_table = libstrings.rsplit( + self._parent._column, cudf.Scalar(pat), n + ) if len(result_table._data) == 1: - if result_table._data[0].null_count == len(self._column): + if result_table._data[0].null_count == len( + self._parent._column + ): result_table = cudf.core.frame.Frame({}) else: - result_table = cpp_rsplit_record(self._column, cudf.Scalar(pat), n) + result_table = libstrings.rsplit_record( + self._parent._column, cudf.Scalar(pat), n + ) return self._return_or_inplace(result_table, expand=expand) @@ -2610,7 +2529,8 @@ def partition(self, sep: str = " ", expand: bool = True) -> ParentType: sep = " " return self._return_or_inplace( - cpp_partition(self._column, cudf.Scalar(sep)), expand=expand + libstrings.partition(self._parent._column, cudf.Scalar(sep)), + expand=expand, ) def rpartition(self, sep: str = " ", expand: bool = True) -> ParentType: @@ -2674,7 +2594,8 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> ParentType: sep = " " return self._return_or_inplace( - cpp_rpartition(self._column, cudf.Scalar(sep)), expand=expand + libstrings.rpartition(self._parent._column, cudf.Scalar(sep)), + expand=expand, ) def pad( @@ -2754,14 +2675,14 @@ def pad( raise TypeError(msg) try: - side = PadSide[side.upper()] + side = libstrings.PadSide[side.upper()] except KeyError: raise ValueError( "side has to be either one of {‘left’, ‘right’, ‘both’}" ) return self._return_or_inplace( - cpp_pad(self._column, width, fillchar, side) + libstrings.pad(self._parent._column, width, fillchar, side) ) def zfill(self, width: int) -> ParentType: @@ -2835,7 +2756,9 @@ def zfill(self, width: int) -> ParentType: msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - return self._return_or_inplace(cpp_zfill(self._column, width)) + return self._return_or_inplace( + libstrings.zfill(self._parent._column, width) + ) def center(self, width: int, fillchar: str = " ") -> ParentType: """ @@ -2906,7 +2829,7 @@ def center(self, width: int, fillchar: str = " ") -> ParentType: raise TypeError(msg) return self._return_or_inplace( - cpp_center(self._column, width, fillchar) + libstrings.center(self._parent._column, width, fillchar) ) def ljust(self, width: int, fillchar: str = " ") -> ParentType: @@ -2960,7 +2883,7 @@ def ljust(self, width: int, fillchar: str = " ") -> ParentType: raise TypeError(msg) return self._return_or_inplace( - cpp_ljust(self._column, width, fillchar) + libstrings.ljust(self._parent._column, width, fillchar) ) def rjust(self, width: int, fillchar: str = " ") -> ParentType: @@ -3014,7 +2937,7 @@ def rjust(self, width: int, fillchar: str = " ") -> ParentType: raise TypeError(msg) return self._return_or_inplace( - cpp_rjust(self._column, width, fillchar) + libstrings.rjust(self._parent._column, width, fillchar) ) def strip(self, to_strip: str = None) -> ParentType: @@ -3073,7 +2996,7 @@ def strip(self, to_strip: str = None) -> ParentType: to_strip = "" return self._return_or_inplace( - cpp_strip(self._column, cudf.Scalar(to_strip)) + libstrings.strip(self._parent._column, cudf.Scalar(to_strip)) ) def lstrip(self, to_strip: str = None) -> ParentType: @@ -3120,7 +3043,7 @@ def lstrip(self, to_strip: str = None) -> ParentType: to_strip = "" return self._return_or_inplace( - cpp_lstrip(self._column, cudf.Scalar(to_strip)) + libstrings.lstrip(self._parent._column, cudf.Scalar(to_strip)) ) def rstrip(self, to_strip: str = None) -> ParentType: @@ -3175,7 +3098,7 @@ def rstrip(self, to_strip: str = None) -> ParentType: to_strip = "" return self._return_or_inplace( - cpp_rstrip(self._column, cudf.Scalar(to_strip)) + libstrings.rstrip(self._parent._column, cudf.Scalar(to_strip)) ) def wrap(self, width: int, **kwargs) -> ParentType: @@ -3270,7 +3193,9 @@ def wrap(self, width: int, **kwargs) -> ParentType: "`break_on_hyphens`=False" ) - return self._return_or_inplace(cpp_wrap(self._column, width)) + return self._return_or_inplace( + libstrings.wrap(self._parent._column, width) + ) def count(self, pat: str, flags: int = 0) -> ParentType: """ @@ -3330,7 +3255,9 @@ def count(self, pat: str, flags: int = 0) -> ParentType: if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") - return self._return_or_inplace(cpp_count_re(self._column, pat)) + return self._return_or_inplace( + libstrings.count_re(self._parent._column, pat) + ) def findall( self, pat: str, flags: int = 0, expand: bool = True @@ -3399,7 +3326,7 @@ def findall( raise NotImplementedError("`flags` parameter is not yet supported") return self._return_or_inplace( - cpp_findall(self._column, pat), expand=expand + libstrings.findall(self._parent._column, pat), expand=expand ) def isempty(self) -> ParentType: @@ -3422,7 +3349,9 @@ def isempty(self) -> ParentType: 4 False dtype: bool """ - return self._return_or_inplace((self._column == "").fillna(False)) + return self._return_or_inplace( + (self._parent._column == "").fillna(False) + ) def isspace(self) -> ParentType: """ @@ -3478,7 +3407,9 @@ def isspace(self) -> ParentType: 2 False dtype: bool """ - return self._return_or_inplace(cpp_isspace(self._column)) + return self._return_or_inplace( + libstrings.is_space(self._parent._column) + ) def endswith(self, pat: str) -> ParentType: """ @@ -3523,13 +3454,15 @@ def endswith(self, pat: str) -> ParentType: """ if pat is None: result_col = column.column_empty( - len(self._column), dtype="bool", masked=True + len(self._parent._column), dtype="bool", masked=True ) elif is_scalar(pat): - result_col = cpp_endswith(self._column, cudf.Scalar(pat, "str")) + result_col = libstrings.endswith( + self._parent._column, cudf.Scalar(pat, "str") + ) else: - result_col = cpp_endswith_multiple( - self._column, column.as_column(pat, dtype="str") + result_col = libstrings.endswith_multiple( + self._parent._column, column.as_column(pat, dtype="str") ) return self._return_or_inplace(result_col) @@ -3583,13 +3516,15 @@ def startswith(self, pat: Union[str, Sequence]) -> ParentType: """ if pat is None: result_col = column.column_empty( - len(self._column), dtype="bool", masked=True + len(self._parent._column), dtype="bool", masked=True ) elif is_scalar(pat): - result_col = cpp_startswith(self._column, cudf.Scalar(pat, "str")) + result_col = libstrings.startswith( + self._parent._column, cudf.Scalar(pat, "str") + ) else: - result_col = cpp_startswith_multiple( - self._column, column.as_column(pat, dtype="str") + result_col = libstrings.startswith_multiple( + self._parent._column, column.as_column(pat, dtype="str") ) return self._return_or_inplace(result_col) @@ -3643,8 +3578,8 @@ def find(self, sub: str, start: int = 0, end: int = None) -> ParentType: if end is None: end = -1 - result_col = cpp_find( - self._column, cudf.Scalar(sub, "str"), start, end + result_col = libstrings.find( + self._parent._column, cudf.Scalar(sub, "str"), start, end ) return self._return_or_inplace(result_col) @@ -3702,8 +3637,8 @@ def rfind(self, sub: str, start: int = 0, end: int = None) -> ParentType: if end is None: end = -1 - result_col = cpp_rfind( - self._column, cudf.Scalar(sub, "str"), start, end + result_col = libstrings.rfind( + self._parent._column, cudf.Scalar(sub, "str"), start, end ) return self._return_or_inplace(result_col) @@ -3757,8 +3692,8 @@ def index(self, sub: str, start: int = 0, end: int = None) -> ParentType: if end is None: end = -1 - result_col = cpp_find( - self._column, cudf.Scalar(sub, "str"), start, end + result_col = libstrings.find( + self._parent._column, cudf.Scalar(sub, "str"), start, end ) result = self._return_or_inplace(result_col) @@ -3817,8 +3752,8 @@ def rindex(self, sub: str, start: int = 0, end: int = None) -> ParentType: if end is None: end = -1 - result_col = cpp_rfind( - self._column, cudf.Scalar(sub, "str"), start, end + result_col = libstrings.rfind( + self._parent._column, cudf.Scalar(sub, "str"), start, end ) result = self._return_or_inplace(result_col) @@ -3871,7 +3806,9 @@ def match(self, pat: str, case: bool = True, flags: int = 0) -> ParentType: if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") - return self._return_or_inplace(cpp_match_re(self._column, pat)) + return self._return_or_inplace( + libstrings.match_re(self._parent._column, pat) + ) def url_decode(self) -> ParentType: """ @@ -3901,7 +3838,9 @@ def url_decode(self) -> ParentType: dtype: object """ - return self._return_or_inplace(cpp_url_decode(self._column)) + return self._return_or_inplace( + libstrings.url_decode(self._parent._column) + ) def url_encode(self) -> ParentType: """ @@ -3932,7 +3871,9 @@ def url_encode(self) -> ParentType: 1 https%3A%2F%2Fmedium.com%2Frapids-ai dtype: object """ - return self._return_or_inplace(cpp_url_encode(self._column)) + return self._return_or_inplace( + libstrings.url_encode(self._parent._column) + ) def code_points(self) -> ParentType: """ @@ -3966,7 +3907,7 @@ def code_points(self) -> ParentType: dtype: int32 """ - new_col = cpp_code_points(self._column) + new_col = libstrings.code_points(self._parent._column) if isinstance(self._parent, cudf.Series): return cudf.Series(new_col, name=self._parent.name) elif isinstance(self._parent, cudf.Index): @@ -4015,7 +3956,9 @@ def translate(self, table: dict) -> ParentType: dtype: object """ table = str.maketrans(table) - return self._return_or_inplace(cpp_translate(self._column, table)) + return self._return_or_inplace( + libstrings.translate(self._parent._column, table) + ) def filter_characters( self, table: dict, keep: bool = True, repl: str = None @@ -4065,8 +4008,8 @@ def filter_characters( repl = "" table = str.maketrans(table) return self._return_or_inplace( - cpp_filter_characters( - self._column, table, keep, cudf.Scalar(repl) + libstrings.filter_characters( + self._parent._column, table, keep, cudf.Scalar(repl) ), ) @@ -4088,7 +4031,9 @@ def normalize_spaces(self) -> ParentType: 1 test string dtype: object """ - return self._return_or_inplace(cpp_normalize_spaces(self._column)) + return self._return_or_inplace( + libstrings.normalize_spaces(self._parent._column) + ) def normalize_characters(self, do_lower: bool = True) -> ParentType: """ @@ -4136,7 +4081,7 @@ def normalize_characters(self, do_lower: bool = True) -> ParentType: dtype: object """ return self._return_or_inplace( - cpp_normalize_characters(self._column, do_lower) + libstrings.normalize_characters(self._parent._column, do_lower) ) def tokenize(self, delimiter: str = " ") -> ParentType: @@ -4172,12 +4117,12 @@ def tokenize(self, delimiter: str = " ") -> ParentType: if isinstance(delimiter, Column): return self._return_or_inplace( - cpp_tokenize_column(self._column, delimiter), + libstrings._tokenize_column(self._parent._column, delimiter), retain_index=False, ) elif isinstance(delimiter, cudf.Scalar): return self._return_or_inplace( - cpp_tokenize_scalar(self._column, delimiter), + libstrings._tokenize_scalar(self._parent._column, delimiter), retain_index=False, ) else: @@ -4219,7 +4164,9 @@ def detokenize( """ separator = _massage_string_arg(separator, "separator") return self._return_or_inplace( - cpp_detokenize(self._column, indices._column, separator), + libstrings.detokenize( + self._parent._column, indices._column, separator + ), retain_index=False, ) @@ -4270,7 +4217,7 @@ def character_tokenize(self) -> ParentType: 29 . dtype: object """ - result_col = cpp_character_tokenize(self._column) + result_col = libstrings.character_tokenize(self._parent._column) if isinstance(self._parent, cudf.Series): return cudf.Series(result_col, name=self._parent.name) elif isinstance(self._parent, cudf.Index): @@ -4307,12 +4254,16 @@ def token_count(self, delimiter: str = " ") -> ParentType: delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) if isinstance(delimiter, Column): return self._return_or_inplace( - cpp_count_tokens_column(self._column, delimiter) + libstrings._count_tokens_column( + self._parent._column, delimiter + ) ) elif isinstance(delimiter, cudf.Scalar): return self._return_or_inplace( - cpp_count_tokens_scalar(self._column, delimiter) + libstrings._count_tokens_scalar( + self._parent._column, delimiter + ) ) else: raise TypeError( @@ -4354,7 +4305,8 @@ def ngrams(self, n: int = 2, separator: str = "_") -> ParentType: """ separator = _massage_string_arg(separator, "separator") return self._return_or_inplace( - cpp_generate_ngrams(self._column, n, separator), retain_index=False + libstrings.generate_ngrams(self._parent._column, n, separator), + retain_index=False, ) def character_ngrams(self, n: int = 2) -> ParentType: @@ -4390,7 +4342,8 @@ def character_ngrams(self, n: int = 2) -> ParentType: dtype: object """ return self._return_or_inplace( - cpp_generate_character_ngrams(self._column, n), retain_index=False + libstrings.generate_character_ngrams(self._parent._column, n), + retain_index=False, ) def ngrams_tokenize( @@ -4427,7 +4380,9 @@ def ngrams_tokenize( delimiter = _massage_string_arg(delimiter, "delimiter") separator = _massage_string_arg(separator, "separator") return self._return_or_inplace( - cpp_ngrams_tokenize(self._column, n, delimiter, separator), + libstrings.ngrams_tokenize( + self._parent._column, n, delimiter, separator + ), retain_index=False, ) @@ -4506,8 +4461,8 @@ def replace_tokens( ) return self._return_or_inplace( - cpp_replace_tokens( - self._column, + libstrings.replace_tokens( + self._parent._column, targets_column, replacements_column, cudf.Scalar(delimiter, dtype="str"), @@ -4577,8 +4532,8 @@ def filter_tokens( ) return self._return_or_inplace( - cpp_filter_tokens( - self._column, + libstrings.filter_tokens( + self._parent._column, min_token_length, cudf.Scalar(replacement, dtype="str"), cudf.Scalar(delimiter, dtype="str"), @@ -4673,8 +4628,8 @@ def subword_tokenize( array([[0, 0, 2], [1, 0, 1]], dtype=uint32) """ - tokens, masks, metadata = cpp_subword_tokenize_vocab_file( - self._column, + tokens, masks, metadata = libstrings.subword_tokenize_vocab_file( + self._parent._column, hash_file, max_length, stride, @@ -4708,7 +4663,7 @@ def porter_stemmer_measure(self) -> ParentType: dtype: int32 """ return self._return_or_inplace( - cpp_porter_stemmer_measure(self._column) + libstrings.porter_stemmer_measure(self._parent._column) ) def is_consonant(self, position) -> ParentType: @@ -4742,17 +4697,17 @@ def is_consonant(self, position) -> ParentType: 1 False dtype: bool """ - ltype = LetterType.CONSONANT + ltype = libstrings.LetterType.CONSONANT if can_convert_to_column(position): return self._return_or_inplace( - cpp_is_letter_multi( - self._column, ltype, column.as_column(position) + libstrings.is_letter_multi( + self._parent._column, ltype, column.as_column(position) ), ) return self._return_or_inplace( - cpp_is_letter(self._column, ltype, position) + libstrings.is_letter(self._parent._column, ltype, position) ) def is_vowel(self, position) -> ParentType: @@ -4786,17 +4741,17 @@ def is_vowel(self, position) -> ParentType: 1 True dtype: bool """ - ltype = LetterType.VOWEL + ltype = libstrings.LetterType.VOWEL if can_convert_to_column(position): return self._return_or_inplace( - cpp_is_letter_multi( - self._column, ltype, column.as_column(position) + libstrings.is_letter_multi( + self._parent._column, ltype, column.as_column(position) ), ) return self._return_or_inplace( - cpp_is_letter(self._column, ltype, position) + libstrings.is_letter(self._parent._column, ltype, position) ) def edit_distance(self, targets) -> ParentType: @@ -4845,7 +4800,7 @@ def edit_distance(self, targets) -> ParentType: ) return self._return_or_inplace( - cpp_edit_distance(self._column, targets_column) + libstrings.edit_distance(self._parent._column, targets_column) ) @@ -5034,7 +4989,11 @@ def sum( skipna=skipna, min_count=min_count ) if isinstance(result_col, type(self)): - return result_col.str().cat() + return libstrings.join( + result_col, + sep=cudf.Scalar(""), + na_rep=cudf.Scalar(None, "str"), + )[0] else: return result_col @@ -5055,10 +5014,7 @@ def set_base_children(self, value: Tuple["column.ColumnBase", ...]): super().set_base_children(value) def __contains__(self, item: ScalarLike) -> bool: - return True in self.str().contains(f"^{item}$") - - def str(self, parent: ParentType = None) -> StringMethods: - return StringMethods(self, parent=parent) + return True in libstrings.contains_re(self, f"^{item}$") def as_numerical_column( self, dtype: Dtype @@ -5066,13 +5022,13 @@ def as_numerical_column( out_dtype = np.dtype(dtype) if out_dtype.kind in {"i", "u"}: - if not cpp_is_integer(self).all(): + if not libstrings.is_integer(self).all(): raise ValueError( "Could not convert strings to integer " "type due to presence of non-integer values." ) elif out_dtype.kind == "f": - if not cpp_is_float(self).all(): + if not libstrings.is_float(self).all(): raise ValueError( "Could not convert strings to float " "type due to presence of non-floating values." @@ -5136,7 +5092,7 @@ def as_timedelta_column( def as_decimal_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.DecimalColumn": - return cpp_to_decimal(self, dtype) + return libstrings.to_decimal(self, dtype) def as_string_column(self, dtype: Dtype, format=None) -> StringColumn: return self @@ -5243,9 +5199,12 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: if self.dtype == to_dtype: return True - elif to_dtype.kind in {"i", "u"} and not cpp_is_integer(self).all(): + elif ( + to_dtype.kind in {"i", "u"} + and not libstrings.is_integer(self).all() + ): return False - elif to_dtype.kind == "f" and not cpp_is_float(self).all(): + elif to_dtype.kind == "f" and not libstrings.is_float(self).all(): return False else: return True @@ -5305,7 +5264,7 @@ def fillna( return super().fillna(method=method) def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]: - found_indices = self.str().contains(f"^{value}$") + found_indices = libstrings.contains(self, f"^{value}$") found_indices = libcudf.unary.cast(found_indices, dtype=np.int32) first = column.as_column(found_indices).find_first_value(1) last = column.as_column(found_indices).find_last_value(1) @@ -5349,10 +5308,17 @@ def binary_operator( lhs, rhs = rhs, lhs if isinstance(rhs, (StringColumn, str, cudf.Scalar)): if op == "add": - return cast("column.ColumnBase", lhs.str().cat(others=rhs)) + return cast( + "column.ColumnBase", + libstrings.concatenate( + cudf.DataFrame({0: lhs, 1: rhs}), + sep=cudf.Scalar(""), + na_rep=cudf.Scalar(None, "str"), + ), + ) elif op in ("eq", "ne", "gt", "lt", "ge", "le", "NULL_EQUALS"): return libcudf.binaryop.binaryop( - lhs=self, rhs=rhs, op=op, dtype="bool" + lhs=lhs, rhs=rhs, op=op, dtype="bool" ) raise TypeError( diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index c2b820d0b43..6c47d94d6e7 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -84,9 +84,6 @@ def copy(self, deep=True): result = result._rename_fields(self.dtype.fields.keys()) return result - def struct(self, parent=None): - return StructMethods(self, parent=parent) - def _rename_fields(self, names): """ Return a StructColumn with the same field values as this StructColumn, @@ -117,12 +114,12 @@ class StructMethods(ColumnMethodsMixin): Struct methods for Series """ - def __init__(self, column, parent=None): - if not is_struct_dtype(column.dtype): + def __init__(self, parent=None): + if not is_struct_dtype(parent.dtype): raise AttributeError( "Can only use .struct accessor with a 'struct' dtype" ) - super().__init__(column=column, parent=parent) + super().__init__(parent=parent) def field(self, key): """ @@ -151,9 +148,9 @@ def field(self, key): 1 3 dtype: int64 """ - fields = list(self._column.dtype.fields.keys()) + fields = list(self._parent._column.dtype.fields.keys()) if key in fields: pos = fields.index(key) - return self._return_or_inplace(self._column.children[pos]) + return self._return_or_inplace(self._parent._column.children[pos]) else: - return self._return_or_inplace(self._column.children[key]) + return self._return_or_inplace(self._parent._column.children[key]) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f59954aaf08..3d44d137754 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -4037,7 +4037,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): ): # Combine and de-dupe the categories categories[idx] = ( - cudf.concat([col.cat().categories for col in cols]) + cudf.concat([col.categories for col in cols]) .to_series() .drop_duplicates(ignore_index=True) ._column @@ -4079,12 +4079,7 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): if idx in categories: cols[idx] = ( cols[idx] - .cat() - ._set_categories( - cols[idx].cat().categories, - categories[idx], - is_unique=True, - ) + ._set_categories(categories[idx], is_unique=True,) .codes ) cols[idx] = cols[idx].astype(dtype) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 3b977a8ced6..bcc77f17b5c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2474,17 +2474,15 @@ def __new__( dtype = None if categories is not None: - data.cat().set_categories( - categories, ordered=ordered, inplace=True - ) + data.set_categories(categories, ordered=ordered, inplace=True) elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): - data.cat().set_categories( + data.set_categories( dtype.categories, ordered=ordered, inplace=True ) elif ordered is True and data.ordered is False: - data.cat().as_ordered(inplace=True) + data = data.as_ordered() elif ordered is False and data.ordered is True: - data.cat().as_unordered(inplace=True) + data = data.as_unordered() out._initialize(data, **kwargs) @@ -2495,14 +2493,14 @@ def codes(self): """ The category codes of this categorical. """ - return self._values.cat().codes + return as_index(self._values.codes) @property def categories(self): """ The categories of this categorical. """ - return self._values.cat().categories + return cudf.Series(self._values.categories) def interval_range( @@ -2782,7 +2780,7 @@ def __repr__(self): @copy_docstring(StringMethods.__init__) # type: ignore @property def str(self): - return StringMethods(column=self._values, parent=self) + return StringMethods(parent=self) def _clean_nulls_from_index(self): """ diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 5e15ddfc359..78fc7a863d6 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -188,7 +188,9 @@ def _match_categorical_dtypes_both( if how == "inner": # cast to category types -- we must cast them back later return _match_join_keys( - lcol.cat()._decategorize(), rcol.cat()._decategorize(), how, + lcol._get_decategorized_column(), + rcol._get_decategorized_column(), + how, ) elif how in {"left", "leftanti", "leftsemi"}: # always cast to left type diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d812214caf8..f57c3bc931b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2329,22 +2329,22 @@ def __invert__(self): @copy_docstring(CategoricalAccessor.__init__) # type: ignore @property def cat(self): - return CategoricalAccessor(column=self._column, parent=self) + return CategoricalAccessor(parent=self) @copy_docstring(StringMethods.__init__) # type: ignore @property def str(self): - return StringMethods(column=self._column, parent=self) + return StringMethods(parent=self) @copy_docstring(ListMethods.__init__) # type: ignore @property def list(self): - return ListMethods(column=self._column, parent=self) + return ListMethods(parent=self) @copy_docstring(StructMethods.__init__) # type: ignore @property def struct(self): - return StructMethods(column=self._column, parent=self) + return StructMethods(parent=self) @property def dtype(self): diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 74f7d16e4ff..180ab9ad6b8 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -6,20 +6,20 @@ import pandas as pd import cudf +from cudf import _lib as libcudf +from cudf._lib import strings as libstrings from cudf.core.column import as_column from cudf.utils.dtypes import ( can_convert_to_column, - is_numerical_dtype, - is_datetime_dtype, - is_timedelta_dtype, is_categorical_dtype, - is_string_dtype, + is_datetime_dtype, is_list_dtype, + is_numerical_dtype, + is_string_dtype, is_struct_dtype, + is_timedelta_dtype, ) -import cudf._lib as libcudf - def to_numeric(arg, errors="raise", downcast=None): """ @@ -195,13 +195,13 @@ def _convert_str_col(col, errors, _downcast=None): if not is_string_dtype(col): raise TypeError("col must be string dtype.") - is_integer = col.str().isinteger() + is_integer = libstrings.isinteger(col) if is_integer.all(): return col.as_numerical_column(dtype=np.dtype("i8")) col = _proc_inf_empty_strings(col) - is_float = col.str().isfloat() + is_float = libstrings.isfloat(col) if is_float.all(): if _downcast in {"unsigned", "signed", "integer"}: warnings.warn( @@ -226,7 +226,7 @@ def _convert_str_col(col, errors, _downcast=None): def _proc_inf_empty_strings(col): """Handles empty and infinity strings """ - col = col.str().lower() + col = libstrings.lower(col) col = _proc_empty_strings(col) col = _proc_inf_strings(col) return col @@ -244,7 +244,7 @@ def _proc_inf_strings(col): """Convert "inf/infinity" strings into "Inf", the native string representing infinity in libcudf """ - col = col.str().replace( - ["+", "inf", "inity"], ["", "Inf", ""], regex=False, + col = libstrings.replace.replace_muli( + col, as_column(["+", "inf", "inity"]), as_column(["", "Inf", ""]), ) return col diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index e2c7ca7dca1..582c5324b8f 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -177,7 +177,7 @@ def to_csv( df = df.copy(deep=False) for col_name, col in df._data.items(): if isinstance(col, cudf.core.column.CategoricalColumn): - df._data[col_name] = col.astype(col.cat().categories.dtype) + df._data[col_name] = col.astype(col.categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): df.index = df.index.astype(df.index.categories.dtype) From 7d9fcc5ec5214e8ac3973f825a633b30c05c8d46 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 20 May 2021 15:29:43 -0400 Subject: [PATCH 05/14] More refactoring --- python/cudf/cudf/_lib/transpose.pyx | 7 +++---- python/cudf/cudf/core/column/__init__.py | 5 +++-- python/cudf/cudf/core/column/categorical.py | 6 ++---- python/cudf/cudf/core/column/column.py | 4 ++-- python/cudf/cudf/core/column/string.py | 2 +- python/cudf/cudf/core/frame.py | 6 +++--- python/cudf/cudf/core/index.py | 12 +++++------- python/cudf/cudf/core/series.py | 4 ++-- python/cudf/cudf/core/tools/numeric.py | 8 ++++---- python/cudf/cudf/testing/testing.py | 4 ++-- python/cudf/cudf/tests/test_dataframe.py | 4 ++-- 11 files changed, 29 insertions(+), 33 deletions(-) diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index d2b053789cd..708f5013cd3 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -36,11 +36,10 @@ def transpose(Table source): if is_categorical_dtype(dtype): if any(not is_categorical_dtype(c.dtype) for c in source._columns): raise ValueError('Columns must all have the same dtype') - cats = list(c.cat().categories for c in source._columns) - cats = cudf.Series(cudf.concat(cats)).drop_duplicates()._column + cats = list(c.categories for c in source._columns) + cats = cudf.core.column.concat_columns(cats).unique() source = Table(index=source._index, data=[ - (name, col.cat()._set_categories( - col.cat().categories, cats, is_unique=True).codes) + (name, col._set_categories(cats, is_unique=True).codes) for name, col in source._data.items() ]) elif dtype.kind in 'OU': diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 32cb557548f..841829302a1 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -10,15 +10,16 @@ column_empty, column_empty_like, column_empty_like_same_mask, + concat_columns, deserialize_columns, full, serialize_columns, ) from cudf.core.column.datetime import DatetimeColumn # noqa: F401 +from cudf.core.column.decimal import DecimalColumn # noqa: F401 +from cudf.core.column.interval import IntervalColumn # noqa: F401 from cudf.core.column.lists import ListColumn # noqa: F401 from cudf.core.column.numerical import NumericalColumn # noqa: F401 from cudf.core.column.string import StringColumn # noqa: F401 from cudf.core.column.struct import StructColumn # noqa: F401 from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401 -from cudf.core.column.interval import IntervalColumn # noqa: F401 -from cudf.core.column.decimal import DecimalColumn # noqa: F401 diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index aed4425093f..9b01e80a805 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1363,7 +1363,7 @@ def view(self, dtype: Dtype) -> ColumnBase: @staticmethod def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: # TODO: This function currently assumes it is being called from - # column._concat_columns, at least to the extent that all the + # column.concat_columns, at least to the extent that all the # preprocessing in that function has already been done. That should be # improved as the concatenation API is solidified. @@ -1371,9 +1371,7 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: head = next((obj for obj in objs if obj.valid_count), objs[0]) # Combine and de-dupe the categories - cats = ( - cudf.concat([o.categories for o in objs]).drop_duplicates()._column - ) + cats = column.concat_columns([o.categories for o in objs]).unique() objs = [o._set_categories(cats, is_unique=True) for o in objs] codes = [o.codes for o in objs] diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 20f302f7e59..815a895fba2 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -688,7 +688,7 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: return indices[-1] def append(self, other: ColumnBase) -> ColumnBase: - return _concat_columns([self, as_column(other)]) + return concat_columns([self, as_column(other)]) def quantile( self, @@ -2230,7 +2230,7 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size) -def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: +def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" if len(objs) == 0: dtype = pd.api.types.pandas_dtype(None) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 36a7e159dc3..c72268d7491 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5264,7 +5264,7 @@ def fillna( return super().fillna(method=method) def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]: - found_indices = libstrings.contains(self, f"^{value}$") + found_indices = libstrings.contains_re(self, f"^{value}$") found_indices = libcudf.unary.cast(found_indices, dtype=np.int32) first = column.as_column(found_indices).find_first_value(1) last = column.as_column(found_indices).find_last_value(1) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 3d44d137754..53cbc2d80fb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -23,6 +23,7 @@ as_column, build_categorical_column, column_empty, + concat_columns, ) from cudf.core.join import merge from cudf.utils.dtypes import ( @@ -37,7 +38,7 @@ T = TypeVar("T", bound="Frame") if TYPE_CHECKING: - from cudf.core.column_accessor import ColumnAccessor + from cudf.core.columnn_accessor import ColumnAccessor class Frame(libcudf.table.Table): @@ -4037,8 +4038,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): ): # Combine and de-dupe the categories categories[idx] = ( - cudf.concat([col.categories for col in cols]) - .to_series() + cudf.Series(concat_columns([col.categories for col in cols])) .drop_duplicates(ignore_index=True) ._column ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index bcc77f17b5c..735fa305b53 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -27,7 +27,7 @@ arange, column, ) -from cudf.core.column.column import _concat_columns +from cudf.core.column.column import concat_columns from cudf.core.column.string import StringMethods as StringMethods from cudf.core.dtypes import IntervalDtype from cudf.core.frame import SingleColumnFrame @@ -639,7 +639,7 @@ def sum(self): @classmethod def _concat(cls, objs): - data = _concat_columns([o._values for o in objs]) + data = concat_columns([o._values for o in objs]) names = {obj.name for obj in objs} if len(names) == 1: [name] = names @@ -2474,11 +2474,9 @@ def __new__( dtype = None if categories is not None: - data.set_categories(categories, ordered=ordered, inplace=True) + data = data.set_categories(categories, ordered=ordered) elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): - data.set_categories( - dtype.categories, ordered=ordered, inplace=True - ) + data = data.set_categories(dtype.categories, ordered=ordered) elif ordered is True and data.ordered is False: data = data.as_ordered() elif ordered is False and data.ordered is True: @@ -2500,7 +2498,7 @@ def categories(self): """ The categories of this categorical. """ - return cudf.Series(self._values.categories) + return as_index(self._values.categories) def interval_range( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f57c3bc931b..205a5370ee3 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -32,7 +32,7 @@ from cudf.core.column.categorical import ( CategoricalAccessor as CategoricalAccessor, ) -from cudf.core.column.column import _concat_columns +from cudf.core.column.column import concat_columns from cudf.core.column.lists import ListMethods from cudf.core.column.string import StringMethods from cudf.core.column.struct import StructMethods @@ -2407,7 +2407,7 @@ def _concat(cls, objs, axis=0, index=True): else: objs = numeric_normalize_types(*objs) - col = _concat_columns([o._column for o in objs]) + col = concat_columns([o._column for o in objs]) if isinstance(col, cudf.core.column.DecimalColumn): col = objs[0]._column._copy_type_metadata(col) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 180ab9ad6b8..068ae46b69c 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -195,13 +195,13 @@ def _convert_str_col(col, errors, _downcast=None): if not is_string_dtype(col): raise TypeError("col must be string dtype.") - is_integer = libstrings.isinteger(col) + is_integer = libstrings.is_integer(col) if is_integer.all(): return col.as_numerical_column(dtype=np.dtype("i8")) col = _proc_inf_empty_strings(col) - is_float = libstrings.isfloat(col) + is_float = libstrings.is_float(col) if is_float.all(): if _downcast in {"unsigned", "signed", "integer"}: warnings.warn( @@ -226,7 +226,7 @@ def _convert_str_col(col, errors, _downcast=None): def _proc_inf_empty_strings(col): """Handles empty and infinity strings """ - col = libstrings.lower(col) + col = libstrings.to_lower(col) col = _proc_empty_strings(col) col = _proc_inf_strings(col) return col @@ -244,7 +244,7 @@ def _proc_inf_strings(col): """Convert "inf/infinity" strings into "Inf", the native string representing infinity in libcudf """ - col = libstrings.replace.replace_muli( + col = libstrings.replace_multi( col, as_column(["+", "inf", "inity"]), as_column(["", "Inf", ""]), ) return col diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index bacab24a6f3..96a4ea31986 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -162,8 +162,8 @@ def assert_column_equal( if check_exact and check_categorical: if is_categorical_dtype(left) and is_categorical_dtype(right): - left_cat = left.cat().categories - right_cat = right.cat().categories + left_cat = left.categories + right_cat = right.categories if check_category_order: assert_index_equal( diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e5e36ba7e21..a5ae5154ec7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5206,8 +5206,8 @@ def test_memory_usage_cat(): gdf = cudf.from_pandas(df) expected = ( - gdf.B._column.cat().categories.__sizeof__() - + gdf.B._column.cat().codes.__sizeof__() + gdf.B._column.categories.__sizeof__() + + gdf.B._column.codes.__sizeof__() ) # Check cat column From 8911feaa17f676219ac45d800a5e12d40d361d1f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 20 May 2021 17:49:19 -0400 Subject: [PATCH 06/14] More refactoring --- python/cudf/cudf/_typing.py | 1 + python/cudf/cudf/core/column/__init__.py | 8 +- python/cudf/cudf/core/column/categorical.py | 60 ++- python/cudf/cudf/core/column/lists.py | 24 +- python/cudf/cudf/core/column/methods.py | 3 +- python/cudf/cudf/core/column/string.py | 470 +++++++++----------- python/cudf/cudf/core/column/struct.py | 6 +- 7 files changed, 261 insertions(+), 311 deletions(-) diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index 034b18ec9e0..2025fd1ecf8 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -28,3 +28,4 @@ BinaryOperand = Union["cudf.Scalar", "cudf.core.column.ColumnBase"] DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"] +SeriesOrIndex = Union["cudf.Series", "cudf.Index"] diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 841829302a1..1f35da4c134 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -1,3 +1,7 @@ +""" +isort: skip_file +""" + # Copyright (c) 2020-2021, NVIDIA CORPORATION. from cudf.core.column.categorical import CategoricalColumn @@ -16,10 +20,10 @@ serialize_columns, ) from cudf.core.column.datetime import DatetimeColumn # noqa: F401 -from cudf.core.column.decimal import DecimalColumn # noqa: F401 -from cudf.core.column.interval import IntervalColumn # noqa: F401 from cudf.core.column.lists import ListColumn # noqa: F401 from cudf.core.column.numerical import NumericalColumn # noqa: F401 from cudf.core.column.string import StringColumn # noqa: F401 from cudf.core.column.struct import StructColumn # noqa: F401 from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401 +from cudf.core.column.decimal import DecimalColumn # noqa: F401 +from cudf.core.column.interval import IntervalColumn # noqa: F401 diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 9b01e80a805..6b32bf908ef 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -12,7 +12,6 @@ Optional, Sequence, Tuple, - Union, cast, ) @@ -38,6 +37,7 @@ ) if TYPE_CHECKING: + from cudf._typing import SeriesOrIndex from cudf.core.column import ( ColumnBase, DatetimeColumn, @@ -47,13 +47,10 @@ ) -ParentType = Union["cudf.Series", "cudf.Index"] - - class CategoricalAccessor(ColumnMethodsMixin): _column: CategoricalColumn - def __init__(self, parent: ParentType): + def __init__(self, parent: SeriesOrIndex): """ Accessor object for categorical properties of the Series values. Be aware that assigning to `categories` is a inplace operation, @@ -118,7 +115,7 @@ def categories(self) -> "cudf.Index": """ The categories of this categorical. """ - return cudf.core.index.as_index(self._parent._column.categories) + return cudf.core.index.as_index(self._column.categories) @property def codes(self) -> "cudf.Series": @@ -130,16 +127,16 @@ def codes(self) -> "cudf.Series": if isinstance(self._parent, cudf.Series) else None ) - return cudf.Series(self._parent._column.codes, index=index) + return cudf.Series(self._column.codes, index=index) @property def ordered(self) -> Optional[bool]: """ Whether the categories have an ordered relationship. """ - return self._parent._column.ordered + return self._column.ordered - def as_ordered(self, inplace: bool = False) -> Optional[ParentType]: + def as_ordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: """ Set the Categorical to be ordered. @@ -193,10 +190,10 @@ def as_ordered(self, inplace: bool = False) -> Optional[ParentType]: Categories (3, int64): [1 < 2 < 10] """ return self._return_or_inplace( - self._parent._column.as_ordered(), inplace=inplace + self._column.as_ordered(), inplace=inplace ) - def as_unordered(self, inplace: bool = False) -> Optional[ParentType]: + def as_unordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: """ Set the Categorical to be unordered. @@ -261,12 +258,12 @@ def as_unordered(self, inplace: bool = False) -> Optional[ParentType]: Categories (3, int64): [1, 2, 10] """ return self._return_or_inplace( - self._parent._column.as_unordered(), inplace=inplace + self._column.as_unordered(), inplace=inplace ) def add_categories( self, new_categories: Any, inplace: bool = False - ) -> Optional[ParentType]: + ) -> Optional[SeriesOrIndex]: """ Add new categories. @@ -318,7 +315,7 @@ def add_categories( Categories (5, int64): [1, 2, 0, 3, 4] """ - old_categories = self._parent._column.categories + old_categories = self._column.categories new_categories = column.as_column( new_categories, dtype=old_categories.dtype if len(new_categories) == 0 else None, @@ -343,7 +340,7 @@ def add_categories( raise ValueError("new categories must not include old categories") new_categories = old_categories.append(new_categories) - out_col = self._parent._column + out_col = self._column if not out_col._categories_equal(new_categories): out_col = out_col._set_categories(new_categories) @@ -351,7 +348,7 @@ def add_categories( def remove_categories( self, removals: Any, inplace: bool = False, - ) -> Optional[ParentType]: + ) -> Optional[SeriesOrIndex]: """ Remove the specified categories. @@ -433,7 +430,7 @@ def remove_categories( raise ValueError(f"removals must all be in old categories: {vals}") new_categories = cats[~cats.isin(removals)]._column - out_col = self._parent._column + out_col = self._column if not out_col._categories_equal(new_categories): out_col = out_col._set_categories(new_categories) @@ -445,7 +442,7 @@ def set_categories( ordered: bool = False, rename: bool = False, inplace: bool = False, - ) -> Optional[ParentType]: + ) -> Optional[SeriesOrIndex]: """ Set the categories to the specified new_categories. @@ -538,7 +535,7 @@ def set_categories( # categories. if rename: # enforce same length - if len(new_categories) != len(self._parent._column.categories): + if len(new_categories) != len(self._column.categories): raise ValueError( "new_categories must have the same " "number of items as old categories" @@ -546,19 +543,19 @@ def set_categories( out_col = column.build_categorical_column( categories=new_categories, - codes=self._parent._column.base_children[0], - mask=self._parent._column.base_mask, - size=self._parent._column.size, - offset=self._parent._column.offset, + codes=self._column.base_children[0], + mask=self._column.base_mask, + size=self._column.size, + offset=self._column.offset, ordered=ordered, ) else: - out_col = self._parent._column + out_col = self._column if not (type(out_col.categories) is type(new_categories)): # If both categories are of different Column types, # return a column full of Nulls. out_col = _create_empty_categorical_column( - self._parent._column, + self._column, CategoricalDtype( categories=new_categories, ordered=ordered ), @@ -577,7 +574,7 @@ def reorder_categories( new_categories: Any, ordered: bool = False, inplace: bool = False, - ) -> Optional[ParentType]: + ) -> Optional[SeriesOrIndex]: """ Reorder categories as specified in new_categories. @@ -648,22 +645,15 @@ def reorder_categories( # Ignore order for comparison because we're only interested # in whether new_categories has all the same values as the # current set of categories. - if not self._parent._column._categories_equal( - new_categories, ordered=False - ): + if not self._column._categories_equal(new_categories, ordered=False): raise ValueError( "items in new_categories are not the same as in " "old categories" ) - out_col = self._parent._column._set_categories( - new_categories, ordered=ordered - ) + out_col = self._column._set_categories(new_categories, ordered=ordered) return self._return_or_inplace(out_col, inplace=inplace) - def _decategorize(self) -> ColumnBase: - return self._parent._column._get_decategorized_column() - class CategoricalColumn(column.ColumnBase): """Implements operations for Columns of Categorical type diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 3d09bafd9ad..db180f6f9e1 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -270,7 +270,7 @@ def get(self, index): min_col_list_len = self.len().min() if -min_col_list_len <= index < min_col_list_len: return self._return_or_inplace( - extract_element(self._parent._column, index) + extract_element(self._column, index) ) else: raise IndexError("list index out of range") @@ -299,7 +299,7 @@ def contains(self, search_key): search_key = cudf.Scalar(search_key) try: res = self._return_or_inplace( - contains_scalar(self._parent._column, search_key) + contains_scalar(self._column, search_key) ) except RuntimeError as e: if ( @@ -336,11 +336,11 @@ def leaves(self): 5 6 dtype: int64 """ - if type(self._parent._column.elements) is ListColumn: - return self._parent._column.elements.elements + if type(self._column.elements) is ListColumn: + return self._column.elements.elements else: return self._return_or_inplace( - self._parent._column.elements, retain_index=False + self._column.elements, retain_index=False ) def len(self): @@ -365,7 +365,7 @@ def len(self): 2 2 dtype: int32 """ - return self._return_or_inplace(count_elements(self._parent._column)) + return self._return_or_inplace(count_elements(self._column)) def take(self, lists_indices): """ @@ -398,7 +398,7 @@ def take(self, lists_indices): lists_indices_col = as_column(lists_indices) if not isinstance(lists_indices_col, ListColumn): raise ValueError("lists_indices should be list type array.") - if not lists_indices_col.size == self._parent._column.size: + if not lists_indices_col.size == self._column.size: raise ValueError( "lists_indices and list column is of different " "size." ) @@ -413,7 +413,7 @@ def take(self, lists_indices): try: res = self._return_or_inplace( - segmented_gather(self._parent._column, lists_indices_col) + segmented_gather(self._column, lists_indices_col) ) except RuntimeError as e: if "contains nulls" in str(e): @@ -448,12 +448,12 @@ def unique(self): dtype: list """ - if is_list_dtype(self._parent._column.children[1].dtype): + if is_list_dtype(self._column.children[1].dtype): raise NotImplementedError("Nested lists unique is not supported.") return self._return_or_inplace( drop_list_duplicates( - self._parent._column, nulls_equal=True, nans_all_equal=True + self._column, nulls_equal=True, nans_all_equal=True ) ) @@ -503,10 +503,10 @@ def sort_values( raise NotImplementedError("`kind` not currently implemented.") if na_position not in {"first", "last"}: raise ValueError(f"Unknown `na_position` value {na_position}") - if is_list_dtype(self._parent._column.children[1].dtype): + if is_list_dtype(self._column.children[1].dtype): raise NotImplementedError("Nested lists sort is not supported.") return self._return_or_inplace( - sort_lists(self._parent._column, ascending, na_position), + sort_lists(self._column, ascending, na_position), retain_index=not ignore_index, ) diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index e2f4acde8cd..4b8ab7758fa 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -14,6 +14,7 @@ class ColumnMethodsMixin: def __init__(self, parent: Union["cudf.Series", "cudf.Index"]): self._parent = parent + self._column = self._parent._column @overload def _return_or_inplace( @@ -60,7 +61,7 @@ def _return_or_inplace( ) return None else: - self._parent._column._mimic_inplace(new_col, inplace=True) + self._column._mimic_inplace(new_col, inplace=True) return None else: if self._parent is None: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c72268d7491..9c5ae0a31ce 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5,7 +5,17 @@ import builtins import pickle import warnings -from typing import Any, Dict, Optional, Sequence, Tuple, Union, cast, overload +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Optional, + Sequence, + Tuple, + Union, + cast, + overload, +) import cupy import numpy as np @@ -17,7 +27,6 @@ from cudf import _lib as libcudf from cudf._lib import string_casting as str_cast, strings as libstrings from cudf._lib.column import Column -from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import column, datetime from cudf.core.column.methods import ColumnMethodsMixin @@ -30,6 +39,10 @@ is_string_dtype, ) +if TYPE_CHECKING: + from cudf._typing import ColumnLike, Dtype, ScalarLike, SeriesOrIndex + + _str_to_numeric_typecast_functions = { np.dtype("int8"): str_cast.stoi8, np.dtype("int16"): str_cast.stoi16, @@ -75,10 +88,9 @@ } -ParentType = Union["cudf.Series", "cudf.Index"] - - class StringMethods(ColumnMethodsMixin): + _column: StringColumn + def __init__(self, parent=None): """ Vectorized string functions for Series and Index. @@ -99,7 +111,7 @@ def __init__(self, parent=None): ) super().__init__(parent=parent) - def htoi(self) -> ParentType: + def htoi(self) -> SeriesOrIndex: """ Returns integer value represented by each hex string. String is interpretted to have hex (base-16) characters. @@ -120,11 +132,11 @@ def htoi(self) -> ParentType: dtype: int64 """ - out = str_cast.htoi(self._parent._column) + out = str_cast.htoi(self._column) return self._return_or_inplace(out, inplace=False) - def ip2int(self) -> ParentType: + def ip2int(self) -> SeriesOrIndex: """ This converts ip strings to integers @@ -151,7 +163,7 @@ def ip2int(self) -> ParentType: dtype: int64 """ - out = str_cast.ip2int(self._parent._column) + out = str_cast.ip2int(self._column) return self._return_or_inplace(out, inplace=False) @@ -161,7 +173,7 @@ def __getitem__(self, key): else: return self.get(key) - def len(self) -> ParentType: + def len(self) -> SeriesOrIndex: """ Computes the length of each element in the Series/Index. @@ -182,10 +194,10 @@ def len(self) -> ParentType: """ return self._return_or_inplace( - libstrings.count_characters(self._parent._column) + libstrings.count_characters(self._column) ) - def byte_count(self) -> ParentType: + def byte_count(self) -> SeriesOrIndex: """ Computes the number of bytes of each string in the Series/Index. @@ -210,9 +222,7 @@ def byte_count(self) -> ParentType: 2 11 dtype: int32 """ - return self._return_or_inplace( - libstrings.count_bytes(self._parent._column), - ) + return self._return_or_inplace(libstrings.count_bytes(self._column),) @overload def cat(self, sep: str = None, na_rep: str = None) -> str: @@ -221,7 +231,7 @@ def cat(self, sep: str = None, na_rep: str = None) -> str: @overload def cat( self, others, sep: str = None, na_rep: str = None - ) -> Union[ParentType, "cudf.core.column.string.StringColumn"]: + ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: ... def cat(self, others=None, sep=None, na_rep=None): @@ -313,13 +323,11 @@ def cat(self, others=None, sep=None, na_rep=None): if others is None: data = libstrings.join( - self._parent._column, - cudf.Scalar(sep), - cudf.Scalar(na_rep, "str"), + self._column, cudf.Scalar(sep), cudf.Scalar(na_rep, "str"), ) else: other_cols = _get_cols_list(self._parent, others) - all_cols = [self._parent._column] + other_cols + all_cols = [self._column] + other_cols data = libstrings.concatenate( cudf.DataFrame( {index: value for index, value in enumerate(all_cols)} @@ -340,7 +348,7 @@ def cat(self, others=None, sep=None, na_rep=None): def join( self, sep=None, string_na_rep=None, sep_na_rep=None - ) -> ParentType: + ) -> SeriesOrIndex: """ Join lists contained as elements in the Series/Index with passed delimiter. @@ -466,10 +474,10 @@ def join( f" of type : {type(string_na_rep)}" ) - if isinstance(self._parent._column, cudf.core.column.ListColumn): - strings_column = self._parent._column + if isinstance(self._column, cudf.core.column.ListColumn): + strings_column = self._column else: - # If self._parent._column is not a ListColumn, we will have to + # If self._column is not a ListColumn, we will have to # split each row by character and create a ListColumn out of it. strings_column = self._split_by_character() @@ -505,23 +513,23 @@ def join( return self._return_or_inplace(data) def _split_by_character(self): - result_col = libstrings.character_tokenize(self._parent._column) + result_col = libstrings.character_tokenize(self._column) - offset_col = self._parent._column.children[0] + offset_col = self._column.children[0] res = cudf.core.column.ListColumn( - size=len(self._parent._column), - dtype=cudf.ListDtype(self._parent._column.dtype), - mask=self._parent._column.mask, + size=len(self._column), + dtype=cudf.ListDtype(self._column.dtype), + mask=self._column.mask, offset=0, - null_count=self._parent._column.null_count, + null_count=self._column.null_count, children=(offset_col, result_col), ) return res def extract( self, pat: str, flags: int = 0, expand: bool = True - ) -> ParentType: + ) -> SeriesOrIndex: """ Extract capture groups in the regex `pat` as columns in a DataFrame. @@ -579,7 +587,7 @@ def extract( if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") - out = libstrings.extract(self._parent._column, pat) + out = libstrings.extract(self._column, pat) if out._num_columns == 1 and expand is False: return self._return_or_inplace(out._columns[0], expand=expand) else: @@ -592,7 +600,7 @@ def contains( flags: int = 0, na=np.nan, regex: bool = True, - ) -> ParentType: + ) -> SeriesOrIndex: """ Test if pattern or regex is contained within a string of a Series or Index. @@ -706,18 +714,18 @@ def contains( if pat is None: result_col = column.column_empty( - len(self._parent._column), dtype="bool", masked=True + len(self._column), dtype="bool", masked=True ) elif is_scalar(pat): if regex is True: - result_col = libstrings.contains_re(self._parent._column, pat) + result_col = libstrings.contains_re(self._column, pat) else: result_col = libstrings.contains( - self._parent._column, cudf.Scalar(pat, "str") + self._column, cudf.Scalar(pat, "str") ) else: result_col = libstrings.contains_multiple( - self._parent._column, column.as_column(pat, dtype="str") + self._column, column.as_column(pat, dtype="str") ) return self._return_or_inplace(result_col) @@ -729,7 +737,7 @@ def replace( case=None, flags: int = 0, regex: bool = True, - ) -> ParentType: + ) -> SeriesOrIndex: """ Replace occurrences of pattern/regex in the Series/Index with some other string. Equivalent to `str.replace() @@ -806,13 +814,11 @@ def replace( return self._return_or_inplace( libstrings.replace_multi_re( - self._parent._column, - pat, - column.as_column(repl, dtype="str"), + self._column, pat, column.as_column(repl, dtype="str"), ) if regex else libstrings.replace_multi( - self._parent._column, + self._column, column.as_column(pat, dtype="str"), column.as_column(repl, dtype="str"), ), @@ -824,18 +830,18 @@ def replace( # Pandas forces non-regex replace when pat is a single-character return self._return_or_inplace( libstrings.replace_re( - self._parent._column, pat, cudf.Scalar(repl, "str"), n + self._column, pat, cudf.Scalar(repl, "str"), n ) if regex is True and len(pat) > 1 else libstrings.replace( - self._parent._column, + self._column, cudf.Scalar(pat, "str"), cudf.Scalar(repl, "str"), n, ), ) - def replace_with_backrefs(self, pat: str, repl: str) -> ParentType: + def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: """ Use the ``repl`` back-ref template to create a new string with the extracted elements found using the ``pat`` expression. @@ -862,12 +868,12 @@ def replace_with_backrefs(self, pat: str, repl: str) -> ParentType: dtype: object """ return self._return_or_inplace( - libstrings.replace_with_backrefs(self._parent._column, pat, repl) + libstrings.replace_with_backrefs(self._column, pat, repl) ) def slice( self, start: int = None, stop: int = None, step: int = None - ) -> ParentType: + ) -> SeriesOrIndex: """ Slice substrings from each element in the Series or Index. @@ -933,10 +939,10 @@ def slice( """ return self._return_or_inplace( - libstrings.slice_strings(self._parent._column, start, stop, step), + libstrings.slice_strings(self._column, start, stop, step), ) - def isinteger(self) -> ParentType: + def isinteger(self) -> SeriesOrIndex: """ Check whether all characters in each string form integer. @@ -994,11 +1000,9 @@ def isinteger(self) -> ParentType: 2 False dtype: bool """ - return self._return_or_inplace( - libstrings.is_integer(self._parent._column) - ) + return self._return_or_inplace(libstrings.is_integer(self._column)) - def ishex(self) -> ParentType: + def ishex(self) -> SeriesOrIndex: """ Check whether all characters in each string form a hex integer. @@ -1035,9 +1039,9 @@ def ishex(self) -> ParentType: 4 True dtype: bool """ - return self._return_or_inplace(str_cast.is_hex(self._parent._column)) + return self._return_or_inplace(str_cast.is_hex(self._column)) - def istimestamp(self, format: str) -> ParentType: + def istimestamp(self, format: str) -> SeriesOrIndex: """ Check whether all characters in each string can be converted to a timestamp using the given format. @@ -1058,10 +1062,10 @@ def istimestamp(self, format: str) -> ParentType: dtype: bool """ return self._return_or_inplace( - str_cast.istimestamp(self._parent._column, format) + str_cast.istimestamp(self._column, format) ) - def isfloat(self) -> ParentType: + def isfloat(self) -> SeriesOrIndex: """ Check whether all characters in each string form floating value. @@ -1122,11 +1126,9 @@ def isfloat(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace( - libstrings.is_float(self._parent._column) - ) + return self._return_or_inplace(libstrings.is_float(self._column)) - def isdecimal(self) -> ParentType: + def isdecimal(self) -> SeriesOrIndex: """ Check whether all characters in each string are decimal. @@ -1185,11 +1187,9 @@ def isdecimal(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace( - libstrings.is_decimal(self._parent._column) - ) + return self._return_or_inplace(libstrings.is_decimal(self._column)) - def isalnum(self) -> ParentType: + def isalnum(self) -> SeriesOrIndex: """ Check whether all characters in each string are alphanumeric. @@ -1256,11 +1256,9 @@ def isalnum(self) -> ParentType: 2 False dtype: bool """ - return self._return_or_inplace( - libstrings.is_alnum(self._parent._column) - ) + return self._return_or_inplace(libstrings.is_alnum(self._column)) - def isalpha(self) -> ParentType: + def isalpha(self) -> SeriesOrIndex: """ Check whether all characters in each string are alphabetic. @@ -1314,11 +1312,9 @@ def isalpha(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace( - libstrings.is_alpha(self._parent._column) - ) + return self._return_or_inplace(libstrings.is_alpha(self._column)) - def isdigit(self) -> ParentType: + def isdigit(self) -> SeriesOrIndex: """ Check whether all characters in each string are digits. @@ -1378,11 +1374,9 @@ def isdigit(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace( - libstrings.is_digit(self._parent._column) - ) + return self._return_or_inplace(libstrings.is_digit(self._column)) - def isnumeric(self) -> ParentType: + def isnumeric(self) -> SeriesOrIndex: """ Check whether all characters in each string are numeric. @@ -1448,11 +1442,9 @@ def isnumeric(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace( - libstrings.is_numeric(self._parent._column) - ) + return self._return_or_inplace(libstrings.is_numeric(self._column)) - def isupper(self) -> ParentType: + def isupper(self) -> SeriesOrIndex: """ Check whether all characters in each string are uppercase. @@ -1507,11 +1499,9 @@ def isupper(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace( - libstrings.is_upper(self._parent._column) - ) + return self._return_or_inplace(libstrings.is_upper(self._column)) - def islower(self) -> ParentType: + def islower(self) -> SeriesOrIndex: """ Check whether all characters in each string are lowercase. @@ -1566,11 +1556,9 @@ def islower(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace( - libstrings.is_lower(self._parent._column) - ) + return self._return_or_inplace(libstrings.is_lower(self._column)) - def isipv4(self) -> ParentType: + def isipv4(self) -> SeriesOrIndex: """ Check whether all characters in each string form an IPv4 address. @@ -1592,9 +1580,9 @@ def isipv4(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(str_cast.is_ipv4(self._parent._column)) + return self._return_or_inplace(str_cast.is_ipv4(self._column)) - def lower(self) -> ParentType: + def lower(self) -> SeriesOrIndex: """ Converts all characters to lowercase. @@ -1631,11 +1619,9 @@ def lower(self) -> ParentType: 3 swapcase dtype: object """ - return self._return_or_inplace( - libstrings.to_lower(self._parent._column) - ) + return self._return_or_inplace(libstrings.to_lower(self._column)) - def upper(self) -> ParentType: + def upper(self) -> SeriesOrIndex: """ Convert each string to uppercase. This only applies to ASCII characters at this time. @@ -1682,11 +1668,9 @@ def upper(self) -> ParentType: 3 SWAPCASE dtype: object """ - return self._return_or_inplace( - libstrings.to_upper(self._parent._column) - ) + return self._return_or_inplace(libstrings.to_upper(self._column)) - def capitalize(self) -> ParentType: + def capitalize(self) -> SeriesOrIndex: """ Convert strings in the Series/Index to be capitalized. This only applies to ASCII characters at this time. @@ -1712,11 +1696,9 @@ def capitalize(self) -> ParentType: 1 Goodbye, friend dtype: object """ - return self._return_or_inplace( - libstrings.capitalize(self._parent._column) - ) + return self._return_or_inplace(libstrings.capitalize(self._column)) - def swapcase(self) -> ParentType: + def swapcase(self) -> SeriesOrIndex: """ Change each lowercase character to uppercase and vice versa. This only applies to ASCII characters at this time. @@ -1759,11 +1741,9 @@ def swapcase(self) -> ParentType: 3 sWaPcAsE dtype: object """ - return self._return_or_inplace( - libstrings.swapcase(self._parent._column) - ) + return self._return_or_inplace(libstrings.swapcase(self._column)) - def title(self) -> ParentType: + def title(self) -> SeriesOrIndex: """ Uppercase the first letter of each letter after a space and lowercase the rest. @@ -1806,11 +1786,11 @@ def title(self) -> ParentType: 3 Swapcase dtype: object """ - return self._return_or_inplace(libstrings.title(self._parent._column)) + return self._return_or_inplace(libstrings.title(self._column)) def filter_alphanum( self, repl: str = None, keep: bool = True - ) -> ParentType: + ) -> SeriesOrIndex: """ Remove non-alphanumeric characters from strings in this column. @@ -1842,14 +1822,12 @@ def filter_alphanum( repl = "" return self._return_or_inplace( - libstrings.filter_alphanum( - self._parent._column, cudf.Scalar(repl), keep - ), + libstrings.filter_alphanum(self._column, cudf.Scalar(repl), keep), ) def slice_from( self, starts: "cudf.Series", stops: "cudf.Series" - ) -> ParentType: + ) -> SeriesOrIndex: """ Return substring of each string using positions for each string. @@ -1888,7 +1866,7 @@ def slice_from( return self._return_or_inplace( libstrings.slice_from( - self._parent._column, + self._column, column.as_column(starts), column.as_column(stops), ), @@ -1896,7 +1874,7 @@ def slice_from( def slice_replace( self, start: int = None, stop: int = None, repl: str = None - ) -> ParentType: + ) -> SeriesOrIndex: """ Replace the specified section of each string with a new string. @@ -1979,11 +1957,11 @@ def slice_replace( return self._return_or_inplace( libstrings.slice_replace( - self._parent._column, start, stop, cudf.Scalar(repl) + self._column, start, stop, cudf.Scalar(repl) ), ) - def insert(self, start: int = 0, repl: str = None) -> ParentType: + def insert(self, start: int = 0, repl: str = None) -> SeriesOrIndex: """ Insert the specified string into each string in the specified position. @@ -2030,10 +2008,10 @@ def insert(self, start: int = 0, repl: str = None) -> ParentType: repl = "" return self._return_or_inplace( - libstrings.insert(self._parent._column, start, cudf.Scalar(repl)), + libstrings.insert(self._column, start, cudf.Scalar(repl)), ) - def get(self, i: int = 0) -> ParentType: + def get(self, i: int = 0) -> SeriesOrIndex: """ Extract element from each component at specified position. @@ -2075,7 +2053,7 @@ def get(self, i: int = 0) -> ParentType: dtype: object """ - return self._return_or_inplace(libstrings.get(self._parent._column, i)) + return self._return_or_inplace(libstrings.get(self._column, i)) def get_json_object(self, json_path): """ @@ -2129,7 +2107,7 @@ def get_json_object(self, json_path): try: res = self._return_or_inplace( libstrings.get_json_object( - self._parent._column, cudf.Scalar(json_path, "str") + self._column, cudf.Scalar(json_path, "str") ) ) except RuntimeError as e: @@ -2145,7 +2123,7 @@ def get_json_object(self, json_path): def split( self, pat: str = None, n: int = -1, expand: bool = None - ) -> ParentType: + ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. @@ -2273,29 +2251,25 @@ def split( pat = "" if expand: - if self._parent._column.null_count == len(self._parent._column): - result_table = cudf.core.frame.Frame( - {0: self._parent._column.copy()} - ) + if self._column.null_count == len(self._column): + result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: result_table = libstrings.split( - self._parent._column, cudf.Scalar(pat, "str"), n + self._column, cudf.Scalar(pat, "str"), n ) if len(result_table._data) == 1: - if result_table._data[0].null_count == len( - self._parent._column - ): + if result_table._data[0].null_count == len(self._column): result_table = cudf.core.frame.Frame({}) else: result_table = libstrings.split_record( - self._parent._column, cudf.Scalar(pat, "str"), n + self._column, cudf.Scalar(pat, "str"), n ) return self._return_or_inplace(result_table, expand=expand) def rsplit( self, pat: str = None, n: int = -1, expand: bool = None - ) -> ParentType: + ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. @@ -2432,27 +2406,23 @@ def rsplit( pat = "" if expand: - if self._parent._column.null_count == len(self._parent._column): - result_table = cudf.core.frame.Frame( - {0: self._parent._column.copy()} - ) + if self._column.null_count == len(self._column): + result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: result_table = libstrings.rsplit( - self._parent._column, cudf.Scalar(pat), n + self._column, cudf.Scalar(pat), n ) if len(result_table._data) == 1: - if result_table._data[0].null_count == len( - self._parent._column - ): + if result_table._data[0].null_count == len(self._column): result_table = cudf.core.frame.Frame({}) else: result_table = libstrings.rsplit_record( - self._parent._column, cudf.Scalar(pat), n + self._column, cudf.Scalar(pat), n ) return self._return_or_inplace(result_table, expand=expand) - def partition(self, sep: str = " ", expand: bool = True) -> ParentType: + def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: """ Split the string at the first occurrence of sep. @@ -2529,11 +2499,11 @@ def partition(self, sep: str = " ", expand: bool = True) -> ParentType: sep = " " return self._return_or_inplace( - libstrings.partition(self._parent._column, cudf.Scalar(sep)), + libstrings.partition(self._column, cudf.Scalar(sep)), expand=expand, ) - def rpartition(self, sep: str = " ", expand: bool = True) -> ParentType: + def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: """ Split the string at the last occurrence of sep. @@ -2594,13 +2564,13 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> ParentType: sep = " " return self._return_or_inplace( - libstrings.rpartition(self._parent._column, cudf.Scalar(sep)), + libstrings.rpartition(self._column, cudf.Scalar(sep)), expand=expand, ) def pad( self, width: int, side: str = "left", fillchar: str = " " - ) -> ParentType: + ) -> SeriesOrIndex: """ Pad strings in the Series/Index up to width. @@ -2682,10 +2652,10 @@ def pad( ) return self._return_or_inplace( - libstrings.pad(self._parent._column, width, fillchar, side) + libstrings.pad(self._column, width, fillchar, side) ) - def zfill(self, width: int) -> ParentType: + def zfill(self, width: int) -> SeriesOrIndex: """ Pad strings in the Series/Index by prepending ‘0’ characters. @@ -2756,11 +2726,9 @@ def zfill(self, width: int) -> ParentType: msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - return self._return_or_inplace( - libstrings.zfill(self._parent._column, width) - ) + return self._return_or_inplace(libstrings.zfill(self._column, width)) - def center(self, width: int, fillchar: str = " ") -> ParentType: + def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ Filling left and right side of strings in the Series/Index with an additional character. @@ -2829,10 +2797,10 @@ def center(self, width: int, fillchar: str = " ") -> ParentType: raise TypeError(msg) return self._return_or_inplace( - libstrings.center(self._parent._column, width, fillchar) + libstrings.center(self._column, width, fillchar) ) - def ljust(self, width: int, fillchar: str = " ") -> ParentType: + def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ Filling right side of strings in the Series/Index with an additional character. Equivalent to `str.ljust() @@ -2883,10 +2851,10 @@ def ljust(self, width: int, fillchar: str = " ") -> ParentType: raise TypeError(msg) return self._return_or_inplace( - libstrings.ljust(self._parent._column, width, fillchar) + libstrings.ljust(self._column, width, fillchar) ) - def rjust(self, width: int, fillchar: str = " ") -> ParentType: + def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ Filling left side of strings in the Series/Index with an additional character. Equivalent to `str.rjust() @@ -2937,10 +2905,10 @@ def rjust(self, width: int, fillchar: str = " ") -> ParentType: raise TypeError(msg) return self._return_or_inplace( - libstrings.rjust(self._parent._column, width, fillchar) + libstrings.rjust(self._column, width, fillchar) ) - def strip(self, to_strip: str = None) -> ParentType: + def strip(self, to_strip: str = None) -> SeriesOrIndex: """ Remove leading and trailing characters. @@ -2996,10 +2964,10 @@ def strip(self, to_strip: str = None) -> ParentType: to_strip = "" return self._return_or_inplace( - libstrings.strip(self._parent._column, cudf.Scalar(to_strip)) + libstrings.strip(self._column, cudf.Scalar(to_strip)) ) - def lstrip(self, to_strip: str = None) -> ParentType: + def lstrip(self, to_strip: str = None) -> SeriesOrIndex: """ Remove leading and trailing characters. @@ -3043,10 +3011,10 @@ def lstrip(self, to_strip: str = None) -> ParentType: to_strip = "" return self._return_or_inplace( - libstrings.lstrip(self._parent._column, cudf.Scalar(to_strip)) + libstrings.lstrip(self._column, cudf.Scalar(to_strip)) ) - def rstrip(self, to_strip: str = None) -> ParentType: + def rstrip(self, to_strip: str = None) -> SeriesOrIndex: """ Remove leading and trailing characters. @@ -3098,10 +3066,10 @@ def rstrip(self, to_strip: str = None) -> ParentType: to_strip = "" return self._return_or_inplace( - libstrings.rstrip(self._parent._column, cudf.Scalar(to_strip)) + libstrings.rstrip(self._column, cudf.Scalar(to_strip)) ) - def wrap(self, width: int, **kwargs) -> ParentType: + def wrap(self, width: int, **kwargs) -> SeriesOrIndex: """ Wrap long strings in the Series/Index to be formatted in paragraphs with length less than a given width. @@ -3193,11 +3161,9 @@ def wrap(self, width: int, **kwargs) -> ParentType: "`break_on_hyphens`=False" ) - return self._return_or_inplace( - libstrings.wrap(self._parent._column, width) - ) + return self._return_or_inplace(libstrings.wrap(self._column, width)) - def count(self, pat: str, flags: int = 0) -> ParentType: + def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: """ Count occurrences of pattern in each string of the Series/Index. @@ -3255,13 +3221,11 @@ def count(self, pat: str, flags: int = 0) -> ParentType: if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") - return self._return_or_inplace( - libstrings.count_re(self._parent._column, pat) - ) + return self._return_or_inplace(libstrings.count_re(self._column, pat)) def findall( self, pat: str, flags: int = 0, expand: bool = True - ) -> ParentType: + ) -> SeriesOrIndex: """ Find all occurrences of pattern or regular expression in the Series/Index. @@ -3326,10 +3290,10 @@ def findall( raise NotImplementedError("`flags` parameter is not yet supported") return self._return_or_inplace( - libstrings.findall(self._parent._column, pat), expand=expand + libstrings.findall(self._column, pat), expand=expand ) - def isempty(self) -> ParentType: + def isempty(self) -> SeriesOrIndex: """ Check whether each string is an empty string. @@ -3349,11 +3313,9 @@ def isempty(self) -> ParentType: 4 False dtype: bool """ - return self._return_or_inplace( - (self._parent._column == "").fillna(False) - ) + return self._return_or_inplace((self._column == "").fillna(False)) - def isspace(self) -> ParentType: + def isspace(self) -> SeriesOrIndex: """ Check whether all characters in each string are whitespace. @@ -3407,11 +3369,9 @@ def isspace(self) -> ParentType: 2 False dtype: bool """ - return self._return_or_inplace( - libstrings.is_space(self._parent._column) - ) + return self._return_or_inplace(libstrings.is_space(self._column)) - def endswith(self, pat: str) -> ParentType: + def endswith(self, pat: str) -> SeriesOrIndex: """ Test if the end of each string element matches a pattern. @@ -3454,20 +3414,20 @@ def endswith(self, pat: str) -> ParentType: """ if pat is None: result_col = column.column_empty( - len(self._parent._column), dtype="bool", masked=True + len(self._column), dtype="bool", masked=True ) elif is_scalar(pat): result_col = libstrings.endswith( - self._parent._column, cudf.Scalar(pat, "str") + self._column, cudf.Scalar(pat, "str") ) else: result_col = libstrings.endswith_multiple( - self._parent._column, column.as_column(pat, dtype="str") + self._column, column.as_column(pat, dtype="str") ) return self._return_or_inplace(result_col) - def startswith(self, pat: Union[str, Sequence]) -> ParentType: + def startswith(self, pat: Union[str, Sequence]) -> SeriesOrIndex: """ Test if the start of each string element matches a pattern. @@ -3516,20 +3476,20 @@ def startswith(self, pat: Union[str, Sequence]) -> ParentType: """ if pat is None: result_col = column.column_empty( - len(self._parent._column), dtype="bool", masked=True + len(self._column), dtype="bool", masked=True ) elif is_scalar(pat): result_col = libstrings.startswith( - self._parent._column, cudf.Scalar(pat, "str") + self._column, cudf.Scalar(pat, "str") ) else: result_col = libstrings.startswith_multiple( - self._parent._column, column.as_column(pat, dtype="str") + self._column, column.as_column(pat, dtype="str") ) return self._return_or_inplace(result_col) - def find(self, sub: str, start: int = 0, end: int = None) -> ParentType: + def find(self, sub: str, start: int = 0, end: int = None) -> SeriesOrIndex: """ Return lowest indexes in each strings in the Series/Index where the substring is fully contained between ``[start:end]``. @@ -3579,12 +3539,14 @@ def find(self, sub: str, start: int = 0, end: int = None) -> ParentType: end = -1 result_col = libstrings.find( - self._parent._column, cudf.Scalar(sub, "str"), start, end + self._column, cudf.Scalar(sub, "str"), start, end ) return self._return_or_inplace(result_col) - def rfind(self, sub: str, start: int = 0, end: int = None) -> ParentType: + def rfind( + self, sub: str, start: int = 0, end: int = None + ) -> SeriesOrIndex: """ Return highest indexes in each strings in the Series/Index where the substring is fully contained between ``[start:end]``. @@ -3638,12 +3600,14 @@ def rfind(self, sub: str, start: int = 0, end: int = None) -> ParentType: end = -1 result_col = libstrings.rfind( - self._parent._column, cudf.Scalar(sub, "str"), start, end + self._column, cudf.Scalar(sub, "str"), start, end ) return self._return_or_inplace(result_col) - def index(self, sub: str, start: int = 0, end: int = None) -> ParentType: + def index( + self, sub: str, start: int = 0, end: int = None + ) -> SeriesOrIndex: """ Return lowest indexes in each strings where the substring is fully contained between ``[start:end]``. This is the same @@ -3693,7 +3657,7 @@ def index(self, sub: str, start: int = 0, end: int = None) -> ParentType: end = -1 result_col = libstrings.find( - self._parent._column, cudf.Scalar(sub, "str"), start, end + self._column, cudf.Scalar(sub, "str"), start, end ) result = self._return_or_inplace(result_col) @@ -3703,7 +3667,9 @@ def index(self, sub: str, start: int = 0, end: int = None) -> ParentType: else: return result - def rindex(self, sub: str, start: int = 0, end: int = None) -> ParentType: + def rindex( + self, sub: str, start: int = 0, end: int = None + ) -> SeriesOrIndex: """ Return highest indexes in each strings where the substring is fully contained between ``[start:end]``. This is the same @@ -3753,7 +3719,7 @@ def rindex(self, sub: str, start: int = 0, end: int = None) -> ParentType: end = -1 result_col = libstrings.rfind( - self._parent._column, cudf.Scalar(sub, "str"), start, end + self._column, cudf.Scalar(sub, "str"), start, end ) result = self._return_or_inplace(result_col) @@ -3763,7 +3729,9 @@ def rindex(self, sub: str, start: int = 0, end: int = None) -> ParentType: else: return result - def match(self, pat: str, case: bool = True, flags: int = 0) -> ParentType: + def match( + self, pat: str, case: bool = True, flags: int = 0 + ) -> SeriesOrIndex: """ Determine if each string matches a regular expression. @@ -3806,11 +3774,9 @@ def match(self, pat: str, case: bool = True, flags: int = 0) -> ParentType: if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") - return self._return_or_inplace( - libstrings.match_re(self._parent._column, pat) - ) + return self._return_or_inplace(libstrings.match_re(self._column, pat)) - def url_decode(self) -> ParentType: + def url_decode(self) -> SeriesOrIndex: """ Returns a URL-decoded format of each string. No format checking is performed. All characters @@ -3838,11 +3804,9 @@ def url_decode(self) -> ParentType: dtype: object """ - return self._return_or_inplace( - libstrings.url_decode(self._parent._column) - ) + return self._return_or_inplace(libstrings.url_decode(self._column)) - def url_encode(self) -> ParentType: + def url_encode(self) -> SeriesOrIndex: """ Returns a URL-encoded format of each string. No format checking is performed. @@ -3871,11 +3835,9 @@ def url_encode(self) -> ParentType: 1 https%3A%2F%2Fmedium.com%2Frapids-ai dtype: object """ - return self._return_or_inplace( - libstrings.url_encode(self._parent._column) - ) + return self._return_or_inplace(libstrings.url_encode(self._column)) - def code_points(self) -> ParentType: + def code_points(self) -> SeriesOrIndex: """ Returns an array by filling it with the UTF-8 code point values for each character of each string. @@ -3907,7 +3869,7 @@ def code_points(self) -> ParentType: dtype: int32 """ - new_col = libstrings.code_points(self._parent._column) + new_col = libstrings.code_points(self._column) if isinstance(self._parent, cudf.Series): return cudf.Series(new_col, name=self._parent.name) elif isinstance(self._parent, cudf.Index): @@ -3915,7 +3877,7 @@ def code_points(self) -> ParentType: else: return new_col - def translate(self, table: dict) -> ParentType: + def translate(self, table: dict) -> SeriesOrIndex: """ Map all characters in the string through the given mapping table. @@ -3957,12 +3919,12 @@ def translate(self, table: dict) -> ParentType: """ table = str.maketrans(table) return self._return_or_inplace( - libstrings.translate(self._parent._column, table) + libstrings.translate(self._column, table) ) def filter_characters( self, table: dict, keep: bool = True, repl: str = None - ) -> ParentType: + ) -> SeriesOrIndex: """ Remove characters from each string using the character ranges in the given mapping table. @@ -4009,11 +3971,11 @@ def filter_characters( table = str.maketrans(table) return self._return_or_inplace( libstrings.filter_characters( - self._parent._column, table, keep, cudf.Scalar(repl) + self._column, table, keep, cudf.Scalar(repl) ), ) - def normalize_spaces(self) -> ParentType: + def normalize_spaces(self) -> SeriesOrIndex: """ Remove extra whitespace between tokens and trim whitespace from the beginning and the end of each string. @@ -4032,10 +3994,10 @@ def normalize_spaces(self) -> ParentType: dtype: object """ return self._return_or_inplace( - libstrings.normalize_spaces(self._parent._column) + libstrings.normalize_spaces(self._column) ) - def normalize_characters(self, do_lower: bool = True) -> ParentType: + def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: """ Normalizes strings characters for tokenizing. @@ -4081,10 +4043,10 @@ def normalize_characters(self, do_lower: bool = True) -> ParentType: dtype: object """ return self._return_or_inplace( - libstrings.normalize_characters(self._parent._column, do_lower) + libstrings.normalize_characters(self._column, do_lower) ) - def tokenize(self, delimiter: str = " ") -> ParentType: + def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: """ Each string is split into tokens using the provided delimiter(s). The sequence returned contains the tokens in the order @@ -4117,12 +4079,12 @@ def tokenize(self, delimiter: str = " ") -> ParentType: if isinstance(delimiter, Column): return self._return_or_inplace( - libstrings._tokenize_column(self._parent._column, delimiter), + libstrings._tokenize_column(self._column, delimiter), retain_index=False, ) elif isinstance(delimiter, cudf.Scalar): return self._return_or_inplace( - libstrings._tokenize_scalar(self._parent._column, delimiter), + libstrings._tokenize_scalar(self._column, delimiter), retain_index=False, ) else: @@ -4133,7 +4095,7 @@ def tokenize(self, delimiter: str = " ") -> ParentType: def detokenize( self, indices: "cudf.Series", separator: str = " " - ) -> ParentType: + ) -> SeriesOrIndex: """ Combines tokens into strings by concatenating them in the order in which they appear in the ``indices`` column. The ``separator`` is @@ -4164,13 +4126,11 @@ def detokenize( """ separator = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.detokenize( - self._parent._column, indices._column, separator - ), + libstrings.detokenize(self._column, indices._column, separator), retain_index=False, ) - def character_tokenize(self) -> ParentType: + def character_tokenize(self) -> SeriesOrIndex: """ Each string is split into individual characters. The sequence returned contains each character as an individual string. @@ -4217,7 +4177,7 @@ def character_tokenize(self) -> ParentType: 29 . dtype: object """ - result_col = libstrings.character_tokenize(self._parent._column) + result_col = libstrings.character_tokenize(self._column) if isinstance(self._parent, cudf.Series): return cudf.Series(result_col, name=self._parent.name) elif isinstance(self._parent, cudf.Index): @@ -4225,7 +4185,7 @@ def character_tokenize(self) -> ParentType: else: return result_col - def token_count(self, delimiter: str = " ") -> ParentType: + def token_count(self, delimiter: str = " ") -> SeriesOrIndex: """ Each string is split into tokens using the provided delimiter. The returned integer sequence is the number of tokens in each string. @@ -4254,16 +4214,12 @@ def token_count(self, delimiter: str = " ") -> ParentType: delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) if isinstance(delimiter, Column): return self._return_or_inplace( - libstrings._count_tokens_column( - self._parent._column, delimiter - ) + libstrings._count_tokens_column(self._column, delimiter) ) elif isinstance(delimiter, cudf.Scalar): return self._return_or_inplace( - libstrings._count_tokens_scalar( - self._parent._column, delimiter - ) + libstrings._count_tokens_scalar(self._column, delimiter) ) else: raise TypeError( @@ -4271,7 +4227,7 @@ def token_count(self, delimiter: str = " ") -> ParentType: for delimiters, but got {type(delimiter)}" ) - def ngrams(self, n: int = 2, separator: str = "_") -> ParentType: + def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex: """ Generate the n-grams from a set of tokens, each record in series is treated a token. @@ -4305,11 +4261,11 @@ def ngrams(self, n: int = 2, separator: str = "_") -> ParentType: """ separator = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.generate_ngrams(self._parent._column, n, separator), + libstrings.generate_ngrams(self._column, n, separator), retain_index=False, ) - def character_ngrams(self, n: int = 2) -> ParentType: + def character_ngrams(self, n: int = 2) -> SeriesOrIndex: """ Generate the n-grams from characters in a column of strings. @@ -4342,13 +4298,13 @@ def character_ngrams(self, n: int = 2) -> ParentType: dtype: object """ return self._return_or_inplace( - libstrings.generate_character_ngrams(self._parent._column, n), + libstrings.generate_character_ngrams(self._column, n), retain_index=False, ) def ngrams_tokenize( self, n: int = 2, delimiter: str = " ", separator: str = "_" - ) -> ParentType: + ) -> SeriesOrIndex: """ Generate the n-grams using tokens from each string. This will tokenize each string and then generate ngrams for each @@ -4380,15 +4336,13 @@ def ngrams_tokenize( delimiter = _massage_string_arg(delimiter, "delimiter") separator = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.ngrams_tokenize( - self._parent._column, n, delimiter, separator - ), + libstrings.ngrams_tokenize(self._column, n, delimiter, separator), retain_index=False, ) def replace_tokens( self, targets, replacements, delimiter: str = None - ) -> ParentType: + ) -> SeriesOrIndex: """ The targets tokens are searched for within each string in the series and replaced with the corresponding replacements if found. @@ -4462,7 +4416,7 @@ def replace_tokens( return self._return_or_inplace( libstrings.replace_tokens( - self._parent._column, + self._column, targets_column, replacements_column, cudf.Scalar(delimiter, dtype="str"), @@ -4474,7 +4428,7 @@ def filter_tokens( min_token_length: int, replacement: str = None, delimiter: str = None, - ) -> ParentType: + ) -> SeriesOrIndex: """ Remove tokens from within each string in the series that are smaller than min_token_length and optionally replace them @@ -4533,7 +4487,7 @@ def filter_tokens( return self._return_or_inplace( libstrings.filter_tokens( - self._parent._column, + self._column, min_token_length, cudf.Scalar(replacement, dtype="str"), cudf.Scalar(delimiter, dtype="str"), @@ -4629,7 +4583,7 @@ def subword_tokenize( [1, 0, 1]], dtype=uint32) """ tokens, masks, metadata = libstrings.subword_tokenize_vocab_file( - self._parent._column, + self._column, hash_file, max_length, stride, @@ -4643,7 +4597,7 @@ def subword_tokenize( cupy.asarray(metadata), ) - def porter_stemmer_measure(self) -> ParentType: + def porter_stemmer_measure(self) -> SeriesOrIndex: """ Compute the Porter Stemmer measure for each string. The Porter Stemmer algorithm is described `here @@ -4663,10 +4617,10 @@ def porter_stemmer_measure(self) -> ParentType: dtype: int32 """ return self._return_or_inplace( - libstrings.porter_stemmer_measure(self._parent._column) + libstrings.porter_stemmer_measure(self._column) ) - def is_consonant(self, position) -> ParentType: + def is_consonant(self, position) -> SeriesOrIndex: """ Return true for strings where the character at ``position`` is a consonant. The ``position`` parameter may also be a list of integers @@ -4702,15 +4656,15 @@ def is_consonant(self, position) -> ParentType: if can_convert_to_column(position): return self._return_or_inplace( libstrings.is_letter_multi( - self._parent._column, ltype, column.as_column(position) + self._column, ltype, column.as_column(position) ), ) return self._return_or_inplace( - libstrings.is_letter(self._parent._column, ltype, position) + libstrings.is_letter(self._column, ltype, position) ) - def is_vowel(self, position) -> ParentType: + def is_vowel(self, position) -> SeriesOrIndex: """ Return true for strings where the character at ``position`` is a vowel -- not a consonant. The ``position`` parameter may also be @@ -4746,15 +4700,15 @@ def is_vowel(self, position) -> ParentType: if can_convert_to_column(position): return self._return_or_inplace( libstrings.is_letter_multi( - self._parent._column, ltype, column.as_column(position) + self._column, ltype, column.as_column(position) ), ) return self._return_or_inplace( - libstrings.is_letter(self._parent._column, ltype, position) + libstrings.is_letter(self._column, ltype, position) ) - def edit_distance(self, targets) -> ParentType: + def edit_distance(self, targets) -> SeriesOrIndex: """ The ``targets`` strings are measured against the strings in this instance using the Levenshtein edit distance algorithm. @@ -4800,7 +4754,7 @@ def edit_distance(self, targets) -> ParentType: ) return self._return_or_inplace( - libstrings.edit_distance(self._parent._column, targets_column) + libstrings.edit_distance(self._column, targets_column) ) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 6c47d94d6e7..7a35354ace6 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -148,9 +148,9 @@ def field(self, key): 1 3 dtype: int64 """ - fields = list(self._parent._column.dtype.fields.keys()) + fields = list(self._column.dtype.fields.keys()) if key in fields: pos = fields.index(key) - return self._return_or_inplace(self._parent._column.children[pos]) + return self._return_or_inplace(self._column.children[pos]) else: - return self._return_or_inplace(self._parent._column.children[key]) + return self._return_or_inplace(self._column.children[key]) From 2a32405c453334bf0e3b64b18031228db42ec19a Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 20 May 2021 17:53:47 -0400 Subject: [PATCH 07/14] More refactoring --- python/cudf/cudf/core/column/categorical.py | 4 ++-- python/cudf/cudf/core/column/lists.py | 4 ++-- python/cudf/cudf/core/column/methods.py | 4 +++- python/cudf/cudf/core/column/string.py | 4 ++-- python/cudf/cudf/core/column/struct.py | 4 ++-- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 6b32bf908ef..6412844b73e 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -27,7 +27,7 @@ from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import column -from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import ( is_categorical_dtype, @@ -47,7 +47,7 @@ ) -class CategoricalAccessor(ColumnMethodsMixin): +class CategoricalAccessor(ColumnMethods): _column: CategoricalColumn def __init__(self, parent: SeriesOrIndex): diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index db180f6f9e1..f3dbcac4ee2 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -19,7 +19,7 @@ from cudf._typing import BinaryOperand from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column -from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import ListDtype from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype @@ -231,7 +231,7 @@ def __cuda_array_interface__(self): ) -class ListMethods(ColumnMethodsMixin): +class ListMethods(ColumnMethods): """ List methods for Series """ diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 4b8ab7758fa..e6795713853 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -9,7 +9,9 @@ import cudf -class ColumnMethodsMixin: +class ColumnMethods: + # Encapsulates common behaviour for Series/Index accessor classes + _parent: Union["cudf.Series", "cudf.Index"] def __init__(self, parent: Union["cudf.Series", "cudf.Index"]): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 9c5ae0a31ce..b9f3950cd5c 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -29,7 +29,7 @@ from cudf._lib.column import Column from cudf.core.buffer import Buffer from cudf.core.column import column, datetime -from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.column.methods import ColumnMethods from cudf.utils import utils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( @@ -88,7 +88,7 @@ } -class StringMethods(ColumnMethodsMixin): +class StringMethods(ColumnMethods): _column: StringColumn def __init__(self, parent=None): diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 7a35354ace6..dd9bddeed82 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -5,7 +5,7 @@ import cudf from cudf.core.column import ColumnBase -from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.column.methods import ColumnMethods from cudf.utils.dtypes import is_struct_dtype @@ -109,7 +109,7 @@ def __cuda_array_interface__(self): ) -class StructMethods(ColumnMethodsMixin): +class StructMethods(ColumnMethods): """ Struct methods for Series """ From b713e13bce3a8012a2fb62d51e42ced8ceeed223 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 20 May 2021 18:03:14 -0400 Subject: [PATCH 08/14] parent can never be None --- python/cudf/cudf/core/column/methods.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index e6795713853..0afa93d84aa 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -54,20 +54,14 @@ def _return_or_inplace( of the owner (Series or Index) to mimic an inplace operation """ if inplace: - if self._parent is not None: - self._parent._mimic_inplace( - self._parent.__class__._from_table( - cudf._lib.table.Table({self._parent.name: new_col}) - ), - inplace=True, - ) - return None - else: - self._column._mimic_inplace(new_col, inplace=True) - return None + self._parent._mimic_inplace( + self._parent.__class__._from_table( + cudf._lib.table.Table({self._parent.name: new_col}) + ), + inplace=True, + ) + return None else: - if self._parent is None: - return new_col if expand or isinstance( self._parent, (cudf.DataFrame, cudf.MultiIndex) ): From 50e6fa3700ad4e0091cefd0a161a57bdd94db386 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 20 May 2021 18:09:23 -0400 Subject: [PATCH 09/14] Redundant docstring --- python/cudf/cudf/core/column/categorical.py | 76 +-------------------- 1 file changed, 1 insertion(+), 75 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 6412844b73e..a02163cd463 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1389,82 +1389,8 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: def set_categories( self, new_categories: Any, ordered: bool = False, rename: bool = False, ) -> CategoricalColumn: - """ - Set the categories to the specified new_categories. - - - `new_categories` can include new categories (which - will result in unused categories) or remove old categories - (which results in values set to null). If `rename==True`, - the categories will simple be renamed (less or more items - than in old categories will result in values set to null or - in unused categories respectively). - - This method can be used to perform more than one action - of adding, removing, and reordering simultaneously and - is therefore faster than performing the individual steps - via the more specialised methods. - - On the other hand this methods does not do checks - (e.g., whether the old categories are included in the - new categories on a reorder), which can result in - surprising changes. - - Parameters - ---------- - - new_categories : list-like - The categories in new order. + # See CategoricalAccessor.set_categories. - ordered : bool, default None - Whether or not the categorical is treated as - a ordered categorical. If not given, do - not change the ordered information. - - rename : bool, default False - Whether or not the `new_categories` should be - considered as a rename of the old categories - or as reordered categories. - - Returns - ------- - cat - Categorical with reordered categories - or None if inplace. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([1, 1, 2, 10, 2, 10], dtype='category') - >>> s - 0 1 - 1 1 - 2 2 - 3 10 - 4 2 - 5 10 - dtype: category - Categories (3, int64): [1, 2, 10] - >>> s.cat.set_categories([1, 10]) - 0 1 - 1 1 - 2 - 3 10 - 4 - 5 10 - dtype: category - Categories (2, int64): [1, 10] - >>> s.cat.set_categories([1, 10], inplace=True) - >>> s - 0 1 - 1 1 - 2 - 3 10 - 4 - 5 10 - dtype: category - Categories (2, int64): [1, 10] - """ ordered = ordered if ordered is not None else self.ordered new_categories = column.as_column(new_categories) From 9d912461fad10885be190a83588f99702c9e7b0d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 20 May 2021 18:22:29 -0400 Subject: [PATCH 10/14] MyPy fix --- python/cudf/cudf/core/column/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index a02163cd463..7d9fb7f94dc 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1554,7 +1554,7 @@ def _create_empty_categorical_column( cudf.utils.utils.scalar_broadcast_to( categorical_column.default_na_value(), categorical_column.size, - np.dtype(categorical_column.codes), + categorical_column.codes.dtype, ) ), offset=categorical_column.offset, From a2bd07a3a48924f77b268cff0e541b76b2a7e7c5 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 26 May 2021 08:47:54 -0400 Subject: [PATCH 11/14] Fix leaves method --- python/cudf/cudf/core/column/lists.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index f3dbcac4ee2..0efbbab14b9 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -1,6 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import pickle +from typing import Optional import numpy as np import pyarrow as pa @@ -16,7 +17,7 @@ sort_lists, ) from cudf._lib.table import Table -from cudf._typing import BinaryOperand +from cudf._typing import BinaryOperand, SeriesOrIndex from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethods @@ -230,6 +231,12 @@ def __cuda_array_interface__(self): "Lists are not yet supported via `__cuda_array_interface__`" ) + def leaves(self): + if isinstance(self.elements, ListColumn): + return self.elements.leaves() + else: + return self.elements + class ListMethods(ColumnMethods): """ @@ -315,7 +322,7 @@ def contains(self, search_key): return res @property - def leaves(self): + def leaves(self) -> Optional[SeriesOrIndex]: """ From a Series of (possibly nested) lists, obtain the elements from the innermost lists as a flat Series (one value per row). @@ -336,12 +343,9 @@ def leaves(self): 5 6 dtype: int64 """ - if type(self._column.elements) is ListColumn: - return self._column.elements.elements - else: - return self._return_or_inplace( - self._column.elements, retain_index=False - ) + return self._return_or_inplace( + self._column.leaves(), retain_index=False + ) def len(self): """ From 487378c7ef41915b03a7cc27aa279734951b14a3 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 13 Jul 2021 10:50:03 -0400 Subject: [PATCH 12/14] Move copyright --- python/cudf/cudf/core/column/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index b26daa3fb8d..5a44d7c58a6 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -1,8 +1,8 @@ +# Copyright (c) 2020-2021, NVIDIA CORPORATION. """ isort: skip_file """ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. from cudf.core.column.categorical import CategoricalColumn from cudf.core.column.column import ( From 8cafbd63968628e5475e463fd1abf1ab75ac233f Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 13 Jul 2021 17:13:05 -0400 Subject: [PATCH 13/14] Parent is not optional --- python/cudf/cudf/core/column/string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 7b23ffa9f4b..d00fca8b1c5 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -100,7 +100,7 @@ def str_to_boolean(column: StringColumn): class StringMethods(ColumnMethods): _column: StringColumn - def __init__(self, parent=None): + def __init__(self, parent): """ Vectorized string functions for Series and Index. From f38dc491be763c86d3c0dd32f85535473569960c Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 14 Jul 2021 16:52:41 -0400 Subject: [PATCH 14/14] Remove code re-introduced by bad merge --- python/cudf/cudf/core/column/column.py | 55 -------------------------- 1 file changed, 55 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index fa9fe17a65d..7bc036587af 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2299,61 +2299,6 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size) -def _copy_type_metadata_from_arrow( - arrow_array: pa.array, cudf_column: ColumnBase -) -> ColumnBase: - """ - Similar to `Column._copy_type_metadata`, except copies type metadata - from arrow array into a cudf column. Recursive for every level. - * When `arrow_array` is struct type and `cudf_column` is StructDtype, copy - field names. - * When `arrow_array` is decimal type and `cudf_column` is - Decimal64Dtype, copy precisions. - """ - if pa.types.is_decimal(arrow_array.type) and isinstance( - cudf_column, - (cudf.core.column.Decimal32Column, cudf.core.column.Decimal64Column), - ): - cudf_column.dtype.precision = arrow_array.type.precision - elif pa.types.is_struct(arrow_array.type) and isinstance( - cudf_column, cudf.core.column.StructColumn - ): - base_children = tuple( - _copy_type_metadata_from_arrow(arrow_array.field(i), col_child) - for i, col_child in enumerate(cudf_column.base_children) - ) - cudf_column.set_base_children(base_children) - return cudf.core.column.StructColumn( - data=None, - size=cudf_column.base_size, - dtype=StructDtype.from_arrow(arrow_array.type), - mask=cudf_column.base_mask, - offset=cudf_column.offset, - null_count=cudf_column.null_count, - children=base_children, - ) - elif pa.types.is_list(arrow_array.type) and isinstance( - cudf_column, cudf.core.column.ListColumn - ): - if arrow_array.values and cudf_column.base_children: - base_children = ( - cudf_column.base_children[0], - _copy_type_metadata_from_arrow( - arrow_array.values, cudf_column.base_children[1] - ), - ) - return cudf.core.column.ListColumn( - size=cudf_column.base_size, - dtype=ListDtype.from_arrow(arrow_array.type), - mask=cudf_column.base_mask, - offset=cudf_column.offset, - null_count=cudf_column.null_count, - children=base_children, - ) - - return cudf_column - - def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" if len(objs) == 0: