diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index e69de29bb2d..866c2861995 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -0,0 +1,88 @@ +from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix +from cudf._lib.nvtext.generate_ngrams import ( + generate_character_ngrams, + generate_ngrams, +) +from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize +from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces +from cudf._lib.nvtext.replace import filter_tokens, replace_tokens +from cudf._lib.nvtext.stemmer import ( + LetterType, + is_letter, + is_letter_multi, + porter_stemmer_measure, +) +from cudf._lib.nvtext.subword_tokenize import subword_tokenize_vocab_file +from cudf._lib.nvtext.tokenize import ( + _count_tokens_column, + _count_tokens_scalar, + _tokenize_column, + _tokenize_scalar, + character_tokenize, + detokenize, +) +from cudf._lib.strings.attributes import ( + code_points, + count_bytes, + count_characters, +) +from cudf._lib.strings.capitalize import capitalize, title +from cudf._lib.strings.case import swapcase, to_lower, to_upper +from cudf._lib.strings.char_types import ( + filter_alphanum, + is_alnum, + is_alpha, + is_decimal, + is_digit, + is_lower, + is_numeric, + is_space, + is_upper, +) +from cudf._lib.strings.combine import ( + concatenate, + join, + join_lists_with_column, + join_lists_with_scalar, +) +from cudf._lib.strings.contains import contains_re, count_re, match_re +from cudf._lib.strings.convert.convert_fixed_point import to_decimal +from cudf._lib.strings.convert.convert_floats import is_float +from cudf._lib.strings.convert.convert_integers import is_integer +from cudf._lib.strings.convert.convert_urls import url_decode, url_encode +from cudf._lib.strings.extract import extract +from cudf._lib.strings.find import ( + contains, + contains_multiple, + endswith, + endswith_multiple, + find, + rfind, + startswith, + startswith_multiple, +) +from cudf._lib.strings.findall import findall +from cudf._lib.strings.json import get_json_object +from cudf._lib.strings.padding import PadSide, center, ljust, pad, rjust, zfill +from cudf._lib.strings.replace import ( + insert, + replace, + replace_multi, + slice_replace, +) +from cudf._lib.strings.replace_re import ( + replace_multi_re, + replace_re, + replace_with_backrefs, +) +from cudf._lib.strings.split.partition import partition, rpartition +from cudf._lib.strings.split.split import ( + rsplit, + rsplit_record, + split, + split_record, +) +from cudf._lib.strings.strip import lstrip, rstrip, strip +from cudf._lib.strings.substring import get, slice_from, slice_strings +from cudf._lib.strings.translate import filter_characters, translate +from cudf._lib.strings.wrap import wrap diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 3d20e5f15b7..cd3a7b2affa 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -23,15 +23,15 @@ from cudf._lib.cpp.strings.combine cimport ( def concatenate(Table source_strings, - object py_separator, - object py_narep): + object sep, + object na_rep): """ Returns a Column by concatenating strings column-wise in `source_strings` - with the specified `py_separator` between each column and - `na`/`None` values are replaced by `py_narep` + with the specified `sep` between each column and + `na`/`None` values are replaced by `na_rep` """ - cdef DeviceScalar separator = py_separator.device_value - cdef DeviceScalar narep = py_narep.device_value + cdef DeviceScalar separator = sep.device_value + cdef DeviceScalar narep = na_rep.device_value cdef unique_ptr[column] c_result cdef table_view source_view = source_strings.data_view() @@ -53,16 +53,16 @@ def concatenate(Table source_strings, def join(Column source_strings, - object py_separator, - object py_narep): + object sep, + object na_rep): """ Returns a Column by concatenating strings row-wise in `source_strings` - with the specified `py_separator` between each column and - `na`/`None` values are replaced by `py_narep` + with the specified `sep` between each column and + `na`/`None` values are replaced by `na_rep` """ - cdef DeviceScalar separator = py_separator.device_value - cdef DeviceScalar narep = py_narep.device_value + cdef DeviceScalar separator = sep.device_value + cdef DeviceScalar narep = na_rep.device_value cdef unique_ptr[column] c_result cdef column_view source_view = source_strings.view() diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index d2b053789cd..708f5013cd3 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -36,11 +36,10 @@ def transpose(Table source): if is_categorical_dtype(dtype): if any(not is_categorical_dtype(c.dtype) for c in source._columns): raise ValueError('Columns must all have the same dtype') - cats = list(c.cat().categories for c in source._columns) - cats = cudf.Series(cudf.concat(cats)).drop_duplicates()._column + cats = list(c.categories for c in source._columns) + cats = cudf.core.column.concat_columns(cats).unique() source = Table(index=source._index, data=[ - (name, col.cat()._set_categories( - col.cat().categories, cats, is_unique=True).codes) + (name, col._set_categories(cats, is_unique=True).codes) for name, col in source._data.items() ]) elif dtype.kind in 'OU': diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index 034b18ec9e0..7eb0c7bdce4 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -28,3 +28,4 @@ BinaryOperand = Union["cudf.Scalar", "cudf.core.column.ColumnBase"] DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"] +SeriesOrIndex = Union["cudf.Series", "cudf.core.index.BaseIndex"] diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 56398bd4f13..daf6d11aa9f 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -218,7 +218,7 @@ def _union_categoricals( sorted_categories = result_col.categories.sort_by_values( ascending=True )[0] - result_col = result_col.cat().reorder_categories( + result_col = result_col.reorder_categories( new_categories=sorted_categories ) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 18d48e16480..5a44d7c58a6 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -1,4 +1,8 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +""" +isort: skip_file +""" + from cudf.core.column.categorical import CategoricalColumn from cudf.core.column.column import ( @@ -12,6 +16,7 @@ column_empty, column_empty_like, column_empty_like_same_mask, + concat_columns, deserialize_columns, full, serialize_columns, @@ -27,3 +32,4 @@ Decimal32Column, Decimal64Column, ) +from cudf.core.column.interval import IntervalColumn # noqa: F401 diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index cbcc30d38a7..ec366cceebd 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -27,7 +27,7 @@ from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import column -from cudf.core.column.methods import ColumnMethodsMixin, ParentType +from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import ( is_categorical_dtype, @@ -38,6 +38,7 @@ ) if TYPE_CHECKING: + from cudf._typing import SeriesOrIndex from cudf.core.column import ( ColumnBase, DatetimeColumn, @@ -47,10 +48,10 @@ ) -class CategoricalAccessor(ColumnMethodsMixin): +class CategoricalAccessor(ColumnMethods): _column: CategoricalColumn - def __init__(self, column: Any, parent: ParentType = None): + def __init__(self, parent: SeriesOrIndex): """ Accessor object for categorical properties of the Series values. Be aware that assigning to `categories` is a inplace operation, @@ -104,11 +105,11 @@ def __init__(self, column: Any, parent: ParentType = None): dtype: category Categories (3, int64): [1, 2, 3] """ - if not is_categorical_dtype(column.dtype): + if not is_categorical_dtype(parent.dtype): raise AttributeError( "Can only use .cat accessor with a 'category' dtype" ) - super().__init__(column=column, parent=parent) + super().__init__(parent=parent) @property def categories(self) -> "cudf.core.index.BaseIndex": @@ -136,7 +137,7 @@ def ordered(self) -> Optional[bool]: """ return self._column.ordered - def as_ordered(self, inplace: bool = False) -> Optional[ParentType]: + def as_ordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: """ Set the Categorical to be ordered. @@ -189,15 +190,11 @@ def as_ordered(self, inplace: bool = False) -> Optional[ParentType]: dtype: category Categories (3, int64): [1 < 2 < 10] """ - out_col = self._column - if not out_col.ordered: - out_col = self._set_categories( - self._column.categories, self._column.categories, ordered=True, - ) - - return self._return_or_inplace(out_col, inplace=inplace) + return self._return_or_inplace( + self._column.as_ordered(), inplace=inplace + ) - def as_unordered(self, inplace: bool = False) -> Optional[ParentType]: + def as_unordered(self, inplace: bool = False) -> Optional[SeriesOrIndex]: """ Set the Categorical to be unordered. @@ -261,17 +258,13 @@ def as_unordered(self, inplace: bool = False) -> Optional[ParentType]: dtype: category Categories (3, int64): [1, 2, 10] """ - out_col = self._column - if out_col.ordered: - out_col = self._set_categories( - self._column.categories, self.categories, ordered=False - ) - - return self._return_or_inplace(out_col, inplace=inplace) + return self._return_or_inplace( + self._column.as_unordered(), inplace=inplace + ) def add_categories( self, new_categories: Any, inplace: bool = False - ) -> Optional[ParentType]: + ) -> Optional[SeriesOrIndex]: """ Add new categories. @@ -349,14 +342,14 @@ def add_categories( new_categories = old_categories.append(new_categories) out_col = self._column - if not self._categories_equal(new_categories): - out_col = self._set_categories(old_categories, new_categories) + if not out_col._categories_equal(new_categories): + out_col = out_col._set_categories(new_categories) return self._return_or_inplace(out_col, inplace=inplace) def remove_categories( self, removals: Any, inplace: bool = False, - ) -> Optional[ParentType]: + ) -> Optional[SeriesOrIndex]: """ Remove the specified categories. @@ -439,10 +432,8 @@ def remove_categories( new_categories = cats[~cats.isin(removals)]._column out_col = self._column - if not self._categories_equal(new_categories): - out_col = self._set_categories( - self._column.categories, new_categories - ) + if not out_col._categories_equal(new_categories): + out_col = out_col._set_categories(new_categories) return self._return_or_inplace(out_col, inplace=inplace) @@ -452,7 +443,7 @@ def set_categories( ordered: bool = False, rename: bool = False, inplace: bool = False, - ) -> Optional[ParentType]: + ) -> Optional[SeriesOrIndex]: """ Set the categories to the specified new_categories. @@ -571,11 +562,11 @@ def set_categories( ), ) elif ( - not self._categories_equal(new_categories, ordered=ordered) + not out_col._categories_equal(new_categories, ordered=ordered) or not self.ordered == ordered ): - out_col = self._set_categories( - self._column.categories, new_categories, ordered=ordered, + out_col = out_col._set_categories( + new_categories, ordered=ordered, ) return self._return_or_inplace(out_col, inplace=inplace) @@ -584,7 +575,7 @@ def reorder_categories( new_categories: Any, ordered: bool = False, inplace: bool = False, - ) -> Optional[ParentType]: + ) -> Optional[SeriesOrIndex]: """ Reorder categories as specified in new_categories. @@ -650,107 +641,10 @@ def reorder_categories( ValueError: items in new_categories are not the same as in old categories """ - new_categories = column.as_column(new_categories) - # Compare new_categories against current categories. - # Ignore order for comparison because we're only interested - # in whether new_categories has all the same values as the - # current set of categories. - if not self._categories_equal(new_categories, ordered=False): - raise ValueError( - "items in new_categories are not the same as in " - "old categories" - ) - out_col = self._set_categories( - self._column.categories, new_categories, ordered=ordered - ) - - return self._return_or_inplace(out_col, inplace=inplace) - - def _categories_equal( - self, new_categories: ColumnBase, ordered=False - ) -> bool: - cur_categories = self._column.categories - if len(new_categories) != len(cur_categories): - return False - if new_categories.dtype != cur_categories.dtype: - return False - # if order doesn't matter, sort before the equals call below - if not ordered: - cur_categories = cudf.Series(cur_categories).sort_values( - ignore_index=True - ) - new_categories = cudf.Series(new_categories).sort_values( - ignore_index=True - ) - return cur_categories.equals(new_categories) - - def _set_categories( - self, - current_categories: Any, - new_categories: Any, - is_unique: bool = False, - ordered: bool = False, - ) -> CategoricalColumn: - """Returns a new CategoricalColumn with the categories set to the - specified *new_categories*. - - Notes - ----- - Assumes ``new_categories`` is the same dtype as the current categories - """ - - cur_cats = column.as_column(current_categories) - new_cats = column.as_column(new_categories) - - # Join the old and new categories to build a map from - # old to new codes, inserting na_sentinel for any old - # categories that don't exist in the new categories - - # Ensure new_categories is unique first - if not (is_unique or new_cats.is_unique): - # drop_duplicates() instead of unique() to preserve order - new_cats = ( - cudf.Series(new_cats) - .drop_duplicates(ignore_index=True) - ._column - ) - - cur_codes = self.codes - max_cat_size = ( - len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats) + return self._return_or_inplace( + self._column.reorder_categories(new_categories, ordered=ordered), + inplace=inplace, ) - out_code_dtype = min_unsigned_type(max_cat_size) - - cur_order = column.arange(len(cur_codes)) - old_codes = column.arange(len(cur_cats), dtype=out_code_dtype) - new_codes = column.arange(len(new_cats), dtype=out_code_dtype) - - new_df = cudf.DataFrame({"new_codes": new_codes, "cats": new_cats}) - old_df = cudf.DataFrame({"old_codes": old_codes, "cats": cur_cats}) - cur_df = cudf.DataFrame({"old_codes": cur_codes, "order": cur_order}) - - # Join the old and new categories and line up their codes - df = old_df.merge(new_df, on="cats", how="left") - # Join the old and new codes to "recode" the codes data buffer - df = cur_df.merge(df, on="old_codes", how="left") - df = df.sort_values(by="order") - df.reset_index(drop=True, inplace=True) - - ordered = ordered if ordered is not None else self.ordered - new_codes = df["new_codes"]._column - - # codes can't have masks, so take mask out before moving in - return column.build_categorical_column( - categories=new_cats, - codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype), - mask=new_codes.base_mask, - size=new_codes.size, - offset=new_codes.offset, - ordered=ordered, - ) - - def _decategorize(self) -> ColumnBase: - return self._column._get_decategorized_column() class CategoricalColumn(column.ColumnBase): @@ -938,9 +832,6 @@ def ordered(self) -> Optional[bool]: def ordered(self, value: bool): self.dtype.ordered = value - def cat(self, parent: ParentType = None): - return CategoricalAccessor(self, parent=parent) - def unary_operator(self, unaryop: str): raise TypeError( f"Series of dtype `category` cannot perform the operation: " @@ -1088,7 +979,7 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series: col = self signed_dtype = min_signed_type(len(col.categories)) - codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array() + codes = col.codes.astype(signed_dtype).fillna(-1).to_array() if is_interval_dtype(col.categories.dtype): # leaving out dropna because it temporarily changes an interval # index into a struct and throws off results. @@ -1225,13 +1116,11 @@ def find_and_replace( # named 'index', which came from the filtered categories, # contains the new ints that we need to map to to_replace_col = column.as_column(catmap.index).astype( - self.cat().codes.dtype - ) - replacement_col = catmap["index"]._column.astype( - self.cat().codes.dtype + self.codes.dtype ) + replacement_col = catmap["index"]._column.astype(self.codes.dtype) - replaced = column.as_column(self.cat().codes) + replaced = column.as_column(self.codes) output = libcudf.replace.replace( replaced, to_replace_col, replacement_col ) @@ -1309,10 +1198,8 @@ def fillna( ) # TODO: only required if fill_value has a subset of the # categories: - fill_value = fill_value.cat()._set_categories( - fill_value.cat().categories, - self.categories, - is_unique=True, + fill_value = fill_value._set_categories( + self.categories, is_unique=True, ) fill_value = column.as_column(fill_value.codes).astype( self.codes.dtype @@ -1380,8 +1267,8 @@ def as_categorical_column( # return a column full of Nulls. return _create_empty_categorical_column(self, dtype) - return self.cat().set_categories( - new_categories=dtype.categories, ordered=dtype.ordered + return self.set_categories( + new_categories=dtype.categories, ordered=bool(dtype.ordered) ) def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn: @@ -1405,8 +1292,8 @@ def as_timedelta_column(self, dtype, **kwargs) -> TimeDeltaColumn: def _get_decategorized_column(self) -> ColumnBase: if self.null_count == len(self): # self.categories is empty; just return codes - return self.cat().codes._column - gather_map = self.cat().codes.astype("int32").fillna(0)._column + return self.codes + gather_map = self.codes.astype("int32").fillna(0) out = self.categories.take(gather_map) out = out.set_mask(self.mask) return out @@ -1439,19 +1326,14 @@ def copy(self, deep: bool = True) -> CategoricalColumn: ) def __sizeof__(self) -> int: - return ( - self.cat().categories.__sizeof__() + self.cat().codes.__sizeof__() - ) + return self.categories.__sizeof__() + self.codes.__sizeof__() def _memory_usage(self, **kwargs) -> int: deep = kwargs.get("deep", False) if deep: return self.__sizeof__() else: - return ( - self.categories._memory_usage() - + self.cat().codes.memory_usage() - ) + return self.categories._memory_usage() + self.codes._memory_usage() def _mimic_inplace( self, other_col: ColumnBase, inplace: bool = False @@ -1469,7 +1351,7 @@ def view(self, dtype: Dtype) -> ColumnBase: @staticmethod def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: # TODO: This function currently assumes it is being called from - # column._concat_columns, at least to the extent that all the + # column.concat_columns, at least to the extent that all the # preprocessing in that function has already been done. That should be # improved as the concatenation API is solidified. @@ -1477,15 +1359,8 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: head = next((obj for obj in objs if obj.valid_count), objs[0]) # Combine and de-dupe the categories - cats = ( - cudf.concat([o.cat().categories for o in objs]) - .drop_duplicates() - ._column - ) - objs = [ - o.cat()._set_categories(o.cat().categories, cats, is_unique=True) - for o in objs - ] + cats = column.concat_columns([o.categories for o in objs]).unique() + objs = [o._set_categories(cats, is_unique=True) for o in objs] codes = [o.codes for o in objs] newsize = sum(map(len, codes)) @@ -1524,9 +1399,181 @@ def _with_type_metadata( offset=self.codes.offset, null_count=self.codes.null_count, ) - return self + def set_categories( + self, new_categories: Any, ordered: bool = False, rename: bool = False, + ) -> CategoricalColumn: + # See CategoricalAccessor.set_categories. + + ordered = ordered if ordered is not None else self.ordered + new_categories = column.as_column(new_categories) + + if isinstance(new_categories, CategoricalColumn): + new_categories = new_categories.categories + + # when called with rename=True, the pandas behavior is + # to replace the current category values with the new + # categories. + if rename: + # enforce same length + if len(new_categories) != len(self.categories): + raise ValueError( + "new_categories must have the same " + "number of items as old categories" + ) + + out_col = column.build_categorical_column( + categories=new_categories, + codes=self.base_children[0], + mask=self.base_mask, + size=self.size, + offset=self.offset, + ordered=ordered, + ) + else: + out_col = self + if not (type(out_col.categories) is type(new_categories)): + # If both categories are of different Column types, + # return a column full of Nulls. + out_col = _create_empty_categorical_column( + self, + CategoricalDtype( + categories=new_categories, ordered=ordered + ), + ) + elif ( + not out_col._categories_equal(new_categories, ordered=ordered) + or not self.ordered == ordered + ): + out_col = out_col._set_categories( + new_categories, ordered=ordered, + ) + return out_col + + def _categories_equal( + self, new_categories: ColumnBase, ordered=False + ) -> bool: + cur_categories = self.categories + if len(new_categories) != len(cur_categories): + return False + if new_categories.dtype != cur_categories.dtype: + return False + # if order doesn't matter, sort before the equals call below + if not ordered: + cur_categories = cudf.Series(cur_categories).sort_values( + ignore_index=True + ) + new_categories = cudf.Series(new_categories).sort_values( + ignore_index=True + ) + return cur_categories.equals(new_categories) + + def _set_categories( + self, + new_categories: Any, + is_unique: bool = False, + ordered: bool = False, + ) -> CategoricalColumn: + """Returns a new CategoricalColumn with the categories set to the + specified *new_categories*. + + Notes + ----- + Assumes ``new_categories`` is the same dtype as the current categories + """ + + cur_cats = column.as_column(self.categories) + new_cats = column.as_column(new_categories) + + # Join the old and new categories to build a map from + # old to new codes, inserting na_sentinel for any old + # categories that don't exist in the new categories + + # Ensure new_categories is unique first + if not (is_unique or new_cats.is_unique): + # drop_duplicates() instead of unique() to preserve order + new_cats = ( + cudf.Series(new_cats) + .drop_duplicates(ignore_index=True) + ._column + ) + + cur_codes = self.codes + max_cat_size = ( + len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats) + ) + out_code_dtype = min_unsigned_type(max_cat_size) + + cur_order = column.arange(len(cur_codes)) + old_codes = column.arange(len(cur_cats), dtype=out_code_dtype) + new_codes = column.arange(len(new_cats), dtype=out_code_dtype) + + new_df = cudf.DataFrame({"new_codes": new_codes, "cats": new_cats}) + old_df = cudf.DataFrame({"old_codes": old_codes, "cats": cur_cats}) + cur_df = cudf.DataFrame({"old_codes": cur_codes, "order": cur_order}) + + # Join the old and new categories and line up their codes + df = old_df.merge(new_df, on="cats", how="left") + # Join the old and new codes to "recode" the codes data buffer + df = cur_df.merge(df, on="old_codes", how="left") + df = df.sort_values(by="order") + df.reset_index(drop=True, inplace=True) + + ordered = ordered if ordered is not None else self.ordered + new_codes = df["new_codes"]._column + + # codes can't have masks, so take mask out before moving in + return column.build_categorical_column( + categories=new_cats, + codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype), + mask=new_codes.base_mask, + size=new_codes.size, + offset=new_codes.offset, + ordered=ordered, + ) + + def reorder_categories( + self, new_categories: Any, ordered: bool = False, + ) -> CategoricalColumn: + new_categories = column.as_column(new_categories) + # Compare new_categories against current categories. + # Ignore order for comparison because we're only interested + # in whether new_categories has all the same values as the + # current set of categories. + if not self._categories_equal(new_categories, ordered=False): + raise ValueError( + "items in new_categories are not the same as in " + "old categories" + ) + return self._set_categories(new_categories, ordered=ordered) + + def as_ordered(self): + out_col = self + if not out_col.ordered: + out_col = column.build_categorical_column( + categories=self.categories, + codes=self.codes, + mask=self.base_mask, + size=self.base_size, + offset=self.offset, + ordered=True, + ) + return out_col + + def as_unordered(self): + out_col = self + if out_col.ordered: + out_col = column.build_categorical_column( + categories=self.categories, + codes=self.codes, + mask=self.base_mask, + size=self.base_size, + offset=self.offset, + ordered=False, + ) + return out_col + def _create_empty_categorical_column( categorical_column: CategoricalColumn, dtype: "CategoricalDtype" @@ -1537,7 +1584,7 @@ def _create_empty_categorical_column( cudf.utils.utils.scalar_broadcast_to( categorical_column.default_na_value(), categorical_column.size, - np.dtype(categorical_column.cat().codes), + categorical_column.codes.dtype, ) ), offset=categorical_column.offset, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 9e0fd9da824..7bc036587af 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -677,7 +677,7 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int: return indices[-1] def append(self, other: ColumnBase) -> ColumnBase: - return _concat_columns([self, as_column(other)]) + return concat_columns([self, as_column(other)]) def quantile( self, @@ -2299,7 +2299,7 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size) -def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: +def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" if len(objs) == 0: dtype = pandas_dtype(None) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 843190f38aa..5a40efa0a93 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -20,7 +20,7 @@ from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column -from cudf.core.column.methods import ColumnMethodsMixin, ParentType +from cudf.core.column.methods import ColumnMethods, ParentType from cudf.core.dtypes import ListDtype from cudf.utils.dtypes import _is_non_decimal_numeric_dtype, is_list_dtype @@ -163,9 +163,6 @@ def offsets(self): """ return self.children[0] - def list(self, parent=None): - return ListMethods(self, parent=parent) - def to_arrow(self): offsets = self.offsets.to_arrow() elements = ( @@ -275,20 +272,26 @@ def _with_type_metadata( return self + def leaves(self): + if isinstance(self.elements, ListColumn): + return self.elements.leaves() + else: + return self.elements + -class ListMethods(ColumnMethodsMixin): +class ListMethods(ColumnMethods): """ List methods for Series """ _column: ListColumn - def __init__(self, column: ListColumn, parent: ParentType = None): - if not is_list_dtype(column.dtype): + def __init__(self, parent: ParentType): + if not is_list_dtype(parent.dtype): raise AttributeError( "Can only use .list accessor with a 'list' dtype" ) - super().__init__(column=column, parent=parent) + super().__init__(parent=parent) def get(self, index: int) -> ParentType: """ @@ -383,12 +386,9 @@ def leaves(self) -> ParentType: 5 6 dtype: int64 """ - if type(self._column.elements) is ListColumn: - return self._column.elements.list(parent=self._parent).leaves - else: - return self._return_or_inplace( - self._column.elements, retain_index=False - ) + return self._return_or_inplace( + self._column.leaves(), retain_index=False + ) def len(self) -> ParentType: """ diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 4b448e27a53..27dc4fe0c0d 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -2,46 +2,46 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional, Union, overload +from typing import Optional, Union, overload from typing_extensions import Literal import cudf -if TYPE_CHECKING: - from cudf.core.column import ColumnBase - ParentType = Union["cudf.Series", "cudf.BaseIndex"] -class ColumnMethodsMixin: - _column: ColumnBase - _parent: Optional[Union["cudf.Series", "cudf.BaseIndex"]] +class ColumnMethods: + _parent: ParentType - def __init__( - self, - column: ColumnBase, - parent: Union["cudf.Series", "cudf.BaseIndex"] = None, - ): - self._column = column + def __init__(self, parent: ParentType): self._parent = parent + self._column = self._parent._column @overload def _return_or_inplace( - self, new_col, inplace: Literal[False], expand=False, retain_index=True - ) -> Union["cudf.Series", "cudf.BaseIndex"]: + self, + new_col, + inplace: Literal[True], + expand: bool = False, + retain_index: bool = True, + ) -> None: ... @overload def _return_or_inplace( - self, new_col, expand: bool = False, retain_index: bool = True - ) -> Union["cudf.Series", "cudf.BaseIndex"]: + self, + new_col, + inplace: Literal[False], + expand: bool = False, + retain_index: bool = True, + ) -> ParentType: ... @overload def _return_or_inplace( - self, new_col, inplace: Literal[True], expand=False, retain_index=True - ) -> None: + self, new_col, expand: bool = False, retain_index: bool = True, + ) -> ParentType: ... @overload @@ -51,7 +51,7 @@ def _return_or_inplace( inplace: bool = False, expand: bool = False, retain_index: bool = True, - ) -> Optional[Union["cudf.Series", "cudf.BaseIndex"]]: + ) -> Optional[ParentType]: ... def _return_or_inplace( @@ -62,20 +62,14 @@ def _return_or_inplace( of the owner (Series or Index) to mimic an inplace operation """ if inplace: - if self._parent is not None: - self._parent._mimic_inplace( - self._parent.__class__._from_table( - cudf._lib.table.Table({self._parent.name: new_col}) - ), - inplace=True, - ) - return None - else: - self._column._mimic_inplace(new_col, inplace=True) - return None + self._parent._mimic_inplace( + self._parent.__class__._from_table( + cudf._lib.table.Table({self._parent.name: new_col}) + ), + inplace=True, + ) + return None else: - if self._parent is None: - return new_col if expand or isinstance( self._parent, (cudf.DataFrame, cudf.MultiIndex) ): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index e17ecec766a..d00fca8b1c5 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5,7 +5,17 @@ import builtins import pickle import warnings -from typing import Any, Dict, Optional, Sequence, Tuple, Union, cast, overload +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Optional, + Sequence, + Tuple, + Union, + cast, + overload, +) import cupy import numpy as np @@ -15,157 +25,16 @@ import cudf from cudf import _lib as libcudf -from cudf._lib import string_casting as str_cast +from cudf._lib import string_casting as str_cast, strings as libstrings from cudf._lib.column import Column -from cudf._lib.nvtext.edit_distance import ( - edit_distance as cpp_edit_distance, - edit_distance_matrix as cpp_edit_distance_matrix, -) -from cudf._lib.nvtext.generate_ngrams import ( - generate_character_ngrams as cpp_generate_character_ngrams, - generate_ngrams as cpp_generate_ngrams, -) -from cudf._lib.nvtext.ngrams_tokenize import ( - ngrams_tokenize as cpp_ngrams_tokenize, -) -from cudf._lib.nvtext.normalize import ( - normalize_characters as cpp_normalize_characters, - normalize_spaces as cpp_normalize_spaces, -) -from cudf._lib.nvtext.replace import ( - filter_tokens as cpp_filter_tokens, - replace_tokens as cpp_replace_tokens, -) -from cudf._lib.nvtext.stemmer import ( - LetterType, - is_letter as cpp_is_letter, - is_letter_multi as cpp_is_letter_multi, - porter_stemmer_measure as cpp_porter_stemmer_measure, -) -from cudf._lib.nvtext.subword_tokenize import ( - subword_tokenize_vocab_file as cpp_subword_tokenize_vocab_file, -) -from cudf._lib.nvtext.tokenize import ( - _count_tokens_column as cpp_count_tokens_column, - _count_tokens_scalar as cpp_count_tokens_scalar, - _tokenize_column as cpp_tokenize_column, - _tokenize_scalar as cpp_tokenize_scalar, - character_tokenize as cpp_character_tokenize, - detokenize as cpp_detokenize, -) -from cudf._lib.strings.attributes import ( - code_points as cpp_code_points, - count_bytes as cpp_count_bytes, - count_characters as cpp_count_characters, -) -from cudf._lib.strings.capitalize import ( - capitalize as cpp_capitalize, - title as cpp_title, -) -from cudf._lib.strings.case import ( - swapcase as cpp_swapcase, - to_lower as cpp_to_lower, - to_upper as cpp_to_upper, -) -from cudf._lib.strings.char_types import ( - filter_alphanum as cpp_filter_alphanum, - is_alnum as cpp_is_alnum, - is_alpha as cpp_is_alpha, - is_decimal as cpp_is_decimal, - is_digit as cpp_is_digit, - is_lower as cpp_is_lower, - is_numeric as cpp_is_numeric, - is_space as cpp_isspace, - is_upper as cpp_is_upper, -) -from cudf._lib.strings.combine import ( - concatenate as cpp_concatenate, - join as cpp_join, - join_lists_with_column as cpp_join_lists_with_column, - join_lists_with_scalar as cpp_join_lists_with_scalar, -) -from cudf._lib.strings.contains import ( - contains_re as cpp_contains_re, - count_re as cpp_count_re, - match_re as cpp_match_re, -) -from cudf._lib.strings.convert.convert_fixed_point import ( - to_decimal as cpp_to_decimal, -) -from cudf._lib.strings.convert.convert_floats import is_float as cpp_is_float -from cudf._lib.strings.convert.convert_integers import ( - is_integer as cpp_is_integer, -) -from cudf._lib.strings.convert.convert_urls import ( - url_decode as cpp_url_decode, - url_encode as cpp_url_encode, -) -from cudf._lib.strings.extract import extract as cpp_extract -from cudf._lib.strings.find import ( - contains as cpp_contains, - contains_multiple as cpp_contains_multiple, - endswith as cpp_endswith, - endswith_multiple as cpp_endswith_multiple, - find as cpp_find, - rfind as cpp_rfind, - startswith as cpp_startswith, - startswith_multiple as cpp_startswith_multiple, -) -from cudf._lib.strings.findall import findall as cpp_findall -from cudf._lib.strings.json import get_json_object as cpp_get_json_object -from cudf._lib.strings.padding import ( - PadSide, - center as cpp_center, - ljust as cpp_ljust, - pad as cpp_pad, - rjust as cpp_rjust, - zfill as cpp_zfill, -) -from cudf._lib.strings.replace import ( - insert as cpp_string_insert, - replace as cpp_replace, - replace_multi as cpp_replace_multi, - slice_replace as cpp_slice_replace, -) -from cudf._lib.strings.replace_re import ( - replace_multi_re as cpp_replace_multi_re, - replace_re as cpp_replace_re, - replace_with_backrefs as cpp_replace_with_backrefs, -) -from cudf._lib.strings.split.partition import ( - partition as cpp_partition, - rpartition as cpp_rpartition, -) -from cudf._lib.strings.split.split import ( - rsplit as cpp_rsplit, - rsplit_record as cpp_rsplit_record, - split as cpp_split, - split_record as cpp_split_record, -) -from cudf._lib.strings.strip import ( - lstrip as cpp_lstrip, - rstrip as cpp_rstrip, - strip as cpp_strip, -) -from cudf._lib.strings.substring import ( - get as cpp_string_get, - slice_from as cpp_slice_from, - slice_strings as cpp_slice_strings, -) -from cudf._lib.strings.translate import ( - filter_characters as cpp_filter_characters, - translate as cpp_translate, -) -from cudf._lib.strings.wrap import wrap as cpp_wrap -from cudf._typing import ColumnLike, Dtype, ScalarLike -from cudf.api.types import is_integer from cudf.core.buffer import Buffer from cudf.core.column import column, datetime -from cudf.core.column.methods import ColumnMethodsMixin, ParentType +from cudf.core.column.methods import ColumnMethods, ParentType from cudf.utils import utils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( can_convert_to_column, + is_integer, is_list_dtype, is_scalar, is_string_dtype, @@ -174,7 +43,13 @@ def str_to_boolean(column: StringColumn): """Takes in string column and returns boolean column """ - return (column.str().len() > cudf.Scalar(0, dtype="int8")).fillna(False) + return ( + libstrings.count_characters(column) > cudf.Scalar(0, dtype="int8") + ).fillna(False) + + +if TYPE_CHECKING: + from cudf._typing import ColumnLike, Dtype, ScalarLike, SeriesOrIndex _str_to_numeric_typecast_functions = { @@ -222,8 +97,10 @@ def str_to_boolean(column: StringColumn): } -class StringMethods(ColumnMethodsMixin): - def __init__(self, column, parent=None): +class StringMethods(ColumnMethods): + _column: StringColumn + + def __init__(self, parent): """ Vectorized string functions for Series and Index. @@ -233,15 +110,17 @@ def __init__(self, column, parent=None): inspiration from R’s stringr package. """ value_type = ( - column.dtype.leaf_type if is_list_dtype(column) else column.dtype + parent.dtype.leaf_type + if is_list_dtype(parent.dtype) + else parent.dtype ) if not is_string_dtype(value_type): raise AttributeError( "Can only use .str accessor with string values" ) - super().__init__(column=column, parent=parent) + super().__init__(parent=parent) - def htoi(self) -> ParentType: + def htoi(self) -> SeriesOrIndex: """ Returns integer value represented by each hex string. String is interpretted to have hex (base-16) characters. @@ -307,7 +186,7 @@ def __getitem__(self, key): else: return self.get(key) - def len(self) -> ParentType: + def len(self) -> SeriesOrIndex: """ Computes the length of each element in the Series/Index. @@ -327,9 +206,11 @@ def len(self) -> ParentType: dtype: int32 """ - return self._return_or_inplace(cpp_count_characters(self._column)) + return self._return_or_inplace( + libstrings.count_characters(self._column) + ) - def byte_count(self) -> ParentType: + def byte_count(self) -> SeriesOrIndex: """ Computes the number of bytes of each string in the Series/Index. @@ -354,7 +235,7 @@ def byte_count(self) -> ParentType: 2 11 dtype: int32 """ - return self._return_or_inplace(cpp_count_bytes(self._column),) + return self._return_or_inplace(libstrings.count_bytes(self._column),) @overload def cat(self, sep: str = None, na_rep: str = None) -> str: @@ -363,7 +244,7 @@ def cat(self, sep: str = None, na_rep: str = None) -> str: @overload def cat( self, others, sep: str = None, na_rep: str = None - ) -> Union[ParentType, "cudf.core.column.string.StringColumn"]: + ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: ... def cat(self, others=None, sep=None, na_rep=None): @@ -454,13 +335,13 @@ def cat(self, others=None, sep=None, na_rep=None): sep = "" if others is None: - data = cpp_join( + data = libstrings.join( self._column, cudf.Scalar(sep), cudf.Scalar(na_rep, "str"), ) else: other_cols = _get_cols_list(self._parent, others) all_cols = [self._column] + other_cols - data = cpp_concatenate( + data = libstrings.concatenate( cudf.DataFrame( {index: value for index, value in enumerate(all_cols)} ), @@ -480,7 +361,7 @@ def cat(self, others=None, sep=None, na_rep=None): def join( self, sep=None, string_na_rep=None, sep_na_rep=None - ) -> ParentType: + ) -> SeriesOrIndex: """ Join lists contained as elements in the Series/Index with passed delimiter. @@ -621,7 +502,7 @@ def join( strings_column = self._split_by_character() if is_scalar(sep): - data = cpp_join_lists_with_scalar( + data = libstrings.join_lists_with_scalar( strings_column, cudf.Scalar(sep), cudf.Scalar(string_na_rep) ) elif can_convert_to_column(sep): @@ -637,7 +518,7 @@ def join( f"of type: {type(sep_na_rep)}" ) - data = cpp_join_lists_with_column( + data = libstrings.join_lists_with_column( strings_column, sep_column, cudf.Scalar(string_na_rep), @@ -652,7 +533,7 @@ def join( return self._return_or_inplace(data) def _split_by_character(self): - result_col = cpp_character_tokenize(self._column) + result_col = libstrings.character_tokenize(self._column) offset_col = self._column.children[0] @@ -668,7 +549,7 @@ def _split_by_character(self): def extract( self, pat: str, flags: int = 0, expand: bool = True - ) -> ParentType: + ) -> SeriesOrIndex: """ Extract capture groups in the regex `pat` as columns in a DataFrame. @@ -726,7 +607,7 @@ def extract( if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") - out = cpp_extract(self._column, pat) + out = libstrings.extract(self._column, pat) if out._num_columns == 1 and expand is False: return self._return_or_inplace(out._columns[0], expand=expand) else: @@ -739,7 +620,7 @@ def contains( flags: int = 0, na=np.nan, regex: bool = True, - ) -> ParentType: + ) -> SeriesOrIndex: """ Test if pattern or regex is contained within a string of a Series or Index. @@ -857,13 +738,13 @@ def contains( ) elif is_scalar(pat): if regex is True: - result_col = cpp_contains_re(self._column, pat) + result_col = libstrings.contains_re(self._column, pat) else: - result_col = cpp_contains( + result_col = libstrings.contains( self._column, cudf.Scalar(pat, "str") ) else: - result_col = cpp_contains_multiple( + result_col = libstrings.contains_multiple( self._column, column.as_column(pat, dtype="str") ) return self._return_or_inplace(result_col) @@ -876,7 +757,7 @@ def replace( case=None, flags: int = 0, regex: bool = True, - ) -> ParentType: + ) -> SeriesOrIndex: """ Replace occurrences of pattern/regex in the Series/Index with some other string. Equivalent to `str.replace() @@ -952,11 +833,11 @@ def replace( ) return self._return_or_inplace( - cpp_replace_multi_re( - self._column, pat, column.as_column(repl, dtype="str") + libstrings.replace_multi_re( + self._column, pat, column.as_column(repl, dtype="str"), ) if regex - else cpp_replace_multi( + else libstrings.replace_multi( self._column, column.as_column(pat, dtype="str"), column.as_column(repl, dtype="str"), @@ -968,9 +849,11 @@ def replace( # Pandas forces non-regex replace when pat is a single-character return self._return_or_inplace( - cpp_replace_re(self._column, pat, cudf.Scalar(repl, "str"), n) + libstrings.replace_re( + self._column, pat, cudf.Scalar(repl, "str"), n + ) if regex is True and len(pat) > 1 - else cpp_replace( + else libstrings.replace( self._column, cudf.Scalar(pat, "str"), cudf.Scalar(repl, "str"), @@ -978,7 +861,7 @@ def replace( ), ) - def replace_with_backrefs(self, pat: str, repl: str) -> ParentType: + def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: """ Use the ``repl`` back-ref template to create a new string with the extracted elements found using the ``pat`` expression. @@ -1005,12 +888,12 @@ def replace_with_backrefs(self, pat: str, repl: str) -> ParentType: dtype: object """ return self._return_or_inplace( - cpp_replace_with_backrefs(self._column, pat, repl) + libstrings.replace_with_backrefs(self._column, pat, repl) ) def slice( self, start: int = None, stop: int = None, step: int = None - ) -> ParentType: + ) -> SeriesOrIndex: """ Slice substrings from each element in the Series or Index. @@ -1076,10 +959,10 @@ def slice( """ return self._return_or_inplace( - cpp_slice_strings(self._column, start, stop, step), + libstrings.slice_strings(self._column, start, stop, step), ) - def isinteger(self) -> ParentType: + def isinteger(self) -> SeriesOrIndex: """ Check whether all characters in each string form integer. @@ -1137,9 +1020,9 @@ def isinteger(self) -> ParentType: 2 False dtype: bool """ - return self._return_or_inplace(cpp_is_integer(self._column)) + return self._return_or_inplace(libstrings.is_integer(self._column)) - def ishex(self) -> ParentType: + def ishex(self) -> SeriesOrIndex: """ Check whether all characters in each string form a hex integer. @@ -1178,7 +1061,7 @@ def ishex(self) -> ParentType: """ return self._return_or_inplace(str_cast.is_hex(self._column)) - def istimestamp(self, format: str) -> ParentType: + def istimestamp(self, format: str) -> SeriesOrIndex: """ Check whether all characters in each string can be converted to a timestamp using the given format. @@ -1202,7 +1085,7 @@ def istimestamp(self, format: str) -> ParentType: str_cast.istimestamp(self._column, format) ) - def isfloat(self) -> ParentType: + def isfloat(self) -> SeriesOrIndex: """ Check whether all characters in each string form floating value. @@ -1263,9 +1146,9 @@ def isfloat(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_float(self._column)) + return self._return_or_inplace(libstrings.is_float(self._column)) - def isdecimal(self) -> ParentType: + def isdecimal(self) -> SeriesOrIndex: """ Check whether all characters in each string are decimal. @@ -1324,9 +1207,9 @@ def isdecimal(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_decimal(self._column)) + return self._return_or_inplace(libstrings.is_decimal(self._column)) - def isalnum(self) -> ParentType: + def isalnum(self) -> SeriesOrIndex: """ Check whether all characters in each string are alphanumeric. @@ -1393,9 +1276,9 @@ def isalnum(self) -> ParentType: 2 False dtype: bool """ - return self._return_or_inplace(cpp_is_alnum(self._column)) + return self._return_or_inplace(libstrings.is_alnum(self._column)) - def isalpha(self) -> ParentType: + def isalpha(self) -> SeriesOrIndex: """ Check whether all characters in each string are alphabetic. @@ -1449,9 +1332,9 @@ def isalpha(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_alpha(self._column)) + return self._return_or_inplace(libstrings.is_alpha(self._column)) - def isdigit(self) -> ParentType: + def isdigit(self) -> SeriesOrIndex: """ Check whether all characters in each string are digits. @@ -1511,9 +1394,9 @@ def isdigit(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_digit(self._column)) + return self._return_or_inplace(libstrings.is_digit(self._column)) - def isnumeric(self) -> ParentType: + def isnumeric(self) -> SeriesOrIndex: """ Check whether all characters in each string are numeric. @@ -1579,9 +1462,9 @@ def isnumeric(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_numeric(self._column)) + return self._return_or_inplace(libstrings.is_numeric(self._column)) - def isupper(self) -> ParentType: + def isupper(self) -> SeriesOrIndex: """ Check whether all characters in each string are uppercase. @@ -1636,9 +1519,9 @@ def isupper(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_upper(self._column)) + return self._return_or_inplace(libstrings.is_upper(self._column)) - def islower(self) -> ParentType: + def islower(self) -> SeriesOrIndex: """ Check whether all characters in each string are lowercase. @@ -1693,9 +1576,9 @@ def islower(self) -> ParentType: 3 False dtype: bool """ - return self._return_or_inplace(cpp_is_lower(self._column)) + return self._return_or_inplace(libstrings.is_lower(self._column)) - def isipv4(self) -> ParentType: + def isipv4(self) -> SeriesOrIndex: """ Check whether all characters in each string form an IPv4 address. @@ -1719,7 +1602,7 @@ def isipv4(self) -> ParentType: """ return self._return_or_inplace(str_cast.is_ipv4(self._column)) - def lower(self) -> ParentType: + def lower(self) -> SeriesOrIndex: """ Converts all characters to lowercase. @@ -1756,9 +1639,9 @@ def lower(self) -> ParentType: 3 swapcase dtype: object """ - return self._return_or_inplace(cpp_to_lower(self._column)) + return self._return_or_inplace(libstrings.to_lower(self._column)) - def upper(self) -> ParentType: + def upper(self) -> SeriesOrIndex: """ Convert each string to uppercase. This only applies to ASCII characters at this time. @@ -1805,9 +1688,9 @@ def upper(self) -> ParentType: 3 SWAPCASE dtype: object """ - return self._return_or_inplace(cpp_to_upper(self._column)) + return self._return_or_inplace(libstrings.to_upper(self._column)) - def capitalize(self) -> ParentType: + def capitalize(self) -> SeriesOrIndex: """ Convert strings in the Series/Index to be capitalized. This only applies to ASCII characters at this time. @@ -1833,9 +1716,9 @@ def capitalize(self) -> ParentType: 1 Goodbye, friend dtype: object """ - return self._return_or_inplace(cpp_capitalize(self._column)) + return self._return_or_inplace(libstrings.capitalize(self._column)) - def swapcase(self) -> ParentType: + def swapcase(self) -> SeriesOrIndex: """ Change each lowercase character to uppercase and vice versa. This only applies to ASCII characters at this time. @@ -1878,9 +1761,9 @@ def swapcase(self) -> ParentType: 3 sWaPcAsE dtype: object """ - return self._return_or_inplace(cpp_swapcase(self._column)) + return self._return_or_inplace(libstrings.swapcase(self._column)) - def title(self) -> ParentType: + def title(self) -> SeriesOrIndex: """ Uppercase the first letter of each letter after a space and lowercase the rest. @@ -1923,11 +1806,11 @@ def title(self) -> ParentType: 3 Swapcase dtype: object """ - return self._return_or_inplace(cpp_title(self._column)) + return self._return_or_inplace(libstrings.title(self._column)) def filter_alphanum( self, repl: str = None, keep: bool = True - ) -> ParentType: + ) -> SeriesOrIndex: """ Remove non-alphanumeric characters from strings in this column. @@ -1959,12 +1842,12 @@ def filter_alphanum( repl = "" return self._return_or_inplace( - cpp_filter_alphanum(self._column, cudf.Scalar(repl), keep), + libstrings.filter_alphanum(self._column, cudf.Scalar(repl), keep), ) def slice_from( self, starts: "cudf.Series", stops: "cudf.Series" - ) -> ParentType: + ) -> SeriesOrIndex: """ Return substring of each string using positions for each string. @@ -2002,14 +1885,16 @@ def slice_from( """ return self._return_or_inplace( - cpp_slice_from( - self._column, column.as_column(starts), column.as_column(stops) + libstrings.slice_from( + self._column, + column.as_column(starts), + column.as_column(stops), ), ) def slice_replace( self, start: int = None, stop: int = None, repl: str = None - ) -> ParentType: + ) -> SeriesOrIndex: """ Replace the specified section of each string with a new string. @@ -2091,10 +1976,12 @@ def slice_replace( repl = "" return self._return_or_inplace( - cpp_slice_replace(self._column, start, stop, cudf.Scalar(repl)), + libstrings.slice_replace( + self._column, start, stop, cudf.Scalar(repl) + ), ) - def insert(self, start: int = 0, repl: str = None) -> ParentType: + def insert(self, start: int = 0, repl: str = None) -> SeriesOrIndex: """ Insert the specified string into each string in the specified position. @@ -2141,10 +2028,10 @@ def insert(self, start: int = 0, repl: str = None) -> ParentType: repl = "" return self._return_or_inplace( - cpp_string_insert(self._column, start, cudf.Scalar(repl)), + libstrings.insert(self._column, start, cudf.Scalar(repl)), ) - def get(self, i: int = 0) -> ParentType: + def get(self, i: int = 0) -> SeriesOrIndex: """ Extract element from each component at specified position. @@ -2186,7 +2073,7 @@ def get(self, i: int = 0) -> ParentType: dtype: object """ - return self._return_or_inplace(cpp_string_get(self._column, i)) + return self._return_or_inplace(libstrings.get(self._column, i)) def get_json_object(self, json_path): """ @@ -2239,7 +2126,7 @@ def get_json_object(self, json_path): try: res = self._return_or_inplace( - cpp_get_json_object( + libstrings.get_json_object( self._column, cudf.Scalar(json_path, "str") ) ) @@ -2256,7 +2143,7 @@ def get_json_object(self, json_path): def split( self, pat: str = None, n: int = -1, expand: bool = None - ) -> ParentType: + ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. @@ -2387,14 +2274,14 @@ def split( if self._column.null_count == len(self._column): result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: - result_table = cpp_split( + result_table = libstrings.split( self._column, cudf.Scalar(pat, "str"), n ) if len(result_table._data) == 1: if result_table._data[0].null_count == len(self._column): result_table = cudf.core.frame.Frame({}) else: - result_table = cpp_split_record( + result_table = libstrings.split_record( self._column, cudf.Scalar(pat, "str"), n ) @@ -2402,7 +2289,7 @@ def split( def rsplit( self, pat: str = None, n: int = -1, expand: bool = None - ) -> ParentType: + ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. @@ -2542,16 +2429,20 @@ def rsplit( if self._column.null_count == len(self._column): result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: - result_table = cpp_rsplit(self._column, cudf.Scalar(pat), n) + result_table = libstrings.rsplit( + self._column, cudf.Scalar(pat), n + ) if len(result_table._data) == 1: if result_table._data[0].null_count == len(self._column): result_table = cudf.core.frame.Frame({}) else: - result_table = cpp_rsplit_record(self._column, cudf.Scalar(pat), n) + result_table = libstrings.rsplit_record( + self._column, cudf.Scalar(pat), n + ) return self._return_or_inplace(result_table, expand=expand) - def partition(self, sep: str = " ", expand: bool = True) -> ParentType: + def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: """ Split the string at the first occurrence of sep. @@ -2628,10 +2519,11 @@ def partition(self, sep: str = " ", expand: bool = True) -> ParentType: sep = " " return self._return_or_inplace( - cpp_partition(self._column, cudf.Scalar(sep)), expand=expand + libstrings.partition(self._column, cudf.Scalar(sep)), + expand=expand, ) - def rpartition(self, sep: str = " ", expand: bool = True) -> ParentType: + def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: """ Split the string at the last occurrence of sep. @@ -2692,12 +2584,13 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> ParentType: sep = " " return self._return_or_inplace( - cpp_rpartition(self._column, cudf.Scalar(sep)), expand=expand + libstrings.rpartition(self._column, cudf.Scalar(sep)), + expand=expand, ) def pad( self, width: int, side: str = "left", fillchar: str = " " - ) -> ParentType: + ) -> SeriesOrIndex: """ Pad strings in the Series/Index up to width. @@ -2772,17 +2665,17 @@ def pad( raise TypeError(msg) try: - side = PadSide[side.upper()] + side = libstrings.PadSide[side.upper()] except KeyError: raise ValueError( "side has to be either one of {‘left’, ‘right’, ‘both’}" ) return self._return_or_inplace( - cpp_pad(self._column, width, fillchar, side) + libstrings.pad(self._column, width, fillchar, side) ) - def zfill(self, width: int) -> ParentType: + def zfill(self, width: int) -> SeriesOrIndex: """ Pad strings in the Series/Index by prepending ‘0’ characters. @@ -2853,9 +2746,9 @@ def zfill(self, width: int) -> ParentType: msg = f"width must be of integer type, not {type(width).__name__}" raise TypeError(msg) - return self._return_or_inplace(cpp_zfill(self._column, width)) + return self._return_or_inplace(libstrings.zfill(self._column, width)) - def center(self, width: int, fillchar: str = " ") -> ParentType: + def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ Filling left and right side of strings in the Series/Index with an additional character. @@ -2924,10 +2817,10 @@ def center(self, width: int, fillchar: str = " ") -> ParentType: raise TypeError(msg) return self._return_or_inplace( - cpp_center(self._column, width, fillchar) + libstrings.center(self._column, width, fillchar) ) - def ljust(self, width: int, fillchar: str = " ") -> ParentType: + def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ Filling right side of strings in the Series/Index with an additional character. Equivalent to `str.ljust() @@ -2978,10 +2871,10 @@ def ljust(self, width: int, fillchar: str = " ") -> ParentType: raise TypeError(msg) return self._return_or_inplace( - cpp_ljust(self._column, width, fillchar) + libstrings.ljust(self._column, width, fillchar) ) - def rjust(self, width: int, fillchar: str = " ") -> ParentType: + def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: """ Filling left side of strings in the Series/Index with an additional character. Equivalent to `str.rjust() @@ -3032,10 +2925,10 @@ def rjust(self, width: int, fillchar: str = " ") -> ParentType: raise TypeError(msg) return self._return_or_inplace( - cpp_rjust(self._column, width, fillchar) + libstrings.rjust(self._column, width, fillchar) ) - def strip(self, to_strip: str = None) -> ParentType: + def strip(self, to_strip: str = None) -> SeriesOrIndex: """ Remove leading and trailing characters. @@ -3091,10 +2984,10 @@ def strip(self, to_strip: str = None) -> ParentType: to_strip = "" return self._return_or_inplace( - cpp_strip(self._column, cudf.Scalar(to_strip)) + libstrings.strip(self._column, cudf.Scalar(to_strip)) ) - def lstrip(self, to_strip: str = None) -> ParentType: + def lstrip(self, to_strip: str = None) -> SeriesOrIndex: """ Remove leading and trailing characters. @@ -3138,10 +3031,10 @@ def lstrip(self, to_strip: str = None) -> ParentType: to_strip = "" return self._return_or_inplace( - cpp_lstrip(self._column, cudf.Scalar(to_strip)) + libstrings.lstrip(self._column, cudf.Scalar(to_strip)) ) - def rstrip(self, to_strip: str = None) -> ParentType: + def rstrip(self, to_strip: str = None) -> SeriesOrIndex: """ Remove leading and trailing characters. @@ -3193,10 +3086,10 @@ def rstrip(self, to_strip: str = None) -> ParentType: to_strip = "" return self._return_or_inplace( - cpp_rstrip(self._column, cudf.Scalar(to_strip)) + libstrings.rstrip(self._column, cudf.Scalar(to_strip)) ) - def wrap(self, width: int, **kwargs) -> ParentType: + def wrap(self, width: int, **kwargs) -> SeriesOrIndex: """ Wrap long strings in the Series/Index to be formatted in paragraphs with length less than a given width. @@ -3288,9 +3181,9 @@ def wrap(self, width: int, **kwargs) -> ParentType: "`break_on_hyphens`=False" ) - return self._return_or_inplace(cpp_wrap(self._column, width)) + return self._return_or_inplace(libstrings.wrap(self._column, width)) - def count(self, pat: str, flags: int = 0) -> ParentType: + def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: """ Count occurrences of pattern in each string of the Series/Index. @@ -3348,11 +3241,11 @@ def count(self, pat: str, flags: int = 0) -> ParentType: if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") - return self._return_or_inplace(cpp_count_re(self._column, pat)) + return self._return_or_inplace(libstrings.count_re(self._column, pat)) def findall( self, pat: str, flags: int = 0, expand: bool = True - ) -> ParentType: + ) -> SeriesOrIndex: """ Find all occurrences of pattern or regular expression in the Series/Index. @@ -3417,10 +3310,10 @@ def findall( raise NotImplementedError("`flags` parameter is not yet supported") return self._return_or_inplace( - cpp_findall(self._column, pat), expand=expand + libstrings.findall(self._column, pat), expand=expand ) - def isempty(self) -> ParentType: + def isempty(self) -> SeriesOrIndex: """ Check whether each string is an empty string. @@ -3442,7 +3335,7 @@ def isempty(self) -> ParentType: """ return self._return_or_inplace((self._column == "").fillna(False)) - def isspace(self) -> ParentType: + def isspace(self) -> SeriesOrIndex: """ Check whether all characters in each string are whitespace. @@ -3496,9 +3389,9 @@ def isspace(self) -> ParentType: 2 False dtype: bool """ - return self._return_or_inplace(cpp_isspace(self._column)) + return self._return_or_inplace(libstrings.is_space(self._column)) - def endswith(self, pat: str) -> ParentType: + def endswith(self, pat: str) -> SeriesOrIndex: """ Test if the end of each string element matches a pattern. @@ -3544,15 +3437,17 @@ def endswith(self, pat: str) -> ParentType: len(self._column), dtype="bool", masked=True ) elif is_scalar(pat): - result_col = cpp_endswith(self._column, cudf.Scalar(pat, "str")) + result_col = libstrings.endswith( + self._column, cudf.Scalar(pat, "str") + ) else: - result_col = cpp_endswith_multiple( + result_col = libstrings.endswith_multiple( self._column, column.as_column(pat, dtype="str") ) return self._return_or_inplace(result_col) - def startswith(self, pat: Union[str, Sequence]) -> ParentType: + def startswith(self, pat: Union[str, Sequence]) -> SeriesOrIndex: """ Test if the start of each string element matches a pattern. @@ -3604,15 +3499,17 @@ def startswith(self, pat: Union[str, Sequence]) -> ParentType: len(self._column), dtype="bool", masked=True ) elif is_scalar(pat): - result_col = cpp_startswith(self._column, cudf.Scalar(pat, "str")) + result_col = libstrings.startswith( + self._column, cudf.Scalar(pat, "str") + ) else: - result_col = cpp_startswith_multiple( + result_col = libstrings.startswith_multiple( self._column, column.as_column(pat, dtype="str") ) return self._return_or_inplace(result_col) - def find(self, sub: str, start: int = 0, end: int = None) -> ParentType: + def find(self, sub: str, start: int = 0, end: int = None) -> SeriesOrIndex: """ Return lowest indexes in each strings in the Series/Index where the substring is fully contained between ``[start:end]``. @@ -3661,13 +3558,15 @@ def find(self, sub: str, start: int = 0, end: int = None) -> ParentType: if end is None: end = -1 - result_col = cpp_find( + result_col = libstrings.find( self._column, cudf.Scalar(sub, "str"), start, end ) return self._return_or_inplace(result_col) - def rfind(self, sub: str, start: int = 0, end: int = None) -> ParentType: + def rfind( + self, sub: str, start: int = 0, end: int = None + ) -> SeriesOrIndex: """ Return highest indexes in each strings in the Series/Index where the substring is fully contained between ``[start:end]``. @@ -3720,13 +3619,15 @@ def rfind(self, sub: str, start: int = 0, end: int = None) -> ParentType: if end is None: end = -1 - result_col = cpp_rfind( + result_col = libstrings.rfind( self._column, cudf.Scalar(sub, "str"), start, end ) return self._return_or_inplace(result_col) - def index(self, sub: str, start: int = 0, end: int = None) -> ParentType: + def index( + self, sub: str, start: int = 0, end: int = None + ) -> SeriesOrIndex: """ Return lowest indexes in each strings where the substring is fully contained between ``[start:end]``. This is the same @@ -3775,7 +3676,7 @@ def index(self, sub: str, start: int = 0, end: int = None) -> ParentType: if end is None: end = -1 - result_col = cpp_find( + result_col = libstrings.find( self._column, cudf.Scalar(sub, "str"), start, end ) @@ -3786,7 +3687,9 @@ def index(self, sub: str, start: int = 0, end: int = None) -> ParentType: else: return result - def rindex(self, sub: str, start: int = 0, end: int = None) -> ParentType: + def rindex( + self, sub: str, start: int = 0, end: int = None + ) -> SeriesOrIndex: """ Return highest indexes in each strings where the substring is fully contained between ``[start:end]``. This is the same @@ -3835,7 +3738,7 @@ def rindex(self, sub: str, start: int = 0, end: int = None) -> ParentType: if end is None: end = -1 - result_col = cpp_rfind( + result_col = libstrings.rfind( self._column, cudf.Scalar(sub, "str"), start, end ) @@ -3846,7 +3749,9 @@ def rindex(self, sub: str, start: int = 0, end: int = None) -> ParentType: else: return result - def match(self, pat: str, case: bool = True, flags: int = 0) -> ParentType: + def match( + self, pat: str, case: bool = True, flags: int = 0 + ) -> SeriesOrIndex: """ Determine if each string matches a regular expression. @@ -3889,9 +3794,9 @@ def match(self, pat: str, case: bool = True, flags: int = 0) -> ParentType: if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") - return self._return_or_inplace(cpp_match_re(self._column, pat)) + return self._return_or_inplace(libstrings.match_re(self._column, pat)) - def url_decode(self) -> ParentType: + def url_decode(self) -> SeriesOrIndex: """ Returns a URL-decoded format of each string. No format checking is performed. All characters @@ -3919,9 +3824,9 @@ def url_decode(self) -> ParentType: dtype: object """ - return self._return_or_inplace(cpp_url_decode(self._column)) + return self._return_or_inplace(libstrings.url_decode(self._column)) - def url_encode(self) -> ParentType: + def url_encode(self) -> SeriesOrIndex: """ Returns a URL-encoded format of each string. No format checking is performed. @@ -3950,9 +3855,9 @@ def url_encode(self) -> ParentType: 1 https%3A%2F%2Fmedium.com%2Frapids-ai dtype: object """ - return self._return_or_inplace(cpp_url_encode(self._column)) + return self._return_or_inplace(libstrings.url_encode(self._column)) - def code_points(self) -> ParentType: + def code_points(self) -> SeriesOrIndex: """ Returns an array by filling it with the UTF-8 code point values for each character of each string. @@ -3984,7 +3889,7 @@ def code_points(self) -> ParentType: dtype: int32 """ - new_col = cpp_code_points(self._column) + new_col = libstrings.code_points(self._column) if isinstance(self._parent, cudf.Series): return cudf.Series(new_col, name=self._parent.name) elif isinstance(self._parent, cudf.BaseIndex): @@ -3992,7 +3897,7 @@ def code_points(self) -> ParentType: else: return new_col - def translate(self, table: dict) -> ParentType: + def translate(self, table: dict) -> SeriesOrIndex: """ Map all characters in the string through the given mapping table. @@ -4033,11 +3938,13 @@ def translate(self, table: dict) -> ParentType: dtype: object """ table = str.maketrans(table) - return self._return_or_inplace(cpp_translate(self._column, table)) + return self._return_or_inplace( + libstrings.translate(self._column, table) + ) def filter_characters( self, table: dict, keep: bool = True, repl: str = None - ) -> ParentType: + ) -> SeriesOrIndex: """ Remove characters from each string using the character ranges in the given mapping table. @@ -4083,12 +3990,12 @@ def filter_characters( repl = "" table = str.maketrans(table) return self._return_or_inplace( - cpp_filter_characters( + libstrings.filter_characters( self._column, table, keep, cudf.Scalar(repl) ), ) - def normalize_spaces(self) -> ParentType: + def normalize_spaces(self) -> SeriesOrIndex: """ Remove extra whitespace between tokens and trim whitespace from the beginning and the end of each string. @@ -4106,9 +4013,11 @@ def normalize_spaces(self) -> ParentType: 1 test string dtype: object """ - return self._return_or_inplace(cpp_normalize_spaces(self._column)) + return self._return_or_inplace( + libstrings.normalize_spaces(self._column) + ) - def normalize_characters(self, do_lower: bool = True) -> ParentType: + def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: """ Normalizes strings characters for tokenizing. @@ -4154,10 +4063,10 @@ def normalize_characters(self, do_lower: bool = True) -> ParentType: dtype: object """ return self._return_or_inplace( - cpp_normalize_characters(self._column, do_lower) + libstrings.normalize_characters(self._column, do_lower) ) - def tokenize(self, delimiter: str = " ") -> ParentType: + def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: """ Each string is split into tokens using the provided delimiter(s). The sequence returned contains the tokens in the order @@ -4190,12 +4099,12 @@ def tokenize(self, delimiter: str = " ") -> ParentType: if isinstance(delimiter, Column): return self._return_or_inplace( - cpp_tokenize_column(self._column, delimiter), + libstrings._tokenize_column(self._column, delimiter), retain_index=False, ) elif isinstance(delimiter, cudf.Scalar): return self._return_or_inplace( - cpp_tokenize_scalar(self._column, delimiter), + libstrings._tokenize_scalar(self._column, delimiter), retain_index=False, ) else: @@ -4206,7 +4115,7 @@ def tokenize(self, delimiter: str = " ") -> ParentType: def detokenize( self, indices: "cudf.Series", separator: str = " " - ) -> ParentType: + ) -> SeriesOrIndex: """ Combines tokens into strings by concatenating them in the order in which they appear in the ``indices`` column. The ``separator`` is @@ -4237,11 +4146,11 @@ def detokenize( """ separator = _massage_string_arg(separator, "separator") return self._return_or_inplace( - cpp_detokenize(self._column, indices._column, separator), + libstrings.detokenize(self._column, indices._column, separator), retain_index=False, ) - def character_tokenize(self) -> ParentType: + def character_tokenize(self) -> SeriesOrIndex: """ Each string is split into individual characters. The sequence returned contains each character as an individual string. @@ -4288,7 +4197,7 @@ def character_tokenize(self) -> ParentType: 29 . dtype: object """ - result_col = cpp_character_tokenize(self._column) + result_col = libstrings.character_tokenize(self._column) if isinstance(self._parent, cudf.Series): return cudf.Series(result_col, name=self._parent.name) elif isinstance(self._parent, cudf.BaseIndex): @@ -4296,7 +4205,7 @@ def character_tokenize(self) -> ParentType: else: return result_col - def token_count(self, delimiter: str = " ") -> ParentType: + def token_count(self, delimiter: str = " ") -> SeriesOrIndex: """ Each string is split into tokens using the provided delimiter. The returned integer sequence is the number of tokens in each string. @@ -4325,12 +4234,12 @@ def token_count(self, delimiter: str = " ") -> ParentType: delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) if isinstance(delimiter, Column): return self._return_or_inplace( - cpp_count_tokens_column(self._column, delimiter) + libstrings._count_tokens_column(self._column, delimiter) ) elif isinstance(delimiter, cudf.Scalar): return self._return_or_inplace( - cpp_count_tokens_scalar(self._column, delimiter) + libstrings._count_tokens_scalar(self._column, delimiter) ) else: raise TypeError( @@ -4338,7 +4247,7 @@ def token_count(self, delimiter: str = " ") -> ParentType: for delimiters, but got {type(delimiter)}" ) - def ngrams(self, n: int = 2, separator: str = "_") -> ParentType: + def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex: """ Generate the n-grams from a set of tokens, each record in series is treated a token. @@ -4372,10 +4281,11 @@ def ngrams(self, n: int = 2, separator: str = "_") -> ParentType: """ separator = _massage_string_arg(separator, "separator") return self._return_or_inplace( - cpp_generate_ngrams(self._column, n, separator), retain_index=False + libstrings.generate_ngrams(self._column, n, separator), + retain_index=False, ) - def character_ngrams(self, n: int = 2) -> ParentType: + def character_ngrams(self, n: int = 2) -> SeriesOrIndex: """ Generate the n-grams from characters in a column of strings. @@ -4408,12 +4318,13 @@ def character_ngrams(self, n: int = 2) -> ParentType: dtype: object """ return self._return_or_inplace( - cpp_generate_character_ngrams(self._column, n), retain_index=False + libstrings.generate_character_ngrams(self._column, n), + retain_index=False, ) def ngrams_tokenize( self, n: int = 2, delimiter: str = " ", separator: str = "_" - ) -> ParentType: + ) -> SeriesOrIndex: """ Generate the n-grams using tokens from each string. This will tokenize each string and then generate ngrams for each @@ -4445,13 +4356,13 @@ def ngrams_tokenize( delimiter = _massage_string_arg(delimiter, "delimiter") separator = _massage_string_arg(separator, "separator") return self._return_or_inplace( - cpp_ngrams_tokenize(self._column, n, delimiter, separator), + libstrings.ngrams_tokenize(self._column, n, delimiter, separator), retain_index=False, ) def replace_tokens( self, targets, replacements, delimiter: str = None - ) -> ParentType: + ) -> SeriesOrIndex: """ The targets tokens are searched for within each string in the series and replaced with the corresponding replacements if found. @@ -4524,7 +4435,7 @@ def replace_tokens( ) return self._return_or_inplace( - cpp_replace_tokens( + libstrings.replace_tokens( self._column, targets_column, replacements_column, @@ -4537,7 +4448,7 @@ def filter_tokens( min_token_length: int, replacement: str = None, delimiter: str = None, - ) -> ParentType: + ) -> SeriesOrIndex: """ Remove tokens from within each string in the series that are smaller than min_token_length and optionally replace them @@ -4595,7 +4506,7 @@ def filter_tokens( ) return self._return_or_inplace( - cpp_filter_tokens( + libstrings.filter_tokens( self._column, min_token_length, cudf.Scalar(replacement, dtype="str"), @@ -4702,7 +4613,7 @@ def subword_tokenize( ) warnings.warn(warning_message, FutureWarning) - tokens, masks, metadata = cpp_subword_tokenize_vocab_file( + tokens, masks, metadata = libstrings.subword_tokenize_vocab_file( self._column, hash_file, max_length, @@ -4717,7 +4628,7 @@ def subword_tokenize( cupy.asarray(metadata), ) - def porter_stemmer_measure(self) -> ParentType: + def porter_stemmer_measure(self) -> SeriesOrIndex: """ Compute the Porter Stemmer measure for each string. The Porter Stemmer algorithm is described `here @@ -4737,10 +4648,10 @@ def porter_stemmer_measure(self) -> ParentType: dtype: int32 """ return self._return_or_inplace( - cpp_porter_stemmer_measure(self._column) + libstrings.porter_stemmer_measure(self._column) ) - def is_consonant(self, position) -> ParentType: + def is_consonant(self, position) -> SeriesOrIndex: """ Return true for strings where the character at ``position`` is a consonant. The ``position`` parameter may also be a list of integers @@ -4771,20 +4682,20 @@ def is_consonant(self, position) -> ParentType: 1 False dtype: bool """ - ltype = LetterType.CONSONANT + ltype = libstrings.LetterType.CONSONANT if can_convert_to_column(position): return self._return_or_inplace( - cpp_is_letter_multi( + libstrings.is_letter_multi( self._column, ltype, column.as_column(position) ), ) return self._return_or_inplace( - cpp_is_letter(self._column, ltype, position) + libstrings.is_letter(self._column, ltype, position) ) - def is_vowel(self, position) -> ParentType: + def is_vowel(self, position) -> SeriesOrIndex: """ Return true for strings where the character at ``position`` is a vowel -- not a consonant. The ``position`` parameter may also be @@ -4815,20 +4726,20 @@ def is_vowel(self, position) -> ParentType: 1 True dtype: bool """ - ltype = LetterType.VOWEL + ltype = libstrings.LetterType.VOWEL if can_convert_to_column(position): return self._return_or_inplace( - cpp_is_letter_multi( + libstrings.is_letter_multi( self._column, ltype, column.as_column(position) ), ) return self._return_or_inplace( - cpp_is_letter(self._column, ltype, position) + libstrings.is_letter(self._column, ltype, position) ) - def edit_distance(self, targets) -> ParentType: + def edit_distance(self, targets) -> SeriesOrIndex: """ The ``targets`` strings are measured against the strings in this instance using the Levenshtein edit distance algorithm. @@ -4874,7 +4785,7 @@ def edit_distance(self, targets) -> ParentType: ) return self._return_or_inplace( - cpp_edit_distance(self._column, targets_column) + libstrings.edit_distance(self._column, targets_column) ) def edit_distance_matrix(self) -> ParentType: @@ -4916,7 +4827,9 @@ def edit_distance_matrix(self) -> ParentType: "Cannot compute edit distance between null strings. " "Consider removing them using `dropna` or fill with `fillna`." ) - return self._return_or_inplace(cpp_edit_distance_matrix(self._column)) + return self._return_or_inplace( + libstrings.edit_distance_matrix(self._column) + ) def _massage_string_arg(value, name, allow_col=False): @@ -5104,7 +5017,11 @@ def sum( skipna=skipna, min_count=min_count ) if isinstance(result_col, type(self)): - return result_col.str().cat() + return libstrings.join( + result_col, + sep=cudf.Scalar(""), + na_rep=cudf.Scalar(None, "str"), + )[0] else: return result_col @@ -5125,10 +5042,7 @@ def set_base_children(self, value: Tuple["column.ColumnBase", ...]): super().set_base_children(value) def __contains__(self, item: ScalarLike) -> bool: - return True in self.str().contains(f"^{item}$") - - def str(self, parent: ParentType = None) -> StringMethods: - return StringMethods(self, parent=parent) + return True in libstrings.contains_re(self, f"^{item}$") def as_numerical_column( self, dtype: Dtype, **kwargs @@ -5136,13 +5050,13 @@ def as_numerical_column( out_dtype = np.dtype(dtype) if out_dtype.kind in {"i", "u"}: - if not cpp_is_integer(self).all(): + if not libstrings.is_integer(self).all(): raise ValueError( "Could not convert strings to integer " "type due to presence of non-integer values." ) elif out_dtype.kind == "f": - if not cpp_is_float(self).all(): + if not libstrings.is_float(self).all(): raise ValueError( "Could not convert strings to float " "type due to presence of non-floating values." @@ -5206,7 +5120,7 @@ def as_timedelta_column( def as_decimal_column( self, dtype: Dtype, **kwargs ) -> "cudf.core.column.Decimal64Column": - return cpp_to_decimal(self, dtype) + return libstrings.to_decimal(self, dtype) def as_string_column( self, dtype: Dtype, format=None, **kwargs @@ -5315,9 +5229,12 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: if self.dtype == to_dtype: return True - elif to_dtype.kind in {"i", "u"} and not cpp_is_integer(self).all(): + elif ( + to_dtype.kind in {"i", "u"} + and not libstrings.is_integer(self).all() + ): return False - elif to_dtype.kind == "f" and not cpp_is_float(self).all(): + elif to_dtype.kind == "f" and not libstrings.is_float(self).all(): return False else: return True @@ -5377,7 +5294,7 @@ def fillna( return super().fillna(method=method) def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]: - found_indices = self.str().contains(f"^{value}$") + found_indices = libstrings.contains_re(self, f"^{value}$") found_indices = libcudf.unary.cast(found_indices, dtype=np.int32) first = column.as_column(found_indices).find_first_value(1) last = column.as_column(found_indices).find_last_value(1) @@ -5421,10 +5338,17 @@ def binary_operator( lhs, rhs = rhs, lhs if isinstance(rhs, (StringColumn, str, cudf.Scalar)): if op == "add": - return cast("column.ColumnBase", lhs.str().cat(others=rhs)) + return cast( + "column.ColumnBase", + libstrings.concatenate( + cudf.DataFrame({0: lhs, 1: rhs}), + sep=cudf.Scalar(""), + na_rep=cudf.Scalar(None, "str"), + ), + ) elif op in ("eq", "ne", "gt", "lt", "ge", "le", "NULL_EQUALS"): return libcudf.binaryop.binaryop( - lhs=self, rhs=rhs, op=op, dtype="bool" + lhs=lhs, rhs=rhs, op=op, dtype="bool" ) raise TypeError( diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 85c8293a91e..6bcc594ab22 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -6,7 +6,7 @@ import cudf from cudf._typing import Dtype from cudf.core.column import ColumnBase, build_struct_column -from cudf.core.column.methods import ColumnMethodsMixin +from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import StructDtype from cudf.utils.dtypes import is_struct_dtype @@ -20,7 +20,7 @@ class StructColumn(ColumnBase): """ - dtype: cudf.core.dtypes.StructDtype + dtype: StructDtype @property def base_size(self): @@ -95,9 +95,6 @@ def copy(self, deep=True): result = result._rename_fields(self.dtype.fields.keys()) return result - def struct(self, parent=None): - return StructMethods(self, parent=parent) - def _rename_fields(self, names): """ Return a StructColumn with the same field values as this StructColumn, @@ -139,17 +136,17 @@ def _with_type_metadata(self: StructColumn, dtype: Dtype) -> StructColumn: return self -class StructMethods(ColumnMethodsMixin): +class StructMethods(ColumnMethods): """ Struct methods for Series """ - def __init__(self, column, parent=None): - if not is_struct_dtype(column.dtype): + def __init__(self, parent=None): + if not is_struct_dtype(parent.dtype): raise AttributeError( "Can only use .struct accessor with a 'struct' dtype" ) - super().__init__(column=column, parent=parent) + super().__init__(parent=parent) def field(self, key): """ diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 3629358ee9f..ec6bf13bd15 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -23,6 +23,7 @@ as_column, build_categorical_column, column_empty, + concat_columns, ) from cudf.core.join import merge from cudf.utils.dtypes import ( @@ -40,7 +41,7 @@ T = TypeVar("T", bound="Frame") if TYPE_CHECKING: - from cudf.core.column_accessor import ColumnAccessor + from cudf.core.columnn_accessor import ColumnAccessor class Frame(libcudf.table.Table): @@ -4054,8 +4055,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): ): # Combine and de-dupe the categories categories[idx] = ( - cudf.concat([col.cat().categories for col in cols]) - .to_series() + cudf.Series(concat_columns([col.categories for col in cols])) .drop_duplicates(ignore_index=True) ._column ) @@ -4085,12 +4085,7 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): if idx in categories: cols[idx] = ( cols[idx] - .cat() - ._set_categories( - cols[idx].cat().categories, - categories[idx], - is_unique=True, - ) + ._set_categories(categories[idx], is_unique=True,) .codes ) cols[idx] = cols[idx].astype(dtype) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 13ea1755803..680bff457e2 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -35,7 +35,7 @@ arange, column, ) -from cudf.core.column.column import _concat_columns, as_column +from cudf.core.column.column import as_column, concat_columns from cudf.core.column.string import StringMethods as StringMethods from cudf.core.dtypes import IntervalDtype from cudf.core.frame import SingleColumnFrame @@ -587,7 +587,7 @@ def sum(self): @classmethod def _concat(cls, objs): - data = _concat_columns([o._values for o in objs]) + data = concat_columns([o._values for o in objs]) names = {obj.name for obj in objs} if len(names) == 1: [name] = names @@ -2569,17 +2569,13 @@ def __init__( dtype = None if categories is not None: - data.cat().set_categories( - categories, ordered=ordered, inplace=True - ) + data = data.set_categories(categories, ordered=ordered) elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): - data.cat().set_categories( - dtype.categories, ordered=ordered, inplace=True - ) + data = data.set_categories(dtype.categories, ordered=ordered) elif ordered is True and data.ordered is False: - data.cat().as_ordered(inplace=True) + data = data.as_ordered() elif ordered is False and data.ordered is True: - data.cat().as_unordered(inplace=True) + data = data.as_unordered() super().__init__(data, **kwargs) @@ -2588,14 +2584,14 @@ def codes(self): """ The category codes of this categorical. """ - return self._values.cat().codes + return as_index(self._values.codes) @property def categories(self): """ The categories of this categorical. """ - return self._values.cat().categories + return as_index(self._values.categories) def interval_range( @@ -2871,7 +2867,7 @@ def __repr__(self): @copy_docstring(StringMethods.__init__) # type: ignore @property def str(self): - return StringMethods(column=self._values, parent=self) + return StringMethods(parent=self) def _clean_nulls_from_index(self): """ diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 5e15ddfc359..78fc7a863d6 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -188,7 +188,9 @@ def _match_categorical_dtypes_both( if how == "inner": # cast to category types -- we must cast them back later return _match_join_keys( - lcol.cat()._decategorize(), rcol.cat()._decategorize(), how, + lcol._get_decategorized_column(), + rcol._get_decategorized_column(), + how, ) elif how in {"left", "leftanti", "leftsemi"}: # always cast to left type diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 77640db6a1d..462f101dfbf 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -32,7 +32,7 @@ from cudf.core.column.categorical import ( CategoricalAccessor as CategoricalAccessor, ) -from cudf.core.column.column import _concat_columns +from cudf.core.column.column import concat_columns from cudf.core.column.lists import ListMethods from cudf.core.column.string import StringMethods from cudf.core.column.struct import StructMethods @@ -2336,22 +2336,22 @@ def __invert__(self): @copy_docstring(CategoricalAccessor.__init__) # type: ignore @property def cat(self): - return CategoricalAccessor(column=self._column, parent=self) + return CategoricalAccessor(parent=self) @copy_docstring(StringMethods.__init__) # type: ignore @property def str(self): - return StringMethods(column=self._column, parent=self) + return StringMethods(parent=self) @copy_docstring(ListMethods.__init__) # type: ignore @property def list(self): - return ListMethods(column=self._column, parent=self) + return ListMethods(parent=self) @copy_docstring(StructMethods.__init__) # type: ignore @property def struct(self): - return StructMethods(column=self._column, parent=self) + return StructMethods(parent=self) @property def dtype(self): @@ -2412,7 +2412,7 @@ def _concat(cls, objs, axis=0, index=True): common_dtype = find_common_type([obj.dtype for obj in objs]) objs = [obj.astype(common_dtype) for obj in objs] - col = _concat_columns([o._column for o in objs]) + col = concat_columns([o._column for o in objs]) if isinstance(col, cudf.core.column.Decimal64Column): col = col._with_type_metadata(objs[0]._column.dtype) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 83df7985108..6d31c1ba74d 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -7,6 +7,7 @@ import cudf from cudf import _lib as libcudf +from cudf._lib import strings as libstrings from cudf.core.column import as_column from cudf.utils.dtypes import ( _is_non_decimal_numeric_dtype, @@ -194,13 +195,13 @@ def _convert_str_col(col, errors, _downcast=None): if not is_string_dtype(col): raise TypeError("col must be string dtype.") - is_integer = col.str().isinteger() + is_integer = libstrings.is_integer(col) if is_integer.all(): return col.as_numerical_column(dtype=np.dtype("i8")) col = _proc_inf_empty_strings(col) - is_float = col.str().isfloat() + is_float = libstrings.is_float(col) if is_float.all(): if _downcast in {"unsigned", "signed", "integer"}: warnings.warn( @@ -225,7 +226,7 @@ def _convert_str_col(col, errors, _downcast=None): def _proc_inf_empty_strings(col): """Handles empty and infinity strings """ - col = col.str().lower() + col = libstrings.to_lower(col) col = _proc_empty_strings(col) col = _proc_inf_strings(col) return col @@ -243,7 +244,7 @@ def _proc_inf_strings(col): """Convert "inf/infinity" strings into "Inf", the native string representing infinity in libcudf """ - col = col.str().replace( - ["+", "inf", "inity"], ["", "Inf", ""], regex=False, + col = libstrings.replace_multi( + col, as_column(["+", "inf", "inity"]), as_column(["", "Inf", ""]), ) return col diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index e2c7ca7dca1..582c5324b8f 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -177,7 +177,7 @@ def to_csv( df = df.copy(deep=False) for col_name, col in df._data.items(): if isinstance(col, cudf.core.column.CategoricalColumn): - df._data[col_name] = col.astype(col.cat().categories.dtype) + df._data[col_name] = col.astype(col.categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): df.index = df.index.astype(df.index.categories.dtype) diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index cedf2aac7af..e0f9b3a6efa 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -164,8 +164,8 @@ def assert_column_equal( if check_exact and check_categorical: if is_categorical_dtype(left) and is_categorical_dtype(right): - left_cat = left.cat().categories - right_cat = right.cat().categories + left_cat = left.categories + right_cat = right.categories if check_category_order: assert_index_equal( diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 951062f2b61..1b59b80bc5a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5217,8 +5217,8 @@ def test_memory_usage_cat(): gdf = cudf.from_pandas(df) expected = ( - gdf.B._column.cat().categories.__sizeof__() - + gdf.B._column.cat().codes.__sizeof__() + gdf.B._column.categories.__sizeof__() + + gdf.B._column.codes.__sizeof__() ) # Check cat column