From 366206d4a04e77bc3fbc9b41948ddb816d4f38e3 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 10 May 2022 17:05:18 -0500 Subject: [PATCH] Import `NA` from `missing` rather than using `cudf.NA` everywhere (#10821) This PR changes cuDF so `NA` isn't used around the codebase from the top level `cudf` namespace and rather is imported directly from `missing`. This is part of https://github.com/rapidsai/cudf/issues/10820 and comes as a follow up to https://github.com/rapidsai/cudf/pull/10791#discussion_r867206392 Authors: - https://github.com/brandon-b-miller Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/10821 --- python/cudf/cudf/_lib/scalar.pyx | 23 ++++++++++--------- python/cudf/cudf/core/_internals/where.py | 5 ++-- python/cudf/cudf/core/column/column.py | 3 ++- python/cudf/cudf/core/column/lists.py | 5 ++-- .../cudf/cudf/core/column/numerical_base.py | 3 ++- python/cudf/cudf/core/column/struct.py | 3 ++- python/cudf/cudf/core/dataframe.py | 11 ++++----- python/cudf/cudf/testing/testing.py | 3 ++- python/cudf/cudf/utils/dtypes.py | 3 ++- 9 files changed, 32 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 71ac022ba2d..6309720706b 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -27,6 +27,7 @@ from cudf._lib.types import ( duration_unit_map, ) from cudf.core.dtypes import ListDtype, StructDtype +from cudf.core.missing import NA from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view @@ -170,7 +171,7 @@ cdef class DeviceScalar: return self.get_raw_ptr()[0].is_valid() def __repr__(self): - if self.value is cudf.NA: + if self.value is NA: return ( f"{self.__class__.__name__}" f"({self.value}, {repr(self.dtype)})" @@ -356,7 +357,7 @@ cdef _set_struct_from_pydict(unique_ptr[scalar]& s, else: pyarrow_table = pa.Table.from_arrays( [ - pa.array([cudf.NA], from_pandas=True, type=f.type) + pa.array([NA], from_pandas=True, type=f.type) for f in arrow_schema ], names=columns @@ -371,7 +372,7 @@ cdef _set_struct_from_pydict(unique_ptr[scalar]& s, cdef _get_py_dict_from_struct(unique_ptr[scalar]& s): if not s.get()[0].is_valid(): - return cudf.NA + return NA cdef table_view struct_table_view = (s.get()).view() column_names = [str(i) for i in range(struct_table_view.num_columns())] @@ -386,7 +387,7 @@ cdef _set_list_from_pylist(unique_ptr[scalar]& s, object dtype, bool valid=True): - value = value if valid else [cudf.NA] + value = value if valid else [NA] cdef Column col if isinstance(dtype.element_type, ListDtype): pa_type = dtype.element_type.to_arrow() @@ -404,7 +405,7 @@ cdef _set_list_from_pylist(unique_ptr[scalar]& s, cdef _get_py_list_from_list(unique_ptr[scalar]& s): if not s.get()[0].is_valid(): - return cudf.NA + return NA cdef column_view list_col_view = (s.get()).view() cdef Column list_col = Column.from_column_view(list_col_view, None) @@ -416,14 +417,14 @@ cdef _get_py_list_from_list(unique_ptr[scalar]& s): cdef _get_py_string_from_string(unique_ptr[scalar]& s): if not s.get()[0].is_valid(): - return cudf.NA + return NA return (s.get())[0].to_string().decode() cdef _get_np_scalar_from_numeric(unique_ptr[scalar]& s): cdef scalar* s_ptr = s.get() if not s_ptr[0].is_valid(): - return cudf.NA + return NA cdef libcudf_types.data_type cdtype = s_ptr[0].type() @@ -456,7 +457,7 @@ cdef _get_np_scalar_from_numeric(unique_ptr[scalar]& s): cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s): cdef scalar* s_ptr = s.get() if not s_ptr[0].is_valid(): - return cudf.NA + return NA cdef libcudf_types.data_type cdtype = s_ptr[0].type() @@ -480,7 +481,7 @@ cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s): cdef scalar* s_ptr = s.get() if not s_ptr[0].is_valid(): - return cudf.NA + return NA cdef libcudf_types.data_type cdtype = s_ptr[0].type() @@ -571,7 +572,7 @@ def as_device_scalar(val, dtype=None): def _is_null_host_scalar(slr): - if slr is None or slr is cudf.NA: + if slr is None or slr is NA: return True elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr): return True @@ -603,5 +604,5 @@ def _nested_na_replace(input_list): if isinstance(value, list): _nested_na_replace(value) elif value is None: - input_list[idx] = cudf.NA + input_list[idx] = NA return input_list diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index 59e7d629092..bc01752a2b4 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -12,6 +12,7 @@ from cudf.core.dataframe import DataFrame from cudf.core.frame import Frame from cudf.core.index import Index +from cudf.core.missing import NA from cudf.core.series import Series from cudf.core.single_column_frame import SingleColumnFrame @@ -28,9 +29,7 @@ def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> ScalarLike: f"{type(other).__name__} to {col.dtype.name}" ) - return cudf.Scalar( - other, dtype=col.dtype if other in {None, cudf.NA} else None - ) + return cudf.Scalar(other, dtype=col.dtype if other in {None, NA} else None) def _check_and_cast_columns_with_other( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e1d91e6d0c0..47a2e3489e8 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -68,6 +68,7 @@ ListDtype, StructDtype, ) +from cudf.core.missing import NA from cudf.core.mixins import BinaryOperand, Reducible from cudf.utils.dtypes import ( cudf_dtype_from_pa_type, @@ -499,7 +500,7 @@ def __setitem__(self, key: Any, value: Any): self._mimic_inplace(out, inplace=True) def _wrap_binop_normalization(self, other): - if other is cudf.NA or other is None: + if other is NA or other is None: return cudf.Scalar(other, dtype=self.dtype) if isinstance(other, np.ndarray) and other.ndim == 0: other = other.item() diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 30e418f0825..e8a5638f07a 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -30,6 +30,7 @@ from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethods, ParentType from cudf.core.dtypes import ListDtype +from cudf.core.missing import NA class ListColumn(ColumnBase): @@ -91,7 +92,7 @@ def __setitem__(self, key, value): if isinstance(value, cudf.Scalar): if value.dtype != self.dtype: raise TypeError("list nesting level mismatch") - elif value is cudf.NA: + elif value is NA: value = cudf.Scalar(value, dtype=self.dtype) else: raise ValueError(f"Can not set {value} into ListColumn") @@ -354,7 +355,7 @@ def get( index = as_column(index) out = extract_element_column(self._column, as_column(index)) - if not (default is None or default is cudf.NA): + if not (default is None or default is NA): # determine rows for which `index` is out-of-bounds lengths = count_elements(self._column) out_of_bounds_mask = (np.negative(index) > lengths) | ( diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 659bb58d790..bb7711a3ead 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -11,6 +11,7 @@ from cudf import _lib as libcudf from cudf._typing import ScalarLike from cudf.core.column import ColumnBase +from cudf.core.missing import NA from cudf.core.mixins import Scannable @@ -116,7 +117,7 @@ def quantile( scalar_result = result.element_indexing(0) return ( cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - if scalar_result is cudf.NA + if scalar_result is NA else scalar_result ) return result diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index ed5e1c9450d..fa834ae8a5a 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -10,6 +10,7 @@ from cudf.core.column import ColumnBase, build_struct_column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import StructDtype +from cudf.core.missing import NA class StructColumn(ColumnBase): @@ -102,7 +103,7 @@ def __setitem__(self, key, value): if isinstance(value, dict): # filling in fields not in dict for field in self.dtype.fields: - value[field] = value.get(field, cudf.NA) + value[field] = value.get(field, NA) value = cudf.Scalar(value, self.dtype) super().__setitem__(key, value) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a3e2f40b28e..0c3dc82719e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -76,6 +76,7 @@ _indices_from_labels, doc_reset_index_template, ) +from cudf.core.missing import NA from cudf.core.multiindex import MultiIndex from cudf.core.resample import DataFrameResampler from cudf.core.series import Series @@ -364,9 +365,7 @@ def _setitem_tuple_arg(self, key, value): scatter_map = _indices_from_labels(self._frame, key[0]) for col in columns_df._column_names: columns_df[col][scatter_map] = ( - value._data[col] - if col in value_column_names - else cudf.NA + value._data[col] if col in value_column_names else NA ) else: @@ -479,7 +478,7 @@ def _setitem_tuple_arg(self, key, value): value_column_names = set(value._column_names) for col in columns_df._column_names: columns_df[col][key[0]] = ( - value._data[col] if col in value_column_names else cudf.NA + value._data[col] if col in value_column_names else NA ) else: @@ -3867,8 +3866,8 @@ def applymap( # bytecode to generate the equivalent PTX # as a null-ignoring version of the function def _func(x): # pragma: no cover - if x is cudf.NA: - return cudf.NA + if x is NA: + return NA else: return devfunc(x) diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index b134d2b26e9..070e4649c7b 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -20,6 +20,7 @@ is_struct_dtype, ) from cudf.core._compat import PANDAS_GE_110 +from cudf.core.missing import NA def dtype_can_compare_equal_to_other(dtype): @@ -290,7 +291,7 @@ def assert_column_equal( def null_safe_scalar_equals(left, right): - if left in {cudf.NA, np.nan} or right in {cudf.NA, np.nan}: + if left in {NA, np.nan} or right in {NA, np.nan}: return left is right return left == right diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 35c6fdc73f8..c2d9a57b72f 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -12,6 +12,7 @@ import cudf from cudf.core._compat import PANDAS_GE_120 +from cudf.core.missing import NA _NA_REP = "" @@ -591,7 +592,7 @@ def _can_cast(from_dtype, to_dtype): `np.can_cast` but with some special handling around cudf specific dtypes. """ - if from_dtype in {None, cudf.NA}: + if from_dtype in {None, NA}: return True if isinstance(from_dtype, type): from_dtype = cudf.dtype(from_dtype)