Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cudf.dtype function #8949

Merged
merged 27 commits into from
Aug 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
60c7c87
Replace cudf.dtype -> np.dtype
shwina Aug 4, 2021
5e50f52
First stab at cudf.dtype
shwina Aug 4, 2021
367b743
Handle datetimes/timedeltas in cudf.dtype
shwina Aug 4, 2021
d04a5f1
Fix test
shwina Aug 4, 2021
85351e9
Handle disallowed numpy types
shwina Aug 5, 2021
3c9dd97
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
shwina Aug 5, 2021
67cca8a
Update python/cudf/cudf/tests/test_dtypes.py
shwina Aug 5, 2021
a10eae0
Some fixes
shwina Aug 6, 2021
89ac918
Remaining failures
shwina Aug 9, 2021
acda2ee
Merge branch 'cudf-dtype-function' of github.com:shwina/cudf into cud…
shwina Aug 9, 2021
64a3290
Style
shwina Aug 9, 2021
a62ab32
Update python/cudf/cudf/api/types.py
shwina Aug 9, 2021
f79e59f
cudf.dtype -> np.dtype
shwina Aug 10, 2021
9dceb80
Merge branch 'cudf-dtype-function' of github.com:shwina/cudf into cud…
shwina Aug 10, 2021
d0bef49
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
shwina Aug 10, 2021
3eba47c
Progress
shwina Aug 11, 2021
048629c
More fix
shwina Aug 11, 2021
40736c4
Early returns
shwina Aug 11, 2021
550c7ba
More tests
shwina Aug 11, 2021
1cfa67c
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
shwina Aug 11, 2021
72d6304
Resolve circular import issues
shwina Aug 11, 2021
c8925f5
Unused import
shwina Aug 12, 2021
26df99a
Space
shwina Aug 12, 2021
fec34d9
Add interval tests
shwina Aug 12, 2021
5fc19a9
:(
shwina Aug 12, 2021
11156f5
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
shwina Aug 12, 2021
2a684be
Merge branch 'branch-21.10' of https://github.com/rapidsai/cudf into …
shwina Aug 13, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions python/cudf/cudf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,42 +8,44 @@

import rmm

from cudf.api.types import dtype
from cudf import core, datasets, testing
from cudf._version import get_versions
from cudf.api.extensions import (
register_dataframe_accessor,
register_index_accessor,
register_series_accessor,
)
from cudf.core import (
from cudf.core.scalar import (
NA,
Scalar,
)
from cudf.core.index import (
BaseIndex,
CategoricalIndex,
DataFrame,
DatetimeIndex,
Float32Index,
Float64Index,
Index,
GenericIndex,
Int8Index,
Int16Index,
Int32Index,
Int64Index,
IntervalIndex,
MultiIndex,
RangeIndex,
StringIndex,
Scalar,
Series,
TimedeltaIndex,
UInt8Index,
UInt16Index,
UInt32Index,
UInt64Index,
cut,
from_pandas,
interval_range,
merge,
)
from cudf.core.dataframe import DataFrame, from_pandas, merge
from cudf.core.series import Series
from cudf.core.multiindex import MultiIndex
from cudf.core.cut import cut
from cudf.core.algorithms import factorize
from cudf.core.dtypes import (
CategoricalDtype,
Expand Down
54 changes: 27 additions & 27 deletions python/cudf/cudf/_fuzz_testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,44 +18,44 @@
ALL_POSSIBLE_VALUES = "ALL_POSSIBLE_VALUES"

_PANDAS_TO_AVRO_SCHEMA_MAP = {
np.dtype("int8"): "int",
cudf.dtype("int8"): "int",
pd.Int8Dtype(): ["int", "null"],
pd.Int16Dtype(): ["int", "null"],
pd.Int32Dtype(): ["int", "null"],
pd.Int64Dtype(): ["long", "null"],
pd.BooleanDtype(): ["boolean", "null"],
pd.StringDtype(): ["string", "null"],
np.dtype("bool_"): "boolean",
np.dtype("int16"): "int",
np.dtype("int32"): "int",
np.dtype("int64"): "long",
np.dtype("O"): "string",
np.dtype("str"): "string",
np.dtype("float32"): "float",
np.dtype("float64"): "double",
np.dtype("<M8[ns]"): {"type": "long", "logicalType": "timestamp-millis"},
np.dtype("<M8[ms]"): {"type": "long", "logicalType": "timestamp-millis"},
np.dtype("<M8[us]"): {"type": "long", "logicalType": "timestamp-micros"},
cudf.dtype("bool_"): "boolean",
cudf.dtype("int16"): "int",
cudf.dtype("int32"): "int",
cudf.dtype("int64"): "long",
cudf.dtype("O"): "string",
cudf.dtype("str"): "string",
cudf.dtype("float32"): "float",
cudf.dtype("float64"): "double",
cudf.dtype("<M8[ns]"): {"type": "long", "logicalType": "timestamp-millis"},
cudf.dtype("<M8[ms]"): {"type": "long", "logicalType": "timestamp-millis"},
cudf.dtype("<M8[us]"): {"type": "long", "logicalType": "timestamp-micros"},
}

PANDAS_TO_ORC_TYPES = {
np.dtype("int8"): pyorc.TinyInt(),
cudf.dtype("int8"): pyorc.TinyInt(),
pd.Int8Dtype(): pyorc.TinyInt(),
pd.Int16Dtype(): pyorc.SmallInt(),
pd.Int32Dtype(): pyorc.Int(),
pd.Int64Dtype(): pyorc.BigInt(),
pd.BooleanDtype(): pyorc.Boolean(),
np.dtype("bool_"): pyorc.Boolean(),
np.dtype("int16"): pyorc.SmallInt(),
np.dtype("int32"): pyorc.Int(),
np.dtype("int64"): pyorc.BigInt(),
np.dtype("O"): pyorc.String(),
cudf.dtype("bool_"): pyorc.Boolean(),
cudf.dtype("int16"): pyorc.SmallInt(),
cudf.dtype("int32"): pyorc.Int(),
cudf.dtype("int64"): pyorc.BigInt(),
cudf.dtype("O"): pyorc.String(),
pd.StringDtype(): pyorc.String(),
np.dtype("float32"): pyorc.Float(),
np.dtype("float64"): pyorc.Double(),
np.dtype("<M8[ns]"): pyorc.Timestamp(),
np.dtype("<M8[ms]"): pyorc.Timestamp(),
np.dtype("<M8[us]"): pyorc.Timestamp(),
cudf.dtype("float32"): pyorc.Float(),
cudf.dtype("float64"): pyorc.Double(),
cudf.dtype("<M8[ns]"): pyorc.Timestamp(),
cudf.dtype("<M8[ms]"): pyorc.Timestamp(),
cudf.dtype("<M8[us]"): pyorc.Timestamp(),
}

ORC_TO_PANDAS_TYPES = {
Expand All @@ -64,10 +64,10 @@
pyorc.Boolean().name: pd.BooleanDtype(),
pyorc.SmallInt().name: pd.Int16Dtype(),
pyorc.BigInt().name: pd.Int64Dtype(),
pyorc.String().name: np.dtype("O"),
pyorc.Float().name: np.dtype("float32"),
pyorc.Double().name: np.dtype("float64"),
pyorc.Timestamp().name: np.dtype("<M8[ns]"),
pyorc.String().name: cudf.dtype("O"),
pyorc.Float().name: cudf.dtype("float32"),
pyorc.Double().name: cudf.dtype("float64"),
pyorc.Timestamp().name: cudf.dtype("<M8[ns]"),
}


Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
interop,
join,
json,
labeling,
merge,
null_mask,
nvtext,
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/_lib/aggregation.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ from cudf._lib.types import Interpolation
cimport cudf._lib.cpp.aggregation as libcudf_aggregation
cimport cudf._lib.cpp.types as libcudf_types

import cudf


class AggregationKind(Enum):
SUM = libcudf_aggregation.aggregation.Kind.SUM
Expand Down Expand Up @@ -277,7 +279,7 @@ cdef class Aggregation:
nb_type = numpy_support.from_dtype(kwargs['dtype'])
type_signature = (nb_type[:],)
compiled_op = cudautils.compile_udf(op, type_signature)
output_np_dtype = np.dtype(compiled_op[1])
output_np_dtype = cudf.dtype(compiled_op[1])
cpp_str = compiled_op[0].encode('UTF-8')
if output_np_dtype not in np_to_cudf_types:
raise TypeError(
Expand Down Expand Up @@ -421,7 +423,7 @@ cdef class RollingAggregation:
nb_type = numpy_support.from_dtype(kwargs['dtype'])
type_signature = (nb_type[:],)
compiled_op = cudautils.compile_udf(op, type_signature)
output_np_dtype = np.dtype(compiled_op[1])
output_np_dtype = cudf.dtype(compiled_op[1])
cpp_str = compiled_op[0].encode('UTF-8')
if output_np_dtype not in np_to_cudf_types:
raise TypeError(
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/_lib/binaryop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ from cudf.utils.dtypes import is_scalar, is_string_dtype

cimport cudf._lib.cpp.binaryop as cpp_binaryop
from cudf._lib.cpp.binaryop cimport binary_operator
import cudf


class BinaryOperation(IntEnum):
Expand Down Expand Up @@ -211,7 +212,7 @@ def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
cdef type_id tid = (
<type_id> (
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(dtype)]
np_to_cudf_types[cudf.dtype(dtype)]
)
)
)
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/_lib/copying.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -787,12 +787,12 @@ cdef class _CPackedColumns:
"""
Construct a ``PackedColumns`` object from a ``cudf.DataFrame``.
"""
from cudf.core import RangeIndex, dtypes
import cudf.core.dtypes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just import _BaseIndex? Not a big deal either way, just curious.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In [17]: %%timeit
    ...: import cudf.core.dtypes
    ...: cudf.core.dtypes._BaseDtype
    ...:
    ...:
407 ns ± 3.89 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [19]: %%timeit
    ...: from cudf.core.dtypes import _BaseDtype
    ...: _BaseDtype
    ...:
    ...:
875 ns ± 1.48 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh sure it's for performance works for me.


cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns)

if keep_index and (
not isinstance(input_table.index, RangeIndex)
not isinstance(input_table.index, cudf.RangeIndex)
or input_table.index.start != 0
or input_table.index.stop != len(input_table)
or input_table.index.step != 1
Expand All @@ -805,7 +805,7 @@ cdef class _CPackedColumns:
p.column_names = input_table._column_names
p.column_dtypes = {}
for name, col in input_table._data.items():
if isinstance(col.dtype, dtypes._BaseDtype):
if isinstance(col.dtype, cudf.core.dtypes._BaseDtype):
p.column_dtypes[name] = col.dtype

p.c_obj = move(cpp_copying.pack(input_table_view))
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ cpdef read_orc(object filepaths_or_buffers,
if timestamp_type is None else
<type_id>(
<underlying_type_t_type_id> (
np_to_cudf_types[np.dtype(timestamp_type)]
np_to_cudf_types[cudf.dtype(timestamp_type)]
)
)
),
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
meta_dtype = cols_dtype_map.get(col, None)
df._data[col] = cudf.core.column.column_empty(
row_count=0,
dtype=np.dtype(meta_dtype)
dtype=cudf.dtype(meta_dtype)
)

# Set the index column
Expand Down
15 changes: 7 additions & 8 deletions python/cudf/cudf/_lib/scalar.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id

from cudf._lib.interop import from_arrow, to_arrow

cimport cudf._lib.cpp.types as libcudf_types
from cudf._lib.cpp.scalar.scalar cimport (
duration_scalar,
fixed_point_scalar,
Expand All @@ -60,9 +61,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport (
)
from cudf._lib.utils cimport data_from_table_view

from cudf.utils.dtypes import _decimal_to_int64, is_list_dtype, is_struct_dtype

cimport cudf._lib.cpp.types as libcudf_types
import cudf


cdef class DeviceScalar:
Expand All @@ -81,7 +80,7 @@ cdef class DeviceScalar:
dtype : dtype
A NumPy dtype.
"""
self._dtype = dtype if dtype.kind != 'U' else np.dtype('object')
self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object')
self._set_value(value, self._dtype)

def _set_value(self, value, dtype):
Expand Down Expand Up @@ -120,9 +119,9 @@ cdef class DeviceScalar:
def _to_host_scalar(self):
if isinstance(self.dtype, cudf.Decimal64Dtype):
result = _get_py_decimal_from_fixed_point(self.c_value)
elif is_struct_dtype(self.dtype):
elif cudf.api.types.is_struct_dtype(self.dtype):
result = _get_py_dict_from_struct(self.c_value)
elif is_list_dtype(self.dtype):
elif cudf.api.types.is_list_dtype(self.dtype):
result = _get_py_list_from_list(self.c_value)
elif pd.api.types.is_string_dtype(self.dtype):
result = _get_py_string_from_string(self.c_value)
Expand Down Expand Up @@ -309,7 +308,7 @@ cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s,
object value,
object dtype,
bool valid=True):
value = _decimal_to_int64(value) if valid else 0
value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0
s.reset(
new fixed_point_scalar[decimal64](
<int64_t>np.int64(value), scale_type(-dtype.scale), valid
Expand Down Expand Up @@ -560,7 +559,7 @@ def _is_null_host_scalar(slr):
def _create_proxy_nat_scalar(dtype):
cdef DeviceScalar result = DeviceScalar.__new__(DeviceScalar)

dtype = np.dtype(dtype)
dtype = cudf.dtype(dtype)
if dtype.char in 'mM':
nat = dtype.type('NaT').astype(dtype)
if dtype.type == np.datetime64:
Expand Down
Loading