Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose pack/unpack API to Python #8153

Merged
merged 44 commits into from
Jun 30, 2021
Merged
Show file tree
Hide file tree
Changes from 39 commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
21e2e53
Expose pack/unpack API
charlesbluca May 4, 2021
49440ed
Package resorting
charlesbluca May 4, 2021
06d82bc
Rename pack/unpack function
charlesbluca May 4, 2021
2d826c4
Add basic Cython function testing
charlesbluca May 5, 2021
f2ed139
Split pack/unpack into individual functions
charlesbluca May 5, 2021
878de88
Remove unnecessary cudf import
charlesbluca May 5, 2021
77dc379
Additional tests for unpacked dataframes
charlesbluca May 6, 2021
bc3228d
Add packed columns Python wrapper class
charlesbluca May 7, 2021
82e0682
Merge remote-tracking branch 'upstream/branch-0.20' into expose-pack-api
charlesbluca May 10, 2021
e5e8e8e
Replace Cython tests with Python
charlesbluca May 10, 2021
c9094a3
Remove unnecessary import
charlesbluca May 10, 2021
55da00b
Run pre-commit hooks
charlesbluca May 10, 2021
da27ce8
First pass at serialization
charlesbluca May 13, 2021
7bfad47
Make unique_ptr of metadata instead
charlesbluca May 14, 2021
d1ae22d
Fix segfaults in pickling/unpickling
charlesbluca May 17, 2021
82b31dd
Merge remote-tracking branch 'upstream/branch-21.06' into expose-pack…
charlesbluca May 19, 2021
6d4d275
Add support for pickle-style serialization
charlesbluca May 20, 2021
6bed65d
Expand tests for pack/unpack
charlesbluca May 20, 2021
d57893b
Remove potential circular import risk
charlesbluca May 20, 2021
674d259
Add logic to pack to decide whether to keep index
charlesbluca May 20, 2021
681a78a
Check for column data before asserting pointer inequality
charlesbluca May 20, 2021
f689d63
Add pack tests for list and struct columns
charlesbluca May 21, 2021
0a64054
Clarify that input table is Python object
charlesbluca May 24, 2021
9b55484
Flesh out docstrings a bit
charlesbluca May 24, 2021
a826093
Add return types
charlesbluca May 24, 2021
8f22d76
Function renames
charlesbluca May 26, 2021
aa54fbf
Return a Table from Python unpack
charlesbluca May 26, 2021
1c5b163
Naive addition of dtypes w.r.t. serialization
charlesbluca May 26, 2021
683ad13
Remove debugging block
charlesbluca May 26, 2021
e7be8f0
Merge remote-tracking branch 'upstream/branch-21.08' into expose-pack…
charlesbluca May 27, 2021
656f5de
Call DataFrame _from_table for tests
charlesbluca May 27, 2021
b413362
Merge remote-tracking branch 'upstream/branch-21.08' into expose-pack…
charlesbluca Jun 14, 2021
a749d19
Merge remote-tracking branch 'upstream/branch-21.08' into expose-pack…
charlesbluca Jun 15, 2021
d33fe8a
Use metadata apply in pack/unpack
charlesbluca Jun 15, 2021
e99b267
Fix categorical test
charlesbluca Jun 15, 2021
fd6ce46
Merge remote-tracking branch 'upstream/branch-21.08' into expose-pack…
charlesbluca Jun 21, 2021
b49a4ab
Serialize dtypes of packed columns
charlesbluca Jun 21, 2021
f279710
Add additional serialization tests
charlesbluca Jun 21, 2021
f2af602
Some docstring alterations
charlesbluca Jun 21, 2021
b78d6f9
Move pickle to top level imports
charlesbluca Jun 22, 2021
4dbd87f
Wrap Cython packed columns in serializable Python class
charlesbluca Jun 22, 2021
4ece337
Add serialization roundtrip tests, centralize in test_pack.py
charlesbluca Jun 22, 2021
af6e8e4
Merge remote-tracking branch 'upstream/branch-21.08' into expose-pack…
charlesbluca Jun 30, 2021
3fa48c1
Fix assert_eq import
charlesbluca Jun 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/include/cudf/copying.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,7 @@ packed_columns pack(cudf::table_view const& input,
* guaranteeing that that all of the columns in the table point into `contiguous_buffer`.
*
* @param input View of the table to pack
* @param contgiuous_buffer A contiguous buffer of device memory which contains the data referenced
* @param contiguous_buffer A contiguous buffer of device memory which contains the data referenced
* by the columns in `table`
* @param buffer_size The size of `contiguous_buffer`.
* @return Vector of bytes representing the metadata used to `unpack` a packed_columns struct.
Expand Down
24 changes: 24 additions & 0 deletions python/cudf/cudf/_lib/copying.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from cudf._lib.table cimport Table

from cudf._lib.cpp.copying cimport packed_columns

cdef class PackedColumns:
cdef packed_columns c_obj
cdef object column_names
cdef object column_dtypes
cdef object index_names

@staticmethod
cdef PackedColumns from_py_table(Table input_table, keep_index=*)

cdef Table unpack(self)

cdef const void* c_metadata_ptr(self) except *

cdef size_t c_metadata_size(self) except *

cdef void* c_gpu_data_ptr(self) except *

cdef size_t c_gpu_data_size(self) except *
167 changes: 166 additions & 1 deletion python/cudf/cudf/_lib/copying.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ from libcpp cimport bool
from libcpp.memory cimport make_unique, unique_ptr, shared_ptr, make_shared
from libcpp.vector cimport vector
from libcpp.utility cimport move
from libc.stdint cimport int32_t, int64_t
from libc.stdint cimport int32_t, int64_t, uint8_t, uintptr_t

from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer

from cudf._lib.column cimport Column
from cudf._lib.scalar import as_device_scalar
Expand Down Expand Up @@ -776,3 +778,166 @@ def segmented_gather(Column source_column, Column gather_map):

result = Column.from_unique_ptr(move(c_result))
return result


cdef class PackedColumns:
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
"""
A packed representation of a ``cudf.Table``, with all columns residing
in a single GPU memory buffer.
"""

def __reduce__(self):
return self.deserialize, self.serialize()

@property
def __cuda_array_interface__(self):
cdef dict intf = {
"data": (self.gpu_data_ptr, False),
"shape": (self.gpu_data_size,),
"strides": None,
"typestr": "|u1",
"version": 0
}
return intf

@property
def metadata_ptr(self):
return int(<uintptr_t>self.c_metadata_ptr())

@property
def metadata_size(self):
return int(self.c_metadata_size())

@property
def gpu_data_ptr(self):
return int(<uintptr_t>self.c_gpu_data_ptr())

@property
def gpu_data_size(self):
return int(self.c_gpu_data_size())

def serialize(self):
header = {}
frames = []

header["column-names"] = self.column_names
header["index-names"] = self.index_names
header["gpu-data-ptr"] = self.gpu_data_ptr
header["gpu-data-size"] = self.gpu_data_size
header["metadata"] = list(
<uint8_t[:self.metadata_size]>self.c_metadata_ptr()
)
charlesbluca marked this conversation as resolved.
Show resolved Hide resolved

column_dtypes = {}
for name, dtype in self.column_dtypes.items():
dtype_header, dtype_frames = dtype.serialize()
column_dtypes[name] = (
dtype_header,
(len(frames), len(frames) + len(dtype_frames)),
)
frames.extend(dtype_frames)
header["column-dtypes"] = column_dtypes

return header, frames

@classmethod
def deserialize(cls, header, frames):
import pickle
charlesbluca marked this conversation as resolved.
Show resolved Hide resolved

cdef PackedColumns p = PackedColumns.__new__(PackedColumns)

dbuf = DeviceBuffer(
ptr=header["gpu-data-ptr"],
size=header["gpu-data-size"]
)

cdef cpp_copying.packed_columns data_
data_.metadata_ = move(
make_unique[cpp_copying.metadata](
move(<vector[uint8_t]>header["metadata"])
)
)
data_.gpu_data = move(dbuf.c_obj)

p.c_obj = move(data_)
p.column_names = header["column-names"]
p.index_names = header["index-names"]

column_dtypes = {}
for name, dtype in header["column-dtypes"].items():
dtype_header, (start, stop) = dtype
column_dtypes[name] = pickle.loads(
dtype_header["type-serialized"]
).deserialize(dtype_header, frames[start:stop])
p.column_dtypes = column_dtypes

return p

@staticmethod
cdef PackedColumns from_py_table(Table input_table, keep_index=True):
charlesbluca marked this conversation as resolved.
Show resolved Hide resolved
"""
Construct a ``PackedColumns`` object from a ``cudf.DataFrame``.
"""
from cudf.core import RangeIndex, dtypes

cdef PackedColumns p = PackedColumns.__new__(PackedColumns)

if keep_index and not input_table.index.equals(
RangeIndex(start=0, stop=len(input_table), step=1)
):
input_table_view = input_table.view()
p.index_names = input_table._index_names
else:
input_table_view = input_table.data_view()

p.column_names = input_table._column_names
p.column_dtypes = {}
for name, col in input_table._data.items():
if isinstance(col.dtype, dtypes._BaseDtype):
p.column_dtypes[name] = col.dtype

p.c_obj = move(cpp_copying.pack(input_table_view))

return p

cdef Table unpack(self):
output_table = Table.from_table_view(
cpp_copying.unpack(self.c_obj),
self,
self.column_names,
self.index_names
)

for name, dtype in self.column_dtypes.items():
output_table._data[name] = (
output_table._data[name]._with_type_metadata(dtype)
)

return output_table

cdef const void* c_metadata_ptr(self) except *:
return self.c_obj.metadata_.get()[0].data()

cdef size_t c_metadata_size(self) except *:
return self.c_obj.metadata_.get()[0].size()

cdef void* c_gpu_data_ptr(self) except *:
return self.c_obj.gpu_data.get()[0].data()

cdef size_t c_gpu_data_size(self) except *:
return self.c_obj.gpu_data.get()[0].size()


def pack(Table input_table, keep_index=True):
charlesbluca marked this conversation as resolved.
Show resolved Hide resolved
"""
Pack the columns of a ``cudf.Table`` into a single GPU memory buffer.
"""
return PackedColumns.from_py_table(input_table, keep_index)


def unpack(PackedColumns packed):
"""
Unpack the results of packing a ``cudf.Table``, returning a new
``Table`` in the process.
"""
return packed.unpack()
16 changes: 15 additions & 1 deletion python/cudf/cudf/_lib/cpp/copying.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from rmm._lib.device_buffer cimport device_buffer

from libcpp cimport bool
from libc.stdint cimport int32_t, int64_t
from libc.stdint cimport int32_t, int64_t, uint8_t
from libcpp.memory cimport unique_ptr
from libcpp.vector cimport vector

Expand All @@ -20,6 +20,12 @@ from cudf._lib.cpp.types cimport size_type

ctypedef const scalar constscalar

cdef extern from "cudf/copying.hpp" namespace "cudf::packed_columns" nogil:
cdef struct metadata:
metadata(vector[uint8_t]&& v)
const uint8_t* data () except +
size_type size () except +

cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
ctypedef enum out_of_bounds_policy:
NULLIFY 'cudf::out_of_bounds_policy::NULLIFY'
Expand Down Expand Up @@ -119,6 +125,10 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
vector[size_type] splits
) except +

cdef struct packed_columns:
unique_ptr[metadata] metadata_
unique_ptr[device_buffer] gpu_data

cdef struct contiguous_split_result:
table_view table
vector[device_buffer] all_data
Expand All @@ -128,6 +138,10 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
vector[size_type] splits
) except +

cdef packed_columns pack (const table_view& input) except +

cdef table_view unpack (const packed_columns& input) except +

cdef unique_ptr[column] copy_if_else (
column_view lhs,
column_view rhs,
Expand Down
128 changes: 128 additions & 0 deletions python/cudf/cudf/tests/test_pack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pandas as pd

from cudf._lib.copying import pack, unpack
from cudf.core import DataFrame, GenericIndex, Series
from cudf.tests.utils import assert_eq


def check_packed_equality(df):
# basic
assert_packed_frame_equality(df)
# sliced
assert_packed_frame_equality(df[:-1])
assert_packed_frame_equality(df[1:])
assert_packed_frame_equality(df[2:-2])
# sorted
sortvaldf = df.sort_values("vals")
assert isinstance(sortvaldf.index, GenericIndex)
assert_packed_frame_equality(sortvaldf)


def assert_packed_frame_equality(df):
pdf = df.to_pandas()

packed = pack(df)
del df
unpacked = DataFrame._from_table(unpack(packed))

assert_eq(unpacked, pdf)


def test_packed_dataframe_equality_numeric():
np.random.seed(0)
df = DataFrame()
nelem = 10
df["keys"] = np.arange(nelem, dtype=np.float64)
df["vals"] = np.random.random(nelem)

check_packed_equality(df)


def test_packed_dataframe_equality_categorical():
np.random.seed(0)

df = DataFrame()
df["keys"] = pd.Categorical(
["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
)
df["vals"] = np.random.random(len(df))

check_packed_equality(df)


def test_packed_dataframe_equality_list():
np.random.seed(0)

df = DataFrame()
df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
df["vals"] = np.random.random(len(df))

check_packed_equality(df)


def test_packed_dataframe_equality_struct():
np.random.seed(0)

df = DataFrame()
df["keys"] = Series(
list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
)
df["vals"] = np.random.random(len(df))

check_packed_equality(df)


def check_packed_unique_pointers(df):
# basic
assert_packed_frame_unique_pointers(df)
# sliced
assert_packed_frame_unique_pointers(df[:-1])
assert_packed_frame_unique_pointers(df[1:])
assert_packed_frame_unique_pointers(df[2:-2])
# sorted
sortvaldf = df.sort_values("vals")
assert isinstance(sortvaldf.index, GenericIndex)
assert_packed_frame_unique_pointers(sortvaldf)


def assert_packed_frame_unique_pointers(df):
unpacked = unpack(pack(df))

for col in df:
if df._data[col].data:
assert df._data[col].data.ptr != unpacked._data[col].data.ptr


def test_packed_dataframe_unique_pointers_numeric():
np.random.seed(0)
df = DataFrame()
nelem = 10
df["keys"] = np.arange(nelem, dtype=np.float64)
df["vals"] = np.random.random(nelem)

check_packed_unique_pointers(df)


def test_packed_dataframe_unique_pointers_categorical():
np.random.seed(0)

df = DataFrame()
df["keys"] = pd.Categorical("aaabababac")
df["vals"] = np.random.random(len(df))

check_packed_unique_pointers(df)
Loading