Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset inspect #510

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions examples/dataset_inspector/criteo_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cats": ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C22", "C23", "C24", "C25", "C26", "C27"],
"cats_mh": ["genres"],
"conts": ["I1", "I2", "I3", "I4", "I5", "I6", "I7", "I8", "I9", "I10", "I11", "I12", "I13", "I14"],
"labels": ["label"]
}
73 changes: 73 additions & 0 deletions examples/dataset_inspector/inspector_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#
# Copyright (c) 2020, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import argparse
import json

import fsspec

import nvtabular as nvt


def parse_args():
parser = argparse.ArgumentParser(description=("Dataset Inspect Tool"))
# Config file
parser.add_argument(
"-c",
"--config_file",
type=str,
help="Dataset columns type (Required)",
)
# Dataset path
parser.add_argument(
"-d",
"--data_path",
default="0",
type=str,
help="Input dataset path (Required)",
)
# Dataset format
parser.add_argument(
"-f",
"--format",
choices=["csv", "parquet"],
default="parquet",
type=str,
help="Dataset format (Default 'parquet')",
)
# Output file name
parser.add_argument(
"-o",
"--output_file",
default="dataset_info.json",
type=str,
help="Output file name (Default 'dataset_info.json')",
)
args = parser.parse_args()
return args


def main(args):
# Get dataset columns
with fsspec.open(args.config_file) as f:
config = json.load(f)

a = nvt.tools.DatasetInspector()
a.inspect(args.data_path, args.format, config, args.output_file)


if __name__ == "__main__":
main(parse_args())
6 changes: 6 additions & 0 deletions examples/dataset_inspector/movielens_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cats": ["movieId", "userId"],
"cats_mh": ["genres"],
"conts": [],
"labels": []
}
6 changes: 6 additions & 0 deletions examples/dataset_inspector/rossmann_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cats": ["Store", "DayOfWeek", "Year", "Month", "Day", "StateHoliday", "CompetitionMonthsOpen", "Promo2Weeks", "StoreType", "Assortment", "PromoInterval", "CompetitionOpenSinceYear", "Promo2SinceYear", "State", "Week", "Events", "Promo_fw", "Promo_bw", "StateHoliday_fw", "StateHoliday_bw", "SchoolHoliday_fw", "SchoolHoliday_bw"],
"cats_mh": [],
"conts": ["CompetitionDistance", "Max_TemperatureC", "Mean_TemperatureC", "Min_TemperatureC", "Max_Humidity", "Mean_Humidity", "Min_Humidity", "Max_Wind_SpeedKm_h", "Mean_Wind_SpeedKm_h", "CloudCover", "trend", "trend_DE", "AfterStateHoliday", "BeforeStateHoliday", "Promo", "SchoolHoliday"],
"labels": ["Sales"]
}
15 changes: 15 additions & 0 deletions nvtabular/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#
# Copyright (c) 2020, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
137 changes: 137 additions & 0 deletions nvtabular/tools/dataset_inspector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#
# Copyright (c) 2020, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import json

import fsspec
import numpy as np
from dask.distributed import Client
from dask_cuda import LocalCUDACluster

from nvtabular.io import Dataset


# Class to help Json to serialize the data
class NpEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.bool_):
return bool(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return super(NpEncoder, self).default(obj)


class DatasetInspector:
"""
Analyzes an existing dataset to extract its statistics.

Parameters
-----------
path_or_source : str, list of str, or <dask.dataframe|cudf|pd>.DataFrame
Dataset path (or list of paths), or a DataFrame. If string,
should specify a specific file or directory path. If this is a
directory path, the directory structure must be flat (nested
directories are not yet supported).
columns_dict: dictionary
Dictionary indicating the diferent columns type
"""

def __get_stats(self, ddf, col, data, col_type):
data[col] = {}

# Get dtype and convert cat-strings and cat_mh-lists
data[col]["dtype"] = str(ddf[col].dtype)
if data[col]["dtype"] == "object":
if col_type == "cat":
data[col]["dtype"] = "string"
ddf[col] = ddf[col].map_partitions(lambda x: x.str.len())
elif col_type == "cat_mh":
data[col]["dtype"] = "list"
ddf[col] = ddf[col].map_partitions(lambda x: x.list.len())
ddf[col].compute()

# Get percentage of nan for all
data[col]["nans_%"] = 100 * (1 - ddf[col].count().compute() / len(ddf[col]))

# Get cardinality for cat and label
data[col]["cardinality"] = ddf[col].nunique().compute()

# Get max/min/mean for cat, cat_mh, and cont
if col_type != "label":
data[col]["min"] = ddf[col].min().compute()
data[col]["max"] = ddf[col].max().compute()
if col_type == "cont":
data[col]["mean"] = ddf[col].mean().compute()
else:
data[col]["avg"] = int(ddf[col].mean().compute())

# For conts get also std
if col_type == "cont":
data[col]["std"] = ddf[col].std().compute()

def inspect(self, path, dataset_format, columns_dict, output_file):
# Get dataset columns
cats = columns_dict["cats"]
cats_mh = columns_dict["cats_mh"]
conts = columns_dict["conts"]
labels = columns_dict["labels"]

# Get dataset
dataset = Dataset(path, engine=dataset_format)
ddf = dataset.to_ddf()
print(ddf.dtypes)

# Create Dask Cluster
cluster = LocalCUDACluster()
client = Client(cluster)
print(client)

# Dictionary to store collected information
data = {}
# Store general info
data["num_rows"] = ddf.shape[0].compute()
data["cats"] = cats
data["cats_mh"] = cats_mh
data["conts"] = conts
data["labels"] = labels

# Get categoricals columns stats
for col in cats:
self.__get_stats(ddf, col, data, "cat")

# Get categoricals multihot columns stats
for col in cats_mh:
self.__get_stats(ddf, col, data, "cat_mh")

# Get continuous columns stats
for col in conts:
self.__get_stats(ddf, col, data, "cont")

# Get labels columns stats
for col in conts:
self.__get_stats(ddf, col, data, "label")

# Write json file
with fsspec.open(output_file, "w") as outfile:
json.dump(data, outfile, cls=NpEncoder)

# Stop Dask Cluster
client.shutdown()
6 changes: 6 additions & 0 deletions tools/criteo_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cats": ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C22", "C23", "C24", "C25", "C26", "C27"],
"cats_mh": ["genres"],
"conts": ["I1", "I2", "I3", "I4", "I5", "I6", "I7", "I8", "I9", "I10", "I11", "I12", "I13", "I14"],
"labels": ["label"]
}
Loading