diff --git a/examples/dataset_inspector/criteo_config.json b/examples/dataset_inspector/criteo_config.json new file mode 100644 index 00000000000..fc076e71ba0 --- /dev/null +++ b/examples/dataset_inspector/criteo_config.json @@ -0,0 +1,6 @@ +{ + "cats": ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C22", "C23", "C24", "C25", "C26", "C27"], + "cats_mh": ["genres"], + "conts": ["I1", "I2", "I3", "I4", "I5", "I6", "I7", "I8", "I9", "I10", "I11", "I12", "I13", "I14"], + "labels": ["label"] +} \ No newline at end of file diff --git a/examples/dataset_inspector/inspector_script.py b/examples/dataset_inspector/inspector_script.py new file mode 100644 index 00000000000..ef2dae90e16 --- /dev/null +++ b/examples/dataset_inspector/inspector_script.py @@ -0,0 +1,73 @@ +# +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import json + +import fsspec + +import nvtabular as nvt + + +def parse_args(): + parser = argparse.ArgumentParser(description=("Dataset Inspect Tool")) + # Config file + parser.add_argument( + "-c", + "--config_file", + type=str, + help="Dataset columns type (Required)", + ) + # Dataset path + parser.add_argument( + "-d", + "--data_path", + default="0", + type=str, + help="Input dataset path (Required)", + ) + # Dataset format + parser.add_argument( + "-f", + "--format", + choices=["csv", "parquet"], + default="parquet", + type=str, + help="Dataset format (Default 'parquet')", + ) + # Output file name + parser.add_argument( + "-o", + "--output_file", + default="dataset_info.json", + type=str, + help="Output file name (Default 'dataset_info.json')", + ) + args = parser.parse_args() + return args + + +def main(args): + # Get dataset columns + with fsspec.open(args.config_file) as f: + config = json.load(f) + + a = nvt.tools.DatasetInspector() + a.inspect(args.data_path, args.format, config, args.output_file) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/examples/dataset_inspector/movielens_config.json b/examples/dataset_inspector/movielens_config.json new file mode 100644 index 00000000000..e32be848fc9 --- /dev/null +++ b/examples/dataset_inspector/movielens_config.json @@ -0,0 +1,6 @@ +{ + "cats": ["movieId", "userId"], + "cats_mh": ["genres"], + "conts": [], + "labels": [] +} \ No newline at end of file diff --git a/examples/dataset_inspector/rossmann_config.json b/examples/dataset_inspector/rossmann_config.json new file mode 100644 index 00000000000..cabdbfe995f --- /dev/null +++ b/examples/dataset_inspector/rossmann_config.json @@ -0,0 +1,6 @@ +{ + "cats": ["Store", "DayOfWeek", "Year", "Month", "Day", "StateHoliday", "CompetitionMonthsOpen", "Promo2Weeks", "StoreType", "Assortment", "PromoInterval", "CompetitionOpenSinceYear", "Promo2SinceYear", "State", "Week", "Events", "Promo_fw", "Promo_bw", "StateHoliday_fw", "StateHoliday_bw", "SchoolHoliday_fw", "SchoolHoliday_bw"], + "cats_mh": [], + "conts": ["CompetitionDistance", "Max_TemperatureC", "Mean_TemperatureC", "Min_TemperatureC", "Max_Humidity", "Mean_Humidity", "Min_Humidity", "Max_Wind_SpeedKm_h", "Mean_Wind_SpeedKm_h", "CloudCover", "trend", "trend_DE", "AfterStateHoliday", "BeforeStateHoliday", "Promo", "SchoolHoliday"], + "labels": ["Sales"] +} \ No newline at end of file diff --git a/nvtabular/tools/__init__.py b/nvtabular/tools/__init__.py new file mode 100644 index 00000000000..f55aea7739f --- /dev/null +++ b/nvtabular/tools/__init__.py @@ -0,0 +1,15 @@ +# +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/nvtabular/tools/dataset_inspector.py b/nvtabular/tools/dataset_inspector.py new file mode 100644 index 00000000000..2774a4bb2e4 --- /dev/null +++ b/nvtabular/tools/dataset_inspector.py @@ -0,0 +1,137 @@ +# +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json + +import fsspec +import numpy as np +from dask.distributed import Client +from dask_cuda import LocalCUDACluster + +from nvtabular.io import Dataset + + +# Class to help Json to serialize the data +class NpEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.bool_): + return bool(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return super(NpEncoder, self).default(obj) + + +class DatasetInspector: + """ + Analyzes an existing dataset to extract its statistics. + + Parameters + ----------- + path_or_source : str, list of str, or .DataFrame + Dataset path (or list of paths), or a DataFrame. If string, + should specify a specific file or directory path. If this is a + directory path, the directory structure must be flat (nested + directories are not yet supported). + columns_dict: dictionary + Dictionary indicating the diferent columns type + """ + + def __get_stats(self, ddf, col, data, col_type): + data[col] = {} + + # Get dtype and convert cat-strings and cat_mh-lists + data[col]["dtype"] = str(ddf[col].dtype) + if data[col]["dtype"] == "object": + if col_type == "cat": + data[col]["dtype"] = "string" + ddf[col] = ddf[col].map_partitions(lambda x: x.str.len()) + elif col_type == "cat_mh": + data[col]["dtype"] = "list" + ddf[col] = ddf[col].map_partitions(lambda x: x.list.len()) + ddf[col].compute() + + # Get percentage of nan for all + data[col]["nans_%"] = 100 * (1 - ddf[col].count().compute() / len(ddf[col])) + + # Get cardinality for cat and label + data[col]["cardinality"] = ddf[col].nunique().compute() + + # Get max/min/mean for cat, cat_mh, and cont + if col_type != "label": + data[col]["min"] = ddf[col].min().compute() + data[col]["max"] = ddf[col].max().compute() + if col_type == "cont": + data[col]["mean"] = ddf[col].mean().compute() + else: + data[col]["avg"] = int(ddf[col].mean().compute()) + + # For conts get also std + if col_type == "cont": + data[col]["std"] = ddf[col].std().compute() + + def inspect(self, path, dataset_format, columns_dict, output_file): + # Get dataset columns + cats = columns_dict["cats"] + cats_mh = columns_dict["cats_mh"] + conts = columns_dict["conts"] + labels = columns_dict["labels"] + + # Get dataset + dataset = Dataset(path, engine=dataset_format) + ddf = dataset.to_ddf() + print(ddf.dtypes) + + # Create Dask Cluster + cluster = LocalCUDACluster() + client = Client(cluster) + print(client) + + # Dictionary to store collected information + data = {} + # Store general info + data["num_rows"] = ddf.shape[0].compute() + data["cats"] = cats + data["cats_mh"] = cats_mh + data["conts"] = conts + data["labels"] = labels + + # Get categoricals columns stats + for col in cats: + self.__get_stats(ddf, col, data, "cat") + + # Get categoricals multihot columns stats + for col in cats_mh: + self.__get_stats(ddf, col, data, "cat_mh") + + # Get continuous columns stats + for col in conts: + self.__get_stats(ddf, col, data, "cont") + + # Get labels columns stats + for col in conts: + self.__get_stats(ddf, col, data, "label") + + # Write json file + with fsspec.open(output_file, "w") as outfile: + json.dump(data, outfile, cls=NpEncoder) + + # Stop Dask Cluster + client.shutdown() diff --git a/tools/criteo_config.json b/tools/criteo_config.json new file mode 100644 index 00000000000..fc076e71ba0 --- /dev/null +++ b/tools/criteo_config.json @@ -0,0 +1,6 @@ +{ + "cats": ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C22", "C23", "C24", "C25", "C26", "C27"], + "cats_mh": ["genres"], + "conts": ["I1", "I2", "I3", "I4", "I5", "I6", "I7", "I8", "I9", "I10", "I11", "I12", "I13", "I14"], + "labels": ["label"] +} \ No newline at end of file diff --git a/tools/inspect_dataset.py b/tools/inspect_dataset.py new file mode 100644 index 00000000000..7c1e5c80571 --- /dev/null +++ b/tools/inspect_dataset.py @@ -0,0 +1,168 @@ +# +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import json + +import fsspec +import numpy as np +from dask.distributed import Client +from dask_cuda import LocalCUDACluster + +import nvtabular as nvt + + +# Class to help Json to serialize the data +class NpEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.bool_): + return bool(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return super(NpEncoder, self).default(obj) + + +def parse_args(): + parser = argparse.ArgumentParser(description=("Dataset Inspect Tool")) + # Config file + parser.add_argument( + "-c", + "--config_file", + type=str, + help="Dataset columns type (Required)", + ) + # Dataset path + parser.add_argument( + "-d", + "--data_path", + default="0", + type=str, + help="Input dataset path (Required)", + ) + # Dataset format + parser.add_argument( + "-f", + "--format", + choices=["csv", "parquet"], + default="parquet", + type=str, + help="Dataset format (Default 'parquet')", + ) + # Output file name + parser.add_argument( + "-o", + "--output_file", + default="dataset_info.json", + type=str, + help="Output file name (Default 'dataset_info.json')", + ) + args = parser.parse_args() + return args + + +def get_stats(ddf, col, data, col_type): + data[col] = {} + + # Get dtype and convert cat-stings and cat_mh-lists + data[col]["dtype"] = str(ddf[col].dtype) + if data[col]["dtype"] == "object": + if col_type == "cat": + data[col]["dtype"] = "string" + ddf[col] = ddf[col].map_partitions(lambda x: x.str.len()) + elif col_type == "cat_mh": + data[col]["dtype"] = "list" + ddf[col] = ddf[col].map_partitions(lambda x: x.list.len()) + ddf[col].compute() + + # Get percentage of nan for all + data[col]["nans_%"] = 100 * (1 - ddf[col].count().compute() / len(ddf[col])) + + # Get cardinality for cat and label + data[col]["cardinality"] = ddf[col].nunique().compute() + + # Get max/min/mean for cat, cat_mh, and cont + if col_type != "label": + data[col]["min"] = ddf[col].min().compute() + data[col]["max"] = ddf[col].max().compute() + if col_type == "cont": + data[col]["mean"] = ddf[col].mean().compute() + else: + data[col]["avg"] = int(ddf[col].mean().compute()) + + # For conts get also std + if col_type == "cont": + data[col]["std"] = ddf[col].std().compute() + + +def main(args): + # Get dataset columns + with fsspec.open(args.config_file) as f: + config = json.load(f) + cats = config["cats"] + cats_mh = config["cats_mh"] + conts = config["conts"] + labels = config["labels"] + + # Get dataset + dataset = nvt.Dataset(args.data_path, engine=args.format) + ddf = dataset.to_ddf() + print(ddf.dtypes) + + # Create Dask Cluster + cluster = LocalCUDACluster() + client = Client(cluster) + print(client) + + # Dictionary to store collected information + data = {} + # Store general info + data["num_rows"] = ddf.shape[0].compute() + data["cats"] = cats + data["cats_mh"] = cats_mh + data["conts"] = conts + data["labels"] = labels + + # Get categoricals columns stats + for col in cats: + get_stats(ddf, col, data, "cat") + + # Get categoricals multihot columns stats + for col in cats_mh: + get_stats(ddf, col, data, "cat_mh") + + # Get continuous columns stats + for col in conts: + get_stats(ddf, col, data, "cont") + + # Get labels columns stats + for col in conts: + get_stats(ddf, col, data, "label") + + # Write json file + with fsspec.open(args.output_file, "w") as outfile: + json.dump(data, outfile, cls=NpEncoder) + + # Stop Dask Cluster + client.shutdown() + + +if __name__ == "__main__": + main(parse_args()) diff --git a/tools/movielens_config.json b/tools/movielens_config.json new file mode 100644 index 00000000000..e32be848fc9 --- /dev/null +++ b/tools/movielens_config.json @@ -0,0 +1,6 @@ +{ + "cats": ["movieId", "userId"], + "cats_mh": ["genres"], + "conts": [], + "labels": [] +} \ No newline at end of file diff --git a/tools/rossmann_config.json b/tools/rossmann_config.json new file mode 100644 index 00000000000..693a07c1689 --- /dev/null +++ b/tools/rossmann_config.json @@ -0,0 +1,6 @@ +{ + "cats": ["Store", "DayOfWeek", "Year", "Month", "Day", "StateHoliday", "CompetitionMonthsOpen", "Promo2Weeks", "StoreType", "Assortment", "PromoInterval", "CompetitionOpenSinceYear", "Promo2SinceYear", "State", "Week", "Events", "Promo_fw", "Promo_bw", "StateHoliday_fw", "StateHoliday_bw", "SchoolHoliday_fw", "SchoolHoliday_bw"], + "cats_mh": ["genres"], + "conts": ["CompetitionDistance", "Max_TemperatureC", "Mean_TemperatureC", "Min_TemperatureC", "Max_Humidity", "Mean_Humidity", "Min_Humidity", "Max_Wind_SpeedKm_h", "Mean_Wind_SpeedKm_h", "CloudCover", "trend", "trend_DE", "AfterStateHoliday", "BeforeStateHoliday", "Promo", "SchoolHoliday"], + "labels": ["Sales"] +} \ No newline at end of file