NVIDIA-Merlin · albert17 · Dec 15, 2020 · Dec 16, 2020 · Dec 16, 2020 · Dec 18, 2020
diff --git a/examples/dataset_inspector/criteo_config.json b/examples/dataset_inspector/criteo_config.json
@@ -0,0 +1,6 @@
+{
+    "cats": ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C22", "C23", "C24", "C25", "C26", "C27"], 
+    "cats_mh": ["genres"],
+    "conts": ["I1", "I2", "I3", "I4", "I5", "I6", "I7", "I8", "I9", "I10", "I11", "I12", "I13", "I14"],
+    "labels": ["label"]
+}
diff --git a/examples/dataset_inspector/inspector_script.py b/examples/dataset_inspector/inspector_script.py
@@ -0,0 +1,73 @@
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import json
+
+import fsspec
+
+import nvtabular as nvt
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=("Dataset Inspect Tool"))
+    # Config file
+    parser.add_argument(
+        "-c",
+        "--config_file",
+        type=str,
+        help="Dataset columns type (Required)",
+    )
+    # Dataset path
+    parser.add_argument(
+        "-d",
+        "--data_path",
+        default="0",
+        type=str,
+        help="Input dataset path (Required)",
+    )
+    # Dataset format
+    parser.add_argument(
+        "-f",
+        "--format",
+        choices=["csv", "parquet"],
+        default="parquet",
+        type=str,
+        help="Dataset format (Default 'parquet')",
+    )
+    # Output file name
+    parser.add_argument(
+        "-o",
+        "--output_file",
+        default="dataset_info.json",
+        type=str,
+        help="Output file name (Default 'dataset_info.json')",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    # Get dataset columns
+    with fsspec.open(args.config_file) as f:
+        config = json.load(f)
+
+    a = nvt.tools.DatasetInspector()
+    a.inspect(args.data_path, args.format, config, args.output_file)
+
+
+if __name__ == "__main__":
+    main(parse_args())
diff --git a/examples/dataset_inspector/movielens_config.json b/examples/dataset_inspector/movielens_config.json
@@ -0,0 +1,6 @@
+{
+    "cats": ["movieId", "userId"],
+    "cats_mh": ["genres"],
+    "conts": [],
+    "labels": []
+}
diff --git a/examples/dataset_inspector/rossmann_config.json b/examples/dataset_inspector/rossmann_config.json
@@ -0,0 +1,6 @@
+{
+    "cats": ["Store", "DayOfWeek", "Year", "Month", "Day", "StateHoliday", "CompetitionMonthsOpen", "Promo2Weeks", "StoreType", "Assortment", "PromoInterval", "CompetitionOpenSinceYear", "Promo2SinceYear", "State", "Week", "Events", "Promo_fw", "Promo_bw", "StateHoliday_fw", "StateHoliday_bw", "SchoolHoliday_fw", "SchoolHoliday_bw"], 
+    "cats_mh": [],
+    "conts": ["CompetitionDistance", "Max_TemperatureC", "Mean_TemperatureC", "Min_TemperatureC", "Max_Humidity", "Mean_Humidity", "Min_Humidity", "Max_Wind_SpeedKm_h", "Mean_Wind_SpeedKm_h", "CloudCover", "trend", "trend_DE", "AfterStateHoliday", "BeforeStateHoliday", "Promo", "SchoolHoliday"],
+    "labels": ["Sales"]
+}
diff --git a/nvtabular/tools/__init__.py b/nvtabular/tools/__init__.py
@@ -0,0 +1,15 @@
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/nvtabular/tools/dataset_inspector.py b/nvtabular/tools/dataset_inspector.py
@@ -0,0 +1,137 @@
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import json
+
+import fsspec
+import numpy as np
+from dask.distributed import Client
+from dask_cuda import LocalCUDACluster
+
+from nvtabular.io import Dataset
+
+
+# Class to help Json to serialize the data
+class NpEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.integer):
+            return int(obj)
+        elif isinstance(obj, np.floating):
+            return float(obj)
+        elif isinstance(obj, np.bool_):
+            return bool(obj)
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+        else:
+            return super(NpEncoder, self).default(obj)
+
+
+class DatasetInspector:
+    """
+    Analyzes an existing dataset to extract its statistics.
+
+    Parameters
+    -----------
+    path_or_source : str, list of str, or <dask.dataframe|cudf|pd>.DataFrame
+        Dataset path (or list of paths), or a DataFrame. If string,
+        should specify a specific file or directory path. If this is a
+        directory path, the directory structure must be flat (nested
+        directories are not yet supported).
+    columns_dict: dictionary
+        Dictionary indicating the diferent columns type
+    """
+
+    def __get_stats(self, ddf, col, data, col_type):
+        data[col] = {}
+
+        # Get dtype and convert cat-strings and cat_mh-lists
+        data[col]["dtype"] = str(ddf[col].dtype)
+        if data[col]["dtype"] == "object":
+            if col_type == "cat":
+                data[col]["dtype"] = "string"
+                ddf[col] = ddf[col].map_partitions(lambda x: x.str.len())
+            elif col_type == "cat_mh":
+                data[col]["dtype"] = "list"
+                ddf[col] = ddf[col].map_partitions(lambda x: x.list.len())
+            ddf[col].compute()
+
+        # Get percentage of nan for all
+        data[col]["nans_%"] = 100 * (1 - ddf[col].count().compute() / len(ddf[col]))
+
+        # Get cardinality for cat and label
+        data[col]["cardinality"] = ddf[col].nunique().compute()
+
+        # Get max/min/mean for cat, cat_mh, and cont
+        if col_type != "label":
+            data[col]["min"] = ddf[col].min().compute()
+            data[col]["max"] = ddf[col].max().compute()
+            if col_type == "cont":
+                data[col]["mean"] = ddf[col].mean().compute()
+            else:
+                data[col]["avg"] = int(ddf[col].mean().compute())
+
+        # For conts get also std
+        if col_type == "cont":
+            data[col]["std"] = ddf[col].std().compute()
+
+    def inspect(self, path, dataset_format, columns_dict, output_file):
+        # Get dataset columns
+        cats = columns_dict["cats"]
+        cats_mh = columns_dict["cats_mh"]
+        conts = columns_dict["conts"]
+        labels = columns_dict["labels"]
+
+        # Get dataset
+        dataset = Dataset(path, engine=dataset_format)
+        ddf = dataset.to_ddf()
+        print(ddf.dtypes)
+
+        # Create Dask Cluster
+        cluster = LocalCUDACluster()
+        client = Client(cluster)
+        print(client)
+
+        # Dictionary to store collected information
+        data = {}
+        # Store general info
+        data["num_rows"] = ddf.shape[0].compute()
+        data["cats"] = cats
+        data["cats_mh"] = cats_mh
+        data["conts"] = conts
+        data["labels"] = labels
+
+        # Get categoricals columns stats
+        for col in cats:
+            self.__get_stats(ddf, col, data, "cat")
+
+        # Get categoricals multihot columns stats
+        for col in cats_mh:
+            self.__get_stats(ddf, col, data, "cat_mh")
+
+        # Get continuous columns stats
+        for col in conts:
+            self.__get_stats(ddf, col, data, "cont")
+
+        # Get labels columns stats
+        for col in conts:
+            self.__get_stats(ddf, col, data, "label")
+
+        # Write json file
+        with fsspec.open(output_file, "w") as outfile:
+            json.dump(data, outfile, cls=NpEncoder)
+
+        # Stop Dask Cluster
+        client.shutdown()
diff --git a/tools/criteo_config.json b/tools/criteo_config.json
@@ -0,0 +1,6 @@
+{
+    "cats": ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C22", "C23", "C24", "C25", "C26", "C27"], 
+    "cats_mh": ["genres"],
+    "conts": ["I1", "I2", "I3", "I4", "I5", "I6", "I7", "I8", "I9", "I10", "I11", "I12", "I13", "I14"],
+    "labels": ["label"]
+}