Add JSON fuzz tests with varying function parameters (#6385)

* add initial set of params for json writer * Add multi-parameter support in JSON writer and reader Fuzz workers. * Add multi-parameter test combinations in json reader and writer tests. * rename _df to _current_buffer * Add misc fixes to parquet fuzz tests. * Adapt json tests to new changes and update random parameter generation logics. * Code cleanup and introducing "ALL_POSSIBLE_VALUES" constant.
rapidsai · Oct 26, 2020 · 4205104 · 4205104
1 parent 7e8eb45
commit 4205104
Show file tree

Hide file tree

Showing 12 changed files with 345 additions and 132 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@
 - PR #6430 Add struct type support to `to_arrow` and `from_arrow`
 
 - PR #6384 Add CSV fuzz tests with varying function parameters
+- PR #6385 Add JSON fuzz tests with varying function parameters
 - PR #6398 Remove function constructor macros in parquet reader
 - PR #6432 Add dictionary support to `cudf::upper_bound` and `cudf::lower_bound`
 - PR #6461 Replace index type-dispatch call with indexalator in cudf::scatter

diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py
@@ -8,11 +8,12 @@
 import cudf
 from cudf._fuzz_testing.io import IOFuzz
 from cudf._fuzz_testing.utils import (
+    ALL_POSSIBLE_VALUES,
     _generate_rand_meta,
-    pandas_dtypes_to_cudf_dtypes,
     pyarrow_to_pandas,
 )
 from cudf.tests import dataset_generator as dg
+from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes
 
 logging.basicConfig(
     format="%(asctime)s %(levelname)-8s %(message)s",
@@ -70,54 +71,56 @@ def write_data(self, file_name):
         if self._current_buffer is not None:
             self._current_buffer.to_csv(file_name + "_crash.csv")
 
-    def get_rand_params(self, params):
+    def set_rand_params(self, params):
         params_dict = {}
         for param, values in params.items():
-            if param == "usecols" and values is None:
-                col_size = self._rand(len(self._df.columns))
-                col_val = np.random.choice(
-                    [
-                        None,
-                        np.unique(
-                            np.random.choice(self._df.columns, col_size)
-                        ),
-                    ]
-                )
-                params_dict[param] = (
-                    col_val if col_val is None else list(col_val)
-                )
-            elif param == "dtype" and values is None:
-                dtype_val = np.random.choice([None, self._df.dtypes.to_dict()])
-                if dtype_val is not None:
-                    dtype_val = {
-                        col_name: "category"
-                        if cudf.utils.dtypes.is_categorical_dtype(dtype)
-                        else pandas_dtypes_to_cudf_dtypes[dtype]
-                        for col_name, dtype in dtype_val.items()
-                    }
-                params_dict[param] = dtype_val
-            elif param == "header" and values is None:
-                header_val = np.random.choice(
-                    ["infer", np.random.randint(low=0, high=len(self._df))]
-                )
-                params_dict[param] = header_val
-            elif param == "skiprows" and values is None:
-                params_dict[param] = np.random.randint(
-                    low=0, high=len(self._df)
-                )
-            elif param == "skipfooter" and values is None:
-                params_dict[param] = np.random.randint(
-                    low=0, high=len(self._df)
-                )
-            elif param == "nrows" and values is None:
-                nrows_val = np.random.choice(
-                    [None, np.random.randint(low=0, high=len(self._df))]
-                )
-                params_dict[param] = nrows_val
+            if values == ALL_POSSIBLE_VALUES:
+                if param == "usecols":
+                    col_size = self._rand(len(self._df.columns))
+                    col_val = np.random.choice(
+                        [
+                            None,
+                            np.unique(
+                                np.random.choice(self._df.columns, col_size)
+                            ),
+                        ]
+                    )
+                    params_dict[param] = (
+                        col_val if col_val is None else list(col_val)
+                    )
+                elif param == "dtype":
+                    dtype_val = np.random.choice(
+                        [None, self._df.dtypes.to_dict()]
+                    )
+                    if dtype_val is not None:
+                        dtype_val = {
+                            col_name: "category"
+                            if cudf.utils.dtypes.is_categorical_dtype(dtype)
+                            else pandas_dtypes_to_cudf_dtypes[dtype]
+                            for col_name, dtype in dtype_val.items()
+                        }
+                    params_dict[param] = dtype_val
+                elif param == "header":
+                    header_val = np.random.choice(
+                        ["infer", np.random.randint(low=0, high=len(self._df))]
+                    )
+                    params_dict[param] = header_val
+                elif param == "skiprows":
+                    params_dict[param] = np.random.randint(
+                        low=0, high=len(self._df)
+                    )
+                elif param == "skipfooter":
+                    params_dict[param] = np.random.randint(
+                        low=0, high=len(self._df)
+                    )
+                elif param == "nrows":
+                    nrows_val = np.random.choice(
+                        [None, np.random.randint(low=0, high=len(self._df))]
+                    )
+                    params_dict[param] = nrows_val
             else:
                 params_dict[param] = np.random.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
-        return params_dict
 
 
 class CSVWriter(IOFuzz):
@@ -169,28 +172,28 @@ def write_data(self, file_name):
         if self._current_buffer is not None:
             self._current_buffer.to_csv(file_name + "_crash.csv")
 
-    def get_rand_params(self, params):
+    def set_rand_params(self, params):
         params_dict = {}
         for param, values in params.items():
-            if param == "columns" and values is None:
-                col_size = self._rand(len(self._current_buffer.columns))
-                params_dict[param] = list(
-                    np.unique(
-                        np.random.choice(
-                            self._current_buffer.columns, col_size
+            if values == ALL_POSSIBLE_VALUES:
+                if param == "columns":
+                    col_size = self._rand(len(self._current_buffer.columns))
+                    params_dict[param] = list(
+                        np.unique(
+                            np.random.choice(
+                                self._current_buffer.columns, col_size
+                            )
                         )
                     )
-                )
-            elif param == "chunksize" and values is None:
-                params_dict[param] = np.random.choice(
-                    [
-                        None,
-                        np.random.randint(
-                            low=1, high=max(1, len(self._current_buffer))
-                        ),
-                    ]
-                )
+                elif param == "chunksize":
+                    params_dict[param] = np.random.choice(
+                        [
+                            None,
+                            np.random.randint(
+                                low=1, high=max(1, len(self._current_buffer))
+                            ),
+                        ]
+                    )
             else:
                 params_dict[param] = np.random.choice(values)
         self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
-        return params_dict
diff --git a/python/cudf/cudf/_fuzz_testing/fuzzer.py b/python/cudf/cudf/_fuzz_testing/fuzzer.py
@@ -86,7 +86,8 @@ def start(self):
                 if self.params is None:
                     self._target(file_name)
                 else:
-                    kwargs = self._data_handler.get_rand_params(self.params)
+                    self._data_handler.set_rand_params(self.params)
+                    kwargs = self._data_handler._current_params["test_kwargs"]
                     logging.info(f"Parameters passed: {str(kwargs)}")
                     self._target(file_name, **kwargs)
             except KeyboardInterrupt:

diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py
@@ -55,7 +55,7 @@ def _load_params(self, path):
 
     @staticmethod
     def _rand(n):
-        return random.randrange(0, n)
+        return random.randrange(0, n + 1)
 
     def generate_input(self):
         raise NotImplementedError("Must be implemented by inherited class")
@@ -80,17 +80,20 @@ def get_next_regression_params(self):
         self._current_params = copy.copy(param)
         return dtypes_meta, num_rows, num_cols, seed
 
-    def get_rand_params(self, params):
+    def set_rand_params(self, params):
         params_dict = {
             param: np.random.choice(values) for param, values in params.items()
         }
         self._current_params["test_kwargs"] = self.process_kwargs(
             params_dict=params_dict
         )
-        return params_dict
 
     def process_kwargs(self, params_dict):
         return {
-            key: bool(value) if isinstance(value, np.bool_) else value
+            key: bool(value)
+            if isinstance(value, np.bool_)
+            else str(value)
+            if isinstance(value, np.dtype)
+            else value
             for key, value in params_dict.items()
         }
diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py
@@ -3,11 +3,19 @@
 
 import logging
 import random
+from collections import abc as abc
+
+import numpy as np
 
 import cudf
 from cudf._fuzz_testing.io import IOFuzz
-from cudf._fuzz_testing.utils import _generate_rand_meta, pyarrow_to_pandas
+from cudf._fuzz_testing.utils import (
+    ALL_POSSIBLE_VALUES,
+    _generate_rand_meta,
+    pyarrow_to_pandas,
+)
 from cudf.tests import dataset_generator as dg
+from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes
 
 logging.basicConfig(
     format="%(asctime)s %(levelname)-8s %(message)s",
@@ -16,6 +24,20 @@
 )
 
 
+def _get_dtype_param_value(dtype_val):
+    if dtype_val is not None and isinstance(dtype_val, abc.Mapping):
+        processed_dtypes = {}
+        for col_name, dtype in dtype_val.items():
+            if cudf.utils.dtypes.is_categorical_dtype(dtype):
+                processed_dtypes[col_name] = "category"
+            else:
+                processed_dtypes[col_name] = str(
+                    pandas_dtypes_to_cudf_dtypes.get(dtype, dtype)
+                )
+        return processed_dtypes
+    return dtype_val
+
+
 class JSONReader(IOFuzz):
     def __init__(
         self,
@@ -42,12 +64,18 @@ def generate_input(self):
         else:
             seed = random.randint(0, 2 ** 32 - 1)
             random.seed(seed)
-            dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
+            dtypes_list = list(
+                cudf.utils.dtypes.ALL_TYPES
+                # https://github.com/pandas-dev/pandas/issues/20599
+                - {"uint64"}
+                # TODO: Remove DATETIME_TYPES after this is fixed:
+                # https://github.com/rapidsai/cudf/issues/6586
+                - set(cudf.utils.dtypes.DATETIME_TYPES)
+            )
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                 self, dtypes_list
             )
             self._current_params["dtypes_meta"] = dtypes_meta
-            self._current_params["file_name"] = self._file_name
             self._current_params["seed"] = seed
             self._current_params["num_rows"] = num_rows
             self._current_params["num_columns"] = num_cols
@@ -57,10 +85,28 @@ def generate_input(self):
         )
         table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
         df = pyarrow_to_pandas(table)
-
+        self._current_buffer = df
         logging.info(f"Shape of DataFrame generated: {df.shape}")
 
-        return df.to_json()
+        return df.to_json(orient="records", lines=True)
+
+    def write_data(self, file_name):
+        if self._current_buffer is not None:
+            self._current_buffer.to_json(
+                file_name + "_crash_json.json", orient="records", lines=True
+            )
+
+    def set_rand_params(self, params):
+        params_dict = {}
+        for param, values in params.items():
+            if param == "dtype" and values == ALL_POSSIBLE_VALUES:
+                dtype_val = np.random.choice(
+                    [True, self._current_buffer.dtypes.to_dict()]
+                )
+                params_dict[param] = _get_dtype_param_value(dtype_val)
+            else:
+                params_dict[param] = np.random.choice(values)
+        self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
 
 
 class JSONWriter(IOFuzz):
@@ -89,7 +135,14 @@ def generate_input(self):
         else:
             seed = random.randint(0, 2 ** 32 - 1)
             random.seed(seed)
-            dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
+            dtypes_list = list(
+                cudf.utils.dtypes.ALL_TYPES
+                # https://github.com/pandas-dev/pandas/issues/20599
+                - {"uint64"}
+                # TODO: Remove DATETIME_TYPES after this is fixed:
+                # https://github.com/rapidsai/cudf/issues/6586
+                - set(cudf.utils.dtypes.DATETIME_TYPES)
+            )
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                 self, dtypes_list
             )
@@ -101,9 +154,27 @@ def generate_input(self):
             f"Generating DataFrame with rows: {num_rows} "
             f"and columns: {num_cols}"
         )
-        df = cudf.DataFrame.from_arrow(
-            dg.rand_dataframe(dtypes_meta, num_rows, seed)
-        )
-        logging.info(f"Shape of DataFrame generated: {df.shape}")
+        table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
+        df = pyarrow_to_pandas(table)
 
+        logging.info(f"Shape of DataFrame generated: {df.shape}")
+        self._current_buffer = df
         return df
+
+    def write_data(self, file_name):
+        if self._current_buffer is not None:
+            self._current_buffer.to_json(
+                file_name + "_crash_json.json", lines=True, orient="records"
+            )
+
+    def set_rand_params(self, params):
+        params_dict = {}
+        for param, values in params.items():
+            if param == "dtype" and values == ALL_POSSIBLE_VALUES:
+                dtype_val = np.random.choice(
+                    [True, self._current_buffer.dtypes.to_dict()]
+                )
+                params_dict[param] = _get_dtype_param_value(dtype_val)
+            else:
+                params_dict[param] = np.random.choice(values)
+        self._current_params["test_kwargs"] = self.process_kwargs(params_dict)