Skip to content

Commit

Permalink
Add JSON fuzz tests with varying function parameters (#6385)
Browse files Browse the repository at this point in the history
* add initial set of params for json writer

* Add multi-parameter support in JSON writer and reader Fuzz workers.

* Add multi-parameter test combinations in json reader and writer tests.

* rename _df to _current_buffer

* Add misc fixes to parquet fuzz tests.

* Adapt json tests to new changes and update random parameter generation logics.

* Code cleanup and introducing "ALL_POSSIBLE_VALUES" constant.
  • Loading branch information
galipremsagar authored Oct 26, 2020
1 parent 7e8eb45 commit 4205104
Show file tree
Hide file tree
Showing 12 changed files with 345 additions and 132 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- PR #6430 Add struct type support to `to_arrow` and `from_arrow`

- PR #6384 Add CSV fuzz tests with varying function parameters
- PR #6385 Add JSON fuzz tests with varying function parameters
- PR #6398 Remove function constructor macros in parquet reader
- PR #6432 Add dictionary support to `cudf::upper_bound` and `cudf::lower_bound`
- PR #6461 Replace index type-dispatch call with indexalator in cudf::scatter
Expand Down
127 changes: 65 additions & 62 deletions python/cudf/cudf/_fuzz_testing/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
import cudf
from cudf._fuzz_testing.io import IOFuzz
from cudf._fuzz_testing.utils import (
ALL_POSSIBLE_VALUES,
_generate_rand_meta,
pandas_dtypes_to_cudf_dtypes,
pyarrow_to_pandas,
)
from cudf.tests import dataset_generator as dg
from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes

logging.basicConfig(
format="%(asctime)s %(levelname)-8s %(message)s",
Expand Down Expand Up @@ -70,54 +71,56 @@ def write_data(self, file_name):
if self._current_buffer is not None:
self._current_buffer.to_csv(file_name + "_crash.csv")

def get_rand_params(self, params):
def set_rand_params(self, params):
params_dict = {}
for param, values in params.items():
if param == "usecols" and values is None:
col_size = self._rand(len(self._df.columns))
col_val = np.random.choice(
[
None,
np.unique(
np.random.choice(self._df.columns, col_size)
),
]
)
params_dict[param] = (
col_val if col_val is None else list(col_val)
)
elif param == "dtype" and values is None:
dtype_val = np.random.choice([None, self._df.dtypes.to_dict()])
if dtype_val is not None:
dtype_val = {
col_name: "category"
if cudf.utils.dtypes.is_categorical_dtype(dtype)
else pandas_dtypes_to_cudf_dtypes[dtype]
for col_name, dtype in dtype_val.items()
}
params_dict[param] = dtype_val
elif param == "header" and values is None:
header_val = np.random.choice(
["infer", np.random.randint(low=0, high=len(self._df))]
)
params_dict[param] = header_val
elif param == "skiprows" and values is None:
params_dict[param] = np.random.randint(
low=0, high=len(self._df)
)
elif param == "skipfooter" and values is None:
params_dict[param] = np.random.randint(
low=0, high=len(self._df)
)
elif param == "nrows" and values is None:
nrows_val = np.random.choice(
[None, np.random.randint(low=0, high=len(self._df))]
)
params_dict[param] = nrows_val
if values == ALL_POSSIBLE_VALUES:
if param == "usecols":
col_size = self._rand(len(self._df.columns))
col_val = np.random.choice(
[
None,
np.unique(
np.random.choice(self._df.columns, col_size)
),
]
)
params_dict[param] = (
col_val if col_val is None else list(col_val)
)
elif param == "dtype":
dtype_val = np.random.choice(
[None, self._df.dtypes.to_dict()]
)
if dtype_val is not None:
dtype_val = {
col_name: "category"
if cudf.utils.dtypes.is_categorical_dtype(dtype)
else pandas_dtypes_to_cudf_dtypes[dtype]
for col_name, dtype in dtype_val.items()
}
params_dict[param] = dtype_val
elif param == "header":
header_val = np.random.choice(
["infer", np.random.randint(low=0, high=len(self._df))]
)
params_dict[param] = header_val
elif param == "skiprows":
params_dict[param] = np.random.randint(
low=0, high=len(self._df)
)
elif param == "skipfooter":
params_dict[param] = np.random.randint(
low=0, high=len(self._df)
)
elif param == "nrows":
nrows_val = np.random.choice(
[None, np.random.randint(low=0, high=len(self._df))]
)
params_dict[param] = nrows_val
else:
params_dict[param] = np.random.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
return params_dict


class CSVWriter(IOFuzz):
Expand Down Expand Up @@ -169,28 +172,28 @@ def write_data(self, file_name):
if self._current_buffer is not None:
self._current_buffer.to_csv(file_name + "_crash.csv")

def get_rand_params(self, params):
def set_rand_params(self, params):
params_dict = {}
for param, values in params.items():
if param == "columns" and values is None:
col_size = self._rand(len(self._current_buffer.columns))
params_dict[param] = list(
np.unique(
np.random.choice(
self._current_buffer.columns, col_size
if values == ALL_POSSIBLE_VALUES:
if param == "columns":
col_size = self._rand(len(self._current_buffer.columns))
params_dict[param] = list(
np.unique(
np.random.choice(
self._current_buffer.columns, col_size
)
)
)
)
elif param == "chunksize" and values is None:
params_dict[param] = np.random.choice(
[
None,
np.random.randint(
low=1, high=max(1, len(self._current_buffer))
),
]
)
elif param == "chunksize":
params_dict[param] = np.random.choice(
[
None,
np.random.randint(
low=1, high=max(1, len(self._current_buffer))
),
]
)
else:
params_dict[param] = np.random.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
return params_dict
3 changes: 2 additions & 1 deletion python/cudf/cudf/_fuzz_testing/fuzzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ def start(self):
if self.params is None:
self._target(file_name)
else:
kwargs = self._data_handler.get_rand_params(self.params)
self._data_handler.set_rand_params(self.params)
kwargs = self._data_handler._current_params["test_kwargs"]
logging.info(f"Parameters passed: {str(kwargs)}")
self._target(file_name, **kwargs)
except KeyboardInterrupt:
Expand Down
11 changes: 7 additions & 4 deletions python/cudf/cudf/_fuzz_testing/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def _load_params(self, path):

@staticmethod
def _rand(n):
return random.randrange(0, n)
return random.randrange(0, n + 1)

def generate_input(self):
raise NotImplementedError("Must be implemented by inherited class")
Expand All @@ -80,17 +80,20 @@ def get_next_regression_params(self):
self._current_params = copy.copy(param)
return dtypes_meta, num_rows, num_cols, seed

def get_rand_params(self, params):
def set_rand_params(self, params):
params_dict = {
param: np.random.choice(values) for param, values in params.items()
}
self._current_params["test_kwargs"] = self.process_kwargs(
params_dict=params_dict
)
return params_dict

def process_kwargs(self, params_dict):
return {
key: bool(value) if isinstance(value, np.bool_) else value
key: bool(value)
if isinstance(value, np.bool_)
else str(value)
if isinstance(value, np.dtype)
else value
for key, value in params_dict.items()
}
91 changes: 81 additions & 10 deletions python/cudf/cudf/_fuzz_testing/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,19 @@

import logging
import random
from collections import abc as abc

import numpy as np

import cudf
from cudf._fuzz_testing.io import IOFuzz
from cudf._fuzz_testing.utils import _generate_rand_meta, pyarrow_to_pandas
from cudf._fuzz_testing.utils import (
ALL_POSSIBLE_VALUES,
_generate_rand_meta,
pyarrow_to_pandas,
)
from cudf.tests import dataset_generator as dg
from cudf.utils.dtypes import pandas_dtypes_to_cudf_dtypes

logging.basicConfig(
format="%(asctime)s %(levelname)-8s %(message)s",
Expand All @@ -16,6 +24,20 @@
)


def _get_dtype_param_value(dtype_val):
if dtype_val is not None and isinstance(dtype_val, abc.Mapping):
processed_dtypes = {}
for col_name, dtype in dtype_val.items():
if cudf.utils.dtypes.is_categorical_dtype(dtype):
processed_dtypes[col_name] = "category"
else:
processed_dtypes[col_name] = str(
pandas_dtypes_to_cudf_dtypes.get(dtype, dtype)
)
return processed_dtypes
return dtype_val


class JSONReader(IOFuzz):
def __init__(
self,
Expand All @@ -42,12 +64,18 @@ def generate_input(self):
else:
seed = random.randint(0, 2 ** 32 - 1)
random.seed(seed)
dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
dtypes_list = list(
cudf.utils.dtypes.ALL_TYPES
# https://github.com/pandas-dev/pandas/issues/20599
- {"uint64"}
# TODO: Remove DATETIME_TYPES after this is fixed:
# https://github.com/rapidsai/cudf/issues/6586
- set(cudf.utils.dtypes.DATETIME_TYPES)
)
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
)
self._current_params["dtypes_meta"] = dtypes_meta
self._current_params["file_name"] = self._file_name
self._current_params["seed"] = seed
self._current_params["num_rows"] = num_rows
self._current_params["num_columns"] = num_cols
Expand All @@ -57,10 +85,28 @@ def generate_input(self):
)
table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
df = pyarrow_to_pandas(table)

self._current_buffer = df
logging.info(f"Shape of DataFrame generated: {df.shape}")

return df.to_json()
return df.to_json(orient="records", lines=True)

def write_data(self, file_name):
if self._current_buffer is not None:
self._current_buffer.to_json(
file_name + "_crash_json.json", orient="records", lines=True
)

def set_rand_params(self, params):
params_dict = {}
for param, values in params.items():
if param == "dtype" and values == ALL_POSSIBLE_VALUES:
dtype_val = np.random.choice(
[True, self._current_buffer.dtypes.to_dict()]
)
params_dict[param] = _get_dtype_param_value(dtype_val)
else:
params_dict[param] = np.random.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)


class JSONWriter(IOFuzz):
Expand Down Expand Up @@ -89,7 +135,14 @@ def generate_input(self):
else:
seed = random.randint(0, 2 ** 32 - 1)
random.seed(seed)
dtypes_list = list(cudf.utils.dtypes.ALL_TYPES)
dtypes_list = list(
cudf.utils.dtypes.ALL_TYPES
# https://github.com/pandas-dev/pandas/issues/20599
- {"uint64"}
# TODO: Remove DATETIME_TYPES after this is fixed:
# https://github.com/rapidsai/cudf/issues/6586
- set(cudf.utils.dtypes.DATETIME_TYPES)
)
dtypes_meta, num_rows, num_cols = _generate_rand_meta(
self, dtypes_list
)
Expand All @@ -101,9 +154,27 @@ def generate_input(self):
f"Generating DataFrame with rows: {num_rows} "
f"and columns: {num_cols}"
)
df = cudf.DataFrame.from_arrow(
dg.rand_dataframe(dtypes_meta, num_rows, seed)
)
logging.info(f"Shape of DataFrame generated: {df.shape}")
table = dg.rand_dataframe(dtypes_meta, num_rows, seed)
df = pyarrow_to_pandas(table)

logging.info(f"Shape of DataFrame generated: {df.shape}")
self._current_buffer = df
return df

def write_data(self, file_name):
if self._current_buffer is not None:
self._current_buffer.to_json(
file_name + "_crash_json.json", lines=True, orient="records"
)

def set_rand_params(self, params):
params_dict = {}
for param, values in params.items():
if param == "dtype" and values == ALL_POSSIBLE_VALUES:
dtype_val = np.random.choice(
[True, self._current_buffer.dtypes.to_dict()]
)
params_dict[param] = _get_dtype_param_value(dtype_val)
else:
params_dict[param] = np.random.choice(values)
self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
Loading

0 comments on commit 4205104

Please sign in to comment.