Skip to content

Commit

Permalink
Support min_val for categorical features in DataGen (#1369)
Browse files Browse the repository at this point in the history
* API Overhaul

First draft of the API overhauls changes. Adds most core functionality, including
defining workflow graphs with a ColumnGroup class, the workflow and dataset changes
, most operators converted to use the new api, etc.

* remove debug print statement

* Fix test_io unittest

Also partially fix some tests inside test_workflow

* Handle multi-column joint/combo categorify

* Update JoinGroupby

* Fix differencelag

* add dependencies method (#498)

* Convert TargetEncoding op

* Update nvtabular/workflow.py

Co-authored-by: Richard (Rick) Zamora <rzamora217@gmail.com>

* Update nvtabular/workflow.py

Co-authored-by: Richard (Rick) Zamora <rzamora217@gmail.com>

* Remove workflow code from dataloaders

We should be doing online transforms like
```KerasSequenceLoader(workflow.transform(dataset), ...```  instead of
```KerasSequenceLoader(dataset, workflows=[workflow], ...``` now

* Unittest ops + bugfix in Bucketize (#496)

* test_minmix

* updates test

* unittest ops

* First draft get_embedding_sizes support

Re-add get_embedding_sizes . Note that this changes how we support multi-hot columns here
(sizes are returned same as single hot, and we don't use this method to distinguish between
multi and singlehot columns)

* isort

* Remove groupbystatistics

* implement serialization of statistics

add save_stats/load_stats/clear_stats methods to the workflow, with each statoperator getting
called as appropiate

* Fix TF dataloader unittests

* test_torch_dataloader fixes

* doc strings

* support min

* permutate index

Co-authored-by: Ben Frederickson <github@benfrederickson.com>
Co-authored-by: rnyak <ronayak@hotmail.com>
Co-authored-by: Richard (Rick) Zamora <rzamora217@gmail.com>
Co-authored-by: root <root@dgx06.aselab.nvidia.com>
Co-authored-by: Karl Higley <kmhigley@gmail.com>
  • Loading branch information
6 people authored Feb 8, 2022
1 parent b69281d commit b3823ea
Showing 1 changed file with 30 additions and 2 deletions.
32 changes: 30 additions & 2 deletions nvtabular/tools/data_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,19 @@
import string

import numpy as np
import pandas as pd
import psutil

try:
import cupy
except ImportError:
cupy = np

try:
import cudf
except ImportError:
cudf = pd

from scipy import stats
from scipy.stats import powerlaw, uniform

Expand Down Expand Up @@ -131,12 +137,16 @@ def create_cats(self, size, cats_rep, entries=False):
offs = offs.astype("int32")
if HAS_GPU:
ser = dist.create_col(
col_size, dtype=np.long, min_val=0, max_val=col.cardinality
col_size, dtype=np.long, min_val=col.min_val, max_val=col.cardinality
).ceil()
else:
ser = dist.create_col(col_size, dtype=np.long, min_val=0, max_val=col.cardinality)
ser = dist.create_col(
col_size, dtype=np.long, min_val=col.min_val, max_val=col.cardinality
)
ser = _make_df(np.ceil(ser))[0]
ser = ser.astype("int32")
if col.permutate_index:
ser = self.permutate_index(ser)
if entries:
cat_names = self.create_cat_entries(
col.cardinality, min_size=col.min_entry_size, max_size=col.max_entry_size
Expand Down Expand Up @@ -348,6 +358,18 @@ def find_target_rep(self, name, cats_rep):
return rep
return None

def permutate_index(self, ser):
name = ser.name
ser.name = "ind"
ind = ser.drop_duplicates().values
ind_random = cupy.random.permutation(ind)
df_map = cudf.DataFrame({"ind": ind, "ind_random": ind_random})
if not HAS_GPU:
ser = cudf.DataFrame(ser)
ser = ser.merge(df_map, how="left", left_on="ind", right_on="ind")["ind_random"]
ser.name = name
return ser


DISTRO_TYPES = {"powerlaw": PowerLawDistro, "uniform": UniformDistro}

Expand Down Expand Up @@ -395,6 +417,8 @@ def __init__(
multi_max=None,
multi_avg=None,
distro=None,
min_val=0,
permutate_index=False,
):
super().__init__(name, dtype, distro)
self.cardinality = cardinality
Expand All @@ -405,6 +429,8 @@ def __init__(
self.multi_min = multi_min
self.multi_max = multi_max
self.multi_avg = multi_avg
self.min_val = min_val
self.permutate_index = permutate_index


class LabelCol(Col):
Expand Down Expand Up @@ -442,6 +468,8 @@ def _get_cols_from_schema(schema, distros=None):
multi_min:
multi_max:
multi_avg:
min_val:
permutate_index:
labels:
col_name:
Expand Down

0 comments on commit b3823ea

Please sign in to comment.