Skip to content

Commit

Permalink
Remove carriage return
Browse files Browse the repository at this point in the history
It looks like some files got commited recently (in NVIDIA-Merlin#521) with windows style linefeed/carriage
returns instead of just the linefeed in the rest of the codebase. Fix so we don't
generate massive whitespace diffs on every commit.
  • Loading branch information
benfred committed Jan 30, 2021
1 parent 764b896 commit e90ca35
Show file tree
Hide file tree
Showing 3 changed files with 495 additions and 495 deletions.
202 changes: 101 additions & 101 deletions nvtabular/ops/data_stats.py
Original file line number Diff line number Diff line change
@@ -1,101 +1,101 @@
#
# Copyright (c) 2020, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import dask_cudf
import numpy as np
from nvtx import annotate

from .operator import ColumnNames
from .stat_operator import StatOperator


class DataStats(StatOperator):
def __init__(self):
super().__init__()
self.col_names = []
self.col_types = []
self.col_dtypes = []
self.output = {}

@annotate("DataStats_fit", color="green", domain="nvt_python")
def fit(self, columns: ColumnNames, ddf: dask_cudf.DataFrame):
dask_stats = {}

ddf_dtypes = ddf.head(1)

# For each column, calculate the stats
for col in columns:
dask_stats[col] = {}
self.col_names.append(col)
# Get dtype for all
dtype = ddf_dtypes[col].dtype
self.col_dtypes.append(dtype)

# Identify column type
if np.issubdtype(dtype, np.floating):
col_type = "conts"
else:
col_type = "cats"
self.col_types.append(col_type)

# Get cardinality for cats
if col_type == "cats":
dask_stats[col]["cardinality"] = ddf[col].nunique()

# if string, replace string for their lengths for the rest of the computations
if dtype == "object":
ddf[col] = ddf[col].map_partitions(lambda x: x.str.len(), meta=("x", int))
# Add list support when cudf supports it:
# https://github.com/rapidsai/cudf/issues/7157
# elif col_type == "cat_mh":
# ddf[col] = ddf[col].map_partitions(lambda x: x.list.len())

# Get min,max, and mean
dask_stats[col]["min"] = ddf[col].min()
dask_stats[col]["max"] = ddf[col].max()
dask_stats[col]["mean"] = ddf[col].mean()

# Get std only for conts
if col_type == "conts":
dask_stats[col]["std"] = ddf[col].std()

# Get Percentage of NaNs for all
dask_stats[col]["per_nan"] = 100 * (1 - ddf[col].count() / len(ddf[col]))

return dask_stats

def fit_finalize(self, dask_stats):
for i, col in enumerate(self.col_names):
# Add dtype
dask_stats[col]["dtype"] = str(self.col_dtypes[i])
# Cast types for yaml
if isinstance(dask_stats[col]["mean"], np.floating):
dask_stats[col]["mean"] = dask_stats[col]["mean"].item()
if isinstance(dask_stats[col]["per_nan"], np.floating):
dask_stats[col]["per_nan"] = dask_stats[col]["per_nan"].item()
if self.col_types[i] == "conts":
if isinstance(dask_stats[col]["std"], np.floating):
dask_stats[col]["std"] = dask_stats[col]["std"].item()
else:
if isinstance(dask_stats[col]["cardinality"], np.integer):
dask_stats[col]["cardinality"] = dask_stats[col]["cardinality"].item()
self.output = dask_stats

def clear(self):
self.output = {}

# transform.__doc__ = Operator.transform.__doc__
fit.__doc__ = StatOperator.fit.__doc__
fit_finalize.__doc__ = StatOperator.fit_finalize.__doc__
#
# Copyright (c) 2020, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import dask_cudf
import numpy as np
from nvtx import annotate

from .operator import ColumnNames
from .stat_operator import StatOperator


class DataStats(StatOperator):
def __init__(self):
super().__init__()
self.col_names = []
self.col_types = []
self.col_dtypes = []
self.output = {}

@annotate("DataStats_fit", color="green", domain="nvt_python")
def fit(self, columns: ColumnNames, ddf: dask_cudf.DataFrame):
dask_stats = {}

ddf_dtypes = ddf.head(1)

# For each column, calculate the stats
for col in columns:
dask_stats[col] = {}
self.col_names.append(col)
# Get dtype for all
dtype = ddf_dtypes[col].dtype
self.col_dtypes.append(dtype)

# Identify column type
if np.issubdtype(dtype, np.floating):
col_type = "conts"
else:
col_type = "cats"
self.col_types.append(col_type)

# Get cardinality for cats
if col_type == "cats":
dask_stats[col]["cardinality"] = ddf[col].nunique()

# if string, replace string for their lengths for the rest of the computations
if dtype == "object":
ddf[col] = ddf[col].map_partitions(lambda x: x.str.len(), meta=("x", int))
# Add list support when cudf supports it:
# https://github.com/rapidsai/cudf/issues/7157
# elif col_type == "cat_mh":
# ddf[col] = ddf[col].map_partitions(lambda x: x.list.len())

# Get min,max, and mean
dask_stats[col]["min"] = ddf[col].min()
dask_stats[col]["max"] = ddf[col].max()
dask_stats[col]["mean"] = ddf[col].mean()

# Get std only for conts
if col_type == "conts":
dask_stats[col]["std"] = ddf[col].std()

# Get Percentage of NaNs for all
dask_stats[col]["per_nan"] = 100 * (1 - ddf[col].count() / len(ddf[col]))

return dask_stats

def fit_finalize(self, dask_stats):
for i, col in enumerate(self.col_names):
# Add dtype
dask_stats[col]["dtype"] = str(self.col_dtypes[i])
# Cast types for yaml
if isinstance(dask_stats[col]["mean"], np.floating):
dask_stats[col]["mean"] = dask_stats[col]["mean"].item()
if isinstance(dask_stats[col]["per_nan"], np.floating):
dask_stats[col]["per_nan"] = dask_stats[col]["per_nan"].item()
if self.col_types[i] == "conts":
if isinstance(dask_stats[col]["std"], np.floating):
dask_stats[col]["std"] = dask_stats[col]["std"].item()
else:
if isinstance(dask_stats[col]["cardinality"], np.integer):
dask_stats[col]["cardinality"] = dask_stats[col]["cardinality"].item()
self.output = dask_stats

def clear(self):
self.output = {}

# transform.__doc__ = Operator.transform.__doc__
fit.__doc__ = StatOperator.fit.__doc__
fit_finalize.__doc__ = StatOperator.fit_finalize.__doc__
Loading

0 comments on commit e90ca35

Please sign in to comment.