Skip to content

Commit

Permalink
[ENH] Add jitter function (pyjanitor-devs#624)
Browse files Browse the repository at this point in the history
* Add first attempt at jitter method in functions.py

* Add test_jitter.py module with initial test for input datatypes.

* Add comments to datatypes check test in test_jitter.py

* Add functional tests for the jitter method with and without setting the random seed.

* Add test to ensure clipping works properly to test_jitter.py.

* Apply black formatting to functions.py and test_jitter.py.

* Apply black formatting to other modules (I did not make any changes to these).

* Shortened line length of comment in test_jitter.py.

* Add typing.Optional to functions.py.

* Fixed type hint for `clip` argument in jitter method of functions.py.

* Added extra error catching and associated test for functions.jitter().

* Update CHANGELOG.rst

* Add jitter() method to general_functions.rst

* Fix test_jitter.py to properly check for Type errors.

* Fix typo in doc string for functions.jitter().

* Add semantic line breaks to doc string of functions.jitter().

* Update parameter description of `clip` in functions.jitter().

* Add comments in functions.jitter() about validating the `clip` parameter.

* Black formatting of test_jitter.py.

* Updated functions.jitter() method to align with pycodestyle.

* Fix type hint declaration for `random_state` in functions.jitter().

* Indicate in the docstring for functions.jitter() that NaN values are ignored.

* Add a check to functions.jitter() to ensure the original column data are numeric.

* Update the check for if `random_state` is `None` in functions.jitter().

* Add a test to ensure the column is numeric for test_jitter.py.

* Add a test to ensure the resulting mean and standard deviation of the jittered values are as expected in test_jitter.py.

* Import pandas for test_jitter.py.

* Add test to test_jitter.py to ensure functions.jitter() still runs (ignores) NaN values.

* Applied Black formatting to test_jitter.py.

* Fix test_jitter.py for pycodestyle.

Co-authored-by: rhosbach <45576320+rhosbach@users.noreply.github.com>
  • Loading branch information
2 people authored and ericmjl committed Jan 16, 2020
1 parent 16f1e04 commit 30a0326
Show file tree
Hide file tree
Showing 9 changed files with 241 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ new version (on deck)
- [DOC] Update API policy for clarity. @ericmjl
- [ENH] Enforce string conversion when cleaning names. @ericmjl
- [ENH] Change ``find_replace`` implementation to use keyword arguments to specify columns to perform find and replace on. @ericmjl
- [ENH] Add ``jitter()`` dataframe function by @rahosbach

v0.19.0
=======
Expand Down
1 change: 1 addition & 0 deletions docs/reference/general_functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ Modify values
round_to_fraction
update_where
to_datetime
jitter

Filtering
~~~~~~~~~
Expand Down
6 changes: 6 additions & 0 deletions docs/reference/janitor.functions/janitor.jitter.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
janitor.jitter
==============

.. currentmodule:: janitor

.. autofunction:: jitter
110 changes: 109 additions & 1 deletion janitor/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
Hashable,
Iterable,
List,
Optional,
Set,
Tuple,
Union,
)


import numpy as np
import pandas as pd
import pandas_flavor as pf
Expand Down Expand Up @@ -3707,3 +3707,111 @@ def toset(series: pd.Series) -> Set:
"""

return set(series.tolist())


@pf.register_dataframe_method
def jitter(
df: pd.DataFrame,
column_name: Hashable,
dest_column_name: str,
scale: np.number,
clip: Optional[Iterable[np.number]] = None,
random_state: Optional[np.number] = None,
) -> pd.DataFrame:
"""
Adds Gaussian noise (jitter) to the values of a column.
Functional usage example:
.. code-block:: python
import pandas as pd
import janitor as jn
df = pd.DataFrame(...)
df = jn.functions.jitter(
df=df,
column_name='values',
dest_column_name='values_jitter',
scale=1.0,
clip=None,
random_state=None,
)
Method chaining usage example:
.. code-block:: python
import pandas as pd
import janitor.functions
df = pd.DataFrame(...)
df = df.jitter(
column_name='values',
dest_column_name='values_jitter',
scale=1.0,
clip=None,
random_state=None,
)
A new column will be created containing the values of the original column
with Gaussian noise added.
For each value in the column, a Gaussian distribution is created
having a location (mean) equal to the value
and a scale (standard deviation) equal to `scale`.
A random value is then sampled from this distribution,
which is the jittered value.
If a tuple is supplied for `clip`,
then any values of the new column less than `clip[0]`
will be set to `clip[0]`,
and any values greater than `clip[1]` will be set to `clip[1]`.
Additionally, if a numeric value is supplied for `random_state`,
this value will be used to set the random seed used for sampling.
NaN values are ignored in this method.
This method mutates the original DataFrame.
:param df: A pandas dataframe.
:param column_name: Name of the column containing
values to add Gaussian jitter to.
:param dest_column_name: The name of the new column containing the
jittered values that will be created.
:param scale: A positive value multiplied by the original
column value to determine the scale (standard deviation) of the
Gaussian distribution to sample from. (A value of zero results in
no jittering.)
:param clip: An iterable of two values (minimum and maximum) to clip
the jittered values to, default to None.
:param random_state: A interger or 1-d array value used to set the random
seed, default to None.
:returns: A pandas DataFrame with a new column containing Gaussian-
jittered values from another column.
"""

# Check types
check("scale", scale, [int, float])

# Check that `column_name` is a numeric column
if not np.issubdtype(df[column_name].dtype, np.number):
raise TypeError(f"{column_name} must be a numeric column.")

if scale <= 0:
raise ValueError("`scale` must be a numeric value greater than 0.")
values = df[column_name]
if random_state is not None:
np.random.seed(random_state)
result = np.random.normal(loc=values, scale=scale)
if clip:
# Ensure `clip` has length 2
if len(clip) != 2:
raise ValueError("`clip` must be an iterable of length 2.")
# Ensure the values in `clip` are ordered as min, max
if clip[1] < clip[0]:
raise ValueError("`clip[0]` must be less than `clip[1]`.")
result = np.clip(result, *clip)
df[dest_column_name] = result

return df
3 changes: 2 additions & 1 deletion janitor/xarray/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
from typing import Union

import numpy as np
import xarray as xr
from pandas_flavor import (
register_xarray_dataarray_method,
register_xarray_dataset_method,
)

import xarray as xr


@register_xarray_dataarray_method
def clone_using(
Expand Down
119 changes: 119 additions & 0 deletions tests/functions/test_jitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import pandas as pd
import pytest

from janitor.functions import jitter


@pytest.mark.functions
def test_datatypes_check(dataframe):
# `scale` should be a numeric value > 0
with pytest.raises(TypeError):
assert dataframe.jitter(
column_name="a", dest_column_name="a_jitter", scale="x"
)

# `random_state` should be an integer or 1-d array
# (see documentation for np.random.seed)
with pytest.raises(TypeError):
assert dataframe.jitter(
column_name="a",
dest_column_name="a_jitter",
scale=1,
random_state="x",
)

# `clip` should only contain numeric values
with pytest.raises(TypeError):
assert dataframe.jitter(
column_name="a",
dest_column_name="a_jitter",
scale=1,
clip=["x", 2],
)

# The column to jitter should be numeric
with pytest.raises(TypeError):
assert dataframe.jitter(
column_name="cities", dest_column_name="cities_jitter", scale=1
)

# `scale` should be greater than 0
with pytest.raises(ValueError):
assert dataframe.jitter(
column_name="a", dest_column_name="a_jitter", scale=-5
)

# `clip` should be a size-2 tuple of numeric values
with pytest.raises(ValueError):
assert dataframe.jitter(
column_name="a",
dest_column_name="a_jitter",
scale=1,
clip=[-10, 10, 5],
)

# `clip[0]` should be less than `clip[1]`
with pytest.raises(ValueError):
assert dataframe.jitter(
column_name="a", dest_column_name="a_jitter", scale=1, clip=[10, 5]
)


@pytest.mark.functions
def test_jitter(dataframe):
# Functional test to ensure jitter runs without error
dataframe.jitter(column_name="a", dest_column_name="a_jitter", scale=1.0)


@pytest.mark.functions
def test_jitter_with_nans(missingdata_df):
# Functional test to ensure jitter runs without error if NaNs are present
missingdata_df.jitter(
column_name="a", dest_column_name="a_jitter", scale=1.0
)


@pytest.mark.functions
def test_jitter_random_state(dataframe):
# Functional test to ensure jitter runs when setting random seed
dataframe.jitter(
column_name="a",
dest_column_name="a_jitter",
scale=1.0,
random_state=77,
)


@pytest.mark.functions
def test_jitter_clip(dataframe):
# Ensure clip works as intended
df = dataframe.jitter(
column_name="a",
dest_column_name="a_jitter",
scale=1.0,
clip=[1.5, 2.5],
)
assert (min(df["a_jitter"]) >= 1.5) & (max(df["a_jitter"]) <= 2.5)


@pytest.mark.functions
def test_jitter_results():
"""Ensure the mean of the jittered values is approximately
equal to the mean of the original values, and that the
standard deviation of the jittered value is approximately
equal to the `scale` parameter."""
error_tolerance = 0.05 # 5%
scale = 2.0

df = pd.DataFrame({"original": [1] * 1000})
results = df.jitter(
column_name="original", dest_column_name="jittered", scale=scale
)
assert (
abs(
(results["jittered"].mean() - results["original"].mean())
/ results["original"].mean()
)
<= error_tolerance
)
assert abs((results["jittered"].std() - scale) / scale) <= error_tolerance
1 change: 1 addition & 0 deletions tests/xarray/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import pytest

import xarray as xr


Expand Down
2 changes: 1 addition & 1 deletion tests/xarray/test_clone_using.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import numpy as np
import pytest
import xarray as xr

import janitor # noqa: F401
import xarray as xr


@pytest.mark.xarray
Expand Down
1 change: 1 addition & 0 deletions tests/xarray/test_convert_datetime_to_number.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import pytest

import xarray as xr


Expand Down

0 comments on commit 30a0326

Please sign in to comment.