[ENH] Add jitter function (pyjanitor-devs#624)

* Add first attempt at jitter method in functions.py * Add test_jitter.py module with initial test for input datatypes. * Add comments to datatypes check test in test_jitter.py * Add functional tests for the jitter method with and without setting the random seed. * Add test to ensure clipping works properly to test_jitter.py. * Apply black formatting to functions.py and test_jitter.py. * Apply black formatting to other modules (I did not make any changes to these). * Shortened line length of comment in test_jitter.py. * Add typing.Optional to functions.py. * Fixed type hint for `clip` argument in jitter method of functions.py. * Added extra error catching and associated test for functions.jitter(). * Update CHANGELOG.rst * Add jitter() method to general_functions.rst * Fix test_jitter.py to properly check for Type errors. * Fix typo in doc string for functions.jitter(). * Add semantic line breaks to doc string of functions.jitter(). * Update parameter description of `clip` in functions.jitter(). * Add comments in functions.jitter() about validating the `clip` parameter. * Black formatting of test_jitter.py. * Updated functions.jitter() method to align with pycodestyle. * Fix type hint declaration for `random_state` in functions.jitter(). * Indicate in the docstring for functions.jitter() that NaN values are ignored. * Add a check to functions.jitter() to ensure the original column data are numeric. * Update the check for if `random_state` is `None` in functions.jitter(). * Add a test to ensure the column is numeric for test_jitter.py. * Add a test to ensure the resulting mean and standard deviation of the jittered values are as expected in test_jitter.py. * Import pandas for test_jitter.py. * Add test to test_jitter.py to ensure functions.jitter() still runs (ignores) NaN values. * Applied Black formatting to test_jitter.py. * Fix test_jitter.py for pycodestyle. Co-authored-by: rhosbach <45576320+rhosbach@users.noreply.github.com>
szuckerman · Jan 16, 2020 · 30a0326 · 30a0326
1 parent 16f1e04
commit 30a0326
Show file tree

Hide file tree

Showing 9 changed files with 241 additions and 3 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,7 @@ new version (on deck)
 - [DOC] Update API policy for clarity. @ericmjl
 - [ENH] Enforce string conversion when cleaning names. @ericmjl
 - [ENH] Change ``find_replace`` implementation to use keyword arguments to specify columns to perform find and replace on. @ericmjl
+- [ENH] Add ``jitter()`` dataframe function by @rahosbach
 
 v0.19.0
 =======

diff --git a/docs/reference/general_functions.rst b/docs/reference/general_functions.rst
@@ -44,6 +44,7 @@ Modify values
     round_to_fraction
     update_where
     to_datetime
+    jitter
 
 Filtering
 ~~~~~~~~~

diff --git a/docs/reference/janitor.functions/janitor.jitter.md b/docs/reference/janitor.functions/janitor.jitter.md
@@ -0,0 +1,6 @@
+janitor.jitter
+==============
+
+.. currentmodule:: janitor
+
+.. autofunction:: jitter
diff --git a/janitor/functions.py b/janitor/functions.py
@@ -13,12 +13,12 @@
     Hashable,
     Iterable,
     List,
+    Optional,
     Set,
     Tuple,
     Union,
 )
 
-
 import numpy as np
 import pandas as pd
 import pandas_flavor as pf
@@ -3707,3 +3707,111 @@ def toset(series: pd.Series) -> Set:
     """
 
     return set(series.tolist())
+
+
+@pf.register_dataframe_method
+def jitter(
+    df: pd.DataFrame,
+    column_name: Hashable,
+    dest_column_name: str,
+    scale: np.number,
+    clip: Optional[Iterable[np.number]] = None,
+    random_state: Optional[np.number] = None,
+) -> pd.DataFrame:
+    """
+    Adds Gaussian noise (jitter) to the values of a column.
+
+    Functional usage example:
+
+    .. code-block:: python
+
+        import pandas as pd
+        import janitor as jn
+
+        df = pd.DataFrame(...)
+
+        df = jn.functions.jitter(
+            df=df,
+            column_name='values',
+            dest_column_name='values_jitter',
+            scale=1.0,
+            clip=None,
+            random_state=None,
+        )
+
+    Method chaining usage example:
+
+    .. code-block:: python
+
+        import pandas as pd
+        import janitor.functions
+
+        df = pd.DataFrame(...)
+
+        df = df.jitter(
+            column_name='values',
+            dest_column_name='values_jitter',
+            scale=1.0,
+            clip=None,
+            random_state=None,
+        )
+
+    A new column will be created containing the values of the original column
+    with Gaussian noise added.
+    For each value in the column, a Gaussian distribution is created
+    having a location (mean) equal to the value
+    and a scale (standard deviation) equal to `scale`.
+    A random value is then sampled from this distribution,
+    which is the jittered value.
+    If a tuple is supplied for `clip`,
+    then any values of the new column less than `clip[0]`
+    will be set to `clip[0]`,
+    and any values greater than `clip[1]` will be set to `clip[1]`.
+    Additionally, if a numeric value is supplied for `random_state`,
+    this value will be used to set the random seed used for sampling.
+    NaN values are ignored in this method.
+
+    This method mutates the original DataFrame.
+
+    :param df: A pandas dataframe.
+    :param column_name: Name of the column containing
+        values to add Gaussian jitter to.
+    :param dest_column_name: The name of the new column containing the
+        jittered values that will be created.
+    :param scale: A positive value multiplied by the original
+        column value to determine the scale (standard deviation) of the
+        Gaussian distribution to sample from. (A value of zero results in
+        no jittering.)
+    :param clip: An iterable of two values (minimum and maximum) to clip
+        the jittered values to, default to None.
+    :param random_state: A interger or 1-d array value used to set the random
+        seed, default to None.
+
+    :returns: A pandas DataFrame with a new column containing Gaussian-
+        jittered values from another column.
+    """
+
+    # Check types
+    check("scale", scale, [int, float])
+
+    # Check that `column_name` is a numeric column
+    if not np.issubdtype(df[column_name].dtype, np.number):
+        raise TypeError(f"{column_name} must be a numeric column.")
+
+    if scale <= 0:
+        raise ValueError("`scale` must be a numeric value greater than 0.")
+    values = df[column_name]
+    if random_state is not None:
+        np.random.seed(random_state)
+    result = np.random.normal(loc=values, scale=scale)
+    if clip:
+        # Ensure `clip` has length 2
+        if len(clip) != 2:
+            raise ValueError("`clip` must be an iterable of length 2.")
+        # Ensure the values in `clip` are ordered as min, max
+        if clip[1] < clip[0]:
+            raise ValueError("`clip[0]` must be less than `clip[1]`.")
+        result = np.clip(result, *clip)
+    df[dest_column_name] = result
+
+    return df
diff --git a/janitor/xarray/functions.py b/janitor/xarray/functions.py
@@ -7,12 +7,13 @@
 from typing import Union
 
 import numpy as np
-import xarray as xr
 from pandas_flavor import (
     register_xarray_dataarray_method,
     register_xarray_dataset_method,
 )
 
+import xarray as xr
+
 
 @register_xarray_dataarray_method
 def clone_using(

diff --git a/tests/functions/test_jitter.py b/tests/functions/test_jitter.py
@@ -0,0 +1,119 @@
+import pandas as pd
+import pytest
+
+from janitor.functions import jitter
+
+
+@pytest.mark.functions
+def test_datatypes_check(dataframe):
+    # `scale` should be a numeric value > 0
+    with pytest.raises(TypeError):
+        assert dataframe.jitter(
+            column_name="a", dest_column_name="a_jitter", scale="x"
+        )
+
+    # `random_state` should be an integer or 1-d array
+    # (see documentation for np.random.seed)
+    with pytest.raises(TypeError):
+        assert dataframe.jitter(
+            column_name="a",
+            dest_column_name="a_jitter",
+            scale=1,
+            random_state="x",
+        )
+
+    # `clip` should only contain numeric values
+    with pytest.raises(TypeError):
+        assert dataframe.jitter(
+            column_name="a",
+            dest_column_name="a_jitter",
+            scale=1,
+            clip=["x", 2],
+        )
+
+    # The column to jitter should be numeric
+    with pytest.raises(TypeError):
+        assert dataframe.jitter(
+            column_name="cities", dest_column_name="cities_jitter", scale=1
+        )
+
+    # `scale` should be greater than 0
+    with pytest.raises(ValueError):
+        assert dataframe.jitter(
+            column_name="a", dest_column_name="a_jitter", scale=-5
+        )
+
+    # `clip` should be a size-2 tuple of numeric values
+    with pytest.raises(ValueError):
+        assert dataframe.jitter(
+            column_name="a",
+            dest_column_name="a_jitter",
+            scale=1,
+            clip=[-10, 10, 5],
+        )
+
+    # `clip[0]` should be less than `clip[1]`
+    with pytest.raises(ValueError):
+        assert dataframe.jitter(
+            column_name="a", dest_column_name="a_jitter", scale=1, clip=[10, 5]
+        )
+
+
+@pytest.mark.functions
+def test_jitter(dataframe):
+    # Functional test to ensure jitter runs without error
+    dataframe.jitter(column_name="a", dest_column_name="a_jitter", scale=1.0)
+
+
+@pytest.mark.functions
+def test_jitter_with_nans(missingdata_df):
+    # Functional test to ensure jitter runs without error if NaNs are present
+    missingdata_df.jitter(
+        column_name="a", dest_column_name="a_jitter", scale=1.0
+    )
+
+
+@pytest.mark.functions
+def test_jitter_random_state(dataframe):
+    # Functional test to ensure jitter runs when setting random seed
+    dataframe.jitter(
+        column_name="a",
+        dest_column_name="a_jitter",
+        scale=1.0,
+        random_state=77,
+    )
+
+
+@pytest.mark.functions
+def test_jitter_clip(dataframe):
+    # Ensure clip works as intended
+    df = dataframe.jitter(
+        column_name="a",
+        dest_column_name="a_jitter",
+        scale=1.0,
+        clip=[1.5, 2.5],
+    )
+    assert (min(df["a_jitter"]) >= 1.5) & (max(df["a_jitter"]) <= 2.5)
+
+
+@pytest.mark.functions
+def test_jitter_results():
+    """Ensure the mean of the jittered values is approximately
+    equal to the mean of the original values, and that the
+    standard deviation of the jittered value is approximately
+    equal to the `scale` parameter."""
+    error_tolerance = 0.05  # 5%
+    scale = 2.0
+
+    df = pd.DataFrame({"original": [1] * 1000})
+    results = df.jitter(
+        column_name="original", dest_column_name="jittered", scale=scale
+    )
+    assert (
+        abs(
+            (results["jittered"].mean() - results["original"].mean())
+            / results["original"].mean()
+        )
+        <= error_tolerance
+    )
+    assert abs((results["jittered"].std() - scale) / scale) <= error_tolerance
diff --git a/tests/xarray/conftest.py b/tests/xarray/conftest.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+
 import xarray as xr
 
 

diff --git a/tests/xarray/test_clone_using.py b/tests/xarray/test_clone_using.py
@@ -1,8 +1,8 @@
 import numpy as np
 import pytest
-import xarray as xr
 
 import janitor  # noqa: F401
+import xarray as xr
 
 
 @pytest.mark.xarray

diff --git a/tests/xarray/test_convert_datetime_to_number.py b/tests/xarray/test_convert_datetime_to_number.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+
 import xarray as xr