forked from pyjanitor-devs/pyjanitor
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ENH] Add jitter function (pyjanitor-devs#624)
* Add first attempt at jitter method in functions.py * Add test_jitter.py module with initial test for input datatypes. * Add comments to datatypes check test in test_jitter.py * Add functional tests for the jitter method with and without setting the random seed. * Add test to ensure clipping works properly to test_jitter.py. * Apply black formatting to functions.py and test_jitter.py. * Apply black formatting to other modules (I did not make any changes to these). * Shortened line length of comment in test_jitter.py. * Add typing.Optional to functions.py. * Fixed type hint for `clip` argument in jitter method of functions.py. * Added extra error catching and associated test for functions.jitter(). * Update CHANGELOG.rst * Add jitter() method to general_functions.rst * Fix test_jitter.py to properly check for Type errors. * Fix typo in doc string for functions.jitter(). * Add semantic line breaks to doc string of functions.jitter(). * Update parameter description of `clip` in functions.jitter(). * Add comments in functions.jitter() about validating the `clip` parameter. * Black formatting of test_jitter.py. * Updated functions.jitter() method to align with pycodestyle. * Fix type hint declaration for `random_state` in functions.jitter(). * Indicate in the docstring for functions.jitter() that NaN values are ignored. * Add a check to functions.jitter() to ensure the original column data are numeric. * Update the check for if `random_state` is `None` in functions.jitter(). * Add a test to ensure the column is numeric for test_jitter.py. * Add a test to ensure the resulting mean and standard deviation of the jittered values are as expected in test_jitter.py. * Import pandas for test_jitter.py. * Add test to test_jitter.py to ensure functions.jitter() still runs (ignores) NaN values. * Applied Black formatting to test_jitter.py. * Fix test_jitter.py for pycodestyle. Co-authored-by: rhosbach <45576320+rhosbach@users.noreply.github.com>
- Loading branch information
Showing
9 changed files
with
241 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,6 +44,7 @@ Modify values | |
round_to_fraction | ||
update_where | ||
to_datetime | ||
jitter | ||
|
||
Filtering | ||
~~~~~~~~~ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
janitor.jitter | ||
============== | ||
|
||
.. currentmodule:: janitor | ||
|
||
.. autofunction:: jitter |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import pandas as pd | ||
import pytest | ||
|
||
from janitor.functions import jitter | ||
|
||
|
||
@pytest.mark.functions | ||
def test_datatypes_check(dataframe): | ||
# `scale` should be a numeric value > 0 | ||
with pytest.raises(TypeError): | ||
assert dataframe.jitter( | ||
column_name="a", dest_column_name="a_jitter", scale="x" | ||
) | ||
|
||
# `random_state` should be an integer or 1-d array | ||
# (see documentation for np.random.seed) | ||
with pytest.raises(TypeError): | ||
assert dataframe.jitter( | ||
column_name="a", | ||
dest_column_name="a_jitter", | ||
scale=1, | ||
random_state="x", | ||
) | ||
|
||
# `clip` should only contain numeric values | ||
with pytest.raises(TypeError): | ||
assert dataframe.jitter( | ||
column_name="a", | ||
dest_column_name="a_jitter", | ||
scale=1, | ||
clip=["x", 2], | ||
) | ||
|
||
# The column to jitter should be numeric | ||
with pytest.raises(TypeError): | ||
assert dataframe.jitter( | ||
column_name="cities", dest_column_name="cities_jitter", scale=1 | ||
) | ||
|
||
# `scale` should be greater than 0 | ||
with pytest.raises(ValueError): | ||
assert dataframe.jitter( | ||
column_name="a", dest_column_name="a_jitter", scale=-5 | ||
) | ||
|
||
# `clip` should be a size-2 tuple of numeric values | ||
with pytest.raises(ValueError): | ||
assert dataframe.jitter( | ||
column_name="a", | ||
dest_column_name="a_jitter", | ||
scale=1, | ||
clip=[-10, 10, 5], | ||
) | ||
|
||
# `clip[0]` should be less than `clip[1]` | ||
with pytest.raises(ValueError): | ||
assert dataframe.jitter( | ||
column_name="a", dest_column_name="a_jitter", scale=1, clip=[10, 5] | ||
) | ||
|
||
|
||
@pytest.mark.functions | ||
def test_jitter(dataframe): | ||
# Functional test to ensure jitter runs without error | ||
dataframe.jitter(column_name="a", dest_column_name="a_jitter", scale=1.0) | ||
|
||
|
||
@pytest.mark.functions | ||
def test_jitter_with_nans(missingdata_df): | ||
# Functional test to ensure jitter runs without error if NaNs are present | ||
missingdata_df.jitter( | ||
column_name="a", dest_column_name="a_jitter", scale=1.0 | ||
) | ||
|
||
|
||
@pytest.mark.functions | ||
def test_jitter_random_state(dataframe): | ||
# Functional test to ensure jitter runs when setting random seed | ||
dataframe.jitter( | ||
column_name="a", | ||
dest_column_name="a_jitter", | ||
scale=1.0, | ||
random_state=77, | ||
) | ||
|
||
|
||
@pytest.mark.functions | ||
def test_jitter_clip(dataframe): | ||
# Ensure clip works as intended | ||
df = dataframe.jitter( | ||
column_name="a", | ||
dest_column_name="a_jitter", | ||
scale=1.0, | ||
clip=[1.5, 2.5], | ||
) | ||
assert (min(df["a_jitter"]) >= 1.5) & (max(df["a_jitter"]) <= 2.5) | ||
|
||
|
||
@pytest.mark.functions | ||
def test_jitter_results(): | ||
"""Ensure the mean of the jittered values is approximately | ||
equal to the mean of the original values, and that the | ||
standard deviation of the jittered value is approximately | ||
equal to the `scale` parameter.""" | ||
error_tolerance = 0.05 # 5% | ||
scale = 2.0 | ||
|
||
df = pd.DataFrame({"original": [1] * 1000}) | ||
results = df.jitter( | ||
column_name="original", dest_column_name="jittered", scale=scale | ||
) | ||
assert ( | ||
abs( | ||
(results["jittered"].mean() - results["original"].mean()) | ||
/ results["original"].mean() | ||
) | ||
<= error_tolerance | ||
) | ||
assert abs((results["jittered"].std() - scale) / scale) <= error_tolerance |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
import xarray as xr | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
import xarray as xr | ||
|
||
|
||
|