Skip to content

Commit

Permalink
add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
alxlyj committed Sep 5, 2024
1 parent d1889b3 commit 1fe934d
Show file tree
Hide file tree
Showing 6 changed files with 309 additions and 58 deletions.
48 changes: 48 additions & 0 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
# A continuous integration (CI) workflow to build and test Robyn Python project

name: Robyn Python application

on:
push:
branches: [ "main", "robynpy_release, "python_rewrite_modeling" ]
pull_request:
branches: [ "main", "robynpy_release", "python_rewrite_modeling" ]

permissions:
contents: read

jobs:
build:

runs-on: ubuntu-latest

strategy:
matrix:
python-version: ["pypy3.9", "3.9", "3.10"]

steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: 'pip'
- name: Display Python version
run: python -c "import sys; print(sys.version)"

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest pytest-cov
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude=./robyn_api/*.py
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude=./robyn_api/*.py
- name: Test with pytest
run: |
pytest ./python/tests --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
Binary file not shown.
75 changes: 75 additions & 0 deletions python/src/robyn/modeling/entities/feature_engineering_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# robyn/modeling/entities/feature_engineering_data.py

import pandas as pd

class FeatureEngineeringInputData:
"""
Class to encapsulate the input data for the feature engineering process.
Attributes:
dt_input (pd.DataFrame): The raw input data.
date_var (str): The name of the date variable.
dep_var (str): The name of the dependent variable.
dep_var_type (str): The type of the dependent variable.
paid_media_spends (list): List of paid media spend columns.
paid_media_vars (list): List of paid media variable columns.
paid_media_signs (list): List of signs for paid media variables.
context_vars (list): List of context variable columns.
context_signs (list): List of signs for context variables.
organic_vars (list): List of organic variable columns.
organic_signs (list): List of signs for organic variables.
factor_vars (list): List of factor variables.
dt_holidays (pd.DataFrame): The raw input holiday data.
prophet_vars (list): List of prophet variables.
prophet_signs (list): List of signs for prophet variables.
prophet_country (str): The country for prophet holidays.
adstock (str): The adstock type.
hyperparameters (dict): The hyperparameters.
window_start (str): The start date of the modeling period.
window_end (str): The end date of the modeling period.
calibration_input (pd.DataFrame): The calibration input data.
json_file (str): The JSON file for importing previously exported inputs.
"""
def __init__(self, dt_input=None, date_var=None, dep_var=None, dep_var_type=None,
paid_media_spends=None, paid_media_vars=None, paid_media_signs=None,
context_vars=None, context_signs=None, organic_vars=None, organic_signs=None,
factor_vars=None, dt_holidays=None, prophet_vars=None, prophet_signs=None,
prophet_country=None, adstock=None, hyperparameters=None, window_start=None,
window_end=None, calibration_input=None, json_file=None):
self.dt_input = dt_input
self.date_var = date_var
self.dep_var = dep_var
self.dep_var_type = dep_var_type
self.paid_media_spends = paid_media_spends
self.paid_media_vars = paid_media_vars
self.paid_media_signs = paid_media_signs
self.context_vars = context_vars
self.context_signs = context_signs
self.organic_vars = organic_vars
self.organic_signs = organic_signs
self.factor_vars = factor_vars
self.dt_holidays = dt_holidays
self.prophet_vars = prophet_vars
self.prophet_signs = prophet_signs
self.prophet_country = prophet_country
self.adstock = adstock
self.hyperparameters = hyperparameters
self.window_start = window_start
self.window_end = window_end
self.calibration_input = calibration_input
self.json_file = json_file


class FeatureEngineeringOutputData:
"""
Class to encapsulate the output data for the feature engineering process.
Attributes:
dt_mod (pd.DataFrame): The modified data after feature engineering.
dt_modRollWind (pd.DataFrame): The modified data within the rolling window.
modNLS (dict): The nonlinear model results.
"""
def __init__(self, dt_mod=None, dt_modRollWind=None, modNLS=None):
self.dt_mod = dt_mod
self.dt_modRollWind = dt_modRollWind
self.modNLS = modNLS
202 changes: 144 additions & 58 deletions python/src/robyn/modeling/feature_engineering.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,39 @@
# feature_engineering.py

import numpy as np
import pandas as pd
from robyn.data.entities.mmmdata_collection import MMMDataCollection

from fbprophet import Prophet
from scipy.optimize import curve_fit
from sklearn.metrics import r2_score
from robyn.modeling.entities.feature_engineering_data import FeatureEngineeringInputData, FeatureEngineeringOutputData
from typing import Tuple, Dict, Any

class FeatureEngineering:
def __init__(self, mmm_data_collection: MMMDataCollection):
self.mmm_data_collection = mmm_data_collection
def __init__(self, input_data: FeatureEngineeringInputData):
"""
Initialize the FeatureEngineering class with input data.
Args:
input_data (FeatureEngineeringInputData): The input data for feature engineering.
"""
self.input_data = input_data

def feature_engineering(self, quiet: bool = False) -> FeatureEngineeringOutputData:
"""
Perform feature engineering on the input data.
Args:
quiet (bool): If True, suppress print statements. Default is False.
def feature_engineering(self, quiet: bool = False) -> MMMDataCollection:
Returns:
FeatureEngineeringOutputData: The output data after feature engineering.
"""
if not quiet:
print(">> Running Robyn feature engineering...")

dt_transform = self.__prepare_data()
dt_transform_roll_wind = self.__create_rolling_window_data(dt_transform)
dt_input_roll_wind = dt_transform.loc[
self.mmm_data_collection.rollingWindowStartWhich
- 1 : self.mmm_data_collection.rollingWindowEndWhich
self.input_data.rollingWindowStartWhich
- 1 : self.input_data.rollingWindowEndWhich
- 1
]
media_cost_factor = self.__calculate_media_cost_factor(dt_input_roll_wind)
Expand All @@ -26,58 +42,128 @@ def feature_engineering(self, quiet: bool = False) -> MMMDataCollection:
dt_input_roll_wind, media_cost_factor
)

self.mmm_data_collection.dt_mod = dt_transform
self.mmm_data_collection.dt_modRollWind = dt_transform_roll_wind
self.mmm_data_collection.dt_inputRollWind = dt_input_roll_wind
self.mmm_data_collection.modNLS = {
"results": mod_nls_collect,
"yhat": yhat_collect,
"plots": plot_nls_collect,
}
output_data = FeatureEngineeringOutputData(
dt_mod=dt_transform,
dt_modRollWind=dt_transform_roll_wind,
dt_inputRollWind=dt_input_roll_wind,
modNLS={
"results": mod_nls_collect,
"yhat": yhat_collect,
"plots": plot_nls_collect,
}
)

return self.mmm_data_collection
return output_data

def __prepare_data(self) -> pd.DataFrame:
used_columns = [
var
for var in self.mmm_data_collection.dt_input.columns
if var not in self.mmm_data_collection.unused_vars
]
dt_input = self.mmm_data_collection.dt_input[used_columns]
dt_transform = dt_input.rename(
columns={
self.mmm_data_collection.date_var: "ds",
self.mmm_data_collection.dep_var: "dep_var",
}
)
dt_transform = dt_transform.sort_values(by=["ds"])
return dt_transform
"""
Prepare the input data by converting date columns and setting dependent variables.
Returns:
pd.DataFrame: The prepared data.
"""
dt_input = self.input_data.dt_input.copy()
dt_input['ds'] = pd.to_datetime(dt_input[self.input_data.date_var])
dt_input['dep_var'] = dt_input[self.input_data.dep_var]
return dt_input

def __create_rolling_window_data(self, dt_transform: pd.DataFrame) -> pd.DataFrame:
rolling_window_start = self.mmm_data_collection.rollingWindowStartWhich - 1
rolling_window_end = self.mmm_data_collection.rollingWindowEndWhich - 1
dt_transform_roll_wind = dt_transform.iloc[
rolling_window_start:rolling_window_end
"""
Create a rolling window of data.
Args:
dt_transform (pd.DataFrame): The transformed data.
Returns:
pd.DataFrame: The rolling window data.
"""
return dt_transform.loc[
self.input_data.rollingWindowStartWhich
- 1 : self.input_data.rollingWindowEndWhich
- 1
]
return dt_transform_roll_wind

def __calculate_media_cost_factor(
self, dt_input_roll_wind: pd.DataFrame
) -> List[float]:
media_cost_factor = []
for i in range(len(self.mmm_data_collection.paid_media_spends)):
spend_sum = np.sum(
dt_input_roll_wind[self.mmm_data_collection.paid_media_spends[i]]
)
exposure_sum = np.sum(
dt_input_roll_wind[self.mmm_data_collection.paid_media_vars[i]]
)
media_cost_factor.append(spend_sum / exposure_sum)
return media_cost_factor

def __run_models(
self, dt_input_roll_wind: pd.DataFrame, media_cost_factor: List[float]
) -> Tuple[List[pd.DataFrame], List[Any], List[pd.DataFrame]]:
# Implement model running logic here
# For simplicity, we'll return empty lists
return [], [], []

def __calculate_media_cost_factor(self, dt_input_roll_wind: pd.DataFrame) -> pd.Series:
"""
Calculate the media cost factor.
Args:
dt_input_roll_wind (pd.DataFrame): The input data within the rolling window.
Returns:
pd.Series: The media cost factor.
"""
return dt_input_roll_wind[self.input_data.paid_media_spends].sum() / dt_input_roll_wind[self.input_data.paid_media_vars].sum()

def __run_models(self, dt_input_roll_wind: pd.DataFrame, media_cost_factor: pd.Series) -> Tuple[list, list, list]:
"""
Run nonlinear and linear models for each media variable.
Args:
dt_input_roll_wind (pd.DataFrame): The input data within the rolling window.
media_cost_factor (pd.Series): The media cost factor.
Returns:
tuple: Collections of model results, plots, and predictions.
"""
mod_nls_collect = []
plot_nls_collect = []
yhat_collect = []

for spend, var in zip(self.input_data.paid_media_spends, self.input_data.paid_media_vars):
if spend != var:
dt_spend_mod_input = dt_input_roll_wind[[spend, var]].dropna()
results = self.__fit_spend_exposure(dt_spend_mod_input, media_cost_factor[var])
mod_nls_collect.append(results['res'])
plot_nls_collect.append(results['plot'])
yhat_collect.append(results['yhat'])

return mod_nls_collect, plot_nls_collect, yhat_collect

def __fit_spend_exposure(self, dt_spend_mod_input: pd.DataFrame, media_cost_factor: float) -> Dict[str, Any]:
"""
Fit the Michaelis-Menten model and a linear model to the spend and exposure data.
Args:
dt_spend_mod_input (pd.DataFrame): The spend and exposure data.
media_cost_factor (float): The media cost factor.
Returns:
dict: The results of the model fitting, including plots and predictions.
"""
def michaelis_menten(spend: np.ndarray, Vmax: float, Km: float) -> np.ndarray:
return Vmax * spend / (Km + spend)

spend = dt_spend_mod_input.iloc[:, 0]
exposure = dt_spend_mod_input.iloc[:, 1]

try:
popt, _ = curve_fit(michaelis_menten, spend, exposure, maxfev=10000)
yhat_nls = michaelis_menten(spend, *popt)
rsq_nls = r2_score(exposure, yhat_nls)
except:
popt = [np.nan, np.nan]
yhat_nls = np.full_like(spend, np.nan)
rsq_nls = np.nan

lm_coef = np.polyfit(spend, exposure, 1)[0]
yhat_lm = np.polyval([lm_coef, 0], spend)
rsq_lm = r2_score(exposure, yhat_lm)

res = {
'channel': dt_spend_mod_input.columns[1],
'Vmax': popt[0],
'Km': popt[1],
'rsq_nls': rsq_nls,
'rsq_lm': rsq_lm,
'coef_lm': lm_coef
}

plot_data = pd.DataFrame({
'spend': spend,
'exposure': exposure,
'yhat_nls': yhat_nls,
'yhat_lm': yhat_lm
})

return {'res': res, 'plot': plot_data, 'yhat': yhat_nls}
Binary file not shown.
42 changes: 42 additions & 0 deletions tests/test_feature_engineering_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# tests/test_feature_engineering_data.py

import pytest
import sys
sys.path.append("/Users/yijuilee/project_robyn/modelling/Robyn/python/src")
from robyn.modeling.entities.feature_engineering_data import FeatureEngineeringInputData

def test_feature_engineering_input_data_initialization():
dt_input = None
date_var = "date"
dep_var = "revenue"
dep_var_type = "revenue"
paid_media_spends = ["tv_S", "ooh_S"]
paid_media_vars = ["tv_S", "ooh_S"]
paid_media_signs = ["positive", "positive"]
context_vars = ["competitor_sales_B"]
context_signs = ["positive"]
organic_vars = ["newsletter"]
organic_signs = ["positive"]
factor_vars = ["events"]
dt_holidays = None
prophet_vars = ["trend", "season"]
prophet_signs = ["default", "default"]
prophet_country = "DE"
adstock = "geometric"
hyperparameters = None
window_start = "2016-11-23"
window_end = "2018-08-22"
calibration_input = None
json_file = None

input_data = FeatureEngineeringInputData(
dt_input, date_var, dep_var, dep_var_type, paid_media_spends, paid_media_vars,
paid_media_signs, context_vars, context_signs, organic_vars, organic_signs,
factor_vars, dt_holidays, prophet_vars, prophet_signs, prophet_country,
adstock, hyperparameters, window_start, window_end, calibration_input, json_file
)

assert input_data.date_var == "date"
assert input_data.dep_var == "revenue"
assert input_data.paid_media_spends == ["tv_S", "ooh_S"]
assert input_data.prophet_country == "DE"

0 comments on commit 1fe934d

Please sign in to comment.