preprocess.py

#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st


# st.title("Product and Category Demand Forecast")

df1 = pd.read_csv('category_tree.csv')
df2 = pd.read_csv('events.csv')
df3=pd.read_csv("item_properties_part1.csv")
df4=pd.read_csv("item_properties_part2.csv")
df = pd.concat([df3,df4],axis='index')


# df = pd.read_csv('/content/gdrive/MyDrive/Kaggle/item_properties.csv')
df.timestamp = pd.to_datetime(df.timestamp,unit='ms')
df2.timestamp = pd.to_datetime(df2.timestamp,unit='ms')


# df.to_hdf("df.hf5",key="df")
df2.to_hdf("df2.hf5",key="df2")

cat = df[df.property == 'categoryid']
cat.drop(columns=['property'],inplace=True)
df_cat_merge = pd.merge(cat,df2,on='itemid')
df_cat_merge.drop(columns=['visitorid','transactionid','timestamp_x'],inplace=True)
df_cat_merge.rename(columns={'value':'categoryid','timestamp_y':'timestamp'},inplace=True)
df_cat_merge.timestamp = df_cat_merge.timestamp.dt.date
dummies = pd.get_dummies(df_cat_merge.event)
df_cat_merge = pd.concat([df_cat_merge,dummies],axis='columns')
df_cat_merge.drop(columns=['event'],inplace=True)
df_cat_merge['wt_act'] = 0.3*df_cat_merge['addtocart'] + 0.5*df_cat_merge['transaction'] + 0.2*df_cat_merge['view']
df_cat_merge_group = df_cat_merge.groupby(['categoryid','timestamp']).wt_act.sum().reset_index()
df_cat_merge_group.set_index('timestamp',inplace=True)
df_cat_merge_group.categoryid = df_cat_merge_group.categoryid.apply(int)


prod = df2.drop(columns=['transactionid','visitorid'])
prod.timestamp = prod.timestamp.dt.date
dummies = pd.get_dummies(prod.event)
prod = pd.concat([prod,dummies],axis='columns')
prod.drop(columns=['event'],inplace=True)
prod['wt_act'] = 0.3*prod['addtocart'] + 0.5*prod['transaction'] + 0.2*prod['view']
prod_group = prod.groupby(['itemid','timestamp']).wt_act.sum().reset_index()
prod_group.set_index('timestamp',inplace=True)
ls = []
df_i = pd.DataFrame(prod_group.itemid.value_counts())
df_i = df_i[df_i.itemid>30]
ls=[]
for i in df_i.index:
  ls.append(i)
prod_group = prod_group[prod_group.itemid.isin(ls)]


cat.to_hdf("cat.hf5",key="cat")
df_cat_merge.to_hdf("df_cat_merge.hf5",key="df_cat_merge")
df_cat_merge_group.to_hdf("df_cat_merge_group.hf5",key="df_cat_merge_group")
prod.to_hdf("prod.hf5",key="prod")
prod_group.to_hdf("prod_group.hf5",key="prod_group")


# cat.to_csv("cat.csv")
# df_cat_merge.to_csv("df_cat_merge.csv")
# df_cat_merge_group.to_csv("df_cat_merge_group.csv")
# prod.to_csv("prod.csv")
# prod_group.to_csv("prod_group.csv")


# In[ ]:


# -*- coding: utf-8 -*-
"""ProductandCategoryRecommendationPass1.2.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1HMMYeDDZ8xJT7AqzEg-1-_15b9DM8wVN

"""


# Commented out IPython magic to ensure Python compatibility.


# Import all the Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime 
import time
# %matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns
import matplotlib.dates as mdates
from mlxtend.frequent_patterns import apriori, association_rules
import warnings
warnings.filterwarnings("ignore")


# Importing the Events File and segregating the events into various columns
events=pd.read_csv("events.csv") 
events_copy=events
events_dummies=pd.get_dummies(events_copy.event)    #To segregate the events into various columns
events_copy_withdummies=pd.concat([events_copy,events_dummies],axis=1)
events_copy_withdummies.drop(["event","timestamp"],axis=1,inplace=True)
events=events_copy_withdummies


#Relating Item_ID with Category_ID

df_item_prop_part1=pd.read_csv("item_properties_part1.csv")
df_item_prop_part2=pd.read_csv("item_properties_part2.csv")
df_itemsprop=df_item_prop_part1.append(df_item_prop_part2)  #Merge both property files to a single file
df_itemsprop.reset_index(inplace=True)
it_cat=df_itemsprop[df_itemsprop.property=="categoryid"].drop(["index","timestamp","property"],axis=1)
it_cat_sort=it_cat.sort_values("itemid").reset_index().drop("index",axis=1)
it_and_cat=it_cat_sort.drop_duplicates()
it_and_cat.reset_index(inplace=True)
for i in it_and_cat.index:
  it_and_cat.at[i,"value"]=int(it_and_cat.iloc[i].value)


# Merging the Category with events
it_and_cat_copy=it_and_cat
events_with_cat=events.merge(it_and_cat_copy,how="inner",on="itemid")
events_with_cat=events_with_cat.rename(columns={'value':'categoryid'})


# Perform the Apriori Algorithm with the Events File


events_with_cat_grouping=events_with_cat[events_with_cat["transactionid"].notna()==True].groupby(["visitorid","categoryid"])["transaction"].sum()
events_with_cat_grouping=events_with_cat_grouping.unstack()
events_with_cat_grouping=events_with_cat_grouping.fillna(0)
def hot_encode(x):
	if(x<= 0):
		return 0
	if(x>= 1):
		return 1
events_with_cat_grouping_encoded=events_with_cat_grouping.applymap(hot_encode)
frq_items = apriori(events_with_cat_grouping_encoded, min_support = 0.0020, use_colnames = True)
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])


# To Form a dataframe outof all the antecedents and cosequents

rules.sort_values(["antecedent support"],ascending=False)
rules_1=rules.drop(["antecedent support","consequent support","support","confidence","lift","leverage","conviction"],axis=1)
rules_1.sort_index(inplace=True)

df_x=pd.DataFrame(None)
df_y=pd.DataFrame(None)
keys={}
for i in events_with_cat["categoryid"].unique():
    keys[i]=0

for i in rules_1.index:
    x=rules_1["antecedents"].iloc[i]
    y=rules_1["consequents"].iloc[i]
    for j in x:
        for k in y:
            df_y.at[keys[j],j]=k
            keys[j]=keys[j]+1
            
df_y=df_y.fillna(-1).astype(int)


df_z=it_and_cat
df_z.rename(columns={'value':'categoryid'},inplace=True)

for i in df_z.categoryid.unique():
    if i not in df_y.columns:
        df_y[i]=-1


rules_copy=rules.copy()
rules_copy.reset_index(drop=True,inplace=True)
for i in rules_copy.index:
    rules_copy.at[i,"antecedents"]=list(rules_copy.iloc[i]["antecedents"])
for i in rules_copy.index:
    rules_copy.at[i,"consequents"]=list(rules_copy.iloc[i]["consequents"])

df_z.to_hdf("df_z.hf5",key="df_z")

df_y.to_hdf("df_y.hf5",key="df_y")

rules_copy.to_hdf("rules.hf5",key="rules")


# In[ ]:


# In[ ]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

df_3 = pd.read_csv('events.csv')
df_3 = df_3[df_3['event']=='transaction']

import datetime
times=[]
for i in df_3['timestamp']:
    times.append(datetime.datetime.fromtimestamp(i//1000.0))
df_3['timestamp']=times

tx_user = pd.DataFrame(df_3['visitorid'].unique())
tx_user.columns = ['visitorid']

tx_max_purchase = df_3.groupby('visitorid').timestamp.max().reset_index()
tx_max_purchase.columns = ['visitorid','MaxPurchaseDate']
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days

tx_user = pd.merge(tx_user, tx_max_purchase[['visitorid','Recency']], on='visitorid')
tx_user.head()

from sklearn.cluster import KMeans

sse={}
tx_recency = tx_user[['Recency']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
    tx_recency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")

kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_user[['Recency']])
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])

#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False)

df_num = tx_user.drop('visitorid',axis=1)
df_num = df_num.drop('RecencyCluster',axis=1)
df_norm = (df_num - df_num.min()) / (df_num.max() - df_num.min())*100
tx_user[df_norm.columns] = df_norm

tx_frequency = df_3.groupby('visitorid').timestamp.count().reset_index()
tx_frequency.columns = ['visitorid','Frequency']
tx_user = pd.merge(tx_user, tx_frequency, on='visitorid')

#k-means
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Frequency']])
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])
#order the frequency cluster
tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)

df_num = tx_user.drop(['visitorid','Recency','FrequencyCluster'],axis=1)
df_num = df_num.drop('RecencyCluster',axis=1)
df_norm = (df_num - df_num.min()) / (df_num.max() - df_num.min())*100
tx_user[df_norm.columns] = df_norm

#calculate overall score and use mean() to see details
tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster']

tx_user['Segment'] = 'Low-Value'
tx_user.loc[tx_user['OverallScore']>1,'Segment'] = 'Mid-Value'
tx_user.loc[tx_user['OverallScore']>3,'Segment'] = 'High-Value'

tx_user.to_csv('transactions.csv')


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

df_3 = pd.read_csv('events.csv')
df_3 = df_3[df_3['event']=='transaction']

import datetime
times=[]
for i in df_3['timestamp']:
    times.append(datetime.datetime.fromtimestamp(i//1000.0))
df_3['timestamp']=times


customer = df_3.groupby('visitorid').agg({'timestamp':lambda x: x.min().month,
                                                   'transactionid': lambda x: len(x)})

customer.columns = ['Start_Month', 'Frequency']

months = [ 'May','Jun', 'Jul', 'Aug', 'Sep']
Monthly_CLV = []

for i in range(1, 6):
    customer_m = customer[customer['Start_Month']==i+4]

    Purchase_freq = round(np.mean(customer_m['Frequency']), 2)

    Retention_rate = customer_m[customer_m['Frequency']>1].shape[0]/customer_m.shape[0]
    churn = round(1 - Retention_rate, 2)

    CLV = round(( Purchase_freq/churn), 2)

    Monthly_CLV.append(CLV)

monthly_clv = pd.DataFrame(zip(months, Monthly_CLV), columns=['Months', 'CLV'])
monthly_clv.to_csv('hist.csv')