Skip to content

Commit

Permalink
changes
Browse files Browse the repository at this point in the history
  • Loading branch information
lbhesse committed Oct 8, 2018
1 parent ab80732 commit 135f647
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 26 deletions.
21 changes: 9 additions & 12 deletions src/data/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def read_df(load_from):
df = pd.read_csv(load_from, sep=';', header=0)
if('Unnamed: 0' in df.columns):
df.drop(['Unnamed: 0'], axis=1)
for col in ['reduced_title', 'tokenized', 'cat_category', 'cat_product_category', 'cat_product_type', 'cat_product_details']:
for col in ['reduced_title', 'tokenized']:#, 'cat_category', 'cat_product_category', 'cat_product_type', 'cat_product_details']:
if(col in df.columns):
df.loc[:, col] = df.loc[:, col].apply(lambda x: literal_eval(x))
return df
Expand Down Expand Up @@ -149,7 +149,7 @@ def make_keras_embeddings(self, data):
vocab_len = data[column].apply(pd.Series).stack().value_counts()

print('************************ max_len:', max_len)
print('************************ vocab_len:', vocab_len)
print('************************ vocab_len:', vocab_len, ut.params.n_vocab)


tokenize = Tokenizer(num_words=ut.params.n_vocab,
Expand All @@ -170,20 +170,12 @@ def make_keras_embeddings(self, data):

return data

def make_categorical(self, data):
data['cat_category'] = data['category'].astype('category').cat.codes
data['cat_product_category'] = data['product_category'].astype('category').cat.codes
data['cat_product_type'] = data['product_type'].astype('category').cat.codes
data['cat_product_details'] = data['product_details'].astype('category').cat.codes
return data

def make_clean(self, data):
data = self.make_clean_title(data)
data = self.make_clean_imagecontent(data)
data = self.make_expanded_categories(data)
data = self.make_keras_embeddings(data)
data = self.make_clean_sku(data)
data = self.make_categorical(data)
data = data.dropna().reset_index(drop=True)
return data

Expand Down Expand Up @@ -255,6 +247,12 @@ def make_selection(self, data):
data = self.select_category_threshold(data)
return data

def make_categorical(data):
data['cat_category'] = data['category'].astype('category').cat.codes
data['cat_product_category'] = data['product_category'].astype('category').cat.codes
data['cat_product_type'] = data['product_type'].astype('category').cat.codes
data['cat_product_details'] = data['product_details'].astype('category').cat.codes
return data

def working_df(clean_title=True, column='category', quantile=None, sample_size=None):
df_clean_dir = os.path.join(ut.dirs.raw_dir, ut.df_names.cleaned_df)
Expand All @@ -273,6 +271,5 @@ def working_df(clean_title=True, column='category', quantile=None, sample_size=N

df_return = stat_selection(column, quantile, sample_size).make_selection(df_cleaned)


del df_cleaned
return df_return
return make_categorical(df_return)
9 changes: 5 additions & 4 deletions src/models/data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,11 @@ def __init__(self, df, data_path, batch_size, classmode, modelmode, mode='train'
self.modelmode = modelmode

# Take labels and a list of image locations in memory
self.labels = to_categorical(np.array(self.df['category'].values.tolist()))
self.labels_pc = to_categorical(np.array(self.df['product_category'].values.tolist()))
self.labels_pt = to_categorical(np.array(self.df['product_type'].values.tolist()))
self.labels_pd = to_categorical(np.array(self.df['product_details'].values.tolist()))
#self.labels = to_categorical(np.array(self.df['category'].values.tolist()))
self.labels = to_categorical(np.array(self.df['cat_category'].values.tolist()))#to_categorical(np.array(self.df['category'].values.tolist()))
self.labels_pc = to_categorical(np.array(self.df['cat_product_category'].values.tolist()))
self.labels_pt = to_categorical(np.array(self.df['cat_product_type'].values.tolist()))
self.labels_pd = to_categorical(np.array(self.df['cat_product_details'].values.tolist()))
self.im_list = self.df['imagename'].apply(lambda x: os.path.join(data_path, x)).tolist()
self.text_list = self.df['tokenized_title'].apply(lambda x: literal_eval(x)).values.tolist()

Expand Down
13 changes: 5 additions & 8 deletions src/models/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,11 @@ def on_epoch_end(self, epoch, logs=None):
super(TrainValTensorBoard, self).on_epoch_end(epoch, logs)

def make_labels(data, classmode):
data['category'] = data['category'].astype('category').cat.codes
data['product_category'] = data['product_category'].astype('category').cat.codes
data['product_type'] = data['product_type'].astype('category').cat.codes
data['product_details'] = data['product_details'].astype('category').cat.codes
n_classes = np.max(np.unique(data['category'].tolist()))+1
n_classes1 = np.max(np.unique(data['product_category'].tolist()))+1
n_classes2 = np.max(np.unique(data['product_type'].tolist()))+1
n_classes3 = np.max(np.unique(data['product_details'].tolist()))+1
#print(len(data['cat_product_category'].value_counts()))
n_classes = len(data['cat_category'].value_counts())
n_classes1 = len(data['cat_product_category'].value_counts())
n_classes2 = len(data['cat_product_type'].value_counts())
n_classes3 = len(data['cat_product_details'].value_counts())

if(classmode == 'multiclass'):
return n_classes
Expand Down
4 changes: 2 additions & 2 deletions src/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ class df_names:

class params:
n_words = 9
n_vocab = 4107 #8214
n_vocab = 5000 #8214
seed = 42
quantile = 10
subsample = .25
batch_size = 8
epochs = 12
learning_rate = 0.0001
learning_rate = 0.00005
image_width = 64
image_heigth = 64
classmode = 'multilabel'
Expand Down

0 comments on commit 135f647

Please sign in to comment.