From 135f647aa36a551bfc3090dd9d5fb4c006f425a7 Mon Sep 17 00:00:00 2001 From: L2Data Date: Mon, 8 Oct 2018 21:46:39 +0200 Subject: [PATCH] changes --- src/data/dataframe.py | 21 +++++++++------------ src/models/data_generator.py | 9 +++++---- src/models/train_model.py | 13 +++++-------- src/utils/utils.py | 4 ++-- 4 files changed, 21 insertions(+), 26 deletions(-) diff --git a/src/data/dataframe.py b/src/data/dataframe.py index 5e10cc1..904251d 100644 --- a/src/data/dataframe.py +++ b/src/data/dataframe.py @@ -28,7 +28,7 @@ def read_df(load_from): df = pd.read_csv(load_from, sep=';', header=0) if('Unnamed: 0' in df.columns): df.drop(['Unnamed: 0'], axis=1) - for col in ['reduced_title', 'tokenized', 'cat_category', 'cat_product_category', 'cat_product_type', 'cat_product_details']: + for col in ['reduced_title', 'tokenized']:#, 'cat_category', 'cat_product_category', 'cat_product_type', 'cat_product_details']: if(col in df.columns): df.loc[:, col] = df.loc[:, col].apply(lambda x: literal_eval(x)) return df @@ -149,7 +149,7 @@ def make_keras_embeddings(self, data): vocab_len = data[column].apply(pd.Series).stack().value_counts() print('************************ max_len:', max_len) - print('************************ vocab_len:', vocab_len) + print('************************ vocab_len:', vocab_len, ut.params.n_vocab) tokenize = Tokenizer(num_words=ut.params.n_vocab, @@ -170,20 +170,12 @@ def make_keras_embeddings(self, data): return data - def make_categorical(self, data): - data['cat_category'] = data['category'].astype('category').cat.codes - data['cat_product_category'] = data['product_category'].astype('category').cat.codes - data['cat_product_type'] = data['product_type'].astype('category').cat.codes - data['cat_product_details'] = data['product_details'].astype('category').cat.codes - return data - def make_clean(self, data): data = self.make_clean_title(data) data = self.make_clean_imagecontent(data) data = self.make_expanded_categories(data) data = self.make_keras_embeddings(data) data = self.make_clean_sku(data) - data = self.make_categorical(data) data = data.dropna().reset_index(drop=True) return data @@ -255,6 +247,12 @@ def make_selection(self, data): data = self.select_category_threshold(data) return data +def make_categorical(data): + data['cat_category'] = data['category'].astype('category').cat.codes + data['cat_product_category'] = data['product_category'].astype('category').cat.codes + data['cat_product_type'] = data['product_type'].astype('category').cat.codes + data['cat_product_details'] = data['product_details'].astype('category').cat.codes + return data def working_df(clean_title=True, column='category', quantile=None, sample_size=None): df_clean_dir = os.path.join(ut.dirs.raw_dir, ut.df_names.cleaned_df) @@ -273,6 +271,5 @@ def working_df(clean_title=True, column='category', quantile=None, sample_size=N df_return = stat_selection(column, quantile, sample_size).make_selection(df_cleaned) - del df_cleaned - return df_return + return make_categorical(df_return) diff --git a/src/models/data_generator.py b/src/models/data_generator.py index 85e0893..399efec 100644 --- a/src/models/data_generator.py +++ b/src/models/data_generator.py @@ -33,10 +33,11 @@ def __init__(self, df, data_path, batch_size, classmode, modelmode, mode='train' self.modelmode = modelmode # Take labels and a list of image locations in memory - self.labels = to_categorical(np.array(self.df['category'].values.tolist())) - self.labels_pc = to_categorical(np.array(self.df['product_category'].values.tolist())) - self.labels_pt = to_categorical(np.array(self.df['product_type'].values.tolist())) - self.labels_pd = to_categorical(np.array(self.df['product_details'].values.tolist())) + #self.labels = to_categorical(np.array(self.df['category'].values.tolist())) + self.labels = to_categorical(np.array(self.df['cat_category'].values.tolist()))#to_categorical(np.array(self.df['category'].values.tolist())) + self.labels_pc = to_categorical(np.array(self.df['cat_product_category'].values.tolist())) + self.labels_pt = to_categorical(np.array(self.df['cat_product_type'].values.tolist())) + self.labels_pd = to_categorical(np.array(self.df['cat_product_details'].values.tolist())) self.im_list = self.df['imagename'].apply(lambda x: os.path.join(data_path, x)).tolist() self.text_list = self.df['tokenized_title'].apply(lambda x: literal_eval(x)).values.tolist() diff --git a/src/models/train_model.py b/src/models/train_model.py index 40f9e36..b94983c 100644 --- a/src/models/train_model.py +++ b/src/models/train_model.py @@ -54,14 +54,11 @@ def on_epoch_end(self, epoch, logs=None): super(TrainValTensorBoard, self).on_epoch_end(epoch, logs) def make_labels(data, classmode): - data['category'] = data['category'].astype('category').cat.codes - data['product_category'] = data['product_category'].astype('category').cat.codes - data['product_type'] = data['product_type'].astype('category').cat.codes - data['product_details'] = data['product_details'].astype('category').cat.codes - n_classes = np.max(np.unique(data['category'].tolist()))+1 - n_classes1 = np.max(np.unique(data['product_category'].tolist()))+1 - n_classes2 = np.max(np.unique(data['product_type'].tolist()))+1 - n_classes3 = np.max(np.unique(data['product_details'].tolist()))+1 + #print(len(data['cat_product_category'].value_counts())) + n_classes = len(data['cat_category'].value_counts()) + n_classes1 = len(data['cat_product_category'].value_counts()) + n_classes2 = len(data['cat_product_type'].value_counts()) + n_classes3 = len(data['cat_product_details'].value_counts()) if(classmode == 'multiclass'): return n_classes diff --git a/src/utils/utils.py b/src/utils/utils.py index 633d574..2269d4b 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -25,13 +25,13 @@ class df_names: class params: n_words = 9 - n_vocab = 4107 #8214 + n_vocab = 5000 #8214 seed = 42 quantile = 10 subsample = .25 batch_size = 8 epochs = 12 - learning_rate = 0.0001 + learning_rate = 0.00005 image_width = 64 image_heigth = 64 classmode = 'multilabel'