pipeline updated

sharathchandrasuroj · Jan 10, 2024 · ca1ac40 · ca1ac40
1 parent c0d4a32
commit ca1ac40
Show file tree

Hide file tree

Showing 11 changed files with 387,378 additions and 76 deletions.
diff --git a/artifacts/model.pkl b/artifacts/model.pkl
diff --git a/artifacts/preprocessor.pkl b/artifacts/preprocessor.pkl
diff --git a/artifacts/raw.csv b/artifacts/raw.csv
diff --git a/artifacts/test.csv b/artifacts/test.csv
diff --git a/artifacts/train.csv b/artifacts/train.csv
diff --git a/setup.py b/setup.py
@@ -1,43 +1,24 @@
-from setuptools import setup, find_packages
+from setuptools import find_packages,setup
 from typing import List
 
-HYPEN_E_DOT='-e .'
+"""HYPEN_E_DOT='-e .'
+
+def get_requirements(file_path:str)->List[str]:
+    requirements=[]
+    with open(file_path) as file_obj:
+        requirements=file_obj.readlines()
+        requirements=[req.replace("\n","") for req in requirements]
 
-'''def get_requiremet(file_path:str)->List[str]:
-    requirements = []
-    with open(file_path) as f:
-        requirements=f.readlines()
-        requirements=[req.replace("\n","")for req in requirements]
-        
         if HYPEN_E_DOT in requirements:
             requirements.remove(HYPEN_E_DOT)
-    return requirements'''
-
-
-with open('README.md', 'r', encoding='utf-8') as f:
-    long_description = f.read()     
-
 
-__version__ = "0.0.4"
-REPO_NAME = "mongodb_connector"
-PKG_NAME= "MongoDB-Connect"
-AUTHOR_USER_NAME = "sunnysavita10"
-AUTHOR_EMAIL = "sunny.savita@ineuron.ai"
+    return requirements"""
 
 setup(
-    name=PKG_NAME,
-    version=__version__,
-    author=AUTHOR_USER_NAME,
-    author_email=AUTHOR_EMAIL,
-    description="A python package for connecting with database.",
-    long_description=long_description,
-    long_description_content="text/markdown",
-    url=f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}",
-    project_urls={
-        "Bug Tracker": f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}/issues",
-    },
-    package_dir={"": "src"},
-    packages=find_packages(where="src"),
-
-
+    name='DimondPricePrediction',
+    version='0.0.1',
+    author='sunny savita',
+    author_email='sunny.savita@ineuron.ai',
+    install_requires=["scikit-learn","pandas","numpy"],
+    packages=find_packages()
 )
diff --git a/src/components/data_ingestion.py b/src/components/data_ingestion.py
@@ -6,21 +6,56 @@
 
 import os
 import sys
-from sklearn.model_selection import train_test_splits
+from sklearn.model_selection import train_test_split
 from dataclasses import dataclass
 from pathlib import Path
 
 @dataclass
 class DataIngestionConfig:
-    pass
+    raw_data_path:str=os.path.join("artifacts","raw.csv")
+    train_data_path:str=os.path.join("artifacts","train.csv")
+    test_data_path:str=os.path.join("artifacts","test.csv")
 
 class DataIngestion:
     def __init__(self):
-        pass
+        self.ingestion_config=DataIngestionConfig()
+
 
     def initiate_data_ingestion(self):
+        logging.info("data ingestion started")
         try:
-            pass
+            data=pd.read_csv("https://raw.githubusercontent.com/sunnysavita10/fsdsmendtoend/main/notebooks/data/gemstone.csv")
+            logging.info(" reading a df")
+
+            os.makedirs(os.path.dirname(os.path.join(self.ingestion_config.raw_data_path)),exist_ok=True)
+            data.to_csv(self.ingestion_config.raw_data_path,index=False)
+            logging.info(" i have saved the raw dataset in artifact folder")
+
+            logging.info("here i have performed train test split")
+
+            train_data,test_data=train_test_split(data,test_size=0.25)
+            logging.info("train test split completed")
+
+            train_data.to_csv(self.ingestion_config.train_data_path,index=False)
+            test_data.to_csv(self.ingestion_config.test_data_path,index=False)
+
+            logging.info("data ingestion part completed")
+
+            return (
+
+
+                self.ingestion_config.train_data_path,
+                self.ingestion_config.test_data_path
+            )
+
+
+
         except Exception as e:
             logging.info()
             raise customexception(e,sys)
+
+
+if __name__=="__main__":
+    obj=DataIngestion()
+
+    obj.initiate_data_ingestion()
diff --git a/src/components/data_transformation.py b/src/components/data_transformation.py
@@ -17,15 +17,112 @@
 
 @dataclass
 class DataTransformationConfig:
-    pass
+    preprocessor_obj_file_path=os.path.join('artifacts','preprocessor.pkl')
+
 
 class DataTransformation:
     def __init__(self):
-        pass
+        self.data_transformation_config=DataTransformationConfig()
 
-    def initiate_data_ingestion(self):
+
+
+    def get_data_transformation(self):
+
         try:
-            pass
+            logging.info('Data Transformation initiated')
+
+            # Define which columns should be ordinal-encoded and which should be scaled
+            categorical_cols = ['cut', 'color','clarity']
+            numerical_cols = ['carat', 'depth','table', 'x', 'y', 'z']
+
+            # Define the custom ranking for each ordinal variable
+            cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
+            color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
+            clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']
+
+            logging.info('Pipeline Initiated')
+
+            ## Numerical Pipeline
+            num_pipeline=Pipeline(
+                steps=[
+                ('imputer',SimpleImputer(strategy='median')),
+                ('scaler',StandardScaler())
+
+                ]
+
+            )
+
+            # Categorigal Pipeline
+            cat_pipeline=Pipeline(
+                steps=[
+                ('imputer',SimpleImputer(strategy='most_frequent')),
+                ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
+                ('scaler',StandardScaler())
+                ]
+
+            )
+
+            preprocessor=ColumnTransformer([
+            ('num_pipeline',num_pipeline,numerical_cols),
+            ('cat_pipeline',cat_pipeline,categorical_cols)
+            ])
+
+            return preprocessor
+
+
+
+
+
         except Exception as e:
-            logging.info()
+            logging.info("Exception occured in the initiate_datatransformation")
+
+            raise customexception(e,sys)
+
+
+    def initialize_data_transformation(self,train_path,test_path):
+        try:
+            train_df=pd.read_csv(train_path)
+            test_df=pd.read_csv(test_path)
+
+            logging.info("read train and test data complete")
+            logging.info(f'Train Dataframe Head : \n{train_df.head().to_string()}')
+            logging.info(f'Test Dataframe Head : \n{test_df.head().to_string()}')
+
+            preprocessing_obj = self.get_data_transformation()
+
+            target_column_name = 'price'
+            drop_columns = [target_column_name,'id']
+
+            input_feature_train_df = train_df.drop(columns=drop_columns,axis=1)
+            target_feature_train_df=train_df[target_column_name]
+
+
+            input_feature_test_df=test_df.drop(columns=drop_columns,axis=1)
+            target_feature_test_df=test_df[target_column_name]
+
+            input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
+
+            input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)
+
+            logging.info("Applying preprocessing object on training and testing datasets.")
+
+            train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
+            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]
+
+            save_object(
+                file_path=self.data_transformation_config.preprocessor_obj_file_path,
+                obj=preprocessing_obj
+            )
+
+            logging.info("preprocessing pickle file saved")
+
+            return (
+                train_arr,
+                test_arr
+            )
+
+        except Exception as e:
+            logging.info("Exception occured in the initiate_datatransformation")
+
             raise customexception(e,sys)
+
diff --git a/src/components/model_evaluation.py b/src/components/model_evaluation.py
diff --git a/src/components/model_trainer.py b/src/components/model_trainer.py
@@ -12,17 +12,59 @@
 from sklearn.linear_model import LinearRegression, Ridge,Lasso,ElasticNet
 
 
-@dataclass
-class  ModelTrainerConfig:
-    pass
-
+@dataclass 
+class ModelTrainerConfig:
+    trained_model_file_path = os.path.join('artifacts','model.pkl')
+
+
 class ModelTrainer:
     def __init__(self):
-        pass
-
-    def initate_model_training(self):
+        self.model_trainer_config = ModelTrainerConfig()
+    
+    def initate_model_training(self,train_array,test_array):
         try:
-            pass
+            logging.info('Splitting Dependent and Independent variables from train and test data')
+            X_train, y_train, X_test, y_test = (
+                train_array[:,:-1],
+                train_array[:,-1],
+                test_array[:,:-1],
+                test_array[:,-1]
+            )
+
+            models={
+            'LinearRegression':LinearRegression(),
+            'Lasso':Lasso(),
+            'Ridge':Ridge(),
+            'Elasticnet':ElasticNet()
+        }
+
+            model_report:dict=evaluate_model(X_train,y_train,X_test,y_test,models)
+            print(model_report)
+            print('\n====================================================================================\n')
+            logging.info(f'Model Report : {model_report}')
+
+            # To get best model score from dictionary 
+            best_model_score = max(sorted(model_report.values()))
+
+            best_model_name = list(model_report.keys())[
+                list(model_report.values()).index(best_model_score)
+            ]
+
+            best_model = models[best_model_name]
+
+            print(f'Best Model Found , Model Name : {best_model_name} , R2 Score : {best_model_score}')
+            print('\n====================================================================================\n')
+            logging.info(f'Best Model Found , Model Name : {best_model_name} , R2 Score : {best_model_score}')
+
+            save_object(
+                 file_path=self.model_trainer_config.trained_model_file_path,
+                 obj=best_model
+            )
+
+
         except Exception as e:
-            logging.info()
+            logging.info('Exception occured at Model Training')
             raise customexception(e,sys)
+
+
+
diff --git a/src/pipeline/training_pipeline.py b/src/pipeline/training_pipeline.py
@@ -0,0 +1,22 @@
+import os
+import sys
+from src.logger.logging import logging
+from src.exception.exception import customexception
+import pandas as pd
+
+from src.components.data_ingestion import DataIngestion
+from src.components.data_transformation import DataTransformation
+from src.components.model_trainer import ModelTrainer
+
+
+obj=DataIngestion()
+
+train_data_path,test_data_path=obj.initiate_data_ingestion()
+
+data_transformation=DataTransformation()
+
+train_arr,test_arr=data_transformation.initialize_data_transformation(train_data_path,test_data_path)
+
+
+model_trainer_obj=ModelTrainer()
+model_trainer_obj.initate_model_training(train_arr,test_arr)