Skip to content

Commit

Permalink
pipeline updated
Browse files Browse the repository at this point in the history
  • Loading branch information
sunnysavita10 committed Jan 10, 2024
1 parent c0d4a32 commit ca1ac40
Show file tree
Hide file tree
Showing 11 changed files with 387,378 additions and 76 deletions.
Binary file added artifacts/model.pkl
Binary file not shown.
Binary file added artifacts/preprocessor.pkl
Binary file not shown.
193,574 changes: 193,574 additions & 0 deletions artifacts/raw.csv

Large diffs are not rendered by default.

48,395 changes: 48,395 additions & 0 deletions artifacts/test.csv

Large diffs are not rendered by default.

145,180 changes: 145,180 additions & 0 deletions artifacts/train.csv

Large diffs are not rendered by default.

49 changes: 15 additions & 34 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,24 @@
from setuptools import setup, find_packages
from setuptools import find_packages,setup
from typing import List

HYPEN_E_DOT='-e .'
"""HYPEN_E_DOT='-e .'
def get_requirements(file_path:str)->List[str]:
requirements=[]
with open(file_path) as file_obj:
requirements=file_obj.readlines()
requirements=[req.replace("\n","") for req in requirements]
'''def get_requiremet(file_path:str)->List[str]:
requirements = []
with open(file_path) as f:
requirements=f.readlines()
requirements=[req.replace("\n","")for req in requirements]
if HYPEN_E_DOT in requirements:
requirements.remove(HYPEN_E_DOT)
return requirements'''


with open('README.md', 'r', encoding='utf-8') as f:
long_description = f.read()

__version__ = "0.0.4"
REPO_NAME = "mongodb_connector"
PKG_NAME= "MongoDB-Connect"
AUTHOR_USER_NAME = "sunnysavita10"
AUTHOR_EMAIL = "sunny.savita@ineuron.ai"
return requirements"""

setup(
name=PKG_NAME,
version=__version__,
author=AUTHOR_USER_NAME,
author_email=AUTHOR_EMAIL,
description="A python package for connecting with database.",
long_description=long_description,
long_description_content="text/markdown",
url=f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}",
project_urls={
"Bug Tracker": f"https://github.com/{AUTHOR_USER_NAME}/{REPO_NAME}/issues",
},
package_dir={"": "src"},
packages=find_packages(where="src"),


name='DimondPricePrediction',
version='0.0.1',
author='sunny savita',
author_email='sunny.savita@ineuron.ai',
install_requires=["scikit-learn","pandas","numpy"],
packages=find_packages()
)
43 changes: 39 additions & 4 deletions src/components/data_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,56 @@

import os
import sys
from sklearn.model_selection import train_test_splits
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataIngestionConfig:
pass
raw_data_path:str=os.path.join("artifacts","raw.csv")
train_data_path:str=os.path.join("artifacts","train.csv")
test_data_path:str=os.path.join("artifacts","test.csv")

class DataIngestion:
def __init__(self):
pass
self.ingestion_config=DataIngestionConfig()


def initiate_data_ingestion(self):
logging.info("data ingestion started")
try:
pass
data=pd.read_csv("https://raw.githubusercontent.com/sunnysavita10/fsdsmendtoend/main/notebooks/data/gemstone.csv")
logging.info(" reading a df")

os.makedirs(os.path.dirname(os.path.join(self.ingestion_config.raw_data_path)),exist_ok=True)
data.to_csv(self.ingestion_config.raw_data_path,index=False)
logging.info(" i have saved the raw dataset in artifact folder")

logging.info("here i have performed train test split")

train_data,test_data=train_test_split(data,test_size=0.25)
logging.info("train test split completed")

train_data.to_csv(self.ingestion_config.train_data_path,index=False)
test_data.to_csv(self.ingestion_config.test_data_path,index=False)

logging.info("data ingestion part completed")

return (


self.ingestion_config.train_data_path,
self.ingestion_config.test_data_path
)



except Exception as e:
logging.info()
raise customexception(e,sys)


if __name__=="__main__":
obj=DataIngestion()

obj.initiate_data_ingestion()
107 changes: 102 additions & 5 deletions src/components/data_transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,112 @@

@dataclass
class DataTransformationConfig:
pass
preprocessor_obj_file_path=os.path.join('artifacts','preprocessor.pkl')


class DataTransformation:
def __init__(self):
pass
self.data_transformation_config=DataTransformationConfig()

def initiate_data_ingestion(self):


def get_data_transformation(self):

try:
pass
logging.info('Data Transformation initiated')

# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = ['cut', 'color','clarity']
numerical_cols = ['carat', 'depth','table', 'x', 'y', 'z']

# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

logging.info('Pipeline Initiated')

## Numerical Pipeline
num_pipeline=Pipeline(
steps=[
('imputer',SimpleImputer(strategy='median')),
('scaler',StandardScaler())

]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
('scaler',StandardScaler())
]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

return preprocessor





except Exception as e:
logging.info()
logging.info("Exception occured in the initiate_datatransformation")

raise customexception(e,sys)


def initialize_data_transformation(self,train_path,test_path):
try:
train_df=pd.read_csv(train_path)
test_df=pd.read_csv(test_path)

logging.info("read train and test data complete")
logging.info(f'Train Dataframe Head : \n{train_df.head().to_string()}')
logging.info(f'Test Dataframe Head : \n{test_df.head().to_string()}')

preprocessing_obj = self.get_data_transformation()

target_column_name = 'price'
drop_columns = [target_column_name,'id']

input_feature_train_df = train_df.drop(columns=drop_columns,axis=1)
target_feature_train_df=train_df[target_column_name]


input_feature_test_df=test_df.drop(columns=drop_columns,axis=1)
target_feature_test_df=test_df[target_column_name]

input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)

input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)

logging.info("Applying preprocessing object on training and testing datasets.")

train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

save_object(
file_path=self.data_transformation_config.preprocessor_obj_file_path,
obj=preprocessing_obj
)

logging.info("preprocessing pickle file saved")

return (
train_arr,
test_arr
)

except Exception as e:
logging.info("Exception occured in the initiate_datatransformation")

raise customexception(e,sys)

24 changes: 0 additions & 24 deletions src/components/model_evaluation.py

This file was deleted.

60 changes: 51 additions & 9 deletions src/components/model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,59 @@
from sklearn.linear_model import LinearRegression, Ridge,Lasso,ElasticNet


@dataclass
class ModelTrainerConfig:
pass

@dataclass
class ModelTrainerConfig:
trained_model_file_path = os.path.join('artifacts','model.pkl')


class ModelTrainer:
def __init__(self):
pass

def initate_model_training(self):
self.model_trainer_config = ModelTrainerConfig()
def initate_model_training(self,train_array,test_array):
try:
pass
logging.info('Splitting Dependent and Independent variables from train and test data')
X_train, y_train, X_test, y_test = (
train_array[:,:-1],
train_array[:,-1],
test_array[:,:-1],
test_array[:,-1]
)

models={
'LinearRegression':LinearRegression(),
'Lasso':Lasso(),
'Ridge':Ridge(),
'Elasticnet':ElasticNet()
}

model_report:dict=evaluate_model(X_train,y_train,X_test,y_test,models)
print(model_report)
print('\n====================================================================================\n')
logging.info(f'Model Report : {model_report}')

# To get best model score from dictionary
best_model_score = max(sorted(model_report.values()))

best_model_name = list(model_report.keys())[
list(model_report.values()).index(best_model_score)
]

best_model = models[best_model_name]

print(f'Best Model Found , Model Name : {best_model_name} , R2 Score : {best_model_score}')
print('\n====================================================================================\n')
logging.info(f'Best Model Found , Model Name : {best_model_name} , R2 Score : {best_model_score}')

save_object(
file_path=self.model_trainer_config.trained_model_file_path,
obj=best_model
)


except Exception as e:
logging.info()
logging.info('Exception occured at Model Training')
raise customexception(e,sys)



22 changes: 22 additions & 0 deletions src/pipeline/training_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os
import sys
from src.logger.logging import logging
from src.exception.exception import customexception
import pandas as pd

from src.components.data_ingestion import DataIngestion
from src.components.data_transformation import DataTransformation
from src.components.model_trainer import ModelTrainer


obj=DataIngestion()

train_data_path,test_data_path=obj.initiate_data_ingestion()

data_transformation=DataTransformation()

train_arr,test_arr=data_transformation.initialize_data_transformation(train_data_path,test_data_path)


model_trainer_obj=ModelTrainer()
model_trainer_obj.initate_model_training(train_arr,test_arr)

0 comments on commit ca1ac40

Please sign in to comment.