Skip to content

Commit

Permalink
Fixed commit error
Browse files Browse the repository at this point in the history
  • Loading branch information
Tryd3x committed Aug 31, 2024
1 parent 56f0631 commit 75eb3dc
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 21 deletions.
1 change: 0 additions & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
datasets/*.csv filter=lfs diff=lfs merge=lfs -text
artifacts/*.csv filter=lfs diff=lfs merge=lfs -text
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/env
/logs
src/__pycache__
src/__pycache__
artifacts/
3 changes: 0 additions & 3 deletions artifacts/raw.csv

This file was deleted.

3 changes: 0 additions & 3 deletions artifacts/test.csv

This file was deleted.

3 changes: 0 additions & 3 deletions artifacts/train.csv

This file was deleted.

3 changes: 3 additions & 0 deletions datasets/income_cleandata.csv
Git LFS file not shown
34 changes: 24 additions & 10 deletions src/components/data_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,39 @@

from sklearn.model_selection import train_test_split

from dataclasses import dataclass

"""The configuration creates a new folder with the data sorted into raw, train and test files"""
@dataclass
class DataIngestionConfig:
project_root = fetch_project_root(os.path.abspath(__file__))
folder_path = 'artifacts'
train_data_file = os.path.join(project_root,folder_path,"train.csv")
test_data_file = os.path.join(project_root,folder_path,"test.csv")
raw_data_file = os.path.join(project_root,folder_path,"raw.csv")

path_to_file: str

def __post_init__(self):
self.project_root = fetch_project_root(os.path.abspath(__file__))
self.folder_path = os.path.basename(self.path_to_file).split('.')[0]
print(f"folder path: {self.folder_path}")
self.train_data_file = os.path.join(self.project_root,'artifacts',self.folder_path,"train.csv")
self.test_data_file = os.path.join(self.project_root,'artifacts',self.folder_path,"test.csv")
self.raw_data_file = os.path.join(self.project_root,'artifacts',self.folder_path,"raw.csv")


class DataIngestion:
def __init__(self,):
self.ingestion_config = DataIngestionConfig()
def __init__(self,path_to_data):
self.path_to_data = path_to_data
self.ingestion_config = DataIngestionConfig(path_to_file=self.path_to_data)

def initiateDataIngestion(self):
print("Initiated Data Ingestion...")

print(self.ingestion_config)
logging.info("Initializing data ingestion")
try:
# Can possibly make this code dynamic to handle multiple sources of data
data = pd.read_csv("../../datasets/people-2000000.csv")
data = pd.read_csv(self.path_to_data)
logging.info("Data loaded successfully")

os.makedirs(os.path.join(self.ingestion_config.project_root, self.ingestion_config.folder_path),exist_ok=True)
os.makedirs(os.path.join(self.ingestion_config.project_root, 'artifacts', self.ingestion_config.folder_path),exist_ok=True)
logging.info("Artifacts folder created")

data.to_csv(self.ingestion_config.raw_data_file, index=False)
Expand All @@ -43,6 +54,8 @@ def initiateDataIngestion(self):
logging.info("Train and test data saved successfully")

logging.info("Data ingestion completed")

print("Data Ingestion completed successfully")
return (
self.ingestion_config.train_data_file,
self.ingestion_config.test_data_file,
Expand All @@ -54,5 +67,6 @@ def initiateDataIngestion(self):


if __name__ == "__main__":
obj = DataIngestion()
# obj = DataIngestion(path_to_data="../../datasets/people-2000000.csv")
obj = DataIngestion(path_to_data="../../datasets/income_cleandata.csv")
obj.initiateDataIngestion()

0 comments on commit 75eb3dc

Please sign in to comment.