-
Notifications
You must be signed in to change notification settings - Fork 21
/
data_process_utils.py
86 lines (72 loc) · 3.82 KB
/
data_process_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
from data_processing import data_process
class DataUtils:
def __init__(self, ticker, dataset, experiment_id, horizons, normalization_window):
self.ticker = ticker # Ticker of the stock to be processed.
self.dataset = dataset # Dataset to be used.
self.experiment_id = experiment_id # Experiment ID.
self.horizons = horizons # Horizons to be used when computing labels.
self.normalization_window = normalization_window # Normalization window to be used when normalizing data.
self.__raw_data_path = None # Path containing the raw LOB data.
self.__processed_data_path_unscaled_data = (
None # Path containing the processed but unscaled LOB data.
)
self.__processed_data_path_scaled_data = (
None # Path containing the processed and scaled LOB data.
)
self.__logs_path = None # Path containing the logs of the data processing.
def __set_raw_data_path(self):
# Set the raw data path according to the dataset.
if self.dataset == "nasdaq":
self.__raw_data_path = f"./data/{self.dataset}/raw/{self.ticker}"
def __set_processed_data_path_unscaled_data(self):
# Set the path containing the processed but unscaled LOB data according to the dataset.
if self.dataset == "nasdaq":
self.__processed_data_path_unscaled_data = (
f"./data/{self.dataset}/unscaled_data/{self.ticker}"
)
def __set_processed_data_path_scaled_data(self):
# Set the path containing the processed and scaled LOB data according to the dataset.
if self.dataset == "nasdaq":
self.__processed_data_path_scaled_data = (
f"./data/{self.dataset}/scaled_data/{self.ticker}"
)
def __set_logs_path(self):
# Set the path containing the logs of the data processing according to the experiment ID.
self.__logs_path = (
f"./loggers/results/{self.experiment_id}/data_processing_logs"
)
def generate_data_folders(self):
self.__set_raw_data_path() # Set the raw data path.
self.__set_processed_data_path_unscaled_data() # Set the path containing the processed but unscaled LOB data.
self.__set_processed_data_path_scaled_data() # Set the path containing the processed and scaled LOB data.
self.__set_logs_path() # Set the path containing the logs of the data processing.
# Create the folder for the processed but unscaled LOB data if it does not exist.
if not os.path.exists(self.__processed_data_path_unscaled_data):
os.makedirs(self.__processed_data_path_unscaled_data)
# Create the folder for the processed and scaled LOB data if it does not exist.
if not os.path.exists(self.__processed_data_path_scaled_data):
os.makedirs(self.__processed_data_path_scaled_data)
# Create the folder for the logs of the data processing if it does not exist.
if not os.path.exists(self.__logs_path):
os.makedirs(self.__logs_path)
# Process the data to obtain scaled and unscaled data.
def process_data(self):
data_process.process_data(
ticker=self.ticker,
input_path=self.__raw_data_path,
output_path=self.__processed_data_path_unscaled_data,
logs_path=self.__logs_path,
horizons=self.horizons,
normalization_window=self.normalization_window,
scaling=False,
)
data_process.process_data(
ticker=self.ticker,
input_path=self.__raw_data_path,
output_path=self.__processed_data_path_scaled_data,
logs_path=self.__logs_path,
horizons=self.horizons,
normalization_window=self.normalization_window,
scaling=True,
)