diff --git a/phishGNN/dataprep.py b/phishGNN/dataprep.py
new file mode 100644
index 0000000..6c9bbc7
--- /dev/null
+++ b/phishGNN/dataprep.py
@@ -0,0 +1,137 @@
+
+import json
+from typing import Dict, List, Tuple
+
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+from utils.utils import normalize_www_prefix
+
+NAN_VALUE = -1
+
+def read_csv(path: str) -> pd.DataFrame:
+    """Opens the csv dataset as DataFrame and cast types.
+    """
+    date_parser = lambda c: pd.to_datetime(c, format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')
+    df = pd.read_csv(
+        path,
+        index_col=False,
+        parse_dates=['domain_creation_date'],
+        date_parser=date_parser,
+    )
+
+    # equilibrate dataset classes as 50/50% benign/phishing
+    nb_phishing = len(df[df['is_phishing'] == 1])
+    benign = df.index[(df['is_phishing'] == 0)][:nb_phishing]
+    other = df.index[~(df['is_phishing'] == 0)]
+    df = pd.concat([df.iloc[benign], df.iloc[other]])
+
+    # cast object dtypes
+    df['url'] = df['url'].astype('string')
+    df['cert_country'] = df['cert_country'].astype('string')
+
+    # remove useless features
+    del df['status_code']
+    del df['depth']
+    del df['domain_creation_date']
+    del df['cert_country'] # TODO: handle cert_country and sort by "dangerous" country
+    return df
+
+
+def normalize_url(url: str):
+    """Strips last / and normalize www prefix.
+    """
+    url = url.rstrip('/')
+    url = normalize_www_prefix(url)
+    return url
+
+
+def normalize_features(df: pd.DataFrame):
+    """Pre-processes every feature so that they are normalized
+    adn scaled between 0 and 1 range.
+
+    Args:
+        df: the dataframe to normalize
+
+    Returns:
+        DataFrame: the normalized dataframe
+    """
+    def bool_to_int(col: pd.Series):
+        def wrapper(x):
+            if np.isnan(x) or x not in [True, False]:
+                return NAN_VALUE
+            return 1 if x == True else 0
+        return col.apply(wrapper)
+    
+    def min_max_scaling(col: pd.Series):
+        min = col.min()
+        max = col.max()
+        return col.apply(lambda x: (x - min) / (max - min))
+
+    def normalize_refs(refs: List[Dict]):
+        refs = json.loads(refs)
+        for ref in refs:
+            ref['url'] = normalize_url(ref['url'])
+        return refs
+
+    # normalize int & float columns
+    num_column_idxs = [i for i, dt in enumerate(df.dtypes) if dt in [int, float]]
+    df.iloc[:, num_column_idxs] = \
+        df.iloc[:, num_column_idxs].apply(min_max_scaling, axis=0)
+
+    # replace NaN values for non-string columns
+    df.iloc[:, num_column_idxs] = df.iloc[:, num_column_idxs].fillna(NAN_VALUE)
+
+    # normalize bool columns
+    bool_columns = ["is_phishing", "is_https", "is_ip_address", "is_error_page",
+        "has_sub_domain", "has_at_symbol", "is_valid_html", "has_form_with_url",
+        "has_iframe", "use_mouseover", "is_cert_valid", "has_dns_record",
+        "has_whois", "path_starts_with_url"]
+    df[bool_columns] = df[bool_columns].apply(bool_to_int, axis=0)
+
+    # normalize urls
+    df['refs'] = df['refs'].apply(normalize_refs)
+    df['url'] = df['url'].apply(normalize_url)
+    df = df.drop_duplicates(subset='url', keep='first')
+
+    return df
+
+
+def load_every_urls_with_features(df: pd.DataFrame, path: str) -> Tuple[List, List]:
+    """Returns a list of every urls in dataset (root urls + every refs)
+    along with their features.
+    """
+    every_urls, X = [], []
+    df_as_dict = df.to_dict("records")
+    
+    for row in tqdm(df_as_dict):
+        every_urls.append(row['url'])
+        features = [value for key, value in row.items() if key not in ["refs", "is_phishing", "url"]]
+        X.append(features)
+
+    return every_urls, X
+
+
+def load_train_set(csv_file: str) -> Tuple[pd.DataFrame, List[List], List[int]]:
+    """Opens the csv file in `csv_file` and returns every
+    features and label of each root url in the dataset.
+
+    Returns:
+        df: the opened and pre-processed dataset
+        X: the list of features (list) of each root url
+        y: the list of labels (int) of each root url
+    """
+    df = read_csv(csv_file)
+    df = normalize_features(df)
+
+    root_urls = df[~df['is_phishing'].isin([NAN_VALUE])]['url']
+    df = df.set_index('url')
+
+    X, y = [], []
+    for i, (_, url) in enumerate(tqdm(root_urls.items(), total=len(root_urls))):
+        data = df.loc[url]
+        y.append(data.is_phishing)
+        X.append(data.drop("refs").drop("is_phishing"))
+
+    return df.reset_index(), X, y
diff --git a/phishGNN/dataset.py b/phishGNN/dataset_v1.py
similarity index 65%
rename from phishGNN/dataset.py
rename to phishGNN/dataset_v1.py
index 3263b2d..105b230 100644
--- a/phishGNN/dataset.py
+++ b/phishGNN/dataset_v1.py
@@ -1,16 +1,14 @@
 import glob
-import json
 import os
-from typing import Dict, List
 
-import numpy as np
 import pandas as pd
 import torch
 import torch_geometric
 from torch_geometric.data import Data, Dataset
 from tqdm import tqdm
 
-from utils.utils import log_fail, log_success, normalize_www_prefix
+import dataprep
+from utils.utils import normalize_www_prefix
 
 print(f"Torch version: {torch.__version__}")
 print(f"Cuda available: {torch.cuda.is_available()}")
@@ -26,8 +24,6 @@ def __init__(
         use_process: bool=True,
         visulization_mode: bool=False,
         nan_value: float=-1.0,
-        max_depth: int=1,
-        test=False,
         transform=None,
         pre_transform=None,
     ):
@@ -38,8 +34,6 @@ def __init__(
         self.use_process = use_process
         self.visulization_mode = visulization_mode
         self.nan_value = nan_value
-        self.max_depth = max_depth
-        self.test = test
         super(PhishingDataset, self).__init__(root, transform, pre_transform)
 
     @property
@@ -70,8 +64,8 @@ def process(self):
 
         # loop over all files in `raw_file_names`
         for raw_path in self.raw_paths:
-            df = self._read_csv(raw_path)
-            df = self._normalize_features(df)
+            df = dataprep.read_csv(raw_path)
+            df = dataprep.normalize_features(df)
 
             root_urls = df[~df['is_phishing'].isin([self.nan_value])]['url']
             df = df.set_index('url')
@@ -92,86 +86,6 @@ def len(self):
         return (len(os.listdir(self.processed_dir)) - 4) // 2
 
 
-    def _read_csv(self, path: str) -> pd.DataFrame:
-        """Opens the csv dataset as DataFrame and cast types.
-        """
-        date_parser = lambda c: pd.to_datetime(c, format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')
-        df = pd.read_csv(
-            path,
-            index_col=False,
-            parse_dates=['domain_creation_date'],
-            date_parser=date_parser,
-        )
-
-        # equilibrate dataset classes as 50/50% benign/phishing
-        nb_phishing = len(df[df['is_phishing'] == 1])
-        benign = df.index[(df['is_phishing'] == 0)][:nb_phishing]
-        other = df.index[~(df['is_phishing'] == 0)]
-        df = pd.concat([df.iloc[benign], df.iloc[other]])
-
-        # cast object dtypes
-        df['url'] = df['url'].astype('string')
-        df['cert_country'] = df['cert_country'].astype('string')
-
-        # remove useless features
-        del df['status_code']
-        del df['depth']
-        del df['domain_creation_date']
-        del df['cert_country'] # TODO: handle cert_country and sort by "dangerous" country
-        return df
-
-
-    def _normalize_features(self, df: pd.DataFrame):
-        """Pre-processes every feature so that they are normalized
-        adn scaled between 0 and 1 range.
-
-        Args:
-            df: the dataframe to normalize
-
-        Returns:
-            DataFrame: the normalized dataframe
-        """
-        def bool_to_int(col: pd.Series):
-            def wrapper(x):
-                if np.isnan(x) or x not in [True, False]:
-                    return self.nan_value
-                return 1 if x == True else 0
-            return col.apply(wrapper)
-        
-        def min_max_scaling(col: pd.Series):
-            min = col.min()
-            max = col.max()
-            return col.apply(lambda x: (x - min) / (max - min))
-
-        def normalize_refs(refs: List[Dict]):
-            refs = json.loads(refs)
-            for ref in refs:
-                ref['url'] = self._normalize_url(ref['url'])
-            return refs
-
-        # normalize int & float columns
-        num_column_idxs = [i for i, dt in enumerate(df.dtypes) if dt in [int, float]]
-        df.iloc[:, num_column_idxs] = \
-            df.iloc[:, num_column_idxs].apply(min_max_scaling, axis=0)
-
-        # replace NaN values for non-string columns
-        df.iloc[:, num_column_idxs] = df.iloc[:, num_column_idxs].fillna(self.nan_value)
-
-        # normalize bool columns
-        bool_columns = ["is_phishing", "is_https", "is_ip_address", "is_error_page",
-            "has_sub_domain", "has_at_symbol", "is_valid_html", "has_form_with_url",
-            "has_iframe", "use_mouseover", "is_cert_valid", "has_dns_record",
-            "has_whois", "path_starts_with_url"]
-        df[bool_columns] = df[bool_columns].apply(bool_to_int, axis=0)
-
-        # normalize urls
-        df['refs'] = df['refs'].apply(normalize_refs)
-        df['url'] = df['url'].apply(self._normalize_url)
-        df = df.drop_duplicates(subset='url', keep='first')
-
-        return df
-
-
     def _build_tensors(self, root_url: str, df: pd.DataFrame):
         """Builds the required tensors for one graph.
         Theses matrices will be then used for training the GNN.
diff --git a/phishGNN/dataset_v2.py b/phishGNN/dataset_v2.py
new file mode 100644
index 0000000..65b12a5
--- /dev/null
+++ b/phishGNN/dataset_v2.py
@@ -0,0 +1,194 @@
+import glob
+import os
+
+import pandas as pd
+import torch
+import torch_geometric
+from sklearn.model_selection import train_test_split
+from torch_geometric.data import Data, Dataset
+from tqdm import tqdm
+
+import dataprep
+from other_models import train_random_forest
+
+print(f"Torch version: {torch.__version__}")
+print(f"Cuda available: {torch.cuda.is_available()}")
+print(f"Torch geometric version: {torch_geometric.__version__}")
+
+
+class PhishingDataset2(Dataset):
+    """Dataset containing both phishing and non-phishing
+    website urls.
+    """
+    def __init__(
+        self,
+        root: str,
+        use_process: bool=True,
+        visulization_mode: bool=False,
+        nan_value: float=-1.0,
+        transform=None,
+        pre_transform=None,
+    ):
+        """
+        root = Where the dataset should be stored. This folder is split
+        into raw_dir (downloaded dataset) and processed_dir (processed data). 
+        """
+        self.use_process = use_process
+        self.visulization_mode = visulization_mode
+        self.nan_value = nan_value
+        super(PhishingDataset2, self).__init__(root, transform, pre_transform)
+
+    @property
+    def raw_file_names(self):
+        """File name of the csv dataset.
+        """
+        return glob.glob(os.path.join(self.raw_dir, "*"))
+
+    @property
+    def processed_file_names(self):
+        return [file + ".pt" for file in self.raw_file_names]
+
+    @property
+    def num_classes(self):
+        return 2
+
+    def file_name(self, idx: int):
+        if self.visulization_mode:
+            return f'data_viz_{idx}.pt'
+        return f'data_{idx}.pt'
+
+    def process(self):
+        """Reads csv files in data/raw and preprocess so that output
+        preprocessed files are written in data/processed folder.
+        """
+        if not self.use_process:
+            return
+
+        # loop over all files in `raw_file_names`
+        for raw_path in self.raw_paths:
+            df, X, y = dataprep.load_train_set(raw_path)
+            df_eval, X_eval, y_eval = dataprep.load_train_set("data/test/raw/evaloutput.csv")
+
+            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+            X_test = [*X_test, *X_eval]
+            y_test = [*y_test, *y_eval]
+
+            forest = train_random_forest(X_train, X_test, y_train, y_test)
+            
+            every_urls, every_features = dataprep.load_every_urls_with_features(df, raw_path)
+            every_preds = forest.predict(every_features)
+
+            root_urls = df[~df['is_phishing'].isin([self.nan_value])]['url']
+            
+            df.drop(df.iloc[:, 2:-1], inplace=True, axis=1)
+            df["url"]: every_urls
+            df["is_phishing_pred"] = every_preds
+
+            df = df.set_index('url')
+            df_to_dict = df.to_dict("index")
+
+            # loop over each root urls in the dataset
+            for i, (_, url) in enumerate(tqdm(root_urls.items(), total=len(root_urls))):
+                edge_index, x, _, y, viz_utils = self._build_tensors(url, df_to_dict, df.index)
+
+                self.data = Data(x=x, edge_index=edge_index, y=y)
+                torch.save(self.data, os.path.join(self.processed_dir, f'data_{i}.pt'))
+
+                # save another file with variables needed for visualization
+                self.data.pos = viz_utils
+                torch.save(self.data, os.path.join(self.processed_dir, f'data_viz_{i}.pt'))
+
+
+    def len(self):
+        return (len(os.listdir(self.processed_dir)) - 4) // 2
+
+
+    def _build_tensors(self, root_url: str, df_to_dict, existing_urls):
+        """Builds the required tensors for one graph.
+        Theses matrices will be then used for training the GNN.
+
+        Args:
+            df: the dataset of one graph as form of pandas daframe
+
+        Returns:
+            Tuple[edge_index, x, edge_attr, y, viz_utils]
+        """
+        from_, to_, edges_ = [], [], []
+        id_to_feat = {}
+        url_to_id = {}
+        queue = [root_url]
+        visited = set()
+        error_pages = set()
+
+        def map_url_to_id(url: str):
+            url_to_id[url] = len(url_to_id) \
+                if url not in url_to_id else url_to_id[url]
+
+        def bool_to_float(value: bool):
+            return 1. if value else 0.
+        
+
+        while True:
+            if len(queue) == 0:
+                break
+            url = queue.pop()
+            try:
+                node = df_to_dict[url]
+            except KeyError:
+                node = self.error_page_node_feature
+
+            refs = node['refs']
+            map_url_to_id(url)
+
+            for i, edge in enumerate(refs):
+                ref = edge['url']
+                is_same_domain = bool_to_float(edge['is_same_domain'])
+                is_form = bool_to_float(edge['is_form'])
+                is_anchor = bool_to_float(edge['is_anchor'])
+                
+                if (url, ref, i) in visited:
+                    break
+                if ref not in existing_urls:
+                    error_pages.add(ref)
+                map_url_to_id(ref)
+
+                from_.append(url_to_id[url])
+                to_.append(url_to_id[ref])
+                edges_.append([1]) # should be edge features
+                    
+                is_anchor = ref == url
+                if not is_anchor:
+                    queue.append(ref)
+                visited.add((url, ref, i))
+
+            # remove url and refs
+            features = [node['is_phishing']]
+            id_to_feat[url_to_id[url]] = features
+        
+        x = [id_to_feat[k] for k in sorted(id_to_feat)]
+        visualization = {
+            "url_to_id": url_to_id,
+            "error_pages": error_pages,
+        }
+
+        return (
+            torch.tensor([from_, to_]).type(torch.LongTensor),
+            torch.tensor(x),
+            torch.tensor(edges_),
+            torch.tensor(df_to_dict[root_url]['is_phishing']),
+            visualization,
+        )
+        
+
+    def get(self, idx):
+        return torch.load(os.path.join(self.processed_dir, self.file_name(idx)))
+
+        
+    @property
+    def error_page_node_feature(self):
+        data = {
+            'is_phishing': self.nan_value,
+            'is_phishing_pred': self.nan_value,
+            'refs': [],
+        }
+        return pd.Series(data=data)
diff --git a/phishGNN/other_models.py b/phishGNN/other_models.py
new file mode 100644
index 0000000..3932a6c
--- /dev/null
+++ b/phishGNN/other_models.py
@@ -0,0 +1,63 @@
+import torch
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.model_selection import train_test_split
+from sklearn import metrics
+
+import dataprep
+from models import FeedforwardNeuralNetModel
+
+
+def train_random_forest(X_train, X_test, y_train, y_test):
+    clf = RandomForestClassifier(n_estimators=100)
+    clf.fit(X_train, y_train)
+
+    y_pred = clf.predict(X_test)
+    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
+    return clf
+
+
+def train_logistic_regression(X_train, X_test, y_train, y_test):
+    reg = LogisticRegression()
+    reg.fit(X_train, y_train)
+
+    y_pred = reg.predict(X_test)
+    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
+    return reg
+
+
+def train_svm(X_train, X_test, y_train, y_test):
+    svm = SVC(kernel = 'rbf', random_state = 0)
+    svm.fit(X_train, y_train)
+
+    y_pred = svm.predict(X_test)
+    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
+    return svm
+
+
+def train_ffn(X_train, X_test, y_train, y_test):
+    model = FeedforwardNeuralNetModel(
+        input_dim=len(X_train[0]),
+        hidden_dim=128,
+        output_dim=2,
+    )
+    lr = 0.01
+    weight_decay = 4e-5
+    epochs = 50
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
+    loss_fn = torch.nn.CrossEntropyLoss()
+
+    train_accs, test_accs = [], []
+    for epoch in range(epochs):
+        loss = model.fit(X_train, y_train, optimizer, loss_fn)
+        train_acc = model.test(X_train, y_train)
+        test_acc = model.test(X_test, y_test)
+        train_accs.append(train_acc)
+        test_accs.append(test_acc)
+        print(f'Epoch: {(epoch+1):03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')
+
+
+if __name__ == '__main__':
+    df, X, y = dataprep.load_train_set("data/train/raw/both.csv")
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
diff --git a/phishGNN/training.py b/phishGNN/training.py
index 04205bd..c2923bb 100644
--- a/phishGNN/training.py
+++ b/phishGNN/training.py
@@ -8,11 +8,12 @@
 
 import torch
 import torch_geometric.nn as nn
-from dataset import PhishingDataset
+from dataset_v1 import PhishingDataset
+from dataset_v2 import PhishingDataset2
 from torch_geometric.loader import DataLoader
 
 from visualization import visualize, plot_embeddings
-from models import GCN_2, GCN_3, GIN, GAT, MLP, GraphSAGE, ClusterGCN, MemPool
+from models import GCN_2, GCN_3, GIN, GAT, GraphSAGE, ClusterGCN, MemPool
 from utils.utils import mean_std_error
 
 
@@ -94,7 +95,7 @@ def train(
     use_process: bool=False,
 ):
     path = os.path.join(os.getcwd(), "data", "train")
-    dataset = PhishingDataset(root=path, use_process=use_process)
+    dataset = PhishingDataset2(root=path, use_process=use_process)
     dataset = dataset.shuffle()
 
     train_test = 0.9
@@ -102,7 +103,7 @@ def train(
     test_dataset1 = dataset[int(len(dataset) * train_test):]
 
     test_path = os.path.join(os.getcwd(), "data", "test")
-    test_dataset2 = PhishingDataset(root=test_path, use_process=False)
+    test_dataset2 = PhishingDataset2(root=test_path, use_process=use_process)
     test_dataset = torch.utils.data.ConcatDataset([test_dataset1, test_dataset2])
 
     train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
@@ -201,15 +202,16 @@ def train(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--training', action="store_true", help='if set, a training will be run from data/train/raw')
-    parser.add_argument('--test', action="store_true", help='if set, a test will be run from data/test/raw')
-    parser.add_argument('--plot-embeddings', action="store_true",
-        help='whether to save the embeddings in a png file during training or not')
-    args, _ = parser.parse_known_args()
-
-    if args.test is not None:
-        accuracy = test_model("20_epochs_default/ClusterGCN_global_max_pool_32.pkl")
-        print(accuracy)
-    else:
-        train(args.plot_embeddings)
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument('--training', action="store_true", help='if set, a training will be run from data/train/raw')
+    # parser.add_argument('--test', action="store_true", help='if set, a test will be run from data/test/raw')
+    # parser.add_argument('--plot-embeddings', action="store_true",
+    #     help='whether to save the embeddings in a png file during training or not')
+    # args, _ = parser.parse_known_args()
+
+    # if args.test is not None:
+    #     accuracy = test_model("20_epochs_default/ClusterGCN_global_max_pool_32.pkl")
+    #     print(accuracy)
+    # else:
+    #     train(args.plot_embeddings)
+    train(use_process=True)
diff --git a/phishGNN/visualization.py b/phishGNN/visualization.py
index 3420758..94a77ee 100644
--- a/phishGNN/visualization.py
+++ b/phishGNN/visualization.py
@@ -11,7 +11,7 @@
 from torch_geometric.data import Data
 from tqdm import tqdm
 
-from dataset import PhishingDataset
+from dataset_v1 import PhishingDataset
 from utils.utils import extract_domain_name, tensor_to_tuple_list
 
 ROOT_COLOR          = '#0096FF'