diff --git a/phishGNN/dataprep.py b/phishGNN/dataprep.py new file mode 100644 index 0000000..6c9bbc7 --- /dev/null +++ b/phishGNN/dataprep.py @@ -0,0 +1,137 @@ + +import json +from typing import Dict, List, Tuple + +import numpy as np +import pandas as pd +from tqdm import tqdm + +from utils.utils import normalize_www_prefix + +NAN_VALUE = -1 + +def read_csv(path: str) -> pd.DataFrame: + """Opens the csv dataset as DataFrame and cast types. + """ + date_parser = lambda c: pd.to_datetime(c, format='%Y-%m-%dT%H:%M:%SZ', errors='coerce') + df = pd.read_csv( + path, + index_col=False, + parse_dates=['domain_creation_date'], + date_parser=date_parser, + ) + + # equilibrate dataset classes as 50/50% benign/phishing + nb_phishing = len(df[df['is_phishing'] == 1]) + benign = df.index[(df['is_phishing'] == 0)][:nb_phishing] + other = df.index[~(df['is_phishing'] == 0)] + df = pd.concat([df.iloc[benign], df.iloc[other]]) + + # cast object dtypes + df['url'] = df['url'].astype('string') + df['cert_country'] = df['cert_country'].astype('string') + + # remove useless features + del df['status_code'] + del df['depth'] + del df['domain_creation_date'] + del df['cert_country'] # TODO: handle cert_country and sort by "dangerous" country + return df + + +def normalize_url(url: str): + """Strips last / and normalize www prefix. + """ + url = url.rstrip('/') + url = normalize_www_prefix(url) + return url + + +def normalize_features(df: pd.DataFrame): + """Pre-processes every feature so that they are normalized + adn scaled between 0 and 1 range. + + Args: + df: the dataframe to normalize + + Returns: + DataFrame: the normalized dataframe + """ + def bool_to_int(col: pd.Series): + def wrapper(x): + if np.isnan(x) or x not in [True, False]: + return NAN_VALUE + return 1 if x == True else 0 + return col.apply(wrapper) + + def min_max_scaling(col: pd.Series): + min = col.min() + max = col.max() + return col.apply(lambda x: (x - min) / (max - min)) + + def normalize_refs(refs: List[Dict]): + refs = json.loads(refs) + for ref in refs: + ref['url'] = normalize_url(ref['url']) + return refs + + # normalize int & float columns + num_column_idxs = [i for i, dt in enumerate(df.dtypes) if dt in [int, float]] + df.iloc[:, num_column_idxs] = \ + df.iloc[:, num_column_idxs].apply(min_max_scaling, axis=0) + + # replace NaN values for non-string columns + df.iloc[:, num_column_idxs] = df.iloc[:, num_column_idxs].fillna(NAN_VALUE) + + # normalize bool columns + bool_columns = ["is_phishing", "is_https", "is_ip_address", "is_error_page", + "has_sub_domain", "has_at_symbol", "is_valid_html", "has_form_with_url", + "has_iframe", "use_mouseover", "is_cert_valid", "has_dns_record", + "has_whois", "path_starts_with_url"] + df[bool_columns] = df[bool_columns].apply(bool_to_int, axis=0) + + # normalize urls + df['refs'] = df['refs'].apply(normalize_refs) + df['url'] = df['url'].apply(normalize_url) + df = df.drop_duplicates(subset='url', keep='first') + + return df + + +def load_every_urls_with_features(df: pd.DataFrame, path: str) -> Tuple[List, List]: + """Returns a list of every urls in dataset (root urls + every refs) + along with their features. + """ + every_urls, X = [], [] + df_as_dict = df.to_dict("records") + + for row in tqdm(df_as_dict): + every_urls.append(row['url']) + features = [value for key, value in row.items() if key not in ["refs", "is_phishing", "url"]] + X.append(features) + + return every_urls, X + + +def load_train_set(csv_file: str) -> Tuple[pd.DataFrame, List[List], List[int]]: + """Opens the csv file in `csv_file` and returns every + features and label of each root url in the dataset. + + Returns: + df: the opened and pre-processed dataset + X: the list of features (list) of each root url + y: the list of labels (int) of each root url + """ + df = read_csv(csv_file) + df = normalize_features(df) + + root_urls = df[~df['is_phishing'].isin([NAN_VALUE])]['url'] + df = df.set_index('url') + + X, y = [], [] + for i, (_, url) in enumerate(tqdm(root_urls.items(), total=len(root_urls))): + data = df.loc[url] + y.append(data.is_phishing) + X.append(data.drop("refs").drop("is_phishing")) + + return df.reset_index(), X, y diff --git a/phishGNN/dataset.py b/phishGNN/dataset_v1.py similarity index 65% rename from phishGNN/dataset.py rename to phishGNN/dataset_v1.py index 3263b2d..105b230 100644 --- a/phishGNN/dataset.py +++ b/phishGNN/dataset_v1.py @@ -1,16 +1,14 @@ import glob -import json import os -from typing import Dict, List -import numpy as np import pandas as pd import torch import torch_geometric from torch_geometric.data import Data, Dataset from tqdm import tqdm -from utils.utils import log_fail, log_success, normalize_www_prefix +import dataprep +from utils.utils import normalize_www_prefix print(f"Torch version: {torch.__version__}") print(f"Cuda available: {torch.cuda.is_available()}") @@ -26,8 +24,6 @@ def __init__( use_process: bool=True, visulization_mode: bool=False, nan_value: float=-1.0, - max_depth: int=1, - test=False, transform=None, pre_transform=None, ): @@ -38,8 +34,6 @@ def __init__( self.use_process = use_process self.visulization_mode = visulization_mode self.nan_value = nan_value - self.max_depth = max_depth - self.test = test super(PhishingDataset, self).__init__(root, transform, pre_transform) @property @@ -70,8 +64,8 @@ def process(self): # loop over all files in `raw_file_names` for raw_path in self.raw_paths: - df = self._read_csv(raw_path) - df = self._normalize_features(df) + df = dataprep.read_csv(raw_path) + df = dataprep.normalize_features(df) root_urls = df[~df['is_phishing'].isin([self.nan_value])]['url'] df = df.set_index('url') @@ -92,86 +86,6 @@ def len(self): return (len(os.listdir(self.processed_dir)) - 4) // 2 - def _read_csv(self, path: str) -> pd.DataFrame: - """Opens the csv dataset as DataFrame and cast types. - """ - date_parser = lambda c: pd.to_datetime(c, format='%Y-%m-%dT%H:%M:%SZ', errors='coerce') - df = pd.read_csv( - path, - index_col=False, - parse_dates=['domain_creation_date'], - date_parser=date_parser, - ) - - # equilibrate dataset classes as 50/50% benign/phishing - nb_phishing = len(df[df['is_phishing'] == 1]) - benign = df.index[(df['is_phishing'] == 0)][:nb_phishing] - other = df.index[~(df['is_phishing'] == 0)] - df = pd.concat([df.iloc[benign], df.iloc[other]]) - - # cast object dtypes - df['url'] = df['url'].astype('string') - df['cert_country'] = df['cert_country'].astype('string') - - # remove useless features - del df['status_code'] - del df['depth'] - del df['domain_creation_date'] - del df['cert_country'] # TODO: handle cert_country and sort by "dangerous" country - return df - - - def _normalize_features(self, df: pd.DataFrame): - """Pre-processes every feature so that they are normalized - adn scaled between 0 and 1 range. - - Args: - df: the dataframe to normalize - - Returns: - DataFrame: the normalized dataframe - """ - def bool_to_int(col: pd.Series): - def wrapper(x): - if np.isnan(x) or x not in [True, False]: - return self.nan_value - return 1 if x == True else 0 - return col.apply(wrapper) - - def min_max_scaling(col: pd.Series): - min = col.min() - max = col.max() - return col.apply(lambda x: (x - min) / (max - min)) - - def normalize_refs(refs: List[Dict]): - refs = json.loads(refs) - for ref in refs: - ref['url'] = self._normalize_url(ref['url']) - return refs - - # normalize int & float columns - num_column_idxs = [i for i, dt in enumerate(df.dtypes) if dt in [int, float]] - df.iloc[:, num_column_idxs] = \ - df.iloc[:, num_column_idxs].apply(min_max_scaling, axis=0) - - # replace NaN values for non-string columns - df.iloc[:, num_column_idxs] = df.iloc[:, num_column_idxs].fillna(self.nan_value) - - # normalize bool columns - bool_columns = ["is_phishing", "is_https", "is_ip_address", "is_error_page", - "has_sub_domain", "has_at_symbol", "is_valid_html", "has_form_with_url", - "has_iframe", "use_mouseover", "is_cert_valid", "has_dns_record", - "has_whois", "path_starts_with_url"] - df[bool_columns] = df[bool_columns].apply(bool_to_int, axis=0) - - # normalize urls - df['refs'] = df['refs'].apply(normalize_refs) - df['url'] = df['url'].apply(self._normalize_url) - df = df.drop_duplicates(subset='url', keep='first') - - return df - - def _build_tensors(self, root_url: str, df: pd.DataFrame): """Builds the required tensors for one graph. Theses matrices will be then used for training the GNN. diff --git a/phishGNN/dataset_v2.py b/phishGNN/dataset_v2.py new file mode 100644 index 0000000..65b12a5 --- /dev/null +++ b/phishGNN/dataset_v2.py @@ -0,0 +1,194 @@ +import glob +import os + +import pandas as pd +import torch +import torch_geometric +from sklearn.model_selection import train_test_split +from torch_geometric.data import Data, Dataset +from tqdm import tqdm + +import dataprep +from other_models import train_random_forest + +print(f"Torch version: {torch.__version__}") +print(f"Cuda available: {torch.cuda.is_available()}") +print(f"Torch geometric version: {torch_geometric.__version__}") + + +class PhishingDataset2(Dataset): + """Dataset containing both phishing and non-phishing + website urls. + """ + def __init__( + self, + root: str, + use_process: bool=True, + visulization_mode: bool=False, + nan_value: float=-1.0, + transform=None, + pre_transform=None, + ): + """ + root = Where the dataset should be stored. This folder is split + into raw_dir (downloaded dataset) and processed_dir (processed data). + """ + self.use_process = use_process + self.visulization_mode = visulization_mode + self.nan_value = nan_value + super(PhishingDataset2, self).__init__(root, transform, pre_transform) + + @property + def raw_file_names(self): + """File name of the csv dataset. + """ + return glob.glob(os.path.join(self.raw_dir, "*")) + + @property + def processed_file_names(self): + return [file + ".pt" for file in self.raw_file_names] + + @property + def num_classes(self): + return 2 + + def file_name(self, idx: int): + if self.visulization_mode: + return f'data_viz_{idx}.pt' + return f'data_{idx}.pt' + + def process(self): + """Reads csv files in data/raw and preprocess so that output + preprocessed files are written in data/processed folder. + """ + if not self.use_process: + return + + # loop over all files in `raw_file_names` + for raw_path in self.raw_paths: + df, X, y = dataprep.load_train_set(raw_path) + df_eval, X_eval, y_eval = dataprep.load_train_set("data/test/raw/evaloutput.csv") + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + X_test = [*X_test, *X_eval] + y_test = [*y_test, *y_eval] + + forest = train_random_forest(X_train, X_test, y_train, y_test) + + every_urls, every_features = dataprep.load_every_urls_with_features(df, raw_path) + every_preds = forest.predict(every_features) + + root_urls = df[~df['is_phishing'].isin([self.nan_value])]['url'] + + df.drop(df.iloc[:, 2:-1], inplace=True, axis=1) + df["url"]: every_urls + df["is_phishing_pred"] = every_preds + + df = df.set_index('url') + df_to_dict = df.to_dict("index") + + # loop over each root urls in the dataset + for i, (_, url) in enumerate(tqdm(root_urls.items(), total=len(root_urls))): + edge_index, x, _, y, viz_utils = self._build_tensors(url, df_to_dict, df.index) + + self.data = Data(x=x, edge_index=edge_index, y=y) + torch.save(self.data, os.path.join(self.processed_dir, f'data_{i}.pt')) + + # save another file with variables needed for visualization + self.data.pos = viz_utils + torch.save(self.data, os.path.join(self.processed_dir, f'data_viz_{i}.pt')) + + + def len(self): + return (len(os.listdir(self.processed_dir)) - 4) // 2 + + + def _build_tensors(self, root_url: str, df_to_dict, existing_urls): + """Builds the required tensors for one graph. + Theses matrices will be then used for training the GNN. + + Args: + df: the dataset of one graph as form of pandas daframe + + Returns: + Tuple[edge_index, x, edge_attr, y, viz_utils] + """ + from_, to_, edges_ = [], [], [] + id_to_feat = {} + url_to_id = {} + queue = [root_url] + visited = set() + error_pages = set() + + def map_url_to_id(url: str): + url_to_id[url] = len(url_to_id) \ + if url not in url_to_id else url_to_id[url] + + def bool_to_float(value: bool): + return 1. if value else 0. + + + while True: + if len(queue) == 0: + break + url = queue.pop() + try: + node = df_to_dict[url] + except KeyError: + node = self.error_page_node_feature + + refs = node['refs'] + map_url_to_id(url) + + for i, edge in enumerate(refs): + ref = edge['url'] + is_same_domain = bool_to_float(edge['is_same_domain']) + is_form = bool_to_float(edge['is_form']) + is_anchor = bool_to_float(edge['is_anchor']) + + if (url, ref, i) in visited: + break + if ref not in existing_urls: + error_pages.add(ref) + map_url_to_id(ref) + + from_.append(url_to_id[url]) + to_.append(url_to_id[ref]) + edges_.append([1]) # should be edge features + + is_anchor = ref == url + if not is_anchor: + queue.append(ref) + visited.add((url, ref, i)) + + # remove url and refs + features = [node['is_phishing']] + id_to_feat[url_to_id[url]] = features + + x = [id_to_feat[k] for k in sorted(id_to_feat)] + visualization = { + "url_to_id": url_to_id, + "error_pages": error_pages, + } + + return ( + torch.tensor([from_, to_]).type(torch.LongTensor), + torch.tensor(x), + torch.tensor(edges_), + torch.tensor(df_to_dict[root_url]['is_phishing']), + visualization, + ) + + + def get(self, idx): + return torch.load(os.path.join(self.processed_dir, self.file_name(idx))) + + + @property + def error_page_node_feature(self): + data = { + 'is_phishing': self.nan_value, + 'is_phishing_pred': self.nan_value, + 'refs': [], + } + return pd.Series(data=data) diff --git a/phishGNN/other_models.py b/phishGNN/other_models.py new file mode 100644 index 0000000..3932a6c --- /dev/null +++ b/phishGNN/other_models.py @@ -0,0 +1,63 @@ +import torch +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.svm import SVC +from sklearn.model_selection import train_test_split +from sklearn import metrics + +import dataprep +from models import FeedforwardNeuralNetModel + + +def train_random_forest(X_train, X_test, y_train, y_test): + clf = RandomForestClassifier(n_estimators=100) + clf.fit(X_train, y_train) + + y_pred = clf.predict(X_test) + print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) + return clf + + +def train_logistic_regression(X_train, X_test, y_train, y_test): + reg = LogisticRegression() + reg.fit(X_train, y_train) + + y_pred = reg.predict(X_test) + print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) + return reg + + +def train_svm(X_train, X_test, y_train, y_test): + svm = SVC(kernel = 'rbf', random_state = 0) + svm.fit(X_train, y_train) + + y_pred = svm.predict(X_test) + print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) + return svm + + +def train_ffn(X_train, X_test, y_train, y_test): + model = FeedforwardNeuralNetModel( + input_dim=len(X_train[0]), + hidden_dim=128, + output_dim=2, + ) + lr = 0.01 + weight_decay = 4e-5 + epochs = 50 + optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) + loss_fn = torch.nn.CrossEntropyLoss() + + train_accs, test_accs = [], [] + for epoch in range(epochs): + loss = model.fit(X_train, y_train, optimizer, loss_fn) + train_acc = model.test(X_train, y_train) + test_acc = model.test(X_test, y_test) + train_accs.append(train_acc) + test_accs.append(test_acc) + print(f'Epoch: {(epoch+1):03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}') + + +if __name__ == '__main__': + df, X, y = dataprep.load_train_set("data/train/raw/both.csv") + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) diff --git a/phishGNN/training.py b/phishGNN/training.py index 04205bd..c2923bb 100644 --- a/phishGNN/training.py +++ b/phishGNN/training.py @@ -8,11 +8,12 @@ import torch import torch_geometric.nn as nn -from dataset import PhishingDataset +from dataset_v1 import PhishingDataset +from dataset_v2 import PhishingDataset2 from torch_geometric.loader import DataLoader from visualization import visualize, plot_embeddings -from models import GCN_2, GCN_3, GIN, GAT, MLP, GraphSAGE, ClusterGCN, MemPool +from models import GCN_2, GCN_3, GIN, GAT, GraphSAGE, ClusterGCN, MemPool from utils.utils import mean_std_error @@ -94,7 +95,7 @@ def train( use_process: bool=False, ): path = os.path.join(os.getcwd(), "data", "train") - dataset = PhishingDataset(root=path, use_process=use_process) + dataset = PhishingDataset2(root=path, use_process=use_process) dataset = dataset.shuffle() train_test = 0.9 @@ -102,7 +103,7 @@ def train( test_dataset1 = dataset[int(len(dataset) * train_test):] test_path = os.path.join(os.getcwd(), "data", "test") - test_dataset2 = PhishingDataset(root=test_path, use_process=False) + test_dataset2 = PhishingDataset2(root=test_path, use_process=use_process) test_dataset = torch.utils.data.ConcatDataset([test_dataset1, test_dataset2]) train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) @@ -201,15 +202,16 @@ def train( if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('--training', action="store_true", help='if set, a training will be run from data/train/raw') - parser.add_argument('--test', action="store_true", help='if set, a test will be run from data/test/raw') - parser.add_argument('--plot-embeddings', action="store_true", - help='whether to save the embeddings in a png file during training or not') - args, _ = parser.parse_known_args() - - if args.test is not None: - accuracy = test_model("20_epochs_default/ClusterGCN_global_max_pool_32.pkl") - print(accuracy) - else: - train(args.plot_embeddings) + # parser = argparse.ArgumentParser() + # parser.add_argument('--training', action="store_true", help='if set, a training will be run from data/train/raw') + # parser.add_argument('--test', action="store_true", help='if set, a test will be run from data/test/raw') + # parser.add_argument('--plot-embeddings', action="store_true", + # help='whether to save the embeddings in a png file during training or not') + # args, _ = parser.parse_known_args() + + # if args.test is not None: + # accuracy = test_model("20_epochs_default/ClusterGCN_global_max_pool_32.pkl") + # print(accuracy) + # else: + # train(args.plot_embeddings) + train(use_process=True) diff --git a/phishGNN/visualization.py b/phishGNN/visualization.py index 3420758..94a77ee 100644 --- a/phishGNN/visualization.py +++ b/phishGNN/visualization.py @@ -11,7 +11,7 @@ from torch_geometric.data import Data from tqdm import tqdm -from dataset import PhishingDataset +from dataset_v1 import PhishingDataset from utils.utils import extract_domain_name, tensor_to_tuple_list ROOT_COLOR = '#0096FF'