Skip to content

Commit

Permalink
Initial commit with formatting, type hinting, type casting
Browse files Browse the repository at this point in the history
  • Loading branch information
helpsystems-mushkevych committed Jul 20, 2022
1 parent 6929333 commit b46cfa0
Show file tree
Hide file tree
Showing 25 changed files with 388 additions and 339 deletions.
25 changes: 25 additions & 0 deletions INSTALLATION.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# create new miniconda environment
```shell
conda create --name torchgraphs python=3.10 --channel conda-forge
conda activate torchgraphs
```

# install libs
```shell
conda install torchsparse pytorch_geometric matplotlib pyvis bs4 --channel conda-forge
pip install torch-scatter torch-cluster torch-spline-conv igraph
```

# unzip training/test data
```shell
./install_dataset.sh
```

# run training
python phishGNN/training.py




### CRAWLER
# setup MongoDb
Empty file added phishGNN/__init__.py
Empty file.
49 changes: 25 additions & 24 deletions phishGNN/cross_validation.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,24 @@
import time

import torch
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
from torch import tensor
from torch.optim import Adam

from torch import tensor, nn
from torch.optim import Adam, Optimizer
from torch.nn.modules.loss import _Loss
from torch_geometric.loader import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from utils.compute_device import COMPUTE_DEVICE

# set default dtype, as MPS Pytorch does not support float64
torch.set_default_dtype(torch.float32)


def cross_validation_with_val_set(dataset, model, loss_fn, folds, epochs, batch_size,
lr, lr_decay_factor, lr_decay_step_size,
weight_decay, logger=None):
weight_decay, logger=None) -> tuple[float, float, float]:

val_losses, accs, durations = [], [], []
for fold, (train_idx, test_idx,
val_idx) in enumerate(zip(*k_fold(dataset, folds))):
for fold, (train_idx, test_idx, val_idx) in enumerate(zip(*k_fold(dataset, folds))):

train_dataset = dataset[train_idx]
test_dataset = dataset[test_idx]
Expand All @@ -27,7 +28,7 @@ def cross_validation_with_val_set(dataset, model, loss_fn, folds, epochs, batch_
val_loader = DataLoader(val_dataset, batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size, shuffle=False)

model.to(device).reset_parameters()
model.to(COMPUTE_DEVICE).reset_parameters()
optimizer = Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

if torch.cuda.is_available():
Expand All @@ -37,9 +38,9 @@ def cross_validation_with_val_set(dataset, model, loss_fn, folds, epochs, batch_

for epoch in range(1, epochs + 1):
if hasattr(model, 'fit'):
loss = model.fit(train_loader, optimizer, loss_fn, device)
loss = model.fit(train_loader, optimizer, loss_fn, COMPUTE_DEVICE)
else:
loss = fit(model, train_loader, optimizer, loss_fn, device)
loss = fit(model, train_loader, optimizer, loss_fn, COMPUTE_DEVICE)

val_losses.append(eval_loss(model, val_loader, loss_fn))
accs.append(eval_acc(model, test_loader))
Expand Down Expand Up @@ -106,19 +107,19 @@ def num_graphs(data):


def fit(
model,
train_loader,
optimizer,
loss_fn,
device,
):
model:nn.Module,
train_loader:DataLoader,
optimizer:Optimizer,
loss_fn:_Loss,
device:torch.device,
) -> float:
model.train()

total_loss = 0
for data in train_loader:
data = data.to(device)
data = data.to(COMPUTE_DEVICE)
out = model(data.x, data.edge_index, data.batch)
loss = loss_fn(out, data.y.long())
loss = loss_fn(out, data.y)
loss.backward()
optimizer.step()
optimizer.zero_grad()
Expand All @@ -142,26 +143,26 @@ def fit(
# return total_loss / len(loader.dataset)


def eval_acc(model, loader):
def eval_acc(model:nn.Module, loader:DataLoader) -> float:
model.eval()

correct = 0
for data in loader:
data = data.to(device)
data = data.to(COMPUTE_DEVICE)
with torch.no_grad():
pred = model(data.x, data.edge_index, data.batch).max(1)[1]
correct += pred.eq(data.y.view(-1)).sum().item()
return correct / len(loader.dataset)


def eval_loss(model, loader, loss_fn):
def eval_loss(model:nn.Module, loader:DataLoader, loss_fn:_Loss) -> float:
model.eval()

loss = 0
for data in loader:
data = data.to(device)
data = data.to(COMPUTE_DEVICE)
with torch.no_grad():
out = model(data.x, data.edge_index, data.batch)
loss += loss_fn(out, data.y.long())
loss += loss_fn(out, data.y)
# loss += F.nll_loss(out, data.y.view(-1), reduction='sum').item()
return loss / len(loader.dataset)
16 changes: 9 additions & 7 deletions phishGNN/dataprep.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

import json
from typing import Dict, List, Tuple

Expand All @@ -10,6 +9,7 @@

NAN_VALUE = -1


def read_csv(path: str) -> pd.DataFrame:
"""Opens the csv dataset as DataFrame and cast types.
"""
Expand All @@ -35,7 +35,7 @@ def read_csv(path: str) -> pd.DataFrame:
del df['status_code']
del df['depth']
del df['domain_creation_date']
del df['cert_country'] # TODO: handle cert_country and sort by "dangerous" country
del df['cert_country'] # TODO: handle cert_country and sort by "dangerous" country
return df


Expand All @@ -57,13 +57,15 @@ def normalize_features(df: pd.DataFrame):
Returns:
DataFrame: the normalized dataframe
"""

def bool_to_int(col: pd.Series):
def wrapper(x):
if np.isnan(x) or x not in [True, False]:
return NAN_VALUE
return 1 if x == True else 0

return col.apply(wrapper)

def min_max_scaling(col: pd.Series):
min = col.min()
max = col.max()
Expand All @@ -85,9 +87,9 @@ def normalize_refs(refs: List[Dict]):

# normalize bool columns
bool_columns = ["is_phishing", "is_https", "is_ip_address", "is_error_page",
"has_sub_domain", "has_at_symbol", "is_valid_html", "has_form_with_url",
"has_iframe", "use_mouseover", "is_cert_valid", "has_dns_record",
"has_whois", "path_starts_with_url"]
"has_sub_domain", "has_at_symbol", "is_valid_html", "has_form_with_url",
"has_iframe", "use_mouseover", "is_cert_valid", "has_dns_record",
"has_whois", "path_starts_with_url"]
df[bool_columns] = df[bool_columns].apply(bool_to_int, axis=0)

# normalize urls
Expand All @@ -104,7 +106,7 @@ def load_every_urls_with_features(df: pd.DataFrame, path: str) -> Tuple[List, Li
"""
every_urls, X = [], []
df_as_dict = df.to_dict("records")

for row in tqdm(df_as_dict):
every_urls.append(row['url'])
features = [value for key, value in row.items() if key not in ["refs", "is_phishing", "url"]]
Expand Down
73 changes: 38 additions & 35 deletions phishGNN/dataset_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,63 +3,67 @@

import pandas as pd
import torch
from torch import Tensor
import torch_geometric
from torch_geometric.data import Data, Dataset
from tqdm import tqdm

import dataprep
from utils.compute_device import COMPUTE_DEVICE
from utils.utils import normalize_www_prefix

print(f"Torch version: {torch.__version__}")
print(f"Cuda available: {torch.cuda.is_available()}")
print(f"Compute device: {COMPUTE_DEVICE}")
print(f"Torch geometric version: {torch_geometric.__version__}")

# set default dtype, as MPS Pytorch does not support float64
torch.set_default_dtype(torch.float32)


class PhishingDataset(Dataset):
"""Dataset containing both phishing and non-phishing
website urls.
"""
"""Dataset containing both phishing and non-phishing website urls. """

def __init__(
self,
root: str,
use_process: bool=True,
visulization_mode: bool=False,
nan_value: float=-1.0,
transform=None,
pre_transform=None,
self,
root: str,
do_data_preparation: bool = True,
visulization_mode: bool = False,
nan_value: float = -1.0,
transform=None,
pre_transform=None,
):
"""
root = Where the dataset should be stored. This folder is split
into raw_dir (downloaded dataset) and processed_dir (processed data).
"""
self.use_process = use_process
self.do_data_preparation = do_data_preparation
self.visulization_mode = visulization_mode
self.nan_value = nan_value
super(PhishingDataset, self).__init__(root, transform, pre_transform)

@property
def raw_file_names(self):
"""File name of the csv dataset.
"""
def raw_file_names(self) -> list[str]:
"""File name of the csv dataset. """
return glob.glob(os.path.join(self.raw_dir, "*"))

@property
def processed_file_names(self):
def processed_file_names(self) -> list[str]:
return [file + ".pt" for file in self.raw_file_names]

@property
def num_classes(self):
return 2

def file_name(self, idx: int):
def file_name(self, idx: int) -> str:
if self.visulization_mode:
return f'data_viz_{idx}.pt'
return f'data_{idx}.pt'

def process(self):
def process(self) -> None:
"""Reads csv files in data/raw and preprocess so that output
preprocessed files are written in data/processed folder.
"""
if not self.use_process:
if not self.do_data_preparation:
return

# loop over all files in `raw_file_names`
Expand All @@ -82,14 +86,12 @@ def process(self):
self.data.pos = viz_utils
torch.save(self.data, os.path.join(self.processed_dir, f'data_viz_{i}.pt'))


def len(self):
return (len(os.listdir(self.processed_dir)) - 4) // 2


def _build_tensors(self, root_url: str, df_to_dict, existing_urls):
def _build_tensors(self, root_url: str, df_to_dict, existing_urls) -> tuple[Tensor, Tensor, Tensor, Tensor, dict]:
"""Builds the required tensors for one graph.
Theses matrices will be then used for training the GNN.
These matrices will be then used for training the GNN.
Args:
df: the dataset of one graph as form of pandas daframe
Expand Down Expand Up @@ -130,37 +132,39 @@ def map_url_to_id(url: str):

from_.append(url_to_id[url])
to_.append(url_to_id[ref])
edges_.append([1]) # should be edge features
edges_.append([1]) # should be edge features

is_anchor = ref == url
if not is_anchor:
queue.append(ref)
visited.add((url, ref, i))

# remove url and refs
features = [v for k, v in sorted(node.items()) \
if k not in ['refs', 'is_phishing']]
if k not in ['refs', 'is_phishing']]
id_to_feat[url_to_id[url]] = features

x = [id_to_feat[k] for k in sorted(id_to_feat)]
visualization = {
"url_to_id": url_to_id,
"error_pages": error_pages,
}

return (
torch.tensor([from_, to_]).type(torch.LongTensor),
torch.tensor(x),
torch.tensor(edges_),
torch.tensor(df_to_dict[root_url]['is_phishing']),
torch.tensor([from_, to_], dtype=torch.int64),
torch.tensor(x, dtype=torch.float32),
torch.tensor(edges_, dtype=torch.int64),
torch.tensor(df_to_dict[root_url]['is_phishing'], dtype=torch.int64),
visualization,
)


def get(self, idx):
return torch.load(os.path.join(self.processed_dir, self.file_name(idx)))
t = torch.load(os.path.join(self.processed_dir, self.file_name(idx)))
t.x = t.x.to(dtype=torch.float32)
t.y = t.y.to(dtype=torch.int64)
t.edge = t.to(dtype=torch.int64)
return t


@property
def error_page_node_feature(self):
data = {
Expand Down Expand Up @@ -195,7 +199,6 @@ def error_page_node_feature(self):
}
return pd.Series(data=data)


def _normalize_url(self, url: str):
url = url.rstrip('/')
url = normalize_www_prefix(url)
Expand Down
Loading

0 comments on commit b46cfa0

Please sign in to comment.