Skip to content

Commit

Permalink
Unifying quotes; Updating pytorch version to 1.12
Browse files Browse the repository at this point in the history
  • Loading branch information
helpsystems-mushkevych committed Jul 21, 2022
1 parent b46cfa0 commit dccfd9d
Show file tree
Hide file tree
Showing 15 changed files with 175 additions and 161 deletions.
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,16 @@ cd phishGNN
### Install dependencies

```python
./install_dataset.sh
python3 -m venv venv
. venv/bin/activate
pip install wheel
pip install -r requirements.txt
pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.11.0+cpu.html # for cpu
pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.12.0+cpu.html # for cpu
```

### unzip the dataset
```shell
./install_dataset.sh
```

## Training
Expand Down
14 changes: 7 additions & 7 deletions phishGNN/dataprep.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,10 @@ def normalize_refs(refs: List[Dict]):
df.iloc[:, num_column_idxs] = df.iloc[:, num_column_idxs].fillna(NAN_VALUE)

# normalize bool columns
bool_columns = ["is_phishing", "is_https", "is_ip_address", "is_error_page",
"has_sub_domain", "has_at_symbol", "is_valid_html", "has_form_with_url",
"has_iframe", "use_mouseover", "is_cert_valid", "has_dns_record",
"has_whois", "path_starts_with_url"]
bool_columns = ['is_phishing', 'is_https', 'is_ip_address', 'is_error_page',
'has_sub_domain', 'has_at_symbol', 'is_valid_html', 'has_form_with_url',
'has_iframe', 'use_mouseover', 'is_cert_valid', 'has_dns_record',
'has_whois', 'path_starts_with_url']
df[bool_columns] = df[bool_columns].apply(bool_to_int, axis=0)

# normalize urls
Expand All @@ -105,11 +105,11 @@ def load_every_urls_with_features(df: pd.DataFrame, path: str) -> Tuple[List, Li
along with their features.
"""
every_urls, X = [], []
df_as_dict = df.to_dict("records")
df_as_dict = df.to_dict('records')

for row in tqdm(df_as_dict):
every_urls.append(row['url'])
features = [value for key, value in row.items() if key not in ["refs", "is_phishing", "url"]]
features = [value for key, value in row.items() if key not in ['refs', 'is_phishing', 'url']]
X.append(features)

return every_urls, X
Expand All @@ -134,6 +134,6 @@ def load_train_set(csv_file: str) -> Tuple[pd.DataFrame, List[List], List[int]]:
for i, (_, url) in enumerate(tqdm(root_urls.items(), total=len(root_urls))):
data = df.loc[url]
y.append(data.is_phishing)
X.append(data.drop("refs").drop("is_phishing"))
X.append(data.drop('refs').drop('is_phishing'))

return df.reset_index(), X, y
26 changes: 13 additions & 13 deletions phishGNN/dataset_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
from utils.compute_device import COMPUTE_DEVICE
from utils.utils import normalize_www_prefix

print(f"Torch version: {torch.__version__}")
print(f"Compute device: {COMPUTE_DEVICE}")
print(f"Torch geometric version: {torch_geometric.__version__}")
print(f'Torch version: {torch.__version__}')
print(f'Compute device: {COMPUTE_DEVICE}')
print(f'Torch geometric version: {torch_geometric.__version__}')

# set default dtype, as MPS Pytorch does not support float64
torch.set_default_dtype(torch.float32)
Expand All @@ -27,7 +27,7 @@ def __init__(
self,
root: str,
do_data_preparation: bool = True,
visulization_mode: bool = False,
visualization_mode: bool = False,
nan_value: float = -1.0,
transform=None,
pre_transform=None,
Expand All @@ -37,25 +37,25 @@ def __init__(
into raw_dir (downloaded dataset) and processed_dir (processed data).
"""
self.do_data_preparation = do_data_preparation
self.visulization_mode = visulization_mode
self.visualization_mode = visualization_mode
self.nan_value = nan_value
super(PhishingDataset, self).__init__(root, transform, pre_transform)

@property
def raw_file_names(self) -> list[str]:
"""File name of the csv dataset. """
return glob.glob(os.path.join(self.raw_dir, "*"))
return glob.glob(os.path.join(self.raw_dir, '*'))

@property
def processed_file_names(self) -> list[str]:
return [file + ".pt" for file in self.raw_file_names]
return [file + '.pt' for file in self.raw_file_names]

@property
def num_classes(self):
return 2

def file_name(self, idx: int) -> str:
if self.visulization_mode:
if self.visualization_mode:
return f'data_viz_{idx}.pt'
return f'data_{idx}.pt'

Expand All @@ -73,7 +73,7 @@ def process(self) -> None:

root_urls = df[~df['is_phishing'].isin([self.nan_value])]['url']
df = df.set_index('url')
df_to_dict = df.to_dict("index")
df_to_dict = df.to_dict('index')

# loop over each root urls in the dataset
for i, (_, url) in enumerate(tqdm(root_urls.items(), total=len(root_urls))):
Expand Down Expand Up @@ -140,14 +140,14 @@ def map_url_to_id(url: str):
visited.add((url, ref, i))

# remove url and refs
features = [v for k, v in sorted(node.items()) \
features = [v for k, v in sorted(node.items())
if k not in ['refs', 'is_phishing']]
id_to_feat[url_to_id[url]] = features

x = [id_to_feat[k] for k in sorted(id_to_feat)]
visualization = {
"url_to_id": url_to_id,
"error_pages": error_pages,
'url_to_id': url_to_id,
'error_pages': error_pages,
}

return (
Expand All @@ -162,7 +162,7 @@ def get(self, idx):
t = torch.load(os.path.join(self.processed_dir, self.file_name(idx)))
t.x = t.x.to(dtype=torch.float32)
t.y = t.y.to(dtype=torch.int64)
t.edge = t.to(dtype=torch.int64)
t.edge_index = t.edge_index.to(dtype=torch.int64)
return t

@property
Expand Down
30 changes: 15 additions & 15 deletions phishGNN/dataset_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
from other_models import train_random_forest
from utils.compute_device import COMPUTE_DEVICE

print(f"Torch version: {torch.__version__}")
print(f"Compute device: {COMPUTE_DEVICE}")
print(f"Torch geometric version: {torch_geometric.__version__}")
print(f'Torch version: {torch.__version__}')
print(f'Compute device: {COMPUTE_DEVICE}')
print(f'Torch geometric version: {torch_geometric.__version__}')

# set default dtype, as MPS Pytorch does not support float64
torch.set_default_dtype(torch.float32)
Expand All @@ -28,7 +28,7 @@ def __init__(
self,
root: str,
do_data_preparation: bool = True,
visulization_mode: bool = False,
visualization_mode: bool = False,
nan_value: float = -1.0,
transform=None,
pre_transform=None,
Expand All @@ -38,25 +38,25 @@ def __init__(
into raw_dir (downloaded dataset) and processed_dir (processed data).
"""
self.do_data_preparation = do_data_preparation
self.visulization_mode = visulization_mode
self.visualization_mode = visualization_mode
self.nan_value = nan_value
super(PhishingDataset2, self).__init__(root, transform, pre_transform)

@property
def raw_file_names(self) -> list[str]:
"""File name of the csv dataset. """
return glob.glob(os.path.join(self.raw_dir, "*"))
return glob.glob(os.path.join(self.raw_dir, '*'))

@property
def processed_file_names(self) -> list[str]:
return [file + ".pt" for file in self.raw_file_names]
return [file + '.pt' for file in self.raw_file_names]

@property
def num_classes(self):
return 2

def file_name(self, idx: int) -> str:
if self.visulization_mode:
if self.visualization_mode:
return f'data_viz_{idx}.pt'
return f'data_{idx}.pt'

Expand All @@ -70,7 +70,7 @@ def process(self) -> None:
# loop over all files in `raw_file_names`
for raw_path in self.raw_paths:
df, X, y = dataprep.load_train_set(raw_path)
df_eval, X_eval, y_eval = dataprep.load_train_set("data/test/raw/evaloutput.csv")
df_eval, X_eval, y_eval = dataprep.load_train_set('data/test/raw/evaloutput.csv')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_test = [*X_test, *X_eval]
Expand All @@ -84,11 +84,11 @@ def process(self) -> None:
root_urls = df[~df['is_phishing'].isin([self.nan_value])]['url']

df.drop(df.iloc[:, 2:-1], inplace=True, axis=1)
df["url"]: every_urls
df["is_phishing_pred"] = every_preds
df['url']: every_urls
df['is_phishing_pred'] = every_preds

df = df.set_index('url')
df_to_dict = df.to_dict("index")
df_to_dict = df.to_dict('index')

# loop over each root urls in the dataset
for i, (_, url) in enumerate(tqdm(root_urls.items(), total=len(root_urls))):
Expand Down Expand Up @@ -167,8 +167,8 @@ def bool_to_float(value: bool):

x = [id_to_feat[k] for k in sorted(id_to_feat)]
visualization = {
"url_to_id": url_to_id,
"error_pages": error_pages,
'url_to_id': url_to_id,
'error_pages': error_pages,
}

return (
Expand All @@ -183,7 +183,7 @@ def get(self, idx):
t = torch.load(os.path.join(self.processed_dir, self.file_name(idx)))
t.x = t.x.to(dtype=torch.float32)
t.y = t.y.to(dtype=torch.int64)
t.edge = t.to(dtype=torch.int64)
t.edge_index = t.edge_index.to(dtype=torch.int64)
return t

@property
Expand Down
6 changes: 3 additions & 3 deletions phishGNN/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@


def train_test_loader(do_data_preparation: bool):
path = os.path.join(os.getcwd(), "data", "train")
path = os.path.join(os.getcwd(), 'data', 'train')
dataset = PhishingDataset2(root=path, do_data_preparation=do_data_preparation)
dataset = dataset.shuffle()

train_test = 0.7
train_dataset = dataset[:int(len(dataset) * train_test)]
test_dataset1 = dataset[int(len(dataset) * train_test):]

test_path = os.path.join(os.getcwd(), "data", "test")
test_path = os.path.join(os.getcwd(), 'data', 'test')
test_dataset2 = PhishingDataset2(root=test_path, do_data_preparation=do_data_preparation)
test_dataset = torch.utils.data.ConcatDataset([test_dataset1, test_dataset2])

Expand All @@ -27,7 +27,7 @@ def train_test_loader(do_data_preparation: bool):


def get_full_dataset(do_data_preparation: bool) -> Dataset:
path = os.path.join(os.getcwd(), "data", "train")
path = os.path.join(os.getcwd(), 'data', 'train')
dataset = PhishingDataset2(root=path, do_data_preparation=do_data_preparation)
dataset = dataset.shuffle()

Expand Down
2 changes: 0 additions & 2 deletions phishGNN/models/mem_pool.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from typing import Tuple, Any

import torch
import torch.nn.functional as F
from torch import Tensor, device
Expand Down
41 changes: 22 additions & 19 deletions phishGNN/other_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@


# from models.ffn import FeedforwardNeuralNetModel
from .models import FeedforwardNeuralNetModel


def warn(*args, **kwargs):
pass


import warnings

warnings.warn = warn


Expand All @@ -29,7 +33,7 @@ def train_random_forest(X_train, X_test, y_train, y_test):

y_pred = clf.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
print('Accuracy:', acc)
return clf, acc


Expand All @@ -39,17 +43,17 @@ def train_logistic_regression(X_train, X_test, y_train, y_test):

y_pred = reg.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
print('Accuracy:', acc)
return reg, acc


def train_svm(X_train, X_test, y_train, y_test):
svm = SVC(kernel = 'rbf', random_state = 0)
svm = SVC(kernel='rbf', random_state=0)
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
print('Accuracy:', acc)
return svm, acc


Expand All @@ -75,26 +79,26 @@ def train_ffn(X_train, X_test, y_train, y_test, epochs=50):
return model, test_accs


def do_experiments(n: int=10):
df, X, y = dataprep.load_train_set("data/train/raw/both.csv")
def do_experiments(n: int = 10):
df, X, y = dataprep.load_train_set('data/train/raw/both.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

names = [
"Nearest Neighbors",
"Linear SVM",
"RBF SVM",
"Decision Tree",
"Random Forest",
"Neural Net",
"AdaBoost",
"Naive Bayes",
"QDA",
"LogisticRegression",
'Nearest Neighbors',
'Linear SVM',
'RBF SVM',
'Decision Tree',
'Random Forest',
'Neural Net',
'AdaBoost',
'Naive Bayes',
'QDA',
'LogisticRegression',
]

classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(kernel='linear', C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(n_estimators=100),
Expand All @@ -113,11 +117,10 @@ def do_experiments(n: int=10):
acc = metrics.accuracy_score(y_test, y_pred)
samples.append(acc)
print(f'{names[i]:20} \t{np.mean(samples)} +- {np.std(samples)}')

_, ffns = train_ffn(X_train, X_test, y_train, y_test, epochs=n)
print(f'Feed Forward: \t{np.mean(ffns)} +- {np.std(ffns)}')



if __name__ == '__main__':
do_experiments()
Loading

0 comments on commit dccfd9d

Please sign in to comment.