Skip to content

Commit

Permalink
Revert "dataset v3 try to do inference"
Browse files Browse the repository at this point in the history
This reverts commit 14ef5f8.
  • Loading branch information
TristanBilot committed Jun 7, 2023
1 parent 14ef5f8 commit 18f35f0
Show file tree
Hide file tree
Showing 12 changed files with 25 additions and 270 deletions.
1 change: 0 additions & 1 deletion crawler/input.txt

This file was deleted.

47 changes: 0 additions & 47 deletions data/predict/raw/dataset.csv

This file was deleted.

Binary file removed manifest.xpi.zip
Binary file not shown.
1 change: 0 additions & 1 deletion mqtt
Submodule mqtt deleted from 713663
3 changes: 1 addition & 2 deletions phishGNN/cross_validation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import time
from typing import Tuple

import torch
from sklearn.model_selection import StratifiedKFold
Expand All @@ -16,7 +15,7 @@

def cross_validation_with_val_set(dataset, model, loss_fn, folds, epochs, batch_size,
lr, lr_decay_factor, lr_decay_step_size,
weight_decay, logger=None) -> Tuple[float, float, float]:
weight_decay, logger=None) -> tuple[float, float, float]:

val_losses, accs, durations = [], [], []
for fold, (train_idx, test_idx, val_idx) in enumerate(zip(*k_fold(dataset, folds))):
Expand Down
15 changes: 7 additions & 8 deletions phishGNN/dataprep.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
NAN_VALUE = -1


def read_csv(path: str, train_test_equilibrum: bool=True) -> pd.DataFrame:
def read_csv(path: str) -> pd.DataFrame:
"""Opens the csv dataset as DataFrame and cast types.
"""
date_parser = lambda c: pd.to_datetime(c, format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')
Expand All @@ -22,11 +22,10 @@ def read_csv(path: str, train_test_equilibrum: bool=True) -> pd.DataFrame:
)

# equilibrate dataset classes as 50/50% benign/phishing
if train_test_equilibrum:
nb_phishing = len(df[df['is_phishing'] == 1])
benign = df.index[(df['is_phishing'] == 0)][:nb_phishing]
other = df.index[~(df['is_phishing'] == 0)]
df = pd.concat([df.iloc[benign], df.iloc[other]])
nb_phishing = len(df[df['is_phishing'] == 1])
benign = df.index[(df['is_phishing'] == 0)][:nb_phishing]
other = df.index[~(df['is_phishing'] == 0)]
df = pd.concat([df.iloc[benign], df.iloc[other]])

# cast object dtypes
df['url'] = df['url'].astype('string')
Expand Down Expand Up @@ -116,7 +115,7 @@ def load_every_urls_with_features(df: pd.DataFrame, path: str) -> Tuple[List, Li
return every_urls, X


def load_train_set(csv_file: str, train_test_equilibrum: bool=True) -> Tuple[pd.DataFrame, List[List], List[int]]:
def load_train_set(csv_file: str) -> Tuple[pd.DataFrame, List[List], List[int]]:
"""Opens the csv file in `csv_file` and returns every
features and label of each root url in the dataset.
Expand All @@ -125,7 +124,7 @@ def load_train_set(csv_file: str, train_test_equilibrum: bool=True) -> Tuple[pd.
X: the list of features (list) of each root url
y: the list of labels (int) of each root url
"""
df = read_csv(csv_file, train_test_equilibrum=train_test_equilibrum)
df = read_csv(csv_file)
df = normalize_features(df)

root_urls = df[~df['is_phishing'].isin([NAN_VALUE])]['url']
Expand Down
7 changes: 3 additions & 4 deletions phishGNN/dataset_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import dataprep
from utils.compute_device import COMPUTE_DEVICE
from utils.utils import normalize_www_prefix
from typing import Tuple, List

print(f'Torch version: {torch.__version__}')
print(f'Compute device: {COMPUTE_DEVICE}')
Expand Down Expand Up @@ -43,12 +42,12 @@ def __init__(
super(PhishingDataset, self).__init__(root, transform, pre_transform)

@property
def raw_file_names(self) -> List[str]:
def raw_file_names(self) -> list[str]:
"""File name of the csv dataset. """
return glob.glob(os.path.join(self.raw_dir, '*'))

@property
def processed_file_names(self) -> List[str]:
def processed_file_names(self) -> list[str]:
return [file + '.pt' for file in self.raw_file_names]

@property
Expand Down Expand Up @@ -90,7 +89,7 @@ def process(self) -> None:
def len(self):
return (len(os.listdir(self.processed_dir)) - 4) // 2

def _build_tensors(self, root_url: str, df_to_dict, existing_urls) -> Tuple[Tensor, Tensor, Tensor, Tensor, dict]:
def _build_tensors(self, root_url: str, df_to_dict, existing_urls) -> tuple[Tensor, Tensor, Tensor, Tensor, dict]:
"""Builds the required tensors for one graph.
These matrices will be then used for training the GNN.
Expand Down
7 changes: 3 additions & 4 deletions phishGNN/dataset_v2.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import glob
import os
from typing import Tuple, List

import pandas as pd
import torch
Expand Down Expand Up @@ -44,12 +43,12 @@ def __init__(
super(PhishingDataset2, self).__init__(root, transform, pre_transform)

@property
def raw_file_names(self) -> List[str]:
def raw_file_names(self) -> list[str]:
"""File name of the csv dataset. """
return glob.glob(os.path.join(self.raw_dir, '*'))

@property
def processed_file_names(self) -> List[str]:
def processed_file_names(self) -> list[str]:
return [file + '.pt' for file in self.raw_file_names]

@property
Expand Down Expand Up @@ -105,7 +104,7 @@ def process(self) -> None:
def len(self):
return (len(os.listdir(self.processed_dir)) - 4) // 2

def _build_tensors(self, root_url: str, df_to_dict, existing_urls) -> Tuple[Tensor, Tensor, Tensor, Tensor, dict]:
def _build_tensors(self, root_url: str, df_to_dict, existing_urls) -> tuple[Tensor, Tensor, Tensor, Tensor, dict]:
"""Builds the required tensors for one graph.
These matrices will be then used for training the GNN.
Expand Down
Loading

0 comments on commit 18f35f0

Please sign in to comment.