Skip to content

Commit

Permalink
✨ divide dataset in 50/50% phishing/benign
Browse files Browse the repository at this point in the history
  • Loading branch information
TristanBilot committed Mar 28, 2022
1 parent fe473a4 commit 189a8f1
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion phishGNN/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,18 @@ def _read_csv(self, path: str) -> pd.DataFrame:
parse_dates=['domain_creation_date'],
date_parser=date_parser,
)

# equilibrate dataset classes as 50/50% benign/phishing
nb_phishing = len(df[df['is_phishing'] == 1])
benign = df.index[(df['is_phishing'] == 0)][:nb_phishing]
other = df.index[~(df['is_phishing'] == 0)]
df = pd.concat([df.iloc[benign], df.iloc[other]])

# cast object dtypes
df['url'] = df['url'].astype('string')
df['cert_country'] = df['cert_country'].astype('string')

# remove useless features
del df['status_code']
del df['depth']
del df['domain_creation_date']
Expand Down Expand Up @@ -247,7 +256,7 @@ def map_url_to_id(url: str):
visited.add((url, ref, i))

# remove url and refs
features = node[:-1]
features = node.drop("refs").drop("is_phishing")
id_to_feat[url_to_id[url]] = features

x = [id_to_feat[k] for k in sorted(id_to_feat)] # (n, d)
Expand Down

0 comments on commit 189a8f1

Please sign in to comment.