Skip to content

Commit

Permalink
🗑️ clean code
Browse files Browse the repository at this point in the history
  • Loading branch information
TristanBilot committed Mar 28, 2022
1 parent 7dde2f9 commit 105f704
Showing 1 changed file with 2 additions and 118 deletions.
120 changes: 2 additions & 118 deletions phishGNN/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,18 +61,9 @@ def file_name(self, idx: int):
return f'data_viz_{idx}.pt'
return f'data_{idx}.pt'

def download(self):
pass

def process(self):
"""Reads csv files in data/raw and preprocess so that output
preprocessed files are written in data/processed folder.
Side effects:
self.data.pos = {
"url_to_id": url_to_id,
"error_pages": error_pages,
} contains data needed fdorthe visualization of the graphs.
"""
if not self.use_process:
return
Expand Down Expand Up @@ -158,14 +149,6 @@ def normalize_refs(refs: List[Dict]):
ref['url'] = self._normalize_url(ref['url'])
return refs

# def str_to_float(col: pd.Series):
# col = col.fillna(self.nan_value)
# s = set(col.unique())
# s = {e: i for i, e in enumerate(s)}

# col = col.apply(lambda x: s[x])
# return min_max_scaling(col)

# normalize int & float columns
num_column_idxs = [i for i, dt in enumerate(df.dtypes) if dt in [int, float]]
df.iloc[:, num_column_idxs] = \
Expand All @@ -179,20 +162,13 @@ def normalize_refs(refs: List[Dict]):
"has_sub_domain", "has_at_symbol", "is_valid_html", "has_form_with_url",
"has_iframe", "use_mouseover", "is_cert_valid", "has_dns_record",
"has_whois", "path_starts_with_url"]
# bool_column_idxs = [i for i, dt in enumerate(df.dtypes) if dt == bool]
df[bool_columns] = df[bool_columns].apply(bool_to_int, axis=0)

# normalize urls
df['refs'] = df['refs'].apply(normalize_refs)
df['url'] = df['url'].apply(self._normalize_url)
df = df.drop_duplicates(subset='url', keep='first')

# ignore url
# no_url = df.iloc[: , 1:]
# str_column_idxs = [i + 1 for i, dt in enumerate(no_url.iloc[: , 1:].dtypes) if dt == "string"]
# no_url.iloc[:, str_column_idxs] = no_url.iloc[:, str_column_idxs].apply(str_to_float, axis=0)
# df.iloc[: , 1:] = no_url

return df


Expand Down Expand Up @@ -223,29 +199,20 @@ def map_url_to_id(url: str):
url = queue.pop()
try:
node = df.loc[url]
# log_success(f'{url} found in features.')
except KeyError:
# log_fail(f'{url} not found in features.')
node = self.error_page_node_feature

refs = node.refs
map_url_to_id(url)

for i, edge in enumerate(refs):
ref = edge['url']
# nb_hrefs = edge['nb_edges']
if (url, ref, i) in visited:
break
if ref not in df.index:
error_pages.add(ref)
map_url_to_id(ref)
# try:
# node = df.loc[ref]
# log_success(f'{ref} found in features.')
# except KeyError:
# log_fail(f'{ref} not found in features.')
# continue
# node = self.error_page_node_feature

from_.append(url_to_id[url])
to_.append(url_to_id[ref])
edges_.append([1]) # should be edge features
Expand All @@ -259,12 +226,11 @@ def map_url_to_id(url: str):
features = node.drop("refs").drop("is_phishing")
id_to_feat[url_to_id[url]] = features

x = [id_to_feat[k] for k in sorted(id_to_feat)] # (n, d)
x = [id_to_feat[k] for k in sorted(id_to_feat)]
visualization = {
"url_to_id": url_to_id,
"error_pages": error_pages,
}
# log_success(f'{root_url} processed.')

return (
torch.tensor([from_, to_]).type(torch.LongTensor),
Expand Down Expand Up @@ -318,85 +284,3 @@ def _normalize_url(self, url: str):
url = url.rstrip('/')
url = normalize_www_prefix(url)
return url

# while len(queue) != 0:
# ref = queue.pop()
# node = df[ref]

# for ref, edge_feats in refs.items():
# if ref not in idxs:
# idxs[ref] = idx
# idx += 1
# from_.append(idxs[url])
# to_.append(idxs[ref])
# edges_.append([1]) # should be edge features

# # remove url and refs
# features = row[2:-1]
# id_to_feat[idxs[url]] = features






# @property
# def raw_file_names(self):
# """ If this file exists in raw_dir, the download is not triggered.
# (The download func. is not implemented here)
# """
# return self.filename

# @property
# def processed_file_names(self):
# """ If these files are found in raw_dir, processing is skipped"""
# self.data = pd.read_csv(self.raw_paths[0]).reset_index()

# if self.test:
# return [f'data_test_{i}.pt' for i in list(self.data.index)]
# else:
# return [f'data_{i}.pt' for i in list(self.data.index)]


# def download(self):
# pass

# def process(self):
# self.data = pd.read_csv(self.raw_paths[0]).reset_index()
# featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
# for index, mol in tqdm(self.data.iterrows(), total=self.data.shape[0]):
# # Featurize molecule
# f = featurizer.featurize(mol["smiles"])
# data = f[0].to_pyg_graph()
# data.y = self._get_label(mol["HIV_active"])
# data.smiles = mol["smiles"]
# if self.test:
# torch.save(data,
# os.path.join(self.processed_dir,
# f'data_test_{index}.pt'))
# else:
# torch.save(data,
# os.path.join(self.processed_dir,
# f'data_{index}.pt'))


# def _get_label(self, label):
# label = np.asarray([label])
# return torch.tensor(label, dtype=torch.int64)

# def len(self):
# return self.data.shape[0]

# def get(self, idx):
# """ - Equivalent to __getitem__ in pytorch
# - Is not needed for PyG's InMemoryDataset
# """
# if self.test:
# data = torch.load(os.path.join(self.processed_dir,
# f'data_test_{idx}.pt'))
# else:
# data = torch.load(os.path.join(self.processed_dir,
# f'data_{idx}.pt'))
# return data


0 comments on commit 105f704

Please sign in to comment.