From 18f35f00217d67b48b67a23bff049171f73318fe Mon Sep 17 00:00:00 2001 From: Tristan Date: Wed, 7 Jun 2023 03:19:50 -0400 Subject: [PATCH] Revert "dataset v3 try to do inference" This reverts commit 14ef5f8d1b52e2dc6126e6035b7befe5b682656f. --- crawler/input.txt | 1 - data/predict/raw/dataset.csv | 47 --------- manifest.xpi.zip | Bin 899 -> 0 bytes mqtt | 1 - phishGNN/cross_validation.py | 3 +- phishGNN/dataprep.py | 15 ++- phishGNN/dataset_v1.py | 7 +- phishGNN/dataset_v2.py | 7 +- phishGNN/dataset_v3.py | 192 ----------------------------------- phishGNN/other_models.py | 2 +- phishGNN/predict.py | 16 +-- phishGNN/utils/utils.py | 4 +- 12 files changed, 25 insertions(+), 270 deletions(-) delete mode 100644 crawler/input.txt delete mode 100644 data/predict/raw/dataset.csv delete mode 100644 manifest.xpi.zip delete mode 160000 mqtt delete mode 100644 phishGNN/dataset_v3.py diff --git a/crawler/input.txt b/crawler/input.txt deleted file mode 100644 index 62ef4dc..0000000 --- a/crawler/input.txt +++ /dev/null @@ -1 +0,0 @@ -https://stackoverflow.com https://youtube.com http://br-icloud.com.br http://mp3raid.com/music/krizz_kaliko.html http://www.garage-pirenne.be/index.php?option=com_content&view=article&id=70&vsig70_0=15 http://www.pashminaonline.com/pure-pashminas http://google.com http://facebook.com http://twitter.com diff --git a/data/predict/raw/dataset.csv b/data/predict/raw/dataset.csv deleted file mode 100644 index 09f129e..0000000 --- a/data/predict/raw/dataset.csv +++ /dev/null @@ -1,47 +0,0 @@ -url,depth,is_phishing,status_code,redirects,is_https,is_ip_address,is_error_page,url_length,domain_url_depth,domain_url_length,has_sub_domain,has_at_symbol,dashes_count,path_starts_with_url,is_valid_html,anchors_count,forms_count,javascript_count,self_anchors_count,has_form_with_url,has_iframe,use_mouseover,is_cert_valid,has_dns_record,has_whois,cert_country,cert_reliability,domain_age,domain_end_period,domain_creation_date,refs -http://google.com/,0,false,200,1,false,false,false,18,1,10,false,false,0,false,true,29,1,0,0,true,false,false,,,,,,,,,"[{""url"":""https://www.google.fr/webhp?tab=ww"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://www.google.fr/imghp?hl=fr&tab=wi"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://maps.google.fr/maps?hl=fr&tab=wl"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://play.google.com/?hl=fr&tab=w8"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://www.youtube.com/?tab=w1"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://news.google.com/?tab=wn"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://mail.google.com/mail/?tab=wm"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://drive.google.com/?tab=wo"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://www.google.fr/intl/fr/about/products?tab=wh"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://calendar.google.com/calendar?tab=wc"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://translate.google.fr/?hl=fr&tab=wT"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://books.google.fr/?hl=fr&tab=wp"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://www.google.fr/shopping?hl=fr&source=og&tab=wf"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://www.blogger.com/?tab=wj"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://www.google.com/finance?tab=we"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://photos.google.com/?tab=wq&pageId=none"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://docs.google.com/document/?usp=docs_alc"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://www.google.fr/intl/fr/about/products?tab=wh"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://accounts.google.com/ServiceLogin?hl=fr&passive=true&continue=http://www.google.com/&ec=GAZAAQ"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://www.google.fr/preferences?hl=fr"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://google.com/preferences?hl=fr"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://www.google.fr/history/optout?hl=fr"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://google.com/search"",""is_same_domain"":true,""is_anchor"":false,""is_form"":true,""is_iframe"":false},{""url"":""http://google.com/advanced_search?hl=fr&authuser=0"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://google.com/intl/fr/ads/"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://google.com/services/"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://google.com/intl/fr/about.html"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://www.google.com/setprefdomain?prefdom=FR&prev=http://www.google.fr/&sig=6480af66K_LdnPi_AYMkuiEk7XuA_P7kSeYx4%3D"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://google.com/intl/fr/policies/privacy/"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://google.com/intl/fr/policies/terms/"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false}]" -http://facebook.com/,0,false,200,3,false,false,false,20,1,12,false,false,0,false,true,3,0,0,0,false,false,true,,,,,,,,,"[{""url"":""https://l.facebook.com/l.php?u=https%3A%2F%2Fwww.google.com%2Fchrome%2Fbrowser%2F&h=AT3T6nJvXHDrlACT1f0OrzwgAULBGZthdbArV4nNofsvVd-hBebct8zU9_qlXv1qDuQtsqekQTCmu0Prc3y_4ILgwKoQ2m7s8rhr5HgGZc0ymgMwfSCS6h6KgKf61p1oTSMUKwWSdW_MGWIC"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://l.facebook.com/l.php?u=https%3A%2F%2Fwww.mozilla.org%2Ffirefox%2Fnew%2F%3Futm_source%3Dfacebook%26utm_medium%3Dreferral%26utm_campaign%3Dunsupported-browser-notification&h=AT0tVHDvvfnBmci8zDFRsQhyrcA6C7QPlpYNZwFyoH5SKC-86VQ1SoqIzMkXo_l2c-WXFMxqR9b9uVeW07IUgLlSOuVQbV6b769OQK-15UjYWYral3uussh7AKRZXVm8-uQqt2wPkLMYC2HD"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""http://facebook.com/mobile"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false}]" -https://youtube.com/,0,false,200,1,true,false,false,20,1,11,false,false,0,false,true,15,0,0,0,false,true,true,,,,,,,,,"[{""url"":""https://accounts.google.com/ServiceLogin?service=youtube&uilel=3&passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Faction_handle_signin%3Dtrue%26app%3Ddesktop%26hl%3Dfr%26next%3D%252Fsignin_passive%26feature%3Dpassive&hl=fr"",""is_same_domain"":false,""is_anchor"":false,""is_form"":false,""is_iframe"":true},{""url"":""https://youtube.com/"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://youtube.com/"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://www.youtube.com/about/"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://www.youtube.com/about/press/"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://www.youtube.com/about/copyright/"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://youtube.com/t/contact_us/"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://www.youtube.com/creators/"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://www.youtube.com/ads/"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://developers.google.com/youtube"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://support.google.com/youtube/contact/FR_Complaints"",""is_same_domain"":false,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://youtube.com/t/terms"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://youtube.com/t/privacy"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://www.youtube.com/about/policies/"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://www.youtube.com/howyoutubeworks?utm_campaign=ytgen&utm_source=ythp&utm_medium=LeftNav&utm_content=txt&u=https%3A%2F%2Fwww.youtube.com%2Fhowyoutubeworks%3Futm_source%3Dythp%26utm_medium%3DLeftNav%26utm_campaign%3Dytgen"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false},{""url"":""https://youtube.com/new"",""is_same_domain"":true,""is_anchor"":true,""is_form"":false,""is_iframe"":false}]" -https://stackoverflow.com/,0,false,403,0,true,false,true,26,1,17,false,false,0,false,true,1,0,0,0,false,false,false,,,,,,,,,[] -http://mp3raid.com/music/krizz_kaliko.html,0,false,403,0,false,false,true,42,1,11,false,false,0,true,false,0,0,0,0,false,false,false,,,,,,,,,[] -http://google.com/search,1,,200,2,false,false,false,24,1,10,false,false,0,true,true,29,1,0,0,true,false,false,,,,,,,,,[] -http://google.com/intl/fr/policies/privacy/,1,,200,1,false,false,false,43,1,10,false,false,0,false,true,1,0,0,0,false,false,false,,,,,,,,,[] -https://books.google.fr/?hl=fr&tab=wp,1,,200,0,true,false,false,37,2,15,true,false,0,false,false,26,1,0,0,true,false,true,,,,,,,,,[] -https://photos.google.com/?tab=wq&pageId=none,1,,200,1,true,false,false,45,2,17,true,false,0,false,false,26,0,0,1,false,false,false,,,,,,,,,[] -http://www.blogger.com/?tab=wj,1,,200,5,false,false,false,30,2,15,true,false,0,false,true,16,0,0,2,false,false,false,,,,,,,,,[] -https://docs.google.com/document/?usp=docs_alc,1,,200,2,true,false,false,46,2,15,true,false,0,false,true,69,0,1,7,false,false,false,,,,,,,,,[] -http://google.com/intl/fr/ads/,1,,200,2,false,false,false,30,1,10,false,false,0,false,false,56,0,0,1,false,false,false,,,,,,,,,[] -http://google.com/intl/fr/policies/terms/,1,,200,1,false,false,false,41,1,10,false,false,0,false,true,1,0,0,0,false,false,false,,,,,,,,,[] -https://www.google.fr/webhp?tab=ww,1,,200,0,true,false,false,34,2,13,true,false,0,false,true,29,1,0,0,true,false,false,,,,,,,,,[] -http://www.google.fr/imghp?hl=fr&tab=wi,1,,200,0,false,false,false,39,2,13,true,false,0,false,true,28,1,0,0,true,false,false,,,,,,,,,[] -https://accounts.google.com/ServiceLogin?hl=fr&passive=true&continue=http://www.google.com/&ec=GAZAAQ,1,,200,2,true,false,false,101,2,19,true,false,0,false,false,6,2,0,0,true,false,true,,,,,,,,,[] -http://www.google.fr/preferences?hl=fr,1,,200,1,false,false,false,38,2,13,true,false,0,false,true,24,1,0,3,true,false,true,,,,,,,,,[] -http://www.google.com/setprefdomain?prefdom=FR&prev=http://www.google.fr/&sig=6480af66K_LdnPi_AYMkuiEk7XuA_P7kSeYx4%3D,1,,200,1,false,false,false,118,2,14,true,false,0,false,true,29,1,0,0,true,false,false,,,,,,,,,[] -https://mail.google.com/mail/?tab=wm,1,,200,4,true,false,false,36,2,15,true,false,0,false,false,6,2,0,0,true,false,true,,,,,,,,,[] -http://www.google.fr/history/optout?hl=fr,1,,200,4,false,false,false,41,2,13,true,false,0,false,false,8,0,0,0,false,false,true,,,,,,,,,[] -http://google.com/services/,1,,200,1,false,false,false,27,1,10,false,false,0,true,true,1,0,0,0,false,false,false,,,,,,,,,[] -https://drive.google.com/?tab=wo,1,,200,3,true,false,false,32,2,16,true,false,0,false,false,6,2,0,0,true,false,true,,,,,,,,,[] -https://calendar.google.com/calendar?tab=wc,1,,200,4,true,false,false,43,2,19,true,false,0,false,false,6,2,0,0,true,false,true,,,,,,,,,[] -http://google.com/preferences?hl=fr,1,,200,2,false,false,false,35,1,10,false,false,0,false,true,24,1,0,3,true,false,true,,,,,,,,,[] -https://www.google.fr/intl/fr/about/products?tab=wh,1,,200,3,true,false,false,51,2,13,true,false,0,false,true,158,0,0,3,false,false,false,,,,,,,,,[] -http://google.com/advanced_search?hl=fr&authuser=0,1,,200,2,false,false,false,50,1,10,false,false,0,false,true,16,1,0,0,true,false,true,,,,,,,,,[] -http://maps.google.fr/maps?hl=fr&tab=wl,1,,200,3,false,false,false,39,2,14,true,false,0,false,true,0,0,0,0,false,false,false,,,,,,,,,[] -http://google.com/intl/fr/about.html,1,,200,4,false,false,false,36,1,10,false,false,0,false,true,49,0,0,1,false,false,false,,,,,,,,,[] -https://www.youtube.com/?tab=w1,1,,200,0,true,false,false,31,2,15,true,false,0,false,true,15,0,0,0,false,true,true,,,,,,,,,[] -https://www.google.fr/shopping?hl=fr&source=og&tab=wf,1,,200,1,true,false,false,53,2,13,true,false,0,true,false,32,1,0,0,true,false,true,,,,,,,,,[] -https://www.youtube.com/creators/,1,,200,0,true,false,false,33,2,15,true,false,0,false,true,43,1,0,1,false,false,false,,,,,,,,,[] -https://youtube.com/t/privacy,1,,200,2,true,false,false,29,1,11,false,false,0,false,false,252,0,0,0,false,true,true,,,,,,,,,[] -https://developers.google.com/youtube,1,,200,0,true,false,false,37,2,21,true,false,0,true,false,102,1,0,0,true,false,false,,,,,,,,,[] -https://www.youtube.com/about/policies/,1,,200,1,true,false,false,39,2,15,true,false,0,false,false,138,1,0,7,false,false,false,,,,,,,,,[] -https://www.youtube.com/about/press/,1,,200,1,true,false,false,36,2,15,true,false,0,false,true,89,1,0,6,false,false,false,,,,,,,,,[] -https://www.youtube.com/ads/,1,,200,0,true,false,false,28,2,15,true,false,0,true,false,85,0,0,1,false,false,false,,,,,,,,,[] -https://youtube.com/new,1,,200,1,true,false,false,23,1,11,false,false,0,true,true,15,0,0,0,false,true,true,,,,,,,,,[] -https://accounts.google.com/ServiceLogin?service=youtube&uilel=3&passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Faction_handle_signin%3Dtrue%26app%3Ddesktop%26hl%3Dfr%26next%3D%252Fsignin_passive%26feature%3Dpassive&hl=fr,1,,200,2,true,false,false,236,2,19,true,false,0,false,false,6,2,0,0,true,false,true,,,,,,,,,[] -https://www.youtube.com/about/copyright/,1,,200,1,true,false,false,40,2,15,true,false,0,false,false,118,1,0,5,false,false,false,,,,,,,,,[] -https://www.youtube.com/howyoutubeworks?utm_campaign=ytgen&utm_source=ythp&utm_medium=LeftNav&utm_content=txt&u=https%3A%2F%2Fwww.youtube.com%2Fhowyoutubeworks%3Futm_source%3Dythp%26utm_medium%3DLeftNav%26utm_campaign%3Dytgen,1,,200,1,true,false,false,225,2,15,true,false,0,false,false,141,1,0,1,false,false,false,,,,,,,,,[] -https://www.youtube.com/about/,1,,200,1,true,false,false,30,2,15,true,false,0,false,false,68,0,0,1,false,false,false,,,,,,,,,[] -https://youtube.com/t/contact_us/,1,,200,1,true,false,false,33,1,11,false,false,0,false,true,17,0,0,0,false,false,false,,,,,,,,,[] -https://youtube.com/t/terms,1,,200,1,true,false,false,27,1,11,false,false,0,false,true,54,0,0,0,false,false,false,,,,,,,,,[] -https://l.facebook.com/l.php?u=https%3A%2F%2Fwww.mozilla.org%2Ffirefox%2Fnew%2F%3Futm_source%3Dfacebook%26utm_medium%3Dreferral%26utm_campaign%3Dunsupported-browser-notification&h=AT0tVHDvvfnBmci8zDFRsQhyrcA6C7QPlpYNZwFyoH5SKC-86VQ1SoqIzMkXo_l2c-WXFMxqR9b9uVeW07IUgLlSOuVQbV6b769OQK-15UjYWYral3uussh7AKRZXVm8-uQqt2wPkLMYC2HD,1,,200,1,true,false,false,324,2,14,true,false,0,false,true,3,0,0,0,false,false,true,,,,,,,,,[] -http://facebook.com/mobile,1,,200,3,false,false,false,26,1,12,false,false,0,true,true,3,0,0,0,false,false,true,,,,,,,,,[] -https://l.facebook.com/l.php?u=https%3A%2F%2Fwww.google.com%2Fchrome%2Fbrowser%2F&h=AT3T6nJvXHDrlACT1f0OrzwgAULBGZthdbArV4nNofsvVd-hBebct8zU9_qlXv1qDuQtsqekQTCmu0Prc3y_4ILgwKoQ2m7s8rhr5HgGZc0ymgMwfSCS6h6KgKf61p1oTSMUKwWSdW_MGWIC,1,,200,1,true,false,false,228,2,14,true,false,0,false,true,3,0,0,0,false,false,true,,,,,,,,,[] diff --git a/manifest.xpi.zip b/manifest.xpi.zip deleted file mode 100644 index 8f329c0304a78e5733b38ca7b5f8965d4d510828..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 899 zcmWIWW@Zs#-~htAc~M~uP|(7}z#zn+z>u4mmzkDYT%wSiUr?!+Rh*v}8p6xKzBuf6 za{1@F6c8@0;AUWC`O3(^05pezAvN%H-faVcJ>oCybxI5cOo}cqj{P!0pW~MUf1_f+ zms@eOl_K9=auj&deCPg${#NZF3jvrxVRFiy~`#JNAWY6-@ zmzOmc&Z=E0n$H&Untl59Cl8N*Oej{elzU+QP5tZc6+5%-=Q~KB5{R=p7Wm=L(FZ5& zv^Bp(Zz(!`@=(jgwQ9vPQqHHfZhs;>wb5Lu&}{C#9`S3pQ{pD?2s*guM#$!lOoeNT zJ0xth!Y+KT(RwTYV$O}FU#?nQfBNLdZC0bOjDCCH`x+ZGHrOq_AiX`1*u>;1BBvXkeNk!K^MI z%~+@|(EMY!WCv59qM~Dfn#XY-g)q0^LmdL*zUR(g_14ii;dxbC$5+SejP}Vh-l-Z_ z%nddeY%()8H1;&{FxkX+(xZ2!ff2`9Zym35Cv?1ZH>H6*!p_lc@pnre(CHxO2Y53w zi7+G5D6*$PX%q&wG=f+Je1|2)qMMIwHz?>}U`yjkMh40PGQgXa4P+b>5S|9oi-7UQ GzyJUg%S<}} diff --git a/mqtt b/mqtt deleted file mode 160000 index 7136637..0000000 --- a/mqtt +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7136637bd047ecefd4aa97839ad2cea42008e3f7 diff --git a/phishGNN/cross_validation.py b/phishGNN/cross_validation.py index 24a9117..2d01498 100644 --- a/phishGNN/cross_validation.py +++ b/phishGNN/cross_validation.py @@ -1,5 +1,4 @@ import time -from typing import Tuple import torch from sklearn.model_selection import StratifiedKFold @@ -16,7 +15,7 @@ def cross_validation_with_val_set(dataset, model, loss_fn, folds, epochs, batch_size, lr, lr_decay_factor, lr_decay_step_size, - weight_decay, logger=None) -> Tuple[float, float, float]: + weight_decay, logger=None) -> tuple[float, float, float]: val_losses, accs, durations = [], [], [] for fold, (train_idx, test_idx, val_idx) in enumerate(zip(*k_fold(dataset, folds))): diff --git a/phishGNN/dataprep.py b/phishGNN/dataprep.py index 9c6e55c..f3224e8 100644 --- a/phishGNN/dataprep.py +++ b/phishGNN/dataprep.py @@ -10,7 +10,7 @@ NAN_VALUE = -1 -def read_csv(path: str, train_test_equilibrum: bool=True) -> pd.DataFrame: +def read_csv(path: str) -> pd.DataFrame: """Opens the csv dataset as DataFrame and cast types. """ date_parser = lambda c: pd.to_datetime(c, format='%Y-%m-%dT%H:%M:%SZ', errors='coerce') @@ -22,11 +22,10 @@ def read_csv(path: str, train_test_equilibrum: bool=True) -> pd.DataFrame: ) # equilibrate dataset classes as 50/50% benign/phishing - if train_test_equilibrum: - nb_phishing = len(df[df['is_phishing'] == 1]) - benign = df.index[(df['is_phishing'] == 0)][:nb_phishing] - other = df.index[~(df['is_phishing'] == 0)] - df = pd.concat([df.iloc[benign], df.iloc[other]]) + nb_phishing = len(df[df['is_phishing'] == 1]) + benign = df.index[(df['is_phishing'] == 0)][:nb_phishing] + other = df.index[~(df['is_phishing'] == 0)] + df = pd.concat([df.iloc[benign], df.iloc[other]]) # cast object dtypes df['url'] = df['url'].astype('string') @@ -116,7 +115,7 @@ def load_every_urls_with_features(df: pd.DataFrame, path: str) -> Tuple[List, Li return every_urls, X -def load_train_set(csv_file: str, train_test_equilibrum: bool=True) -> Tuple[pd.DataFrame, List[List], List[int]]: +def load_train_set(csv_file: str) -> Tuple[pd.DataFrame, List[List], List[int]]: """Opens the csv file in `csv_file` and returns every features and label of each root url in the dataset. @@ -125,7 +124,7 @@ def load_train_set(csv_file: str, train_test_equilibrum: bool=True) -> Tuple[pd. X: the list of features (list) of each root url y: the list of labels (int) of each root url """ - df = read_csv(csv_file, train_test_equilibrum=train_test_equilibrum) + df = read_csv(csv_file) df = normalize_features(df) root_urls = df[~df['is_phishing'].isin([NAN_VALUE])]['url'] diff --git a/phishGNN/dataset_v1.py b/phishGNN/dataset_v1.py index 70c8cec..c91f942 100644 --- a/phishGNN/dataset_v1.py +++ b/phishGNN/dataset_v1.py @@ -11,7 +11,6 @@ import dataprep from utils.compute_device import COMPUTE_DEVICE from utils.utils import normalize_www_prefix -from typing import Tuple, List print(f'Torch version: {torch.__version__}') print(f'Compute device: {COMPUTE_DEVICE}') @@ -43,12 +42,12 @@ def __init__( super(PhishingDataset, self).__init__(root, transform, pre_transform) @property - def raw_file_names(self) -> List[str]: + def raw_file_names(self) -> list[str]: """File name of the csv dataset. """ return glob.glob(os.path.join(self.raw_dir, '*')) @property - def processed_file_names(self) -> List[str]: + def processed_file_names(self) -> list[str]: return [file + '.pt' for file in self.raw_file_names] @property @@ -90,7 +89,7 @@ def process(self) -> None: def len(self): return (len(os.listdir(self.processed_dir)) - 4) // 2 - def _build_tensors(self, root_url: str, df_to_dict, existing_urls) -> Tuple[Tensor, Tensor, Tensor, Tensor, dict]: + def _build_tensors(self, root_url: str, df_to_dict, existing_urls) -> tuple[Tensor, Tensor, Tensor, Tensor, dict]: """Builds the required tensors for one graph. These matrices will be then used for training the GNN. diff --git a/phishGNN/dataset_v2.py b/phishGNN/dataset_v2.py index 6976a9c..7c501a8 100644 --- a/phishGNN/dataset_v2.py +++ b/phishGNN/dataset_v2.py @@ -1,6 +1,5 @@ import glob import os -from typing import Tuple, List import pandas as pd import torch @@ -44,12 +43,12 @@ def __init__( super(PhishingDataset2, self).__init__(root, transform, pre_transform) @property - def raw_file_names(self) -> List[str]: + def raw_file_names(self) -> list[str]: """File name of the csv dataset. """ return glob.glob(os.path.join(self.raw_dir, '*')) @property - def processed_file_names(self) -> List[str]: + def processed_file_names(self) -> list[str]: return [file + '.pt' for file in self.raw_file_names] @property @@ -105,7 +104,7 @@ def process(self) -> None: def len(self): return (len(os.listdir(self.processed_dir)) - 4) // 2 - def _build_tensors(self, root_url: str, df_to_dict, existing_urls) -> Tuple[Tensor, Tensor, Tensor, Tensor, dict]: + def _build_tensors(self, root_url: str, df_to_dict, existing_urls) -> tuple[Tensor, Tensor, Tensor, Tensor, dict]: """Builds the required tensors for one graph. These matrices will be then used for training the GNN. diff --git a/phishGNN/dataset_v3.py b/phishGNN/dataset_v3.py deleted file mode 100644 index a989886..0000000 --- a/phishGNN/dataset_v3.py +++ /dev/null @@ -1,192 +0,0 @@ -import glob -import os -from typing import Tuple, List - -import pandas as pd -import torch -import torch_geometric -from torch import Tensor -from sklearn.model_selection import train_test_split -from torch_geometric.data import Data, Dataset -from tqdm import tqdm - -import dataprep -from other_models import train_random_forest -from utils.compute_device import COMPUTE_DEVICE - -print(f'Torch version: {torch.__version__}') -print(f'Compute device: {COMPUTE_DEVICE}') -print(f'Torch geometric version: {torch_geometric.__version__}') - -# set default dtype, as MPS Pytorch does not support float64 -torch.set_default_dtype(torch.float32) - - -class PhishingDataset3(Dataset): - """Dataset containing both phishing and non-phishing website urls. """ - - def __init__( - self, - root: str, - do_data_preparation: bool = True, - visualization_mode: bool = False, - nan_value: float = -1.0, - transform=None, - pre_transform=None, - ): - """ - root = Where the dataset should be stored. This folder is split - into raw_dir (downloaded dataset) and processed_dir (processed data). - """ - self.do_data_preparation = do_data_preparation - self.visualization_mode = visualization_mode - self.nan_value = nan_value - super(PhishingDataset3, self).__init__(root, transform, pre_transform) - - @property - def raw_file_names(self) -> List[str]: - """File name of the csv dataset. """ - return glob.glob(os.path.join(self.raw_dir, '*')) - - @property - def processed_file_names(self) -> List[str]: - return [file + '.pt' for file in self.raw_file_names] - - @property - def num_classes(self): - return 2 - - def file_name(self, idx: int) -> str: - if self.visualization_mode: - return f'data_viz_{idx}.pt' - return f'data_{idx}.pt' - - def process(self) -> None: - """Reads csv files in data/raw and preprocess so that output - preprocessed files are written in data/processed folder. - """ - if not self.do_data_preparation: - return - - # loop over all files in `raw_file_names` - for raw_path in self.raw_paths: - df, X, y = dataprep.load_train_set(raw_path, train_test_equilibrum=False) - - forest, _ = train_random_forest(X, X, y, y) - - every_urls, every_features = dataprep.load_every_urls_with_features(df, raw_path) - every_preds = forest.predict(every_features) - - root_urls = df[~df['is_phishing'].isin([self.nan_value])]['url'] - - df.drop(df.iloc[:, 2:-1], inplace=True, axis=1) - df['url']: every_urls - df['is_phishing_pred'] = every_preds - - df = df.set_index('url') - df_to_dict = df.to_dict('index') - - # loop over each root urls in the dataset - for i, (_, url) in enumerate(tqdm(root_urls.items(), total=len(root_urls))): - edge_index, x, _, y, viz_utils = self._build_tensors(url, df_to_dict, df.index) - - self.data = Data(x=x, edge_index=edge_index, y=y) - torch.save(self.data, os.path.join(self.processed_dir, f'data_{i}.pt')) - - # save another file with variables needed for visualization - self.data.pos = viz_utils - torch.save(self.data, os.path.join(self.processed_dir, f'data_viz_{i}.pt')) - - def len(self): - return (len(os.listdir(self.processed_dir)) - 2) - - def _build_tensors(self, root_url: str, df_to_dict, existing_urls) -> Tuple[Tensor, Tensor, Tensor, Tensor, dict]: - """Builds the required tensors for one graph. - These matrices will be then used for training the GNN. - - Args: - df: the dataset of one graph as form of pandas daframe - - Returns: - Tuple[edge_index, x, edge_attr, y, viz_utils] - """ - from_, to_, edges_ = [], [], [] - id_to_feat = {} - url_to_id = {} - queue = [root_url] - visited = set() - error_pages = set() - - def map_url_to_id(url: str): - url_to_id[url] = len(url_to_id) \ - if url not in url_to_id else url_to_id[url] - - def bool_to_float(value: bool): - return 1. if value else 0. - - while True: - if len(queue) == 0: - break - url = queue.pop() - try: - node = df_to_dict[url] - except KeyError: - node = self.error_page_node_feature - - refs = node['refs'] - map_url_to_id(url) - - for i, edge in enumerate(refs): - ref = edge['url'] - is_same_domain = bool_to_float(edge['is_same_domain']) - is_form = bool_to_float(edge['is_form']) - is_anchor = bool_to_float(edge['is_anchor']) - - if (url, ref, i) in visited: - break - if ref not in existing_urls: - error_pages.add(ref) - map_url_to_id(ref) - - from_.append(url_to_id[url]) - to_.append(url_to_id[ref]) - edges_.append([1]) # should be edge features - - is_anchor = ref == url - if not is_anchor: - queue.append(ref) - visited.add((url, ref, i)) - - # remove url and refs - features = [node['is_phishing_pred']] - id_to_feat[url_to_id[url]] = features - - x = [id_to_feat[k] for k in sorted(id_to_feat)] - visualization = { - 'url_to_id': url_to_id, - 'error_pages': error_pages, - } - - return ( - torch.tensor([from_, to_], dtype=torch.int64), - torch.tensor(x, dtype=torch.float32), - torch.tensor(edges_, dtype=torch.int64), - torch.tensor(df_to_dict[root_url]['is_phishing'], dtype=torch.int64), - visualization, - ) - - def get(self, idx): - t = torch.load(os.path.join(self.processed_dir, self.file_name(idx))) - t.x = t.x.to(dtype=torch.float32) - t.y = t.y.to(dtype=torch.int64) - t.edge_index = t.edge_index.to(dtype=torch.int64) - return t - - @property - def error_page_node_feature(self): - data = { - 'is_phishing': self.nan_value, - 'is_phishing_pred': self.nan_value, - 'refs': [], - } - return pd.Series(data=data) diff --git a/phishGNN/other_models.py b/phishGNN/other_models.py index 2be5344..3397f97 100644 --- a/phishGNN/other_models.py +++ b/phishGNN/other_models.py @@ -15,7 +15,7 @@ # from models.ffn import FeedforwardNeuralNetModel -from models import FeedforwardNeuralNetModel +from .models import FeedforwardNeuralNetModel def warn(*args, **kwargs): diff --git a/phishGNN/predict.py b/phishGNN/predict.py index 7cfb9e7..a9372d0 100644 --- a/phishGNN/predict.py +++ b/phishGNN/predict.py @@ -4,22 +4,21 @@ import torch -# from dataset_v1 import PhishingDataset -from dataset_v3 import PhishingDataset3 +from dataset_v1 import PhishingDataset from utils.compute_device import COMPUTE_DEVICE def predict(url: str, weights_file: str) -> int: path = os.path.join(os.getcwd(), 'data', 'predict') - data_files = sorted(glob.glob(os.path.join(path, 'raw', '*.csv'))) + data_files = sorted(glob.glob(os.path.join(path, 'processed', '*'))) if not os.path.exists(path) or len(data_files) == 0: raise FileNotFoundError(f'No files found in path {path}, please the crawler before.') - dataset = PhishingDataset3(root=path, do_data_preparation=True) + dataset = PhishingDataset(root=path, do_data_preparation=True) data = dataset[0] data = data.to(COMPUTE_DEVICE) - model = torch.load(os.path.join(os.getcwd(), 'weights/', weights_file), map_location=COMPUTE_DEVICE).to(COMPUTE_DEVICE) + model = torch.load(os.path.join(os.getcwd(), 'weights/', weights_file)).to(COMPUTE_DEVICE) model.eval() out = model(data.x, data.edge_index, data.batch) pred = out.argmax(dim=1) @@ -28,11 +27,12 @@ def predict(url: str, weights_file: str) -> int: if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--url', type=str, help='the url to predict (phishing/benign)', default="http://www.amazon.fr") - parser.add_argument('--pkl_file', type=str, default='10_epochs_default/GCN_3_global_mean_pool_32.pkl', help='the path to the model weights (.pkl)') + parser.add_argument('url', type=str, help='the url to predict (phishing/benign)') + parser.add_argument('pkl_file', type=str, default='GCN_3_global_mean_pool_32.pkl', + help='the path to the model weights (.pkl)') args, _ = parser.parse_known_args() - pred = predict(args.url, args.pkl_file) + pred = predict(args.url, args.weights_file) if pred == 1: print('Phishing') diff --git a/phishGNN/utils/utils.py b/phishGNN/utils/utils.py index 2c08ecb..83943d9 100644 --- a/phishGNN/utils/utils.py +++ b/phishGNN/utils/utils.py @@ -1,4 +1,4 @@ -from typing import List, Tuple +from typing import List from urllib.parse import urlparse import numpy as np @@ -25,7 +25,7 @@ def log_fail(msg: str): print(f'{bcolors.FAIL}FAILURE:\t{bcolors.ENDC}{msg}') -def tensor_to_tuple_list(tensor: torch.Tensor) -> List[Tuple[int, int]]: +def tensor_to_tuple_list(tensor: torch.Tensor) -> list[tuple[int, int]]: """Converts a tensor of shape [[x], [y]] in an array of tuples of shape [(x, y)]. """