添加IP代理爬取 (Python3WebSpider#106)

* Create ip89.py www.89ip.cn 免费代理 * Update ip89.py update Class name * Create fatezero_proxylist.py 增加 http://proxylist.fatezero.org/ 代理 * Create ihuan.py i幻代理
wang-weart · Feb 6, 2021 · e3bbd55 · e3bbd55
1 parent 9912b98
commit e3bbd55
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 0 deletions.
diff --git a/proxypool/crawlers/public/fatezero_proxylist.py b/proxypool/crawlers/public/fatezero_proxylist.py
@@ -0,0 +1,32 @@
+from proxypool.schemas.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+import re
+import json
+BASE_URL = 'http://proxylist.fatezero.org/proxy.list'
+
+
+class FatezeroCrawler(BaseCrawler):
+    """
+    Fatezero crawler,http://proxylist.fatezero.org
+    """
+    urls = [BASE_URL]
+
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+
+        hosts_ports = html.split('\n')
+        for addr in hosts_ports:
+            ip_address = json.loads(addr)
+            if(True):
+                host = ip_address['host']
+                port = ip_address['port']
+                yield Proxy(host=host, port=port)
+
+
+if __name__ == '__main__':
+    crawler = FatezeroCrawler()
+    for proxy in crawler.crawl():
+        print(proxy)
diff --git a/proxypool/crawlers/public/ihuan.py b/proxypool/crawlers/public/ihuan.py
@@ -0,0 +1,34 @@
+from proxypool.schemas.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+import re
+from pyquery import PyQuery as pq
+import time
+BASE_URL = 'https://ip.ihuan.me/today/{path}.html'
+
+
+class IhuanCrawler(BaseCrawler):
+    """
+    ip  ihuan crawler, https://ip.ihuan.me
+    """
+    urls = [BASE_URL.format(path=time.strftime("%Y/%m/%d/%H", time.localtime()))]
+
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        # doc = pq(html)('.text-left')
+        ip_address = re.compile('([\d:\.]*).*?<br>')
+        hosts_ports = ip_address.findall(html)
+        for addr in hosts_ports:
+            addr_split = addr.split(':')
+            if(len(addr_split) == 2):
+                host = addr_split[0]
+                port = addr_split[1]
+                yield Proxy(host=host, port=port)
+
+
+if __name__ == '__main__':
+    crawler = IhuanCrawler()
+    for proxy in crawler.crawl():
+        print(proxy)
diff --git a/proxypool/crawlers/public/ip89.py b/proxypool/crawlers/public/ip89.py
@@ -0,0 +1,33 @@
+from proxypool.schemas.proxy import Proxy
+from proxypool.crawlers.base import BaseCrawler
+import re
+
+MAX_NUM = 9999
+BASE_URL = 'http://api.89ip.cn/tqdl.html?api=1&num={MAX_NUM}&port=&address=&isp='.format(MAX_NUM=MAX_NUM)
+
+
+class Ip89Crawler(BaseCrawler):
+    """
+    89ip crawler, http://api.89ip.cn
+    """
+    urls = [BASE_URL]
+
+    def parse(self, html):
+        """
+        parse html file to get proxies
+        :return:
+        """
+        ip_address = re.compile('([\d:\.]*)<br>')
+        hosts_ports = ip_address.findall(html)
+        for addr in hosts_ports:
+            addr_split = addr.split(':')
+            if(len(addr_split) == 2):
+                host = addr_split[0]
+                port = addr_split[1]
+                yield Proxy(host=host, port=port)
+
+
+if __name__ == '__main__':
+    crawler = Ip89Crawler()
+    for proxy in crawler.crawl():
+        print(proxy)