Skip to content

Commit

Permalink
添加IP代理爬取 (Python3WebSpider#106)
Browse files Browse the repository at this point in the history
* Create ip89.py

www.89ip.cn 免费代理

* Update ip89.py

update Class name

* Create fatezero_proxylist.py

增加 http://proxylist.fatezero.org/ 代理

* Create ihuan.py

i幻 代理
  • Loading branch information
everhopingandwaiting committed Feb 6, 2021
1 parent 9912b98 commit e3bbd55
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 0 deletions.
32 changes: 32 additions & 0 deletions proxypool/crawlers/public/fatezero_proxylist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import re
import json
BASE_URL = 'http://proxylist.fatezero.org/proxy.list'


class FatezeroCrawler(BaseCrawler):
"""
Fatezero crawler,http://proxylist.fatezero.org
"""
urls = [BASE_URL]

def parse(self, html):
"""
parse html file to get proxies
:return:
"""

hosts_ports = html.split('\n')
for addr in hosts_ports:
ip_address = json.loads(addr)
if(True):
host = ip_address['host']
port = ip_address['port']
yield Proxy(host=host, port=port)


if __name__ == '__main__':
crawler = FatezeroCrawler()
for proxy in crawler.crawl():
print(proxy)
34 changes: 34 additions & 0 deletions proxypool/crawlers/public/ihuan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import re
from pyquery import PyQuery as pq
import time
BASE_URL = 'https://ip.ihuan.me/today/{path}.html'


class IhuanCrawler(BaseCrawler):
"""
ip ihuan crawler, https://ip.ihuan.me
"""
urls = [BASE_URL.format(path=time.strftime("%Y/%m/%d/%H", time.localtime()))]

def parse(self, html):
"""
parse html file to get proxies
:return:
"""
# doc = pq(html)('.text-left')
ip_address = re.compile('([\d:\.]*).*?<br>')
hosts_ports = ip_address.findall(html)
for addr in hosts_ports:
addr_split = addr.split(':')
if(len(addr_split) == 2):
host = addr_split[0]
port = addr_split[1]
yield Proxy(host=host, port=port)


if __name__ == '__main__':
crawler = IhuanCrawler()
for proxy in crawler.crawl():
print(proxy)
33 changes: 33 additions & 0 deletions proxypool/crawlers/public/ip89.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from proxypool.schemas.proxy import Proxy
from proxypool.crawlers.base import BaseCrawler
import re

MAX_NUM = 9999
BASE_URL = 'http://api.89ip.cn/tqdl.html?api=1&num={MAX_NUM}&port=&address=&isp='.format(MAX_NUM=MAX_NUM)


class Ip89Crawler(BaseCrawler):
"""
89ip crawler, http://api.89ip.cn
"""
urls = [BASE_URL]

def parse(self, html):
"""
parse html file to get proxies
:return:
"""
ip_address = re.compile('([\d:\.]*)<br>')
hosts_ports = ip_address.findall(html)
for addr in hosts_ports:
addr_split = addr.split(':')
if(len(addr_split) == 2):
host = addr_split[0]
port = addr_split[1]
yield Proxy(host=host, port=port)


if __name__ == '__main__':
crawler = Ip89Crawler()
for proxy in crawler.crawl():
print(proxy)

0 comments on commit e3bbd55

Please sign in to comment.