forked from lewis-007/MediaCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a8a4d34
commit 986179b
Showing
16 changed files
with
562 additions
and
267 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# -*- coding: utf-8 -*- | ||
# @Author : relakkes@gmail.com | ||
# @Time : 2023/12/2 14:37 | ||
# @Desc : |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
# -*- coding: utf-8 -*- | ||
# @Author : relakkes@gmail.com | ||
# @Time : 2023/12/2 13:45 | ||
# @Desc : ip代理池实现 | ||
import random | ||
from typing import List | ||
|
||
import httpx | ||
from tenacity import retry, stop_after_attempt, wait_fixed | ||
|
||
from tools import utils | ||
|
||
from .proxy_ip_provider import IpInfoModel, IpProxy | ||
|
||
|
||
class ProxyIpPool: | ||
def __init__(self, ip_pool_count: int, enable_validate_ip: bool) -> None: | ||
self.valid_ip_url = "https://httpbin.org/ip" # 验证 IP 是否有效的地址 | ||
self.ip_pool_count = ip_pool_count | ||
self.enable_validate_ip = enable_validate_ip | ||
self.proxy_list: List[IpInfoModel] = [] | ||
|
||
async def load_proxies(self) -> None: | ||
""" | ||
从 HTTP 代理商获取 IP 列表 | ||
:return: | ||
""" | ||
self.proxy_list = await IpProxy.get_proxies(self.ip_pool_count) | ||
|
||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) | ||
async def is_valid_proxy(self, proxy: IpInfoModel) -> bool: | ||
""" | ||
验证代理IP是否有效 | ||
:param proxy: | ||
:return: | ||
""" | ||
utils.logger.info(f"[ProxyIpPool.is_valid_proxy] testing {proxy.ip} is it valid ") | ||
try: | ||
httpx_proxy = f"{proxy.protocol}{proxy.ip}:{proxy.port}" | ||
proxy_auth = httpx.BasicAuth(proxy.user, proxy.password) | ||
async with httpx.AsyncClient(proxies={proxy.protocol: httpx_proxy}, auth=proxy_auth) as client: | ||
response = await client.get(self.valid_ip_url) | ||
if response.status_code == 200: | ||
return True | ||
else: | ||
return False | ||
except Exception as e: | ||
utils.logger.info(f"[ProxyIpPool.is_valid_proxy] testing {proxy.ip} err: {e}") | ||
raise e | ||
|
||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) | ||
async def get_proxy(self) -> IpInfoModel: | ||
""" | ||
从代理池中随机提取一个代理IP | ||
:return: | ||
""" | ||
if len(self.proxy_list) == 0: | ||
await self.reload_proxies() | ||
|
||
proxy = random.choice(self.proxy_list) | ||
if self.enable_validate_ip: | ||
if not await self.is_valid_proxy(proxy): | ||
raise Exception("[ProxyIpPool.get_proxy] current ip invalid and again get it") | ||
self.proxy_list.remove(proxy) | ||
return proxy | ||
|
||
async def reload_proxies(self): | ||
""" | ||
# 重新加载代理池 | ||
:return: | ||
""" | ||
self.proxy_list = [] | ||
await self.load_proxies() | ||
|
||
|
||
async def create_ip_pool(ip_pool_count: int, enable_validate_ip) -> ProxyIpPool: | ||
""" | ||
创建 IP 代理池 | ||
:param ip_pool_count: | ||
:param enable_validate_ip: | ||
:return: | ||
""" | ||
pool = ProxyIpPool(ip_pool_count, enable_validate_ip) | ||
await pool.load_proxies() | ||
return pool | ||
|
||
|
||
if __name__ == '__main__': | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
# -*- coding: utf-8 -*- | ||
# @Author : relakkes@gmail.com | ||
# @Time : 2023/12/2 11:18 | ||
# @Desc : 爬虫 IP 获取实现 | ||
# @Url : 现在实现了极速HTTP的接口,官网地址:https://www.jisuhttp.com/?pl=mAKphQ&plan=ZY&kd=Yang | ||
|
||
import asyncio | ||
import os | ||
from abc import ABC, abstractmethod | ||
from typing import Dict, List, Optional | ||
from urllib.parse import urlencode | ||
|
||
import httpx | ||
from pydantic import BaseModel, Field | ||
|
||
from tools import utils | ||
|
||
|
||
class IpGetError(Exception): | ||
""" ip get error""" | ||
|
||
|
||
class IpInfoModel(BaseModel): | ||
"""Unified IP model""" | ||
ip: str = Field(title="ip") | ||
port: int = Field(title="端口") | ||
user: str = Field(title="IP代理认证的用户名") | ||
protocol: str = Field(default="https://", title="代理IP的协议") | ||
password: str = Field(title="IP代理认证用户的密码") | ||
expired_time_ts: Optional[int] = Field(title="IP 过期时间") | ||
|
||
|
||
class ProxyProvider(ABC): | ||
@abstractmethod | ||
async def get_proxies(self, num: int) -> List[Dict]: | ||
""" | ||
获取 IP 的抽象方法,不同的 HTTP 代理商需要实现该方法 | ||
:param num: 提取的 IP 数量 | ||
:return: | ||
""" | ||
pass | ||
|
||
|
||
class JiSuHttpProxy(ProxyProvider): | ||
def __init__(self, exract_type: str, key: str, crypto: str, res_type: str, protocol: int, time: int): | ||
""" | ||
极速HTTP 代理IP实现 | ||
官网地址:https://www.jisuhttp.com/?pl=mAKphQ&plan=ZY&kd=Yang | ||
:param exract_type: 提取方式 | ||
:param key: 提取key值 (到上面链接的官网去注册后获取) | ||
:param crypto: 加密签名 (到上面链接的官网去注册后获取) | ||
:param res_type: 返回的数据格式:TXT、JSON | ||
:param protocol: IP协议:1:HTTP、2:HTTPS、3:SOCKS5 | ||
:param time: IP使用时长,支持3、5、10、15、30分钟时效 | ||
""" | ||
self.exract_type = exract_type | ||
self.api_path = "https://api.jisuhttp.com" | ||
self.params = { | ||
"key": key, | ||
"crypto": crypto, | ||
"type": res_type, | ||
"port": protocol, | ||
"time": time, | ||
"pw": "1", # 是否使用账密验证, 1:是,0:否,否表示白名单验证;默认为0 | ||
"se": "1", # 返回JSON格式时是否显示IP过期时间, 1:显示,0:不显示;默认为0 | ||
} | ||
|
||
async def get_proxies(self, num: int) -> List[IpInfoModel]: | ||
""" | ||
:param num: | ||
:return: | ||
""" | ||
if self.exract_type == "API": | ||
uri = "/fetchips" | ||
self.params.update({"num": num}) | ||
ip_infos = [] | ||
async with httpx.AsyncClient() as client: | ||
url = self.api_path + uri + '?' + urlencode(self.params) | ||
utils.logger.info(f"[JiSuHttpProxy] get ip proxy url:{url}") | ||
response = await client.get(url, headers={"User-Agent": "MediaCrawler"}) | ||
res_dict: Dict = response.json() | ||
if res_dict.get("code") == 0: | ||
data: List[Dict] = res_dict.get("data") | ||
for ip_item in data: | ||
ip_info_model = IpInfoModel( | ||
ip=ip_item.get("ip"), | ||
port=ip_item.get("port"), | ||
user=ip_item.get("user"), | ||
password=ip_item.get("pass"), | ||
expired_time_ts=utils.get_unix_time_from_time_str(ip_item.get("expire")) | ||
) | ||
ip_infos.append(ip_info_model) | ||
else: | ||
raise IpGetError(res_dict.get("msg", "unkown err")) | ||
return ip_infos | ||
else: | ||
pass | ||
|
||
|
||
|
||
IpProxy = JiSuHttpProxy( | ||
key=os.getenv("jisu_key", ""), # 通过环境变量的方式获取极速HTTPIP提取key值 | ||
crypto=os.getenv("jisu_crypto", ""), # 通过环境变量的方式获取极速HTTPIP提取加密签名 | ||
res_type="json", | ||
protocol=2, | ||
time=30 | ||
) | ||
|
||
if __name__ == '__main__': | ||
_ip_infos = asyncio.run(IpProxy.get_proxies(1)) | ||
print(_ip_infos) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# -*- coding: utf-8 -*- | ||
# @Author : relakkes@gmail.com | ||
# @Time : 2023/12/2 14:42 | ||
# @Desc : | ||
from unittest import IsolatedAsyncioTestCase | ||
|
||
from proxy.proxy_ip_pool import create_ip_pool | ||
from proxy.proxy_ip_provider import IpInfoModel | ||
|
||
|
||
class TestIpPool(IsolatedAsyncioTestCase): | ||
async def test_ip_pool(self): | ||
pool = await create_ip_pool(ip_pool_count=30, enable_validate_ip=False) | ||
for i in range(30): | ||
ip_proxy_info: IpInfoModel = await pool.get_proxy() | ||
self.assertIsNotNone(ip_proxy_info.ip, msg="验证 ip 是否获取成功") | ||
print(ip_proxy_info) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
# -*- coding: utf-8 -*- | ||
# @Author : relakkes@gmail.com | ||
# @Time : 2023/12/2 12:53 | ||
# @Desc : 爬虫相关的工具函数 | ||
|
||
import base64 | ||
import random | ||
import re | ||
from io import BytesIO | ||
from typing import Dict, List, Optional, Tuple | ||
|
||
from PIL import Image, ImageDraw | ||
from playwright.async_api import Cookie, Page | ||
|
||
|
||
async def find_login_qrcode(page: Page, selector: str) -> str: | ||
"""find login qrcode image from target selector""" | ||
try: | ||
elements = await page.wait_for_selector( | ||
selector=selector, | ||
) | ||
login_qrcode_img = await elements.get_property("src") # type: ignore | ||
return str(login_qrcode_img) | ||
|
||
except Exception as e: | ||
print(e) | ||
return "" | ||
|
||
|
||
def show_qrcode(qr_code) -> None: # type: ignore | ||
"""parse base64 encode qrcode image and show it""" | ||
qr_code = qr_code.split(",")[1] | ||
qr_code = base64.b64decode(qr_code) | ||
image = Image.open(BytesIO(qr_code)) | ||
|
||
# Add a square border around the QR code and display it within the border to improve scanning accuracy. | ||
width, height = image.size | ||
new_image = Image.new('RGB', (width + 20, height + 20), color=(255, 255, 255)) | ||
new_image.paste(image, (10, 10)) | ||
draw = ImageDraw.Draw(new_image) | ||
draw.rectangle((0, 0, width + 19, height + 19), outline=(0, 0, 0), width=1) | ||
new_image.show() | ||
|
||
|
||
def get_user_agent() -> str: | ||
ua_list = [ | ||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", | ||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36", | ||
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36", | ||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36", | ||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36", | ||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36" | ||
] | ||
return random.choice(ua_list) | ||
|
||
|
||
def convert_cookies(cookies: Optional[List[Cookie]]) -> Tuple[str, Dict]: | ||
if not cookies: | ||
return "", {} | ||
cookies_str = ";".join([f"{cookie.get('name')}={cookie.get('value')}" for cookie in cookies]) | ||
cookie_dict = dict() | ||
for cookie in cookies: | ||
cookie_dict[cookie.get('name')] = cookie.get('value') | ||
return cookies_str, cookie_dict | ||
|
||
|
||
def convert_str_cookie_to_dict(cookie_str: str) -> Dict: | ||
cookie_dict: Dict[str, str] = dict() | ||
if not cookie_str: | ||
return cookie_dict | ||
for cookie in cookie_str.split(";"): | ||
cookie = cookie.strip() | ||
if not cookie: | ||
continue | ||
cookie_list = cookie.split("=") | ||
if len(cookie_list) != 2: | ||
continue | ||
cookie_value = cookie_list[1] | ||
if isinstance(cookie_value, list): | ||
cookie_value = "".join(cookie_value) | ||
cookie_dict[cookie_list[0]] = cookie_value | ||
return cookie_dict | ||
|
||
|
||
def match_interact_info_count(count_str: str) -> int: | ||
if not count_str: | ||
return 0 | ||
|
||
match = re.search(r'\d+', count_str) | ||
if match: | ||
number = match.group() | ||
return int(number) | ||
else: | ||
return 0 |
Oops, something went wrong.