Skip to content

Commit

Permalink
feat: 增加 IP 代理的最新实现
Browse files Browse the repository at this point in the history
  • Loading branch information
NanmiCoder committed Dec 2, 2023
1 parent a8a4d34 commit 986179b
Show file tree
Hide file tree
Showing 16 changed files with 562 additions and 267 deletions.
2 changes: 1 addition & 1 deletion base/base_crawler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import ABC, abstractmethod

from base.proxy_account_pool import AccountPool
from proxy.proxy_account_pool import AccountPool


class AbstractCrawler(ABC):
Expand Down
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

import config
import db
from base import proxy_account_pool
from media_platform.douyin import DouYinCrawler
from media_platform.kuaishou import KuaishouCrawler
from media_platform.xhs import XiaoHongShuCrawler
from proxy import proxy_account_pool


class CrawlerFactory:
Expand Down
2 changes: 1 addition & 1 deletion media_platform/douyin/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

import config
from base.base_crawler import AbstractCrawler
from base.proxy_account_pool import AccountPool
from models import douyin
from proxy.proxy_account_pool import AccountPool
from tools import utils
from var import crawler_type_var

Expand Down
2 changes: 1 addition & 1 deletion media_platform/kuaishou/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@

import config
from base.base_crawler import AbstractCrawler
from base.proxy_account_pool import AccountPool
from models import kuaishou
from proxy.proxy_account_pool import AccountPool
from tools import utils
from var import comment_tasks_var, crawler_type_var

Expand Down
2 changes: 1 addition & 1 deletion media_platform/kuaishou/graphql.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# 快手的数据传输是基于GraphQL实现的
# 这个类负责获取一些GraphQL的schema
from typing import Dict
from typing import Dict


class KuaiShouGraphQL:
Expand Down
2 changes: 1 addition & 1 deletion media_platform/xhs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

import config
from base.base_crawler import AbstractCrawler
from base.proxy_account_pool import AccountPool
from models import xiaohongshu as xhs_model
from proxy.proxy_account_pool import AccountPool
from tools import utils
from var import crawler_type_var

Expand Down
2 changes: 2 additions & 0 deletions models/kuaishou.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def __str__(self):
async def update_kuaishou_video(video_item: Dict):
photo_info: Dict = video_item.get("photo", {})
video_id = photo_info.get("id")
if not video_id:
return
user_info = video_item.get("author", {})
local_db_item = {
"video_id": video_id,
Expand Down
4 changes: 4 additions & 0 deletions proxy/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/2 14:37
# @Desc :
5 changes: 5 additions & 0 deletions base/proxy_account_pool.py → proxy/proxy_account_pool.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/2 11:18
# @Desc : IP 和 手机号 一一配对的账号代理池

from typing import List, Optional, Set, Tuple

import config
Expand Down
89 changes: 89 additions & 0 deletions proxy/proxy_ip_pool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/2 13:45
# @Desc : ip代理池实现
import random
from typing import List

import httpx
from tenacity import retry, stop_after_attempt, wait_fixed

from tools import utils

from .proxy_ip_provider import IpInfoModel, IpProxy


class ProxyIpPool:
def __init__(self, ip_pool_count: int, enable_validate_ip: bool) -> None:
self.valid_ip_url = "https://httpbin.org/ip" # 验证 IP 是否有效的地址
self.ip_pool_count = ip_pool_count
self.enable_validate_ip = enable_validate_ip
self.proxy_list: List[IpInfoModel] = []

async def load_proxies(self) -> None:
"""
从 HTTP 代理商获取 IP 列表
:return:
"""
self.proxy_list = await IpProxy.get_proxies(self.ip_pool_count)

@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
async def is_valid_proxy(self, proxy: IpInfoModel) -> bool:
"""
验证代理IP是否有效
:param proxy:
:return:
"""
utils.logger.info(f"[ProxyIpPool.is_valid_proxy] testing {proxy.ip} is it valid ")
try:
httpx_proxy = f"{proxy.protocol}{proxy.ip}:{proxy.port}"
proxy_auth = httpx.BasicAuth(proxy.user, proxy.password)
async with httpx.AsyncClient(proxies={proxy.protocol: httpx_proxy}, auth=proxy_auth) as client:
response = await client.get(self.valid_ip_url)
if response.status_code == 200:
return True
else:
return False
except Exception as e:
utils.logger.info(f"[ProxyIpPool.is_valid_proxy] testing {proxy.ip} err: {e}")
raise e

@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
async def get_proxy(self) -> IpInfoModel:
"""
从代理池中随机提取一个代理IP
:return:
"""
if len(self.proxy_list) == 0:
await self.reload_proxies()

proxy = random.choice(self.proxy_list)
if self.enable_validate_ip:
if not await self.is_valid_proxy(proxy):
raise Exception("[ProxyIpPool.get_proxy] current ip invalid and again get it")
self.proxy_list.remove(proxy)
return proxy

async def reload_proxies(self):
"""
# 重新加载代理池
:return:
"""
self.proxy_list = []
await self.load_proxies()


async def create_ip_pool(ip_pool_count: int, enable_validate_ip) -> ProxyIpPool:
"""
创建 IP 代理池
:param ip_pool_count:
:param enable_validate_ip:
:return:
"""
pool = ProxyIpPool(ip_pool_count, enable_validate_ip)
await pool.load_proxies()
return pool


if __name__ == '__main__':
pass
111 changes: 111 additions & 0 deletions proxy/proxy_ip_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/2 11:18
# @Desc : 爬虫 IP 获取实现
# @Url : 现在实现了极速HTTP的接口,官网地址:https://www.jisuhttp.com/?pl=mAKphQ&plan=ZY&kd=Yang

import asyncio
import os
from abc import ABC, abstractmethod
from typing import Dict, List, Optional
from urllib.parse import urlencode

import httpx
from pydantic import BaseModel, Field

from tools import utils


class IpGetError(Exception):
""" ip get error"""


class IpInfoModel(BaseModel):
"""Unified IP model"""
ip: str = Field(title="ip")
port: int = Field(title="端口")
user: str = Field(title="IP代理认证的用户名")
protocol: str = Field(default="https://", title="代理IP的协议")
password: str = Field(title="IP代理认证用户的密码")
expired_time_ts: Optional[int] = Field(title="IP 过期时间")


class ProxyProvider(ABC):
@abstractmethod
async def get_proxies(self, num: int) -> List[Dict]:
"""
获取 IP 的抽象方法,不同的 HTTP 代理商需要实现该方法
:param num: 提取的 IP 数量
:return:
"""
pass


class JiSuHttpProxy(ProxyProvider):
def __init__(self, exract_type: str, key: str, crypto: str, res_type: str, protocol: int, time: int):
"""
极速HTTP 代理IP实现
官网地址:https://www.jisuhttp.com/?pl=mAKphQ&plan=ZY&kd=Yang
:param exract_type: 提取方式
:param key: 提取key值 (到上面链接的官网去注册后获取)
:param crypto: 加密签名 (到上面链接的官网去注册后获取)
:param res_type: 返回的数据格式:TXT、JSON
:param protocol: IP协议:1:HTTP、2:HTTPS、3:SOCKS5
:param time: IP使用时长,支持3、5、10、15、30分钟时效
"""
self.exract_type = exract_type
self.api_path = "https://api.jisuhttp.com"
self.params = {
"key": key,
"crypto": crypto,
"type": res_type,
"port": protocol,
"time": time,
"pw": "1", # 是否使用账密验证, 1:是,0:否,否表示白名单验证;默认为0
"se": "1", # 返回JSON格式时是否显示IP过期时间, 1:显示,0:不显示;默认为0
}

async def get_proxies(self, num: int) -> List[IpInfoModel]:
"""
:param num:
:return:
"""
if self.exract_type == "API":
uri = "/fetchips"
self.params.update({"num": num})
ip_infos = []
async with httpx.AsyncClient() as client:
url = self.api_path + uri + '?' + urlencode(self.params)
utils.logger.info(f"[JiSuHttpProxy] get ip proxy url:{url}")
response = await client.get(url, headers={"User-Agent": "MediaCrawler"})
res_dict: Dict = response.json()
if res_dict.get("code") == 0:
data: List[Dict] = res_dict.get("data")
for ip_item in data:
ip_info_model = IpInfoModel(
ip=ip_item.get("ip"),
port=ip_item.get("port"),
user=ip_item.get("user"),
password=ip_item.get("pass"),
expired_time_ts=utils.get_unix_time_from_time_str(ip_item.get("expire"))
)
ip_infos.append(ip_info_model)
else:
raise IpGetError(res_dict.get("msg", "unkown err"))
return ip_infos
else:
pass



IpProxy = JiSuHttpProxy(
key=os.getenv("jisu_key", ""), # 通过环境变量的方式获取极速HTTPIP提取key值
crypto=os.getenv("jisu_crypto", ""), # 通过环境变量的方式获取极速HTTPIP提取加密签名
res_type="json",
protocol=2,
time=30
)

if __name__ == '__main__':
_ip_infos = asyncio.run(IpProxy.get_proxies(1))
print(_ip_infos)
17 changes: 17 additions & 0 deletions test/test_proxy_ip_pool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/2 14:42
# @Desc :
from unittest import IsolatedAsyncioTestCase

from proxy.proxy_ip_pool import create_ip_pool
from proxy.proxy_ip_provider import IpInfoModel


class TestIpPool(IsolatedAsyncioTestCase):
async def test_ip_pool(self):
pool = await create_ip_pool(ip_pool_count=30, enable_validate_ip=False)
for i in range(30):
ip_proxy_info: IpInfoModel = await pool.get_proxy()
self.assertIsNotNone(ip_proxy_info.ip, msg="验证 ip 是否获取成功")
print(ip_proxy_info)
94 changes: 94 additions & 0 deletions tools/crawler_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
# @Author : relakkes@gmail.com
# @Time : 2023/12/2 12:53
# @Desc : 爬虫相关的工具函数

import base64
import random
import re
from io import BytesIO
from typing import Dict, List, Optional, Tuple

from PIL import Image, ImageDraw
from playwright.async_api import Cookie, Page


async def find_login_qrcode(page: Page, selector: str) -> str:
"""find login qrcode image from target selector"""
try:
elements = await page.wait_for_selector(
selector=selector,
)
login_qrcode_img = await elements.get_property("src") # type: ignore
return str(login_qrcode_img)

except Exception as e:
print(e)
return ""


def show_qrcode(qr_code) -> None: # type: ignore
"""parse base64 encode qrcode image and show it"""
qr_code = qr_code.split(",")[1]
qr_code = base64.b64decode(qr_code)
image = Image.open(BytesIO(qr_code))

# Add a square border around the QR code and display it within the border to improve scanning accuracy.
width, height = image.size
new_image = Image.new('RGB', (width + 20, height + 20), color=(255, 255, 255))
new_image.paste(image, (10, 10))
draw = ImageDraw.Draw(new_image)
draw.rectangle((0, 0, width + 19, height + 19), outline=(0, 0, 0), width=1)
new_image.show()


def get_user_agent() -> str:
ua_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"
]
return random.choice(ua_list)


def convert_cookies(cookies: Optional[List[Cookie]]) -> Tuple[str, Dict]:
if not cookies:
return "", {}
cookies_str = ";".join([f"{cookie.get('name')}={cookie.get('value')}" for cookie in cookies])
cookie_dict = dict()
for cookie in cookies:
cookie_dict[cookie.get('name')] = cookie.get('value')
return cookies_str, cookie_dict


def convert_str_cookie_to_dict(cookie_str: str) -> Dict:
cookie_dict: Dict[str, str] = dict()
if not cookie_str:
return cookie_dict
for cookie in cookie_str.split(";"):
cookie = cookie.strip()
if not cookie:
continue
cookie_list = cookie.split("=")
if len(cookie_list) != 2:
continue
cookie_value = cookie_list[1]
if isinstance(cookie_value, list):
cookie_value = "".join(cookie_value)
cookie_dict[cookie_list[0]] = cookie_value
return cookie_dict


def match_interact_info_count(count_str: str) -> int:
if not count_str:
return 0

match = re.search(r'\d+', count_str)
if match:
number = match.group()
return int(number)
else:
return 0
Loading

0 comments on commit 986179b

Please sign in to comment.