-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
729 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,238 @@ | ||
#!/usr/bin/env python | ||
''' | ||
-*- coding: utf-8 -*- | ||
@Time : 2024/6/4 16:51 | ||
@Author : Harvey | ||
@File : run.py | ||
''' | ||
''' | ||
目前仅能采集 产品详情 + 评论区的首页 | ||
评论区还要其他地方有检测的待处理 | ||
''' | ||
|
||
import random | ||
import re | ||
|
||
import ddddocr | ||
import requests | ||
|
||
from wbh_word.spider.Get_TJ_ip import ip_proxies | ||
|
||
# # 目前所有站点的域名和cookie(cookie有存活期) | ||
amazon_site_info = { | ||
# 20500 | ||
"https://www.amazon.com": ['美国站', 'i18n-prefs=USD; lc-main=en_US; session-id={}-{}-{}; ubid-main={}-{}-{}'], | ||
# NW1 6XE | ||
"https://www.amazon.co.uk": ['英国站', 'i18n-prefs=CNY; lc-acbuk=en_GB; session-id={}-{}-{}; ubid-acbuk={}-{}-{}'], | ||
# K1V 7P8 | ||
"https://www.amazon.ca": ['加拿大站', 'i18n-prefs=CAD; lc-acbca=en_CA; session-id={}-{}-{}; ubid-acbca={}-{}-{}'], | ||
# 10115 | ||
"https://www.amazon.de": ['德国站', 'lc-acbde=en_GB; i18n-prefs=CNY; session-id={}-{}-{}; ubid-acbde={}-{}-{}'], | ||
# 1011-1109 | ||
"https://www.amazon.nl": ['荷兰站', 'i18n-prefs=EUR; lc-acbnl=en_GB; session-id={}-{}-{}; ubid-acbnl={}-{}-{}'], | ||
# 11455 | ||
"https://www.amazon.se": ['瑞典站', 'i18n-prefs=SEK; lc-acbse=en_GB; session-id={}-{}-{}; ubid-acbse={}-{}-{}'], | ||
# 1930 | ||
"https://www.amazon.com.be": ['比利时站', | ||
'i18n-prefs=EUR; lc-acbbe=en_GB; session-id={}-{}-{}; ubid-acbbe={}-{}-{}'], | ||
# 789680 | ||
"https://www.amazon.sg": ['新加坡站', 'i18n-prefs=SGD; session-id={}-{}-{}; ubid-acbsg={}-{}-{}'], | ||
# 11433 | ||
"https://www.amazon.sa": ['阿拉伯站', 'i18n-prefs=SAR; lc-acbsa=en_AE; session-id={}-{}-{}; ubid-acbsa={}-{}-{}'], | ||
# Dubai | ||
"https://www.amazon.ae": ['阿联酋站', 'i18n-prefs=USD; lc-acbae=en_AE; session-id={}-{}-{}; ubid-acbae={}-{}-{}'], | ||
# 999008 | ||
"https://www.amazon.in": ['印度站', 'i18n-prefs=INR; lc-acbin=en_IN; session-id={}-{}-{}; ubid-acbin={}-{}-{}'], | ||
"https://www.amazon.eg": ['埃及站', 'i18n-prefs=EGP; lc-acbeg=en_AE; session-id={}-{}-{}; ubid-acbeg={}-{}-{}'], | ||
# 00144 | ||
"https://www.amazon.it": ['意大利站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbit={}-{}-{}'], | ||
# 08358 | ||
"https://www.amazon.es": ['西班牙站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbes={}-{}-{}'], | ||
# 10115 | ||
"https://www.amazon.pl": ["波兰站", 'i18n-prefs=PLN; session-id={}-{}-{}; ubid-acbpl={}-{}-{}'], | ||
# 34000 | ||
"https://www.amazon.com.tr": ["土耳其站", 'i18n-prefs=TRY; session-id={}-{}-{}; ubid-acbtr={}-{}-{}'], | ||
# 83331-000 | ||
"https://www.amazon.com.br": ["巴西站", 'i18n-prefs=BRL; session-id={}-{}-{}; ubid-acbbr={}-{}-{}'], | ||
# 75020 | ||
"https://www.amazon.fr": ["法国站", 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbfr={}-{}-{}'], | ||
# 01830 | ||
"https://www.amazon.com.mx": ["墨西哥站", 'i18n-prefs=MXN; session-id={}-{}-{}; ubid-acbmx={}-{}-{}'], | ||
# 2600 | ||
"https://www.amazon.com.a": ["澳大利亚站", 'i18n-prefs=AUD; session-id={}-{}-{}; ubid-acbau={}-{}-{}'], | ||
} | ||
|
||
|
||
def get_response_type(response): | ||
type = 9999 | ||
if response.status_code == 404: | ||
# 判断商品是否过期 | ||
print("[Type]当前页面为-商品过期") | ||
type = 2 | ||
elif not response or response.status_code < 200 or response.status_code >= 400: | ||
# 异常响应 | ||
print("[Type]当前页面为-异常响应") | ||
type = 0 | ||
elif (response.status_code == 302) or ('Sorry! Something went wrong' in response.text) or ( | ||
'请刷新页面并重试' in response.text): | ||
# 请求错误 | ||
print("[Type]当前页面为-请求错误") | ||
type = -1 | ||
elif re.search(r'Enter the characters you see below', response.text) or ( | ||
'/errors/validateCaptcha' in response.text): | ||
# 验证码 | ||
print("[Type]当前页面为-验证码") | ||
type = -2 | ||
elif len(response.text) > 150000: | ||
print("[Type]当前页面为-正常响应") | ||
type = 1 | ||
return type | ||
|
||
|
||
def random_amazon_headers(): | ||
headers = { | ||
"dpr": "1", | ||
"referer": "https://www.amazon.com", | ||
# "sec-ch-ua": "\"Chromium\";v=\"124 \", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"", | ||
"sec-ch-viewport-width": "1912", | ||
"sec-fetch-mode": "navigate", | ||
"sec-fetch-site": "same-origin", | ||
"sec-fetch-user": "?1", | ||
"upgrade-insecure-requests": "1", | ||
# "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", | ||
"viewport-width": "1912" | ||
} | ||
ua_temple = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.{}.{}" | ||
headers['user-agent'] = ua_temple.format(random.randint(1000, 9999), random.randint(10, 1000)) | ||
return headers | ||
|
||
|
||
def updata_cookie(cookie_dict, meta): | ||
''' | ||
根据cookie字典来更新cookie | ||
:param cookie_dict: | ||
:return: | ||
''' | ||
# print('[UPDATA_COOKIE]', cookie_dict) | ||
cookies = meta.get('cookies', {}) | ||
if cookie_dict.get('x-amz-captcha-1', ''): | ||
cookies['x-amz-captcha-1'] = cookie_dict['x-amz-captcha-1'] | ||
if cookie_dict.get('x-amz-captcha-2', ''): | ||
cookies['x-amz-captcha-2'] = cookie_dict['x-amz-captcha-2'] | ||
meta['cookies'] = cookies | ||
return cookies, meta | ||
|
||
|
||
def get_img(response): | ||
''' | ||
下载并识别图片 | ||
:param response: | ||
:return:图片ID | ||
:return:图片识别结果 | ||
''' | ||
img_id = re.findall(r'name="amzn" value="(.*?)"', response.text) | ||
img = re.findall(r'<img src="(.*?)">', response.text) | ||
if img_id and img: | ||
img_url = img[0] | ||
img_id = img_id[0] | ||
r = requests.get(img_url) | ||
img_path = './image/img.png' | ||
ocr = ddddocr.DdddOcr() | ||
with open(img_path, 'wb') as f: | ||
f.write(r.content) | ||
img_data = ocr.classification(r.content) | ||
img_data = img_data.lower() | ||
return img_id, img_data | ||
|
||
|
||
def run_verify(response, meta): | ||
''' | ||
处理验证码 | ||
:param response: | ||
:return: | ||
''' | ||
verify_url = "https://www.amazon.com/errors/validateCaptcha" | ||
img_id, img_data = get_img(response) | ||
if img_id and img_data: | ||
msg_url = meta.get('msg_url', '') | ||
url_href = msg_url.split('amazon.com')[-1] | ||
proxies = meta.get('proxies', '') | ||
headers = meta.get('headers') | ||
cookies = meta.get('cookies') | ||
params = { | ||
"amzn": img_id, | ||
"amzn-r": url_href, | ||
"field-keywords": img_data | ||
} | ||
if msg_url and proxies: | ||
print('[GET]正在请求验证码页, 验证码识别结果为:',img_data) | ||
response = requests.get(verify_url, headers=headers, params=params, cookies=cookies, proxies=proxies, | ||
allow_redirects=False) | ||
response_cookie = dict(response.cookies) | ||
cookies, meta = updata_cookie(response_cookie, meta) | ||
return meta | ||
|
||
|
||
def get_product_detail(meta): | ||
''' | ||
采集产品详情 | ||
:param meta: | ||
:return: | ||
''' | ||
headers = meta['headers'] | ||
msg_url = meta.get('msg_url', '') | ||
proxies = meta.get('proxies', '') | ||
cookies = meta.get('cookies', {}) | ||
response = requests.get(msg_url, headers=headers, cookies=cookies, proxies=proxies) | ||
|
||
print('[GET]正在第 {} 次请求, 响应长度为:'.format(meta['retry_count'] + 1), len(response.text)) | ||
response_type = get_response_type(response) | ||
if response_type == -2: | ||
# 出现验证码,判断是否超过最大重试次数 | ||
if meta['retry_count'] < meta['max_retry']: | ||
meta = run_verify(response, meta) | ||
retry_count = meta['retry_count'] | ||
meta['retry_count'] = retry_count + 1 | ||
|
||
print('[RETRY]重试当前任务:', ) | ||
response, meta = get_product_detail(meta) | ||
else: | ||
print('[MAX_RETRY]超过最大重试次数') | ||
return None | ||
elif response_type == 1: | ||
# 正常的响应 | ||
pass | ||
else: | ||
print(response.text) | ||
raise '超出预期的响应' | ||
return response, meta | ||
|
||
|
||
def run(): | ||
url_list = [ | ||
# 产品详情 | ||
"https://www.amazon.com/dp/B08DFLR38F", | ||
"https://www.amazon.com/TAISCAI-USB-Mount%EF%BC%8C18W-Dual-Waterproof/dp/B0CY99S3KN", | ||
"https://www.amazon.com/Wireless-Charging-Mag-Safe-Foldable-Magnetic/dp/B0CSP7KHD1", | ||
"https://www.amazon.com/Charger-Hohosb-Adapter-Charging-More-White/dp/B0CZ3WXFX3" | ||
# 评论区 | ||
"https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F", | ||
] | ||
proxies = ip_proxies() | ||
# proxies = None | ||
meta = { | ||
'headers': random_amazon_headers(), | ||
'proxies': proxies, | ||
'max_retry': 3 # 最大重试次数 | ||
} | ||
for url in url_list: | ||
meta['msg_url'] = url | ||
meta['retry_count'] = 0 # 重试次数 | ||
print('[START]',url) | ||
response, meta = get_product_detail(meta) | ||
|
||
# print(response.text) | ||
|
||
|
||
if __name__ == '__main__': | ||
run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# -*- coding: UTF-8 -*- | ||
''' | ||
@Project :wbh_pj | ||
@File :123.py | ||
@Author :hao | ||
@Date :2023/10/24 14:56 | ||
''' | ||
''' | ||
# 亚马逊所有页面都可以采集,最强的方案 | ||
# 目前只有 safari15_5 / safari15_3 指纹可以通过 | ||
''' | ||
# import requests | ||
from curl_cffi import requests | ||
from wbh_word.spider.Get_TJ_ip import ip_proxies | ||
|
||
headers = { | ||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", | ||
"accept-language": "zh-CN,zh;q=0.9", | ||
"priority": "u=0, i", | ||
"sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"", | ||
"sec-ch-ua-mobile": "?0", | ||
"sec-ch-ua-platform": "\"Windows\"", | ||
"sec-fetch-dest": "document", | ||
"sec-fetch-mode": "navigate", | ||
"sec-fetch-site": "none", | ||
"sec-fetch-user": "?1", | ||
"upgrade-insecure-requests": "1", | ||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" | ||
} | ||
# 产品详情 | ||
# url = "https://www.amazon.com/dp/B0CS28ZLWS" | ||
# 评论区 | ||
# url = "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F/ref=cm_cr_arp_d_paging_btm_next_2?pageNumber=2" | ||
url = "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F/ref=cm_cr_getr_d_paging_btm_next_3?pageNumber=3" | ||
proxies = ip_proxies() | ||
|
||
response = requests.get(url, headers=headers, proxies=proxies, impersonate="safari15_3") | ||
|
||
print(response.text) | ||
print(response) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# -*- coding: UTF-8 -*- | ||
''' | ||
@Project :wbh_pj | ||
@File :selenium_demo.py | ||
@Author :hao | ||
@Date :2023/10/23 16:49 | ||
''' | ||
''' | ||
需要过验证码,暂时不写 | ||
''' | ||
import time | ||
|
||
from selenium import webdriver | ||
|
||
|
||
def demo_run(): | ||
url1 = 'https://www.amazon.com/dp/B0CS28ZLWS' | ||
# ---------# 下面这一大块东西都是用来隐藏selenium的特征值--------------------- | ||
|
||
# chrome_options.add_argument("--proxy-server=http://114.230.23.140:3658") # 新增ip代理 | ||
chrome_options = webdriver.ChromeOptions() | ||
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) | ||
chrome_options.add_experimental_option('useAutomationExtension', False) | ||
|
||
driver = webdriver.Chrome(options=chrome_options) # 核心为下面这几行 | ||
with open('JS_2.js') as f: | ||
js = f.read() | ||
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', | ||
{'source': js}) | ||
|
||
# ---------# 隐藏特征值---------------------------- | ||
driver.get(url1) | ||
time.sleep(123) | ||
|
||
|
||
if __name__ == '__main__': | ||
demo_run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
# @Time : 2024/5/22 18:31 | ||
# @Author : Harvey | ||
# @File : 方案4_http2.py | ||
import httpx | ||
from urllib.parse import urlparse | ||
|
||
# proxies = { | ||
# 'http://': 'http://172.23.64.1:8888', | ||
# 'https://': 'http://172.23.64.1:8888', | ||
# } | ||
# # 为代理键添加正确的URL格式 | ||
# proxies = {urlparse(k).scheme + '://' + urlparse(k).netloc: v for k, v in proxies.items()} | ||
|
||
# client = httpx.Client(http2=True, proxies=proxies, verify=False) | ||
client = httpx.Client(http2=True) | ||
|
||
# 之后的使用方式和requests一样 | ||
|
||
headers = { | ||
"dpr": "1", | ||
"referer": "https://www.amazon.com", | ||
"sec-ch-ua": "\"Chromium\";v=\"124\", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"", | ||
"sec-ch-viewport-width": "1912", | ||
"sec-fetch-mode": "navigate", | ||
"sec-fetch-site": "same-origin", | ||
"sec-fetch-user": "?1", | ||
"upgrade-insecure-requests": "1", | ||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", | ||
"viewport-width": "1912" | ||
} | ||
|
||
|
||
url = 'https://www.amazon.com/gp/product/ajax/ref=dp_aod_NEW_mbc?asin=B08DFLR38F&m=&qid=&smid=&sourcecustomerorglistid=&sourcecustomerorglistitemid=&sr=&pc=dp&experienceId=aodAjaxMain' | ||
# url2 = '/gp/aag/main?ie=UTF8&seller=A3VQLMMKUUX89G&isAmazonFulfilled=1&asin=B08DFLR38F&ref_=olp_merch_name_2' | ||
|
||
result = client.get(url, headers=headers) | ||
|
||
|
||
print(result.text) | ||
print(result) |
Empty file.
Oops, something went wrong.