diff --git "a/\344\272\232\351\251\254\351\200\212/README.md" "b/\344\272\232\351\251\254\351\200\212/README.md"
new file mode 100644
index 0000000..e69de29
diff --git "a/\344\272\232\351\251\254\351\200\212/image/img.png" "b/\344\272\232\351\251\254\351\200\212/image/img.png"
new file mode 100644
index 0000000..87eb769
Binary files /dev/null and "b/\344\272\232\351\251\254\351\200\212/image/img.png" differ
diff --git "a/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2101_\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201.py" "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2101_\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201.py"
new file mode 100644
index 0000000..8cf63ff
--- /dev/null
+++ "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2101_\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201.py"
@@ -0,0 +1,238 @@
+#!/usr/bin/env python
+'''
+-*- coding: utf-8 -*-
+@Time : 2024/6/4 16:51
+@Author : Harvey
+@File : run.py
+'''
+'''
+目前仅能采集 产品详情 + 评论区的首页
+评论区还要其他地方有检测的待处理
+'''
+
+import random
+import re
+
+import ddddocr
+import requests
+
+from wbh_word.spider.Get_TJ_ip import ip_proxies
+
+# # 目前所有站点的域名和cookie(cookie有存活期)
+amazon_site_info = {
+ # 20500
+ "https://www.amazon.com": ['美国站', 'i18n-prefs=USD; lc-main=en_US; session-id={}-{}-{}; ubid-main={}-{}-{}'],
+ # NW1 6XE
+ "https://www.amazon.co.uk": ['英国站', 'i18n-prefs=CNY; lc-acbuk=en_GB; session-id={}-{}-{}; ubid-acbuk={}-{}-{}'],
+ # K1V 7P8
+ "https://www.amazon.ca": ['加拿大站', 'i18n-prefs=CAD; lc-acbca=en_CA; session-id={}-{}-{}; ubid-acbca={}-{}-{}'],
+ # 10115
+ "https://www.amazon.de": ['德国站', 'lc-acbde=en_GB; i18n-prefs=CNY; session-id={}-{}-{}; ubid-acbde={}-{}-{}'],
+ # 1011-1109
+ "https://www.amazon.nl": ['荷兰站', 'i18n-prefs=EUR; lc-acbnl=en_GB; session-id={}-{}-{}; ubid-acbnl={}-{}-{}'],
+ # 11455
+ "https://www.amazon.se": ['瑞典站', 'i18n-prefs=SEK; lc-acbse=en_GB; session-id={}-{}-{}; ubid-acbse={}-{}-{}'],
+ # 1930
+ "https://www.amazon.com.be": ['比利时站',
+ 'i18n-prefs=EUR; lc-acbbe=en_GB; session-id={}-{}-{}; ubid-acbbe={}-{}-{}'],
+ # 789680
+ "https://www.amazon.sg": ['新加坡站', 'i18n-prefs=SGD; session-id={}-{}-{}; ubid-acbsg={}-{}-{}'],
+ # 11433
+ "https://www.amazon.sa": ['阿拉伯站', 'i18n-prefs=SAR; lc-acbsa=en_AE; session-id={}-{}-{}; ubid-acbsa={}-{}-{}'],
+ # Dubai
+ "https://www.amazon.ae": ['阿联酋站', 'i18n-prefs=USD; lc-acbae=en_AE; session-id={}-{}-{}; ubid-acbae={}-{}-{}'],
+ # 999008
+ "https://www.amazon.in": ['印度站', 'i18n-prefs=INR; lc-acbin=en_IN; session-id={}-{}-{}; ubid-acbin={}-{}-{}'],
+ "https://www.amazon.eg": ['埃及站', 'i18n-prefs=EGP; lc-acbeg=en_AE; session-id={}-{}-{}; ubid-acbeg={}-{}-{}'],
+ # 00144
+ "https://www.amazon.it": ['意大利站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbit={}-{}-{}'],
+ # 08358
+ "https://www.amazon.es": ['西班牙站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbes={}-{}-{}'],
+ # 10115
+ "https://www.amazon.pl": ["波兰站", 'i18n-prefs=PLN; session-id={}-{}-{}; ubid-acbpl={}-{}-{}'],
+ # 34000
+ "https://www.amazon.com.tr": ["土耳其站", 'i18n-prefs=TRY; session-id={}-{}-{}; ubid-acbtr={}-{}-{}'],
+ # 83331-000
+ "https://www.amazon.com.br": ["巴西站", 'i18n-prefs=BRL; session-id={}-{}-{}; ubid-acbbr={}-{}-{}'],
+ # 75020
+ "https://www.amazon.fr": ["法国站", 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbfr={}-{}-{}'],
+ # 01830
+ "https://www.amazon.com.mx": ["墨西哥站", 'i18n-prefs=MXN; session-id={}-{}-{}; ubid-acbmx={}-{}-{}'],
+ # 2600
+ "https://www.amazon.com.a": ["澳大利亚站", 'i18n-prefs=AUD; session-id={}-{}-{}; ubid-acbau={}-{}-{}'],
+}
+
+
+def get_response_type(response):
+ type = 9999
+ if response.status_code == 404:
+ # 判断商品是否过期
+ print("[Type]当前页面为-商品过期")
+ type = 2
+ elif not response or response.status_code < 200 or response.status_code >= 400:
+ # 异常响应
+ print("[Type]当前页面为-异常响应")
+ type = 0
+ elif (response.status_code == 302) or ('Sorry! Something went wrong' in response.text) or (
+ '请刷新页面并重试' in response.text):
+ # 请求错误
+ print("[Type]当前页面为-请求错误")
+ type = -1
+ elif re.search(r'Enter the characters you see below', response.text) or (
+ '/errors/validateCaptcha' in response.text):
+ # 验证码
+ print("[Type]当前页面为-验证码")
+ type = -2
+ elif len(response.text) > 150000:
+ print("[Type]当前页面为-正常响应")
+ type = 1
+ return type
+
+
+def random_amazon_headers():
+ headers = {
+ "dpr": "1",
+ "referer": "https://www.amazon.com",
+ # "sec-ch-ua": "\"Chromium\";v=\"124 \", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
+ "sec-ch-viewport-width": "1912",
+ "sec-fetch-mode": "navigate",
+ "sec-fetch-site": "same-origin",
+ "sec-fetch-user": "?1",
+ "upgrade-insecure-requests": "1",
+ # "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
+ "viewport-width": "1912"
+ }
+ ua_temple = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.{}.{}"
+ headers['user-agent'] = ua_temple.format(random.randint(1000, 9999), random.randint(10, 1000))
+ return headers
+
+
+def updata_cookie(cookie_dict, meta):
+ '''
+ 根据cookie字典来更新cookie
+ :param cookie_dict:
+ :return:
+ '''
+ # print('[UPDATA_COOKIE]', cookie_dict)
+ cookies = meta.get('cookies', {})
+ if cookie_dict.get('x-amz-captcha-1', ''):
+ cookies['x-amz-captcha-1'] = cookie_dict['x-amz-captcha-1']
+ if cookie_dict.get('x-amz-captcha-2', ''):
+ cookies['x-amz-captcha-2'] = cookie_dict['x-amz-captcha-2']
+ meta['cookies'] = cookies
+ return cookies, meta
+
+
+def get_img(response):
+ '''
+ 下载并识别图片
+ :param response:
+ :return:图片ID
+ :return:图片识别结果
+ '''
+ img_id = re.findall(r'name="amzn" value="(.*?)"', response.text)
+ img = re.findall(r'', response.text)
+ if img_id and img:
+ img_url = img[0]
+ img_id = img_id[0]
+ r = requests.get(img_url)
+ img_path = './image/img.png'
+ ocr = ddddocr.DdddOcr()
+ with open(img_path, 'wb') as f:
+ f.write(r.content)
+ img_data = ocr.classification(r.content)
+ img_data = img_data.lower()
+ return img_id, img_data
+
+
+def run_verify(response, meta):
+ '''
+ 处理验证码
+ :param response:
+ :return:
+ '''
+ verify_url = "https://www.amazon.com/errors/validateCaptcha"
+ img_id, img_data = get_img(response)
+ if img_id and img_data:
+ msg_url = meta.get('msg_url', '')
+ url_href = msg_url.split('amazon.com')[-1]
+ proxies = meta.get('proxies', '')
+ headers = meta.get('headers')
+ cookies = meta.get('cookies')
+ params = {
+ "amzn": img_id,
+ "amzn-r": url_href,
+ "field-keywords": img_data
+ }
+ if msg_url and proxies:
+ print('[GET]正在请求验证码页, 验证码识别结果为:',img_data)
+ response = requests.get(verify_url, headers=headers, params=params, cookies=cookies, proxies=proxies,
+ allow_redirects=False)
+ response_cookie = dict(response.cookies)
+ cookies, meta = updata_cookie(response_cookie, meta)
+ return meta
+
+
+def get_product_detail(meta):
+ '''
+ 采集产品详情
+ :param meta:
+ :return:
+ '''
+ headers = meta['headers']
+ msg_url = meta.get('msg_url', '')
+ proxies = meta.get('proxies', '')
+ cookies = meta.get('cookies', {})
+ response = requests.get(msg_url, headers=headers, cookies=cookies, proxies=proxies)
+
+ print('[GET]正在第 {} 次请求, 响应长度为:'.format(meta['retry_count'] + 1), len(response.text))
+ response_type = get_response_type(response)
+ if response_type == -2:
+ # 出现验证码,判断是否超过最大重试次数
+ if meta['retry_count'] < meta['max_retry']:
+ meta = run_verify(response, meta)
+ retry_count = meta['retry_count']
+ meta['retry_count'] = retry_count + 1
+
+ print('[RETRY]重试当前任务:', )
+ response, meta = get_product_detail(meta)
+ else:
+ print('[MAX_RETRY]超过最大重试次数')
+ return None
+ elif response_type == 1:
+ # 正常的响应
+ pass
+ else:
+ print(response.text)
+ raise '超出预期的响应'
+ return response, meta
+
+
+def run():
+ url_list = [
+ # 产品详情
+ "https://www.amazon.com/dp/B08DFLR38F",
+ "https://www.amazon.com/TAISCAI-USB-Mount%EF%BC%8C18W-Dual-Waterproof/dp/B0CY99S3KN",
+ "https://www.amazon.com/Wireless-Charging-Mag-Safe-Foldable-Magnetic/dp/B0CSP7KHD1",
+ "https://www.amazon.com/Charger-Hohosb-Adapter-Charging-More-White/dp/B0CZ3WXFX3"
+ # 评论区
+ "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F",
+ ]
+ proxies = ip_proxies()
+ # proxies = None
+ meta = {
+ 'headers': random_amazon_headers(),
+ 'proxies': proxies,
+ 'max_retry': 3 # 最大重试次数
+ }
+ for url in url_list:
+ meta['msg_url'] = url
+ meta['retry_count'] = 0 # 重试次数
+ print('[START]',url)
+ response, meta = get_product_detail(meta)
+
+ # print(response.text)
+
+
+if __name__ == '__main__':
+ run()
diff --git "a/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2102_\347\252\201\347\240\264\346\214\207\347\272\271\346\243\200\346\265\213.py" "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2102_\347\252\201\347\240\264\346\214\207\347\272\271\346\243\200\346\265\213.py"
new file mode 100644
index 0000000..510d122
--- /dev/null
+++ "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2102_\347\252\201\347\240\264\346\214\207\347\272\271\346\243\200\346\265\213.py"
@@ -0,0 +1,42 @@
+# -*- coding: UTF-8 -*-
+'''
+@Project :wbh_pj
+@File :123.py
+@Author :hao
+@Date :2023/10/24 14:56
+'''
+'''
+# 亚马逊所有页面都可以采集,最强的方案
+# 目前只有 safari15_5 / safari15_3 指纹可以通过
+'''
+# import requests
+from curl_cffi import requests
+from wbh_word.spider.Get_TJ_ip import ip_proxies
+
+headers = {
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+ "accept-language": "zh-CN,zh;q=0.9",
+ "priority": "u=0, i",
+ "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
+ "sec-ch-ua-mobile": "?0",
+ "sec-ch-ua-platform": "\"Windows\"",
+ "sec-fetch-dest": "document",
+ "sec-fetch-mode": "navigate",
+ "sec-fetch-site": "none",
+ "sec-fetch-user": "?1",
+ "upgrade-insecure-requests": "1",
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+}
+# 产品详情
+# url = "https://www.amazon.com/dp/B0CS28ZLWS"
+# 评论区
+# url = "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F/ref=cm_cr_arp_d_paging_btm_next_2?pageNumber=2"
+url = "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F/ref=cm_cr_getr_d_paging_btm_next_3?pageNumber=3"
+proxies = ip_proxies()
+
+response = requests.get(url, headers=headers, proxies=proxies, impersonate="safari15_3")
+
+print(response.text)
+print(response)
+
+
diff --git "a/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2103_\350\207\252\345\212\250\345\214\226\347\210\254\345\217\226.py" "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2103_\350\207\252\345\212\250\345\214\226\347\210\254\345\217\226.py"
new file mode 100644
index 0000000..ecf49c9
--- /dev/null
+++ "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2103_\350\207\252\345\212\250\345\214\226\347\210\254\345\217\226.py"
@@ -0,0 +1,37 @@
+# -*- coding: UTF-8 -*-
+'''
+@Project :wbh_pj
+@File :selenium_demo.py
+@Author :hao
+@Date :2023/10/23 16:49
+'''
+'''
+需要过验证码,暂时不写
+'''
+import time
+
+from selenium import webdriver
+
+
+def demo_run():
+ url1 = 'https://www.amazon.com/dp/B0CS28ZLWS'
+ # ---------# 下面这一大块东西都是用来隐藏selenium的特征值---------------------
+
+ # chrome_options.add_argument("--proxy-server=http://114.230.23.140:3658") # 新增ip代理
+ chrome_options = webdriver.ChromeOptions()
+ chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
+ chrome_options.add_experimental_option('useAutomationExtension', False)
+
+ driver = webdriver.Chrome(options=chrome_options) # 核心为下面这几行
+ with open('JS_2.js') as f:
+ js = f.read()
+ driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument',
+ {'source': js})
+
+ # ---------# 隐藏特征值----------------------------
+ driver.get(url1)
+ time.sleep(123)
+
+
+if __name__ == '__main__':
+ demo_run()
diff --git "a/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2104_http2.py" "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2104_http2.py"
new file mode 100644
index 0000000..00dd05a
--- /dev/null
+++ "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2104_http2.py"
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time : 2024/5/22 18:31
+# @Author : Harvey
+# @File : 方案4_http2.py
+import httpx
+from urllib.parse import urlparse
+
+# proxies = {
+# 'http://': 'http://172.23.64.1:8888',
+# 'https://': 'http://172.23.64.1:8888',
+# }
+# # 为代理键添加正确的URL格式
+# proxies = {urlparse(k).scheme + '://' + urlparse(k).netloc: v for k, v in proxies.items()}
+
+# client = httpx.Client(http2=True, proxies=proxies, verify=False)
+client = httpx.Client(http2=True)
+
+# 之后的使用方式和requests一样
+
+headers = {
+ "dpr": "1",
+ "referer": "https://www.amazon.com",
+ "sec-ch-ua": "\"Chromium\";v=\"124\", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
+ "sec-ch-viewport-width": "1912",
+ "sec-fetch-mode": "navigate",
+ "sec-fetch-site": "same-origin",
+ "sec-fetch-user": "?1",
+ "upgrade-insecure-requests": "1",
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
+ "viewport-width": "1912"
+}
+
+
+url = 'https://www.amazon.com/gp/product/ajax/ref=dp_aod_NEW_mbc?asin=B08DFLR38F&m=&qid=&smid=&sourcecustomerorglistid=&sourcecustomerorglistitemid=&sr=&pc=dp&experienceId=aodAjaxMain'
+# url2 = '/gp/aag/main?ie=UTF8&seller=A3VQLMMKUUX89G&isAmazonFulfilled=1&asin=B08DFLR38F&ref_=olp_merch_name_2'
+
+result = client.get(url, headers=headers)
+
+
+print(result.text)
+print(result)
diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/README.md" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/README.md"
new file mode 100644
index 0000000..e69de29
diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_detail.py" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_detail.py"
new file mode 100644
index 0000000..3967544
--- /dev/null
+++ "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_detail.py"
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time : 2024/5/18 11:51
+# @Author : Harvey
+# @File : get_detail.py
+import requests
+from wbh_word.spider.Get_TJ_ip import ip_proxies
+
+headers = {
+ "dpr": "1",
+ "referer": "https://www.amazon.com",
+ "sec-ch-ua": "\"Chromium\";v=\"124\", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
+ "sec-ch-viewport-width": "1912",
+ "sec-fetch-mode": "navigate",
+ "sec-fetch-site": "same-origin",
+ "sec-fetch-user": "?1",
+ "upgrade-insecure-requests": "1",
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
+ "viewport-width": "1912"
+}
+cookies = {
+ # "csm-sid": "713-2299262-7567932",
+ "x-amz-captcha-1": "1717480655829841",
+ "x-amz-captcha-2": "THz/dORI7f9MPEYpk6zAOQ==",
+ "session-id": "145-8954814-3186315",
+ "session-id-time": "2082787201l",
+ # 上面这部分为核心字段,过了验证码之后就可以得到
+
+ "i18n-prefs": "USD",
+ "lc-main": "zh_CN",
+ "sp-cdn": "\"L5Z9:CN\"",
+ # "ubid-main": "132-1464863-8061124",
+ # "session-token": "m4jtyQF+jZJqVW/adslOeUE7aWcay+oPVttMzoTlqWO9R9VCk6M0xNooY5RmGRW9eOBxpsP949PLbSn9eXz1ECwAwFVxwxRSWZtYLjcpY/70/WSGpis0IqQpRZSPI5RmUQgi/1lHq4qB+zIqJoudzKwXxCt7ihAa4fhbjcAOJjVsAO3pxMHfOH7aDjRw3wHt4xDaW53dyRENzIaYNvwh+KCkzK0w5SOxz6fxuY6v9zUsuWLt8pZmtQ75YoU1C3+Okt2scs+5b+jt+1dl/OTQ6oHj7QyAqK5h0MFeVM9jEkXgoubepR1OgB0YWNmMD3wCrb3sB0NtbZThvFJmWxOV3Bri1TQREibq",
+ # "csm-hit": "tb:7NNEK8EZX7MDKY0R3SZH+s-7NNEK8EZX7MDKY0R3SZH|1715936997814&t:1715936997814&adb:adblk_no"
+}
+# url = "https://www.amazon.com/Munchkin%C2%AE-Brica%C2%AE-Stroller-Organizer-Bag/dp/B0BPMQQN6M"
+url = "https://www.amazon.com/dp/B08DFLR38F"
+
+proxies = ip_proxies()
+# proxies = {'http': 'http://613706c5ede9d:kF0C7UslCBzxXdt@114.239.95.42:3328', 'https': 'http://613706c5ede9d:kF0C7UslCBzxXdt@114.239.95.42:3328'}
+response = requests.get(url, headers=headers, cookies=cookies,proxies=proxies)
+print(response.text)
+print(len(response.text))
+print(response.cookies)
+print(response.headers)
\ No newline at end of file
diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_img.py" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_img.py"
new file mode 100644
index 0000000..a5b6be8
--- /dev/null
+++ "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_img.py"
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time : 2024/5/18 11:46
+# @Author : Harvey
+# @File : get_img.py
+import requests
+
+
+headers = {
+ "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+ "accept-language": "zh-CN,zh;q=0.9",
+ "priority": "u=0, i",
+ "referer": "https://www.amazon.com/dp/B0CS28ZLWS",
+ "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
+ "sec-ch-ua-mobile": "?0",
+ "sec-ch-ua-platform": "\"Windows\"",
+ "sec-fetch-dest": "document",
+ "sec-fetch-mode": "navigate",
+ "sec-fetch-site": "same-origin",
+ "sec-fetch-user": "?1",
+ "upgrade-insecure-requests": "1",
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+}
+cookies = {
+ # "csm-sid": "602-5089573-6274975"
+}
+url = "https://www.amazon.com/errors/validateCaptcha"
+params = {
+ "amzn": "b+TKPZCS+956d3A6Vjh14g==",
+ "amzn-r": "/dp/B08DFLR38F",
+ "field-keywords": "jtymlx"
+}
+proxies = {'http': 'http://613706c5ede9d:kF0C7UslCBzxXdt@121.206.142.66:3328', 'https': 'http://613706c5ede9d:kF0C7UslCBzxXdt@121.206.142.66:3328'}
+# 请求成功后会返回302,需要禁用自动跳转,先获取cookie,否则会自动跳转到详情页
+response = requests.get(url, headers=headers, cookies=cookies, params=params,proxies=proxies, allow_redirects=False)
+
+# print(response.text)
+print(len(response.text))
+print(response.cookies)
+print(response.headers)
+print(response.status_code)
+# 当出现302状态说明请求成功
+
+print("======= 重定向 =======")
+response = requests.get(url, headers=headers, cookies=cookies, params=params,proxies=proxies)
+print(len(response.text))
+print(response.cookies)
+print(response.headers)
+print(response.status_code)
\ No newline at end of file
diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img.png" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img.png"
new file mode 100644
index 0000000..7d0dace
Binary files /dev/null and "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img.png" differ
diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img2222.png" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img2222.png"
new file mode 100644
index 0000000..91af59d
Binary files /dev/null and "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img2222.png" differ
diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/run.py" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/run.py"
new file mode 100644
index 0000000..c67f903
--- /dev/null
+++ "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/run.py"
@@ -0,0 +1,231 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time : 2024/6/4 16:51
+# @Author : Harvey
+# @File : run.py
+import random
+import re
+
+import ddddocr
+import requests
+
+from wbh_word.spider.Get_TJ_ip import ip_proxies
+
+# # 目前所有站点的域名和cookie(cookie有存活期)
+amazon_site_info = {
+ # 20500
+ "https://www.amazon.com": ['美国站', 'i18n-prefs=USD; lc-main=en_US; session-id={}-{}-{}; ubid-main={}-{}-{}'],
+ # NW1 6XE
+ "https://www.amazon.co.uk": ['英国站', 'i18n-prefs=CNY; lc-acbuk=en_GB; session-id={}-{}-{}; ubid-acbuk={}-{}-{}'],
+ # K1V 7P8
+ "https://www.amazon.ca": ['加拿大站', 'i18n-prefs=CAD; lc-acbca=en_CA; session-id={}-{}-{}; ubid-acbca={}-{}-{}'],
+ # 10115
+ "https://www.amazon.de": ['德国站', 'lc-acbde=en_GB; i18n-prefs=CNY; session-id={}-{}-{}; ubid-acbde={}-{}-{}'],
+ # 1011-1109
+ "https://www.amazon.nl": ['荷兰站', 'i18n-prefs=EUR; lc-acbnl=en_GB; session-id={}-{}-{}; ubid-acbnl={}-{}-{}'],
+ # 11455
+ "https://www.amazon.se": ['瑞典站', 'i18n-prefs=SEK; lc-acbse=en_GB; session-id={}-{}-{}; ubid-acbse={}-{}-{}'],
+ # 1930
+ "https://www.amazon.com.be": ['比利时站',
+ 'i18n-prefs=EUR; lc-acbbe=en_GB; session-id={}-{}-{}; ubid-acbbe={}-{}-{}'],
+ # 789680
+ "https://www.amazon.sg": ['新加坡站', 'i18n-prefs=SGD; session-id={}-{}-{}; ubid-acbsg={}-{}-{}'],
+ # 11433
+ "https://www.amazon.sa": ['阿拉伯站', 'i18n-prefs=SAR; lc-acbsa=en_AE; session-id={}-{}-{}; ubid-acbsa={}-{}-{}'],
+ # Dubai
+ "https://www.amazon.ae": ['阿联酋站', 'i18n-prefs=USD; lc-acbae=en_AE; session-id={}-{}-{}; ubid-acbae={}-{}-{}'],
+ # 999008
+ "https://www.amazon.in": ['印度站', 'i18n-prefs=INR; lc-acbin=en_IN; session-id={}-{}-{}; ubid-acbin={}-{}-{}'],
+ "https://www.amazon.eg": ['埃及站', 'i18n-prefs=EGP; lc-acbeg=en_AE; session-id={}-{}-{}; ubid-acbeg={}-{}-{}'],
+ # 00144
+ "https://www.amazon.it": ['意大利站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbit={}-{}-{}'],
+ # 08358
+ "https://www.amazon.es": ['西班牙站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbes={}-{}-{}'],
+ # 10115
+ "https://www.amazon.pl": ["波兰站", 'i18n-prefs=PLN; session-id={}-{}-{}; ubid-acbpl={}-{}-{}'],
+ # 34000
+ "https://www.amazon.com.tr": ["土耳其站", 'i18n-prefs=TRY; session-id={}-{}-{}; ubid-acbtr={}-{}-{}'],
+ # 83331-000
+ "https://www.amazon.com.br": ["巴西站", 'i18n-prefs=BRL; session-id={}-{}-{}; ubid-acbbr={}-{}-{}'],
+ # 75020
+ "https://www.amazon.fr": ["法国站", 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbfr={}-{}-{}'],
+ # 01830
+ "https://www.amazon.com.mx": ["墨西哥站", 'i18n-prefs=MXN; session-id={}-{}-{}; ubid-acbmx={}-{}-{}'],
+ # 2600
+ "https://www.amazon.com.a": ["澳大利亚站", 'i18n-prefs=AUD; session-id={}-{}-{}; ubid-acbau={}-{}-{}'],
+}
+
+
+def get_response_type(response):
+ type = 9999
+ if response.status_code == 404:
+ # 判断商品是否过期
+ print("[Type]当前页面为-商品过期")
+ type = 2
+ elif not response or response.status_code < 200 or response.status_code >= 400:
+ # 异常响应
+ print("[Type]当前页面为-异常响应")
+ type = 0
+ elif (response.status_code == 302) or ('Sorry! Something went wrong' in response.text) or (
+ '请刷新页面并重试' in response.text):
+ # 请求错误
+ print("[Type]当前页面为-请求错误")
+ type = -1
+ elif re.search(r'Enter the characters you see below', response.text) or (
+ '/errors/validateCaptcha' in response.text):
+ # 验证码
+ print("[Type]当前页面为-验证码")
+ type = -2
+ elif len(response.text) > 150000:
+ print("[Type]当前页面为-正常响应")
+ type = 1
+ return type
+
+
+def random_amazon_headers():
+ headers = {
+ "dpr": "1",
+ "referer": "https://www.amazon.com",
+ # "sec-ch-ua": "\"Chromium\";v=\"124 \", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
+ "sec-ch-viewport-width": "1912",
+ "sec-fetch-mode": "navigate",
+ "sec-fetch-site": "same-origin",
+ "sec-fetch-user": "?1",
+ "upgrade-insecure-requests": "1",
+ # "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
+ "viewport-width": "1912"
+ }
+ ua_temple = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.{}.{}"
+ headers['user-agent'] = ua_temple.format(random.randint(1000, 9999), random.randint(10, 1000))
+ return headers
+
+
+def updata_cookie(cookie_dict, meta):
+ '''
+ 根据cookie字典来更新cookie
+ :param cookie_dict:
+ :return:
+ '''
+ # print('[UPDATA_COOKIE]', cookie_dict)
+ cookies = meta.get('cookies', {})
+ if cookie_dict.get('x-amz-captcha-1', ''):
+ cookies['x-amz-captcha-1'] = cookie_dict['x-amz-captcha-1']
+ if cookie_dict.get('x-amz-captcha-2', ''):
+ cookies['x-amz-captcha-2'] = cookie_dict['x-amz-captcha-2']
+ meta['cookies'] = cookies
+ return cookies, meta
+
+
+def get_img(response):
+ '''
+ 下载并识别图片
+ :param response:
+ :return:图片ID
+ :return:图片识别结果
+ '''
+ img_id = re.findall(r'name="amzn" value="(.*?)"', response.text)
+ img = re.findall(r'', response.text)
+ if img_id and img:
+ img_url = img[0]
+ img_id = img_id[0]
+ r = requests.get(img_url)
+ img_path = './image/img.png'
+ ocr = ddddocr.DdddOcr()
+ with open(img_path, 'wb') as f:
+ f.write(r.content)
+ img_data = ocr.classification(r.content)
+ img_data = img_data.lower()
+ return img_id, img_data
+
+
+def run_verify(response, meta):
+ '''
+ 处理验证码
+ :param response:
+ :return:
+ '''
+ verify_url = "https://www.amazon.com/errors/validateCaptcha"
+ img_id, img_data = get_img(response)
+ if img_id and img_data:
+ msg_url = meta.get('msg_url', '')
+ url_href = msg_url.split('amazon.com')[-1]
+ proxies = meta.get('proxies', '')
+ headers = meta.get('headers')
+ cookies = meta.get('cookies')
+ params = {
+ "amzn": img_id,
+ "amzn-r": url_href,
+ "field-keywords": img_data
+ }
+ if msg_url and proxies:
+ print('[GET]正在请求验证码页, 验证码识别结果为:',img_data)
+ response = requests.get(verify_url, headers=headers, params=params, cookies=cookies, proxies=proxies,
+ allow_redirects=False)
+ response_cookie = dict(response.cookies)
+ cookies, meta = updata_cookie(response_cookie, meta)
+ return meta
+
+
+def get_product_detail(meta):
+ '''
+ 采集产品详情
+ :param meta:
+ :return:
+ '''
+ headers = meta['headers']
+ msg_url = meta.get('msg_url', '')
+ proxies = meta.get('proxies', '')
+ cookies = meta.get('cookies', {})
+ response = requests.get(msg_url, headers=headers, cookies=cookies, proxies=proxies)
+
+ print('[GET]正在第 {} 次请求, 响应长度为:'.format(meta['retry_count'] + 1), len(response.text))
+ response_type = get_response_type(response)
+ if response_type == -2:
+ # 出现验证码,判断是否超过最大重试次数
+ if meta['retry_count'] < meta['max_retry']:
+ meta = run_verify(response, meta)
+ retry_count = meta['retry_count']
+ meta['retry_count'] = retry_count + 1
+
+ print('[RETRY]重试当前任务:', )
+ response, meta = get_product_detail(meta)
+ else:
+ print('[MAX_RETRY]超过最大重试次数')
+ return None
+ elif response_type == 1:
+ # 正常的响应
+ pass
+ else:
+ print(response.text)
+ raise '超出预期的响应'
+ return response, meta
+
+
+def run():
+ url_list = [
+ # 产品详情
+ "https://www.amazon.com/dp/B08DFLR38F",
+ "https://www.amazon.com/TAISCAI-USB-Mount%EF%BC%8C18W-Dual-Waterproof/dp/B0CY99S3KN",
+ "https://www.amazon.com/Wireless-Charging-Mag-Safe-Foldable-Magnetic/dp/B0CSP7KHD1",
+ "https://www.amazon.com/Charger-Hohosb-Adapter-Charging-More-White/dp/B0CZ3WXFX3"
+ # 评论区
+ "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F",
+ ]
+ proxies = ip_proxies()
+ # proxies = None
+ meta = {
+ 'headers': random_amazon_headers(),
+ 'proxies': proxies,
+ 'max_retry': 3 # 最大重试次数
+ }
+ for url in url_list:
+ meta['msg_url'] = url
+ meta['retry_count'] = 0 # 重试次数
+ print('[START]',url)
+ response, meta = get_product_detail(meta)
+
+ # print(response.text)
+
+
+if __name__ == '__main__':
+ run()
diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/test.py" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/test.py"
new file mode 100644
index 0000000..80dc9d8
--- /dev/null
+++ "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/test.py"
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time : 2024/5/18 11:44
+# @Author : Harvey
+# @File : yanzhengma.py
+import requests
+from wbh_word.spider.Get_TJ_ip import ip_proxies
+
+headers = {
+ "dpr": "1",
+ "referer": "https://www.amazon.com",
+ "sec-ch-ua": "\"Chromium\";v=\"124\", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
+ "sec-ch-viewport-width": "1912",
+ "sec-fetch-mode": "navigate",
+ "sec-fetch-site": "same-origin",
+ "sec-fetch-user": "?1",
+ "upgrade-insecure-requests": "1",
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
+ "viewport-width": "1912"
+}
+cookies = {
+ "csm-sid": "713-2299262-7567932",
+ "x-amz-captcha-1": "1715945070398999",
+ "x-amz-captcha-2": "/fvr6wiJciyvvZR8JOfw+Q==",
+ "session-id": "138-0202196-4407222",
+ "session-id-time": "2082787201l",
+ # 上面这部分为核心字段,过了验证码之后就可以得到
+
+ "i18n-prefs": "USD",
+ "lc-main": "zh_CN",
+ "sp-cdn": "\"L5Z9:CN\"",
+ # "ubid-main": "132-5607086-4560162",
+ # "session-token": "m4jtyQF+jZJqVW/adslOeUE7aWcay+oPVttMzoTlqWO9R9VCk6M0xNooY5RmGRW9eOBxpsP949PLbSn9eXz1ECwAwFVxwxRSWZtYLjcpY/70/WSGpis0IqQpRZSPI5RmUQgi/1lHq4qB+zIqJoudzKwXxCt7ihAa4fhbjcAOJjVsAO3pxMHfOH7aDjRw3wHt4xDaW53dyRENzIaYNvwh+KCkzK0w5SOxz6fxuY6v9zUsuWLt8pZmtQ75YoU1C3+Okt2scs+5b+jt+1dl/OTQ6oHj7QyAqK5h0MFeVM9jEkXgoubepR1OgB0YWNmMD3wCrb3sB0NtbZThvFJmWxOV3Bri1TQREibq",
+ # "csm-hit": "tb:7NNEK8EZX7MDKY0R3SZH+s-7NNEK8EZX7MDKY0R3SZH|1715936997814&t:1715936997814&adb:adblk_no"
+}
+# url = "https://www.amazon.com/Munchkin%C2%AE-Brica%C2%AE-Stroller-Organizer-Bag/dp/B0BPMQQN6M"
+url = "https://www.amazon.com/dp/B08DFLR38F"
+
+proxies = ip_proxies()
+response = requests.get(url, headers=headers, cookies=cookies,proxies=proxies)
+print(response.text)
+print(len(response.text))
+print(response.cookies)
+print(response.headers)
+print(proxies)
\ No newline at end of file