新增亚马逊爬虫

wu50416 · Jun 5, 2024 · 52359ac · 52359ac
1 parent 728e067
commit 52359ac
Show file tree

Hide file tree

Showing 13 changed files with 729 additions and 0 deletions.
diff --git a/亚马逊/README.md b/亚马逊/README.md
diff --git a/亚马逊/image/img.png b/亚马逊/image/img.png
diff --git a/亚马逊/方案1_突破验证码.py b/亚马逊/方案1_突破验证码.py
@@ -0,0 +1,238 @@
+#!/usr/bin/env python
+'''
+-*- coding: utf-8 -*-
+@Time    : 2024/6/4 16:51
+@Author  : Harvey
+@File    : run.py
+'''
+'''
+目前仅能采集 产品详情 + 评论区的首页
+评论区还要其他地方有检测的待处理
+'''
+
+import random
+import re
+
+import ddddocr
+import requests
+
+from wbh_word.spider.Get_TJ_ip import ip_proxies
+
+# # 目前所有站点的域名和cookie(cookie有存活期)
+amazon_site_info = {
+    # 20500
+    "https://www.amazon.com": ['美国站', 'i18n-prefs=USD; lc-main=en_US; session-id={}-{}-{}; ubid-main={}-{}-{}'],
+    # NW1 6XE
+    "https://www.amazon.co.uk": ['英国站', 'i18n-prefs=CNY; lc-acbuk=en_GB; session-id={}-{}-{}; ubid-acbuk={}-{}-{}'],
+    # K1V 7P8
+    "https://www.amazon.ca": ['加拿大站', 'i18n-prefs=CAD; lc-acbca=en_CA; session-id={}-{}-{}; ubid-acbca={}-{}-{}'],
+    # 10115
+    "https://www.amazon.de": ['德国站', 'lc-acbde=en_GB; i18n-prefs=CNY; session-id={}-{}-{}; ubid-acbde={}-{}-{}'],
+    # 1011-1109
+    "https://www.amazon.nl": ['荷兰站', 'i18n-prefs=EUR; lc-acbnl=en_GB; session-id={}-{}-{}; ubid-acbnl={}-{}-{}'],
+    # 11455
+    "https://www.amazon.se": ['瑞典站', 'i18n-prefs=SEK; lc-acbse=en_GB; session-id={}-{}-{}; ubid-acbse={}-{}-{}'],
+    # 1930
+    "https://www.amazon.com.be": ['比利时站',
+                                  'i18n-prefs=EUR; lc-acbbe=en_GB; session-id={}-{}-{}; ubid-acbbe={}-{}-{}'],
+    # 789680
+    "https://www.amazon.sg": ['新加坡站', 'i18n-prefs=SGD; session-id={}-{}-{}; ubid-acbsg={}-{}-{}'],
+    # 11433
+    "https://www.amazon.sa": ['阿拉伯站', 'i18n-prefs=SAR; lc-acbsa=en_AE; session-id={}-{}-{}; ubid-acbsa={}-{}-{}'],
+    # Dubai
+    "https://www.amazon.ae": ['阿联酋站', 'i18n-prefs=USD; lc-acbae=en_AE; session-id={}-{}-{}; ubid-acbae={}-{}-{}'],
+    # 999008
+    "https://www.amazon.in": ['印度站', 'i18n-prefs=INR; lc-acbin=en_IN; session-id={}-{}-{}; ubid-acbin={}-{}-{}'],
+    "https://www.amazon.eg": ['埃及站', 'i18n-prefs=EGP; lc-acbeg=en_AE; session-id={}-{}-{}; ubid-acbeg={}-{}-{}'],
+    # 00144
+    "https://www.amazon.it": ['意大利站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbit={}-{}-{}'],
+    # 08358
+    "https://www.amazon.es": ['西班牙站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbes={}-{}-{}'],
+    # 10115
+    "https://www.amazon.pl": ["波兰站", 'i18n-prefs=PLN; session-id={}-{}-{}; ubid-acbpl={}-{}-{}'],
+    # 34000
+    "https://www.amazon.com.tr": ["土耳其站", 'i18n-prefs=TRY; session-id={}-{}-{}; ubid-acbtr={}-{}-{}'],
+    # 83331-000
+    "https://www.amazon.com.br": ["巴西站", 'i18n-prefs=BRL; session-id={}-{}-{}; ubid-acbbr={}-{}-{}'],
+    # 75020
+    "https://www.amazon.fr": ["法国站", 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbfr={}-{}-{}'],
+    # 01830
+    "https://www.amazon.com.mx": ["墨西哥站", 'i18n-prefs=MXN; session-id={}-{}-{}; ubid-acbmx={}-{}-{}'],
+    # 2600
+    "https://www.amazon.com.a": ["澳大利亚站", 'i18n-prefs=AUD; session-id={}-{}-{}; ubid-acbau={}-{}-{}'],
+}
+
+
+def get_response_type(response):
+    type = 9999
+    if response.status_code == 404:
+        # 判断商品是否过期
+        print("[Type]当前页面为-商品过期")
+        type = 2
+    elif not response or response.status_code < 200 or response.status_code >= 400:
+        # 异常响应
+        print("[Type]当前页面为-异常响应")
+        type = 0
+    elif (response.status_code == 302) or ('Sorry! Something went wrong' in response.text) or (
+            '请刷新页面并重试' in response.text):
+        # 请求错误
+        print("[Type]当前页面为-请求错误")
+        type = -1
+    elif re.search(r'Enter the characters you see below', response.text) or (
+            '/errors/validateCaptcha' in response.text):
+        # 验证码
+        print("[Type]当前页面为-验证码")
+        type = -2
+    elif len(response.text) > 150000:
+        print("[Type]当前页面为-正常响应")
+        type = 1
+    return type
+
+
+def random_amazon_headers():
+    headers = {
+        "dpr": "1",
+        "referer": "https://www.amazon.com",
+        # "sec-ch-ua": "\"Chromium\";v=\"124 \", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
+        "sec-ch-viewport-width": "1912",
+        "sec-fetch-mode": "navigate",
+        "sec-fetch-site": "same-origin",
+        "sec-fetch-user": "?1",
+        "upgrade-insecure-requests": "1",
+        # "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
+        "viewport-width": "1912"
+    }
+    ua_temple = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.{}.{}"
+    headers['user-agent'] = ua_temple.format(random.randint(1000, 9999), random.randint(10, 1000))
+    return headers
+
+
+def updata_cookie(cookie_dict, meta):
+    '''
+    根据cookie字典来更新cookie
+    :param cookie_dict:
+    :return:
+    '''
+    # print('[UPDATA_COOKIE]', cookie_dict)
+    cookies = meta.get('cookies', {})
+    if cookie_dict.get('x-amz-captcha-1', ''):
+        cookies['x-amz-captcha-1'] = cookie_dict['x-amz-captcha-1']
+    if cookie_dict.get('x-amz-captcha-2', ''):
+        cookies['x-amz-captcha-2'] = cookie_dict['x-amz-captcha-2']
+    meta['cookies'] = cookies
+    return cookies, meta
+
+
+def get_img(response):
+    '''
+    下载并识别图片
+    :param response:
+    :return:图片ID
+    :return:图片识别结果
+    '''
+    img_id = re.findall(r'name="amzn" value="(.*?)"', response.text)
+    img = re.findall(r'<img src="(.*?)">', response.text)
+    if img_id and img:
+        img_url = img[0]
+        img_id = img_id[0]
+        r = requests.get(img_url)
+        img_path = './image/img.png'
+        ocr = ddddocr.DdddOcr()
+        with open(img_path, 'wb') as f:
+            f.write(r.content)
+        img_data = ocr.classification(r.content)
+        img_data = img_data.lower()
+        return img_id, img_data
+
+
+def run_verify(response, meta):
+    '''
+    处理验证码
+    :param response:
+    :return:
+    '''
+    verify_url = "https://www.amazon.com/errors/validateCaptcha"
+    img_id, img_data = get_img(response)
+    if img_id and img_data:
+        msg_url = meta.get('msg_url', '')
+        url_href = msg_url.split('amazon.com')[-1]
+        proxies = meta.get('proxies', '')
+        headers = meta.get('headers')
+        cookies = meta.get('cookies')
+        params = {
+            "amzn": img_id,
+            "amzn-r": url_href,
+            "field-keywords": img_data
+        }
+        if msg_url and proxies:
+            print('[GET]正在请求验证码页, 验证码识别结果为：',img_data)
+            response = requests.get(verify_url, headers=headers, params=params, cookies=cookies, proxies=proxies,
+                                    allow_redirects=False)
+            response_cookie = dict(response.cookies)
+            cookies, meta = updata_cookie(response_cookie, meta)
+    return meta
+
+
+def get_product_detail(meta):
+    '''
+    采集产品详情
+    :param meta:
+    :return:
+    '''
+    headers = meta['headers']
+    msg_url = meta.get('msg_url', '')
+    proxies = meta.get('proxies', '')
+    cookies = meta.get('cookies', {})
+    response = requests.get(msg_url, headers=headers, cookies=cookies, proxies=proxies)
+
+    print('[GET]正在第 {} 次请求, 响应长度为：'.format(meta['retry_count'] + 1), len(response.text))
+    response_type = get_response_type(response)
+    if response_type == -2:
+        # 出现验证码，判断是否超过最大重试次数
+        if meta['retry_count'] < meta['max_retry']:
+            meta = run_verify(response, meta)
+            retry_count = meta['retry_count']
+            meta['retry_count'] = retry_count + 1
+
+            print('[RETRY]重试当前任务：', )
+            response, meta = get_product_detail(meta)
+        else:
+            print('[MAX_RETRY]超过最大重试次数')
+            return None
+    elif response_type == 1:
+        # 正常的响应
+        pass
+    else:
+        print(response.text)
+        raise '超出预期的响应'
+    return response, meta
+
+
+def run():
+    url_list = [
+        # 产品详情
+        "https://www.amazon.com/dp/B08DFLR38F",
+        "https://www.amazon.com/TAISCAI-USB-Mount%EF%BC%8C18W-Dual-Waterproof/dp/B0CY99S3KN",
+        "https://www.amazon.com/Wireless-Charging-Mag-Safe-Foldable-Magnetic/dp/B0CSP7KHD1",
+        "https://www.amazon.com/Charger-Hohosb-Adapter-Charging-More-White/dp/B0CZ3WXFX3"
+        # 评论区
+        "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F",
+    ]
+    proxies = ip_proxies()
+    # proxies = None
+    meta = {
+        'headers': random_amazon_headers(),
+        'proxies': proxies,
+        'max_retry': 3  # 最大重试次数
+    }
+    for url in url_list:
+        meta['msg_url'] = url
+        meta['retry_count'] = 0  # 重试次数
+        print('[START]',url)
+        response, meta = get_product_detail(meta)
+
+        # print(response.text)
+
+
+if __name__ == '__main__':
+    run()
diff --git a/亚马逊/方案2_突破指纹检测.py b/亚马逊/方案2_突破指纹检测.py
@@ -0,0 +1,42 @@
+# -*- coding: UTF-8 -*-
+'''
+@Project ：wbh_pj 
+@File ：123.py
+@Author ：hao
+@Date ：2023/10/24 14:56 
+'''
+'''
+# 亚马逊所有页面都可以采集，最强的方案
+# 目前只有 safari15_5 / safari15_3 指纹可以通过
+'''
+# import requests
+from curl_cffi import requests
+from wbh_word.spider.Get_TJ_ip import ip_proxies
+
+headers = {
+    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+    "accept-language": "zh-CN,zh;q=0.9",
+    "priority": "u=0, i",
+    "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
+    "sec-ch-ua-mobile": "?0",
+    "sec-ch-ua-platform": "\"Windows\"",
+    "sec-fetch-dest": "document",
+    "sec-fetch-mode": "navigate",
+    "sec-fetch-site": "none",
+    "sec-fetch-user": "?1",
+    "upgrade-insecure-requests": "1",
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+}
+# 产品详情
+# url = "https://www.amazon.com/dp/B0CS28ZLWS"
+# 评论区
+# url = "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F/ref=cm_cr_arp_d_paging_btm_next_2?pageNumber=2"
+url = "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F/ref=cm_cr_getr_d_paging_btm_next_3?pageNumber=3"
+proxies = ip_proxies()
+
+response = requests.get(url, headers=headers, proxies=proxies, impersonate="safari15_3")
+
+print(response.text)
+print(response)
+
+
diff --git a/亚马逊/方案3_自动化爬取.py b/亚马逊/方案3_自动化爬取.py
@@ -0,0 +1,37 @@
+# -*- coding: UTF-8 -*-
+'''
+@Project ：wbh_pj 
+@File ：selenium_demo.py
+@Author ：hao
+@Date ：2023/10/23 16:49 
+'''
+'''
+需要过验证码，暂时不写
+'''
+import time
+
+from selenium import webdriver
+
+
+def demo_run():
+    url1 = 'https://www.amazon.com/dp/B0CS28ZLWS'
+    # ---------# 下面这一大块东西都是用来隐藏selenium的特征值---------------------
+
+    # chrome_options.add_argument("--proxy-server=http://114.230.23.140:3658")       # 新增ip代理
+    chrome_options = webdriver.ChromeOptions()
+    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
+    chrome_options.add_experimental_option('useAutomationExtension', False)
+
+    driver = webdriver.Chrome(options=chrome_options)  # 核心为下面这几行
+    with open('JS_2.js') as f:
+        js = f.read()
+    driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument',
+                           {'source': js})
+
+    # ---------# 隐藏特征值----------------------------
+    driver.get(url1)
+    time.sleep(123)
+
+
+if __name__ == '__main__':
+    demo_run()
diff --git a/亚马逊/方案4_http2.py b/亚马逊/方案4_http2.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 2024/5/22 18:31
+# @Author  : Harvey
+# @File    : 方案4_http2.py
+import httpx
+from urllib.parse import urlparse
+
+# proxies = {
+#     'http://': 'http://172.23.64.1:8888',
+#     'https://': 'http://172.23.64.1:8888',
+# }
+# # 为代理键添加正确的URL格式
+# proxies = {urlparse(k).scheme + '://' + urlparse(k).netloc: v for k, v in proxies.items()}
+
+# client = httpx.Client(http2=True, proxies=proxies, verify=False)
+client = httpx.Client(http2=True)
+
+# 之后的使用方式和requests一样
+
+headers = {
+    "dpr": "1",
+    "referer": "https://www.amazon.com",
+    "sec-ch-ua": "\"Chromium\";v=\"124\", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
+    "sec-ch-viewport-width": "1912",
+    "sec-fetch-mode": "navigate",
+    "sec-fetch-site": "same-origin",
+    "sec-fetch-user": "?1",
+    "upgrade-insecure-requests": "1",
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
+    "viewport-width": "1912"
+}
+
+
+url = 'https://www.amazon.com/gp/product/ajax/ref=dp_aod_NEW_mbc?asin=B08DFLR38F&m=&qid=&smid=&sourcecustomerorglistid=&sourcecustomerorglistitemid=&sr=&pc=dp&experienceId=aodAjaxMain'
+# url2 = '/gp/aag/main?ie=UTF8&amp;seller=A3VQLMMKUUX89G&amp;isAmazonFulfilled=1&amp;asin=B08DFLR38F&amp;ref_=olp_merch_name_2'
+
+result = client.get(url, headers=headers)
+
+
+print(result.text)
+print(result)
diff --git a/亚马逊/突破验证码步骤拆解/README.md b/亚马逊/突破验证码步骤拆解/README.md