diff --git "a/\344\272\232\351\251\254\351\200\212/README.md" "b/\344\272\232\351\251\254\351\200\212/README.md" new file mode 100644 index 0000000..e69de29 diff --git "a/\344\272\232\351\251\254\351\200\212/image/img.png" "b/\344\272\232\351\251\254\351\200\212/image/img.png" new file mode 100644 index 0000000..87eb769 Binary files /dev/null and "b/\344\272\232\351\251\254\351\200\212/image/img.png" differ diff --git "a/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2101_\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201.py" "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2101_\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201.py" new file mode 100644 index 0000000..8cf63ff --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2101_\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201.py" @@ -0,0 +1,238 @@ +#!/usr/bin/env python +''' +-*- coding: utf-8 -*- +@Time : 2024/6/4 16:51 +@Author : Harvey +@File : run.py +''' +''' +目前仅能采集 产品详情 + 评论区的首页 +评论区还要其他地方有检测的待处理 +''' + +import random +import re + +import ddddocr +import requests + +from wbh_word.spider.Get_TJ_ip import ip_proxies + +# # 目前所有站点的域名和cookie(cookie有存活期) +amazon_site_info = { + # 20500 + "https://www.amazon.com": ['美国站', 'i18n-prefs=USD; lc-main=en_US; session-id={}-{}-{}; ubid-main={}-{}-{}'], + # NW1 6XE + "https://www.amazon.co.uk": ['英国站', 'i18n-prefs=CNY; lc-acbuk=en_GB; session-id={}-{}-{}; ubid-acbuk={}-{}-{}'], + # K1V 7P8 + "https://www.amazon.ca": ['加拿大站', 'i18n-prefs=CAD; lc-acbca=en_CA; session-id={}-{}-{}; ubid-acbca={}-{}-{}'], + # 10115 + "https://www.amazon.de": ['德国站', 'lc-acbde=en_GB; i18n-prefs=CNY; session-id={}-{}-{}; ubid-acbde={}-{}-{}'], + # 1011-1109 + "https://www.amazon.nl": ['荷兰站', 'i18n-prefs=EUR; lc-acbnl=en_GB; session-id={}-{}-{}; ubid-acbnl={}-{}-{}'], + # 11455 + "https://www.amazon.se": ['瑞典站', 'i18n-prefs=SEK; lc-acbse=en_GB; session-id={}-{}-{}; ubid-acbse={}-{}-{}'], + # 1930 + "https://www.amazon.com.be": ['比利时站', + 'i18n-prefs=EUR; lc-acbbe=en_GB; session-id={}-{}-{}; ubid-acbbe={}-{}-{}'], + # 789680 + "https://www.amazon.sg": ['新加坡站', 'i18n-prefs=SGD; session-id={}-{}-{}; ubid-acbsg={}-{}-{}'], + # 11433 + "https://www.amazon.sa": ['阿拉伯站', 'i18n-prefs=SAR; lc-acbsa=en_AE; session-id={}-{}-{}; ubid-acbsa={}-{}-{}'], + # Dubai + "https://www.amazon.ae": ['阿联酋站', 'i18n-prefs=USD; lc-acbae=en_AE; session-id={}-{}-{}; ubid-acbae={}-{}-{}'], + # 999008 + "https://www.amazon.in": ['印度站', 'i18n-prefs=INR; lc-acbin=en_IN; session-id={}-{}-{}; ubid-acbin={}-{}-{}'], + "https://www.amazon.eg": ['埃及站', 'i18n-prefs=EGP; lc-acbeg=en_AE; session-id={}-{}-{}; ubid-acbeg={}-{}-{}'], + # 00144 + "https://www.amazon.it": ['意大利站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbit={}-{}-{}'], + # 08358 + "https://www.amazon.es": ['西班牙站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbes={}-{}-{}'], + # 10115 + "https://www.amazon.pl": ["波兰站", 'i18n-prefs=PLN; session-id={}-{}-{}; ubid-acbpl={}-{}-{}'], + # 34000 + "https://www.amazon.com.tr": ["土耳其站", 'i18n-prefs=TRY; session-id={}-{}-{}; ubid-acbtr={}-{}-{}'], + # 83331-000 + "https://www.amazon.com.br": ["巴西站", 'i18n-prefs=BRL; session-id={}-{}-{}; ubid-acbbr={}-{}-{}'], + # 75020 + "https://www.amazon.fr": ["法国站", 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbfr={}-{}-{}'], + # 01830 + "https://www.amazon.com.mx": ["墨西哥站", 'i18n-prefs=MXN; session-id={}-{}-{}; ubid-acbmx={}-{}-{}'], + # 2600 + "https://www.amazon.com.a": ["澳大利亚站", 'i18n-prefs=AUD; session-id={}-{}-{}; ubid-acbau={}-{}-{}'], +} + + +def get_response_type(response): + type = 9999 + if response.status_code == 404: + # 判断商品是否过期 + print("[Type]当前页面为-商品过期") + type = 2 + elif not response or response.status_code < 200 or response.status_code >= 400: + # 异常响应 + print("[Type]当前页面为-异常响应") + type = 0 + elif (response.status_code == 302) or ('Sorry! Something went wrong' in response.text) or ( + '请刷新页面并重试' in response.text): + # 请求错误 + print("[Type]当前页面为-请求错误") + type = -1 + elif re.search(r'Enter the characters you see below', response.text) or ( + '/errors/validateCaptcha' in response.text): + # 验证码 + print("[Type]当前页面为-验证码") + type = -2 + elif len(response.text) > 150000: + print("[Type]当前页面为-正常响应") + type = 1 + return type + + +def random_amazon_headers(): + headers = { + "dpr": "1", + "referer": "https://www.amazon.com", + # "sec-ch-ua": "\"Chromium\";v=\"124 \", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-viewport-width": "1912", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + # "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", + "viewport-width": "1912" + } + ua_temple = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.{}.{}" + headers['user-agent'] = ua_temple.format(random.randint(1000, 9999), random.randint(10, 1000)) + return headers + + +def updata_cookie(cookie_dict, meta): + ''' + 根据cookie字典来更新cookie + :param cookie_dict: + :return: + ''' + # print('[UPDATA_COOKIE]', cookie_dict) + cookies = meta.get('cookies', {}) + if cookie_dict.get('x-amz-captcha-1', ''): + cookies['x-amz-captcha-1'] = cookie_dict['x-amz-captcha-1'] + if cookie_dict.get('x-amz-captcha-2', ''): + cookies['x-amz-captcha-2'] = cookie_dict['x-amz-captcha-2'] + meta['cookies'] = cookies + return cookies, meta + + +def get_img(response): + ''' + 下载并识别图片 + :param response: + :return:图片ID + :return:图片识别结果 + ''' + img_id = re.findall(r'name="amzn" value="(.*?)"', response.text) + img = re.findall(r'', response.text) + if img_id and img: + img_url = img[0] + img_id = img_id[0] + r = requests.get(img_url) + img_path = './image/img.png' + ocr = ddddocr.DdddOcr() + with open(img_path, 'wb') as f: + f.write(r.content) + img_data = ocr.classification(r.content) + img_data = img_data.lower() + return img_id, img_data + + +def run_verify(response, meta): + ''' + 处理验证码 + :param response: + :return: + ''' + verify_url = "https://www.amazon.com/errors/validateCaptcha" + img_id, img_data = get_img(response) + if img_id and img_data: + msg_url = meta.get('msg_url', '') + url_href = msg_url.split('amazon.com')[-1] + proxies = meta.get('proxies', '') + headers = meta.get('headers') + cookies = meta.get('cookies') + params = { + "amzn": img_id, + "amzn-r": url_href, + "field-keywords": img_data + } + if msg_url and proxies: + print('[GET]正在请求验证码页, 验证码识别结果为:',img_data) + response = requests.get(verify_url, headers=headers, params=params, cookies=cookies, proxies=proxies, + allow_redirects=False) + response_cookie = dict(response.cookies) + cookies, meta = updata_cookie(response_cookie, meta) + return meta + + +def get_product_detail(meta): + ''' + 采集产品详情 + :param meta: + :return: + ''' + headers = meta['headers'] + msg_url = meta.get('msg_url', '') + proxies = meta.get('proxies', '') + cookies = meta.get('cookies', {}) + response = requests.get(msg_url, headers=headers, cookies=cookies, proxies=proxies) + + print('[GET]正在第 {} 次请求, 响应长度为:'.format(meta['retry_count'] + 1), len(response.text)) + response_type = get_response_type(response) + if response_type == -2: + # 出现验证码,判断是否超过最大重试次数 + if meta['retry_count'] < meta['max_retry']: + meta = run_verify(response, meta) + retry_count = meta['retry_count'] + meta['retry_count'] = retry_count + 1 + + print('[RETRY]重试当前任务:', ) + response, meta = get_product_detail(meta) + else: + print('[MAX_RETRY]超过最大重试次数') + return None + elif response_type == 1: + # 正常的响应 + pass + else: + print(response.text) + raise '超出预期的响应' + return response, meta + + +def run(): + url_list = [ + # 产品详情 + "https://www.amazon.com/dp/B08DFLR38F", + "https://www.amazon.com/TAISCAI-USB-Mount%EF%BC%8C18W-Dual-Waterproof/dp/B0CY99S3KN", + "https://www.amazon.com/Wireless-Charging-Mag-Safe-Foldable-Magnetic/dp/B0CSP7KHD1", + "https://www.amazon.com/Charger-Hohosb-Adapter-Charging-More-White/dp/B0CZ3WXFX3" + # 评论区 + "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F", + ] + proxies = ip_proxies() + # proxies = None + meta = { + 'headers': random_amazon_headers(), + 'proxies': proxies, + 'max_retry': 3 # 最大重试次数 + } + for url in url_list: + meta['msg_url'] = url + meta['retry_count'] = 0 # 重试次数 + print('[START]',url) + response, meta = get_product_detail(meta) + + # print(response.text) + + +if __name__ == '__main__': + run() diff --git "a/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2102_\347\252\201\347\240\264\346\214\207\347\272\271\346\243\200\346\265\213.py" "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2102_\347\252\201\347\240\264\346\214\207\347\272\271\346\243\200\346\265\213.py" new file mode 100644 index 0000000..510d122 --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2102_\347\252\201\347\240\264\346\214\207\347\272\271\346\243\200\346\265\213.py" @@ -0,0 +1,42 @@ +# -*- coding: UTF-8 -*- +''' +@Project :wbh_pj +@File :123.py +@Author :hao +@Date :2023/10/24 14:56 +''' +''' +# 亚马逊所有页面都可以采集,最强的方案 +# 目前只有 safari15_5 / safari15_3 指纹可以通过 +''' +# import requests +from curl_cffi import requests +from wbh_word.spider.Get_TJ_ip import ip_proxies + +headers = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "zh-CN,zh;q=0.9", + "priority": "u=0, i", + "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"Windows\"", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "none", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" +} +# 产品详情 +# url = "https://www.amazon.com/dp/B0CS28ZLWS" +# 评论区 +# url = "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F/ref=cm_cr_arp_d_paging_btm_next_2?pageNumber=2" +url = "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F/ref=cm_cr_getr_d_paging_btm_next_3?pageNumber=3" +proxies = ip_proxies() + +response = requests.get(url, headers=headers, proxies=proxies, impersonate="safari15_3") + +print(response.text) +print(response) + + diff --git "a/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2103_\350\207\252\345\212\250\345\214\226\347\210\254\345\217\226.py" "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2103_\350\207\252\345\212\250\345\214\226\347\210\254\345\217\226.py" new file mode 100644 index 0000000..ecf49c9 --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2103_\350\207\252\345\212\250\345\214\226\347\210\254\345\217\226.py" @@ -0,0 +1,37 @@ +# -*- coding: UTF-8 -*- +''' +@Project :wbh_pj +@File :selenium_demo.py +@Author :hao +@Date :2023/10/23 16:49 +''' +''' +需要过验证码,暂时不写 +''' +import time + +from selenium import webdriver + + +def demo_run(): + url1 = 'https://www.amazon.com/dp/B0CS28ZLWS' + # ---------# 下面这一大块东西都是用来隐藏selenium的特征值--------------------- + + # chrome_options.add_argument("--proxy-server=http://114.230.23.140:3658") # 新增ip代理 + chrome_options = webdriver.ChromeOptions() + chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) + chrome_options.add_experimental_option('useAutomationExtension', False) + + driver = webdriver.Chrome(options=chrome_options) # 核心为下面这几行 + with open('JS_2.js') as f: + js = f.read() + driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', + {'source': js}) + + # ---------# 隐藏特征值---------------------------- + driver.get(url1) + time.sleep(123) + + +if __name__ == '__main__': + demo_run() diff --git "a/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2104_http2.py" "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2104_http2.py" new file mode 100644 index 0000000..00dd05a --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2104_http2.py" @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2024/5/22 18:31 +# @Author : Harvey +# @File : 方案4_http2.py +import httpx +from urllib.parse import urlparse + +# proxies = { +# 'http://': 'http://172.23.64.1:8888', +# 'https://': 'http://172.23.64.1:8888', +# } +# # 为代理键添加正确的URL格式 +# proxies = {urlparse(k).scheme + '://' + urlparse(k).netloc: v for k, v in proxies.items()} + +# client = httpx.Client(http2=True, proxies=proxies, verify=False) +client = httpx.Client(http2=True) + +# 之后的使用方式和requests一样 + +headers = { + "dpr": "1", + "referer": "https://www.amazon.com", + "sec-ch-ua": "\"Chromium\";v=\"124\", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-viewport-width": "1912", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", + "viewport-width": "1912" +} + + +url = 'https://www.amazon.com/gp/product/ajax/ref=dp_aod_NEW_mbc?asin=B08DFLR38F&m=&qid=&smid=&sourcecustomerorglistid=&sourcecustomerorglistitemid=&sr=&pc=dp&experienceId=aodAjaxMain' +# url2 = '/gp/aag/main?ie=UTF8&seller=A3VQLMMKUUX89G&isAmazonFulfilled=1&asin=B08DFLR38F&ref_=olp_merch_name_2' + +result = client.get(url, headers=headers) + + +print(result.text) +print(result) diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/README.md" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/README.md" new file mode 100644 index 0000000..e69de29 diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_detail.py" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_detail.py" new file mode 100644 index 0000000..3967544 --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_detail.py" @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2024/5/18 11:51 +# @Author : Harvey +# @File : get_detail.py +import requests +from wbh_word.spider.Get_TJ_ip import ip_proxies + +headers = { + "dpr": "1", + "referer": "https://www.amazon.com", + "sec-ch-ua": "\"Chromium\";v=\"124\", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-viewport-width": "1912", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", + "viewport-width": "1912" +} +cookies = { + # "csm-sid": "713-2299262-7567932", + "x-amz-captcha-1": "1717480655829841", + "x-amz-captcha-2": "THz/dORI7f9MPEYpk6zAOQ==", + "session-id": "145-8954814-3186315", + "session-id-time": "2082787201l", + # 上面这部分为核心字段,过了验证码之后就可以得到 + + "i18n-prefs": "USD", + "lc-main": "zh_CN", + "sp-cdn": "\"L5Z9:CN\"", + # "ubid-main": "132-1464863-8061124", + # "session-token": "m4jtyQF+jZJqVW/adslOeUE7aWcay+oPVttMzoTlqWO9R9VCk6M0xNooY5RmGRW9eOBxpsP949PLbSn9eXz1ECwAwFVxwxRSWZtYLjcpY/70/WSGpis0IqQpRZSPI5RmUQgi/1lHq4qB+zIqJoudzKwXxCt7ihAa4fhbjcAOJjVsAO3pxMHfOH7aDjRw3wHt4xDaW53dyRENzIaYNvwh+KCkzK0w5SOxz6fxuY6v9zUsuWLt8pZmtQ75YoU1C3+Okt2scs+5b+jt+1dl/OTQ6oHj7QyAqK5h0MFeVM9jEkXgoubepR1OgB0YWNmMD3wCrb3sB0NtbZThvFJmWxOV3Bri1TQREibq", + # "csm-hit": "tb:7NNEK8EZX7MDKY0R3SZH+s-7NNEK8EZX7MDKY0R3SZH|1715936997814&t:1715936997814&adb:adblk_no" +} +# url = "https://www.amazon.com/Munchkin%C2%AE-Brica%C2%AE-Stroller-Organizer-Bag/dp/B0BPMQQN6M" +url = "https://www.amazon.com/dp/B08DFLR38F" + +proxies = ip_proxies() +# proxies = {'http': 'http://613706c5ede9d:kF0C7UslCBzxXdt@114.239.95.42:3328', 'https': 'http://613706c5ede9d:kF0C7UslCBzxXdt@114.239.95.42:3328'} +response = requests.get(url, headers=headers, cookies=cookies,proxies=proxies) +print(response.text) +print(len(response.text)) +print(response.cookies) +print(response.headers) \ No newline at end of file diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_img.py" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_img.py" new file mode 100644 index 0000000..a5b6be8 --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_img.py" @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2024/5/18 11:46 +# @Author : Harvey +# @File : get_img.py +import requests + + +headers = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "zh-CN,zh;q=0.9", + "priority": "u=0, i", + "referer": "https://www.amazon.com/dp/B0CS28ZLWS", + "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"Windows\"", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" +} +cookies = { + # "csm-sid": "602-5089573-6274975" +} +url = "https://www.amazon.com/errors/validateCaptcha" +params = { + "amzn": "b+TKPZCS+956d3A6Vjh14g==", + "amzn-r": "/dp/B08DFLR38F", + "field-keywords": "jtymlx" +} +proxies = {'http': 'http://613706c5ede9d:kF0C7UslCBzxXdt@121.206.142.66:3328', 'https': 'http://613706c5ede9d:kF0C7UslCBzxXdt@121.206.142.66:3328'} +# 请求成功后会返回302,需要禁用自动跳转,先获取cookie,否则会自动跳转到详情页 +response = requests.get(url, headers=headers, cookies=cookies, params=params,proxies=proxies, allow_redirects=False) + +# print(response.text) +print(len(response.text)) +print(response.cookies) +print(response.headers) +print(response.status_code) +# 当出现302状态说明请求成功 + +print("======= 重定向 =======") +response = requests.get(url, headers=headers, cookies=cookies, params=params,proxies=proxies) +print(len(response.text)) +print(response.cookies) +print(response.headers) +print(response.status_code) \ No newline at end of file diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img.png" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img.png" new file mode 100644 index 0000000..7d0dace Binary files /dev/null and "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img.png" differ diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img2222.png" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img2222.png" new file mode 100644 index 0000000..91af59d Binary files /dev/null and "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img2222.png" differ diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/run.py" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/run.py" new file mode 100644 index 0000000..c67f903 --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/run.py" @@ -0,0 +1,231 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2024/6/4 16:51 +# @Author : Harvey +# @File : run.py +import random +import re + +import ddddocr +import requests + +from wbh_word.spider.Get_TJ_ip import ip_proxies + +# # 目前所有站点的域名和cookie(cookie有存活期) +amazon_site_info = { + # 20500 + "https://www.amazon.com": ['美国站', 'i18n-prefs=USD; lc-main=en_US; session-id={}-{}-{}; ubid-main={}-{}-{}'], + # NW1 6XE + "https://www.amazon.co.uk": ['英国站', 'i18n-prefs=CNY; lc-acbuk=en_GB; session-id={}-{}-{}; ubid-acbuk={}-{}-{}'], + # K1V 7P8 + "https://www.amazon.ca": ['加拿大站', 'i18n-prefs=CAD; lc-acbca=en_CA; session-id={}-{}-{}; ubid-acbca={}-{}-{}'], + # 10115 + "https://www.amazon.de": ['德国站', 'lc-acbde=en_GB; i18n-prefs=CNY; session-id={}-{}-{}; ubid-acbde={}-{}-{}'], + # 1011-1109 + "https://www.amazon.nl": ['荷兰站', 'i18n-prefs=EUR; lc-acbnl=en_GB; session-id={}-{}-{}; ubid-acbnl={}-{}-{}'], + # 11455 + "https://www.amazon.se": ['瑞典站', 'i18n-prefs=SEK; lc-acbse=en_GB; session-id={}-{}-{}; ubid-acbse={}-{}-{}'], + # 1930 + "https://www.amazon.com.be": ['比利时站', + 'i18n-prefs=EUR; lc-acbbe=en_GB; session-id={}-{}-{}; ubid-acbbe={}-{}-{}'], + # 789680 + "https://www.amazon.sg": ['新加坡站', 'i18n-prefs=SGD; session-id={}-{}-{}; ubid-acbsg={}-{}-{}'], + # 11433 + "https://www.amazon.sa": ['阿拉伯站', 'i18n-prefs=SAR; lc-acbsa=en_AE; session-id={}-{}-{}; ubid-acbsa={}-{}-{}'], + # Dubai + "https://www.amazon.ae": ['阿联酋站', 'i18n-prefs=USD; lc-acbae=en_AE; session-id={}-{}-{}; ubid-acbae={}-{}-{}'], + # 999008 + "https://www.amazon.in": ['印度站', 'i18n-prefs=INR; lc-acbin=en_IN; session-id={}-{}-{}; ubid-acbin={}-{}-{}'], + "https://www.amazon.eg": ['埃及站', 'i18n-prefs=EGP; lc-acbeg=en_AE; session-id={}-{}-{}; ubid-acbeg={}-{}-{}'], + # 00144 + "https://www.amazon.it": ['意大利站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbit={}-{}-{}'], + # 08358 + "https://www.amazon.es": ['西班牙站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbes={}-{}-{}'], + # 10115 + "https://www.amazon.pl": ["波兰站", 'i18n-prefs=PLN; session-id={}-{}-{}; ubid-acbpl={}-{}-{}'], + # 34000 + "https://www.amazon.com.tr": ["土耳其站", 'i18n-prefs=TRY; session-id={}-{}-{}; ubid-acbtr={}-{}-{}'], + # 83331-000 + "https://www.amazon.com.br": ["巴西站", 'i18n-prefs=BRL; session-id={}-{}-{}; ubid-acbbr={}-{}-{}'], + # 75020 + "https://www.amazon.fr": ["法国站", 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbfr={}-{}-{}'], + # 01830 + "https://www.amazon.com.mx": ["墨西哥站", 'i18n-prefs=MXN; session-id={}-{}-{}; ubid-acbmx={}-{}-{}'], + # 2600 + "https://www.amazon.com.a": ["澳大利亚站", 'i18n-prefs=AUD; session-id={}-{}-{}; ubid-acbau={}-{}-{}'], +} + + +def get_response_type(response): + type = 9999 + if response.status_code == 404: + # 判断商品是否过期 + print("[Type]当前页面为-商品过期") + type = 2 + elif not response or response.status_code < 200 or response.status_code >= 400: + # 异常响应 + print("[Type]当前页面为-异常响应") + type = 0 + elif (response.status_code == 302) or ('Sorry! Something went wrong' in response.text) or ( + '请刷新页面并重试' in response.text): + # 请求错误 + print("[Type]当前页面为-请求错误") + type = -1 + elif re.search(r'Enter the characters you see below', response.text) or ( + '/errors/validateCaptcha' in response.text): + # 验证码 + print("[Type]当前页面为-验证码") + type = -2 + elif len(response.text) > 150000: + print("[Type]当前页面为-正常响应") + type = 1 + return type + + +def random_amazon_headers(): + headers = { + "dpr": "1", + "referer": "https://www.amazon.com", + # "sec-ch-ua": "\"Chromium\";v=\"124 \", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-viewport-width": "1912", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + # "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", + "viewport-width": "1912" + } + ua_temple = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.{}.{}" + headers['user-agent'] = ua_temple.format(random.randint(1000, 9999), random.randint(10, 1000)) + return headers + + +def updata_cookie(cookie_dict, meta): + ''' + 根据cookie字典来更新cookie + :param cookie_dict: + :return: + ''' + # print('[UPDATA_COOKIE]', cookie_dict) + cookies = meta.get('cookies', {}) + if cookie_dict.get('x-amz-captcha-1', ''): + cookies['x-amz-captcha-1'] = cookie_dict['x-amz-captcha-1'] + if cookie_dict.get('x-amz-captcha-2', ''): + cookies['x-amz-captcha-2'] = cookie_dict['x-amz-captcha-2'] + meta['cookies'] = cookies + return cookies, meta + + +def get_img(response): + ''' + 下载并识别图片 + :param response: + :return:图片ID + :return:图片识别结果 + ''' + img_id = re.findall(r'name="amzn" value="(.*?)"', response.text) + img = re.findall(r'', response.text) + if img_id and img: + img_url = img[0] + img_id = img_id[0] + r = requests.get(img_url) + img_path = './image/img.png' + ocr = ddddocr.DdddOcr() + with open(img_path, 'wb') as f: + f.write(r.content) + img_data = ocr.classification(r.content) + img_data = img_data.lower() + return img_id, img_data + + +def run_verify(response, meta): + ''' + 处理验证码 + :param response: + :return: + ''' + verify_url = "https://www.amazon.com/errors/validateCaptcha" + img_id, img_data = get_img(response) + if img_id and img_data: + msg_url = meta.get('msg_url', '') + url_href = msg_url.split('amazon.com')[-1] + proxies = meta.get('proxies', '') + headers = meta.get('headers') + cookies = meta.get('cookies') + params = { + "amzn": img_id, + "amzn-r": url_href, + "field-keywords": img_data + } + if msg_url and proxies: + print('[GET]正在请求验证码页, 验证码识别结果为:',img_data) + response = requests.get(verify_url, headers=headers, params=params, cookies=cookies, proxies=proxies, + allow_redirects=False) + response_cookie = dict(response.cookies) + cookies, meta = updata_cookie(response_cookie, meta) + return meta + + +def get_product_detail(meta): + ''' + 采集产品详情 + :param meta: + :return: + ''' + headers = meta['headers'] + msg_url = meta.get('msg_url', '') + proxies = meta.get('proxies', '') + cookies = meta.get('cookies', {}) + response = requests.get(msg_url, headers=headers, cookies=cookies, proxies=proxies) + + print('[GET]正在第 {} 次请求, 响应长度为:'.format(meta['retry_count'] + 1), len(response.text)) + response_type = get_response_type(response) + if response_type == -2: + # 出现验证码,判断是否超过最大重试次数 + if meta['retry_count'] < meta['max_retry']: + meta = run_verify(response, meta) + retry_count = meta['retry_count'] + meta['retry_count'] = retry_count + 1 + + print('[RETRY]重试当前任务:', ) + response, meta = get_product_detail(meta) + else: + print('[MAX_RETRY]超过最大重试次数') + return None + elif response_type == 1: + # 正常的响应 + pass + else: + print(response.text) + raise '超出预期的响应' + return response, meta + + +def run(): + url_list = [ + # 产品详情 + "https://www.amazon.com/dp/B08DFLR38F", + "https://www.amazon.com/TAISCAI-USB-Mount%EF%BC%8C18W-Dual-Waterproof/dp/B0CY99S3KN", + "https://www.amazon.com/Wireless-Charging-Mag-Safe-Foldable-Magnetic/dp/B0CSP7KHD1", + "https://www.amazon.com/Charger-Hohosb-Adapter-Charging-More-White/dp/B0CZ3WXFX3" + # 评论区 + "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F", + ] + proxies = ip_proxies() + # proxies = None + meta = { + 'headers': random_amazon_headers(), + 'proxies': proxies, + 'max_retry': 3 # 最大重试次数 + } + for url in url_list: + meta['msg_url'] = url + meta['retry_count'] = 0 # 重试次数 + print('[START]',url) + response, meta = get_product_detail(meta) + + # print(response.text) + + +if __name__ == '__main__': + run() diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/test.py" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/test.py" new file mode 100644 index 0000000..80dc9d8 --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/test.py" @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2024/5/18 11:44 +# @Author : Harvey +# @File : yanzhengma.py +import requests +from wbh_word.spider.Get_TJ_ip import ip_proxies + +headers = { + "dpr": "1", + "referer": "https://www.amazon.com", + "sec-ch-ua": "\"Chromium\";v=\"124\", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-viewport-width": "1912", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", + "viewport-width": "1912" +} +cookies = { + "csm-sid": "713-2299262-7567932", + "x-amz-captcha-1": "1715945070398999", + "x-amz-captcha-2": "/fvr6wiJciyvvZR8JOfw+Q==", + "session-id": "138-0202196-4407222", + "session-id-time": "2082787201l", + # 上面这部分为核心字段,过了验证码之后就可以得到 + + "i18n-prefs": "USD", + "lc-main": "zh_CN", + "sp-cdn": "\"L5Z9:CN\"", + # "ubid-main": "132-5607086-4560162", + # "session-token": "m4jtyQF+jZJqVW/adslOeUE7aWcay+oPVttMzoTlqWO9R9VCk6M0xNooY5RmGRW9eOBxpsP949PLbSn9eXz1ECwAwFVxwxRSWZtYLjcpY/70/WSGpis0IqQpRZSPI5RmUQgi/1lHq4qB+zIqJoudzKwXxCt7ihAa4fhbjcAOJjVsAO3pxMHfOH7aDjRw3wHt4xDaW53dyRENzIaYNvwh+KCkzK0w5SOxz6fxuY6v9zUsuWLt8pZmtQ75YoU1C3+Okt2scs+5b+jt+1dl/OTQ6oHj7QyAqK5h0MFeVM9jEkXgoubepR1OgB0YWNmMD3wCrb3sB0NtbZThvFJmWxOV3Bri1TQREibq", + # "csm-hit": "tb:7NNEK8EZX7MDKY0R3SZH+s-7NNEK8EZX7MDKY0R3SZH|1715936997814&t:1715936997814&adb:adblk_no" +} +# url = "https://www.amazon.com/Munchkin%C2%AE-Brica%C2%AE-Stroller-Organizer-Bag/dp/B0BPMQQN6M" +url = "https://www.amazon.com/dp/B08DFLR38F" + +proxies = ip_proxies() +response = requests.get(url, headers=headers, cookies=cookies,proxies=proxies) +print(response.text) +print(len(response.text)) +print(response.cookies) +print(response.headers) +print(proxies) \ No newline at end of file