From 52359ac63ca7a252f86f08c7b1a3f84ae1b858ae Mon Sep 17 00:00:00 2001 From: wu50416 <504168539@qq.com> Date: Wed, 5 Jun 2024 17:06:22 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E4=BA=9A=E9=A9=AC=E9=80=8A?= =?UTF-8?q?=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../README.md" | 0 .../image/img.png" | Bin 0 -> 5058 bytes ...64\351\252\214\350\257\201\347\240\201.py" | 238 ++++++++++++++++++ ...07\347\272\271\346\243\200\346\265\213.py" | 42 ++++ ...50\345\214\226\347\210\254\345\217\226.py" | 37 +++ .../\346\226\271\346\241\2104_http2.py" | 42 ++++ .../README.md" | 0 .../get_detail.py" | 45 ++++ .../get_img.py" | 49 ++++ .../image/img.png" | Bin 0 -> 6064 bytes .../image/img2222.png" | Bin 0 -> 6121 bytes .../run.py" | 231 +++++++++++++++++ .../test.py" | 45 ++++ 13 files changed, 729 insertions(+) create mode 100644 "\344\272\232\351\251\254\351\200\212/README.md" create mode 100644 "\344\272\232\351\251\254\351\200\212/image/img.png" create mode 100644 "\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2101_\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201.py" create mode 100644 "\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2102_\347\252\201\347\240\264\346\214\207\347\272\271\346\243\200\346\265\213.py" create mode 100644 "\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2103_\350\207\252\345\212\250\345\214\226\347\210\254\345\217\226.py" create mode 100644 "\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2104_http2.py" create mode 100644 "\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/README.md" create mode 100644 "\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_detail.py" create mode 100644 "\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_img.py" create mode 100644 "\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img.png" create mode 100644 "\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img2222.png" create mode 100644 "\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/run.py" create mode 100644 "\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/test.py" diff --git "a/\344\272\232\351\251\254\351\200\212/README.md" "b/\344\272\232\351\251\254\351\200\212/README.md" new file mode 100644 index 0000000..e69de29 diff --git "a/\344\272\232\351\251\254\351\200\212/image/img.png" "b/\344\272\232\351\251\254\351\200\212/image/img.png" new file mode 100644 index 0000000000000000000000000000000000000000..87eb76951ac9f52588c1133fc623b2764665b19e GIT binary patch literal 5058 zcmbtX_cI)Tvp*%GCPAVTM}#AycOgjh-fMKC*V7U$xix%YWlsl)3AWrYO!)f7& zas*Kl{pI`K%=-i0+x^VWd}eocf7#h`Gk5a?a8FZJLlrBg5_%a}e?xi;>SCOjF!1y>(ru;X`GzbQUo!q#zW zz@;7v;}E^1lAA=2tgB$1**AbkY6j&SR*{G8*;>i<{byXSy0E30^~fOdXGvYnO#>z8 zKziC7*%YvdntLB-mIzFvZ{(YY`aJnkt>|l+`(kA2ZF64@SDX{_26w;~0}nshj2&0^ zD_TjoDqkt?-a0~rSs}shf~;nm4;${b8bQ@Oulm_veEgMn_CAWLQ3Xi@v}spff>MZN z%L+z|8E)}G9MW8VMx?zsT??+XiBH%{^0Ny^2!|Y?v>y->lgE1N<2fB;ct>Kg`aAhW6K=kmrf`(4h9rh zy(0=dgLd4e_4;w;m{VZyo-9u3=kmA>u$3_fR>8w0Rpz9{Pp&ana;f$mX0 zwU_$)?DcgwBnC+;jUC}@oah0Zw6?~B+WcRhDOIG4(eJBMM?^AzOztaBfAKMZ?v2Fq z^~d9BqgN_H?;{Cgyxgd6#HKAxNWQj5^IK!ZnEoUKI8RGSsK|SSQqy_E%0j{;bF^5I z{b+fvqM-MzH|zDz(Ez+lV)?>mH%Q7@(I4G0V-@6o2i`%n7%Y3`p~T>jHmb=Rn z=SJ7aBVVCZ;$lJ^HR=5#11VOxmb8sY^K|#%%_~k}u|&?5RK3v7tY7s+ZM$D(!3A!K z*~Vr^wYEs&)x-inF7Cm4*TicMQm@<91!X7J<9R$9+}Nn-cA#*{gt(kI9phZ8olza zUZr@f)7N!@ScitwJ}ziEoF*J3pC!Xj8Su_!q97KCRq63ABTd3X21m$2!3H;LB9zVF zkxR>$X%Nya>WHlk$6VXIg;h#E?PPQhuEVgxhT2V;=G6v*-24%4Y?;w$ViFoJ}P$f6pH$7HxN!t-}eXl#r1W}VUFU|U-`RseUg64*0eAd&F zdCjyHOW7eS1#M#M`Y{WzSBPbwcYi0>DA(Mg!z-|sJ!;vbg9CajHxX4b@jCd#GS~IW zbeP0O&6&V}&Q-x*PY#jQ-p~EuajH80(EdzffDEsxd;{>H@MAW4%-f~Dv)QP~xg5I( zq6&|!?{eN*uOw&5XPxFyZG5wodpuXt&`9VVFA~^VD2s>$M)!L<({UI808l$PZNhh58nQ~ z{>Qn`?jwnSSxvih-3_@Jm52>bS=zPgx^ulM8?WQpu{-GDZ(hYHos2o`b(0AHm7ZAk zZCJpQ>28}O3%03K^V0%pVdSr`xaX-nxz2?yOtXtBPaP%eBl-38&ktvDCo$n?_!qF< zf@VzPYj~kom&*-6wZ60g&;MW{P=Lf^|2PAHw5Kl0g4k)99mWBp*y2UXGK@6;&XqbS z7=~ltrv%qt366h>MGhqJQiayNj?L%zdgRz-tPx`^B_D&^V)`BHN2V6(nJRMk4lEt9 z;z%_i7p(bLY(Lm1Xt{k^A?ut`s{U5lGCwOZM-+B07niAzJdrur{i?_a-%3g$9FxST zNAWjYvduq)pbItgjYtr6wkicj3YOto1v~ZrvQUCTC4@!zYy?hrXM}>d$3!M^sj;%s zAf$T5&lT43P%zWe?;Xt@<0I8Ly9CL$d+VV)ep8E-Gwji)-Oys6BRTVPR-Z;Q-kCO( zwM4Fgd#ivjAqU4QT?5H|1{W}lN)qnn_-r1Tm53bc6oX8LCV?KeX5kn_E5~nfc7D2vFb}#&GjdZz}I=*Oehac`*zB*!i z(l-Z?jSA)S%{E1;W<@2lJissr;V!e=Jy4hHd)SK8z6t^VXl}9aI+b5o;BuoT3fTTC zV#DX<@(OjPC?#qMi!w`wF03N99{jGtI~E59YK*rUwTK*Dv=>ZF)>Uke%3zOE_nz>ts!KJ;+9 zu3w(3T+O#Snq9M>i8TX-C;ighKn9zxAc1Ag$*Q#U_jLk&{&3!Ij8)o4x)n4oFFdkf z(BM)v-fHGy2kX}Hd_u+wh=juqFD4T8go1y5o$2zQV)w%>X<}yNN|n8I-aMeKe$0>E zAyT%SWz*jCEtm`%VvqXnXJP>8_#8=R+BRpw$=u(>N5NT1-+R4kE;qJOYAn_N^P5h! zYh59Con+JE`=JOIBRluP`zSI?fAFHKOwcPV+kLNZEsGNy0vD82!AyQ9j@gQ5{<}~q zk1ML9{n0Kbj|6Se@6<_$JqQHd^2F}gshI6|bh>kQMH)Sc8I(dC7kN35(ki`Xt+yGu z+wEPA#S}|G!Odk=@1ttZ5pnvnCiZl(Qqj<+&zIre%td9LJWaK&TT|08ZYI5@51VXJ zdf8IuxZQ_oa>GN6^fB-*zc!Clbng$BLZzovCRt(cLmnj@6bU+f+u4f&7lfkfL52>@ zXp!vZ*%_3A0{hq}?+KQ$w4iQV_X9(mqiu1SPu1M!+Undp-4FRsqs=1B=SAb)vWn{* zYi9JAlqXk~gg(grwCO!6o!Z+RJQ_-^px~n|{#flJH%a+6h_5(Q=muaFenxcADNoeA z6UI~1;3K?qzXz1l_~BsPURyA2Jgzw(Ojst=oNB3T*Y!*mS!H!CK(`hQ8xyR(rkUcp zuz1g?cuO3b_3DQOL|qLit%ny$SQs|Sa+XdI%D~C4@~jKFLYERd5_En40$s&xx|`_5 zn#Z=acqGHY=FxW1+B}z|T7bOmStT!2f{m!bbmDW1@Z9aYhv*&cyD;{K^6k21uR@!a zmuE*%=Voj^VV-X5n(fkDvWdcIGJ2qMB#AhFZ`wt&zdOL<-pJ3Ui^jIJW-B=pn z^{g7F_!#(6mfi+J)e?qaRL2JXa?FSI>*y>)!}7;U_M$y-2+i4n)%LNdPpZDhan_K! zZ?Pbe%Hf(ndP5hs@i&0SPmh%bzXja@S`BVZy# zsFysN=bg{=&Kg$6ZM-<)OlbBCA(~Sx_Fx9X-h!X?7t~KBbp4L#(nZa;MJ$d{vA#Zs z1~df{tTkfix_iA6d=jfB?mv%OfAfu$Goh84B3(J?-r)e7-{6%guB*&xD6QN(^dTl3-x?)jD{-$=RAiHn_NDDQ* z-hia(A6?ZfTann^aU9*qcvUgU&T4=qF}C`4(>z=~GC__u3Pk|n&Gy(eG zn4O6m!1FT^pLsunJjn$Nd$z`sojg`7)5t)+8r6SkdX!ebNH6?H1Ww0y9vQ!rqIfd! zV_Xcf!t!cb24~B0-Wdo>t`x*QRq%mJt_+%Lg*fk!7(GyPn`OKKScAs&y}$7Cfu%>> z!*zhV;y8Be@n1HIb#HFfo<2=mK$UsR_Ib=~)&vi?rR50FgWo@SbC~hX8 zw9JX@)=ca{lYI7(CQW^@&VRKgN$$q3_Lk`hDW3HG5^^+?t$6FsziB5PfuCX|||g-&i2_GAh0@_%}E)vVFXk&B3U;g8C~7Ya%rD)d;vtneZbuCZ_68=8Wl1#bj)3!c>Harw?j8%7lwcw^H(4{My9EEP7IVn=<{fQfuHovoEFKXPLQKj z_vWN`6%I%5Cu5Offg{w~a8IVheltAJk`QNYNdNp|_1FE3Vn>ImLRZrhpCI$?+%wB? zP};7JQ2m=Q`{Cu0*y~be{vMbE*9nPt`e!xgEIE!*C#}XQq#~@$Nzz%Ipx3mpE4?t< ziDP&EnUl)`uBMM`#(Q(iC@)xw#R1*M4J$V(bpt-&mMCH2NR<_l(QW0{4{LlOmitqu2UeiHMJvs5WySA%Uf?2IY&0pf}V zgHsGWc@j+aYr;H;)B^YYgZUbi78y7+lvqpAoF=I%bNsW%pz%4@>jC`|gjD@!UgHO= zmuuSKIKgEgm3^!io+mze!XDC=MjNjxebbr8P z27`Jeo$u67X-ew;3f~|gYUcz4KB6^!b8t1!K3YCne1}l8q_*v2Dm&gsjnK`SVi}K- zZ5CbD2e&goSuj|8P1kGwF#pBcwhV@e%N(U&wJ>{TU6~x~ou$=kzOMKgD>RcJ^NSWz!b2g?z=quP{U zEMctrY1Vc`&K{DSBI{zkeIY_SHKjTaGYX<$wn(bQMEcrV$yt4MLH$A!QRt~uad)+4~hS+f5qKF5rSYs_W zXDgym;~Rjzy!rkO0RC)xR%F(D#^_`vKWkb*S8b;~fBI~03bAG~sq^bl$ow%v!ZZSJ zU&?u2#pP`{^xD0Or06~-Oc1e2YuC7G*^aN3&8-Bl*fzx(N|x23;NGGg)BvJ*5FHgj zRqbhn*W>^*ohfMA%3s&p0DG`KO{(y_A=a6uoGRkwvdHd=g@kUBVm*Bz3;NI09|}ye z&B3%C*AsE8*N}hLx7q{8_c28|Y=au)272}6knWc=PC827u`?s-bXJ{P<);Fsf%Wy4 z)hL6LRjDB8tgkF?7HHWb7f3mcc_axvWTbat2e5tepI{(I%#^icS~wWL%bChonY(YD zSh>wE1NY3e%IU%sTxa^s)B8*6#JEAf+m$^rb360D0YvD&!bowD|2C>F@tH&= 400: + # 异常响应 + print("[Type]当前页面为-异常响应") + type = 0 + elif (response.status_code == 302) or ('Sorry! Something went wrong' in response.text) or ( + '请刷新页面并重试' in response.text): + # 请求错误 + print("[Type]当前页面为-请求错误") + type = -1 + elif re.search(r'Enter the characters you see below', response.text) or ( + '/errors/validateCaptcha' in response.text): + # 验证码 + print("[Type]当前页面为-验证码") + type = -2 + elif len(response.text) > 150000: + print("[Type]当前页面为-正常响应") + type = 1 + return type + + +def random_amazon_headers(): + headers = { + "dpr": "1", + "referer": "https://www.amazon.com", + # "sec-ch-ua": "\"Chromium\";v=\"124 \", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-viewport-width": "1912", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + # "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", + "viewport-width": "1912" + } + ua_temple = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.{}.{}" + headers['user-agent'] = ua_temple.format(random.randint(1000, 9999), random.randint(10, 1000)) + return headers + + +def updata_cookie(cookie_dict, meta): + ''' + 根据cookie字典来更新cookie + :param cookie_dict: + :return: + ''' + # print('[UPDATA_COOKIE]', cookie_dict) + cookies = meta.get('cookies', {}) + if cookie_dict.get('x-amz-captcha-1', ''): + cookies['x-amz-captcha-1'] = cookie_dict['x-amz-captcha-1'] + if cookie_dict.get('x-amz-captcha-2', ''): + cookies['x-amz-captcha-2'] = cookie_dict['x-amz-captcha-2'] + meta['cookies'] = cookies + return cookies, meta + + +def get_img(response): + ''' + 下载并识别图片 + :param response: + :return:图片ID + :return:图片识别结果 + ''' + img_id = re.findall(r'name="amzn" value="(.*?)"', response.text) + img = re.findall(r'', response.text) + if img_id and img: + img_url = img[0] + img_id = img_id[0] + r = requests.get(img_url) + img_path = './image/img.png' + ocr = ddddocr.DdddOcr() + with open(img_path, 'wb') as f: + f.write(r.content) + img_data = ocr.classification(r.content) + img_data = img_data.lower() + return img_id, img_data + + +def run_verify(response, meta): + ''' + 处理验证码 + :param response: + :return: + ''' + verify_url = "https://www.amazon.com/errors/validateCaptcha" + img_id, img_data = get_img(response) + if img_id and img_data: + msg_url = meta.get('msg_url', '') + url_href = msg_url.split('amazon.com')[-1] + proxies = meta.get('proxies', '') + headers = meta.get('headers') + cookies = meta.get('cookies') + params = { + "amzn": img_id, + "amzn-r": url_href, + "field-keywords": img_data + } + if msg_url and proxies: + print('[GET]正在请求验证码页, 验证码识别结果为:',img_data) + response = requests.get(verify_url, headers=headers, params=params, cookies=cookies, proxies=proxies, + allow_redirects=False) + response_cookie = dict(response.cookies) + cookies, meta = updata_cookie(response_cookie, meta) + return meta + + +def get_product_detail(meta): + ''' + 采集产品详情 + :param meta: + :return: + ''' + headers = meta['headers'] + msg_url = meta.get('msg_url', '') + proxies = meta.get('proxies', '') + cookies = meta.get('cookies', {}) + response = requests.get(msg_url, headers=headers, cookies=cookies, proxies=proxies) + + print('[GET]正在第 {} 次请求, 响应长度为:'.format(meta['retry_count'] + 1), len(response.text)) + response_type = get_response_type(response) + if response_type == -2: + # 出现验证码,判断是否超过最大重试次数 + if meta['retry_count'] < meta['max_retry']: + meta = run_verify(response, meta) + retry_count = meta['retry_count'] + meta['retry_count'] = retry_count + 1 + + print('[RETRY]重试当前任务:', ) + response, meta = get_product_detail(meta) + else: + print('[MAX_RETRY]超过最大重试次数') + return None + elif response_type == 1: + # 正常的响应 + pass + else: + print(response.text) + raise '超出预期的响应' + return response, meta + + +def run(): + url_list = [ + # 产品详情 + "https://www.amazon.com/dp/B08DFLR38F", + "https://www.amazon.com/TAISCAI-USB-Mount%EF%BC%8C18W-Dual-Waterproof/dp/B0CY99S3KN", + "https://www.amazon.com/Wireless-Charging-Mag-Safe-Foldable-Magnetic/dp/B0CSP7KHD1", + "https://www.amazon.com/Charger-Hohosb-Adapter-Charging-More-White/dp/B0CZ3WXFX3" + # 评论区 + "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F", + ] + proxies = ip_proxies() + # proxies = None + meta = { + 'headers': random_amazon_headers(), + 'proxies': proxies, + 'max_retry': 3 # 最大重试次数 + } + for url in url_list: + meta['msg_url'] = url + meta['retry_count'] = 0 # 重试次数 + print('[START]',url) + response, meta = get_product_detail(meta) + + # print(response.text) + + +if __name__ == '__main__': + run() diff --git "a/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2102_\347\252\201\347\240\264\346\214\207\347\272\271\346\243\200\346\265\213.py" "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2102_\347\252\201\347\240\264\346\214\207\347\272\271\346\243\200\346\265\213.py" new file mode 100644 index 0000000..510d122 --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2102_\347\252\201\347\240\264\346\214\207\347\272\271\346\243\200\346\265\213.py" @@ -0,0 +1,42 @@ +# -*- coding: UTF-8 -*- +''' +@Project :wbh_pj +@File :123.py +@Author :hao +@Date :2023/10/24 14:56 +''' +''' +# 亚马逊所有页面都可以采集,最强的方案 +# 目前只有 safari15_5 / safari15_3 指纹可以通过 +''' +# import requests +from curl_cffi import requests +from wbh_word.spider.Get_TJ_ip import ip_proxies + +headers = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "zh-CN,zh;q=0.9", + "priority": "u=0, i", + "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"Windows\"", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "none", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" +} +# 产品详情 +# url = "https://www.amazon.com/dp/B0CS28ZLWS" +# 评论区 +# url = "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F/ref=cm_cr_arp_d_paging_btm_next_2?pageNumber=2" +url = "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F/ref=cm_cr_getr_d_paging_btm_next_3?pageNumber=3" +proxies = ip_proxies() + +response = requests.get(url, headers=headers, proxies=proxies, impersonate="safari15_3") + +print(response.text) +print(response) + + diff --git "a/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2103_\350\207\252\345\212\250\345\214\226\347\210\254\345\217\226.py" "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2103_\350\207\252\345\212\250\345\214\226\347\210\254\345\217\226.py" new file mode 100644 index 0000000..ecf49c9 --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2103_\350\207\252\345\212\250\345\214\226\347\210\254\345\217\226.py" @@ -0,0 +1,37 @@ +# -*- coding: UTF-8 -*- +''' +@Project :wbh_pj +@File :selenium_demo.py +@Author :hao +@Date :2023/10/23 16:49 +''' +''' +需要过验证码,暂时不写 +''' +import time + +from selenium import webdriver + + +def demo_run(): + url1 = 'https://www.amazon.com/dp/B0CS28ZLWS' + # ---------# 下面这一大块东西都是用来隐藏selenium的特征值--------------------- + + # chrome_options.add_argument("--proxy-server=http://114.230.23.140:3658") # 新增ip代理 + chrome_options = webdriver.ChromeOptions() + chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) + chrome_options.add_experimental_option('useAutomationExtension', False) + + driver = webdriver.Chrome(options=chrome_options) # 核心为下面这几行 + with open('JS_2.js') as f: + js = f.read() + driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', + {'source': js}) + + # ---------# 隐藏特征值---------------------------- + driver.get(url1) + time.sleep(123) + + +if __name__ == '__main__': + demo_run() diff --git "a/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2104_http2.py" "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2104_http2.py" new file mode 100644 index 0000000..00dd05a --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\346\226\271\346\241\2104_http2.py" @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2024/5/22 18:31 +# @Author : Harvey +# @File : 方案4_http2.py +import httpx +from urllib.parse import urlparse + +# proxies = { +# 'http://': 'http://172.23.64.1:8888', +# 'https://': 'http://172.23.64.1:8888', +# } +# # 为代理键添加正确的URL格式 +# proxies = {urlparse(k).scheme + '://' + urlparse(k).netloc: v for k, v in proxies.items()} + +# client = httpx.Client(http2=True, proxies=proxies, verify=False) +client = httpx.Client(http2=True) + +# 之后的使用方式和requests一样 + +headers = { + "dpr": "1", + "referer": "https://www.amazon.com", + "sec-ch-ua": "\"Chromium\";v=\"124\", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-viewport-width": "1912", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", + "viewport-width": "1912" +} + + +url = 'https://www.amazon.com/gp/product/ajax/ref=dp_aod_NEW_mbc?asin=B08DFLR38F&m=&qid=&smid=&sourcecustomerorglistid=&sourcecustomerorglistitemid=&sr=&pc=dp&experienceId=aodAjaxMain' +# url2 = '/gp/aag/main?ie=UTF8&seller=A3VQLMMKUUX89G&isAmazonFulfilled=1&asin=B08DFLR38F&ref_=olp_merch_name_2' + +result = client.get(url, headers=headers) + + +print(result.text) +print(result) diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/README.md" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/README.md" new file mode 100644 index 0000000..e69de29 diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_detail.py" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_detail.py" new file mode 100644 index 0000000..3967544 --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_detail.py" @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2024/5/18 11:51 +# @Author : Harvey +# @File : get_detail.py +import requests +from wbh_word.spider.Get_TJ_ip import ip_proxies + +headers = { + "dpr": "1", + "referer": "https://www.amazon.com", + "sec-ch-ua": "\"Chromium\";v=\"124\", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-viewport-width": "1912", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", + "viewport-width": "1912" +} +cookies = { + # "csm-sid": "713-2299262-7567932", + "x-amz-captcha-1": "1717480655829841", + "x-amz-captcha-2": "THz/dORI7f9MPEYpk6zAOQ==", + "session-id": "145-8954814-3186315", + "session-id-time": "2082787201l", + # 上面这部分为核心字段,过了验证码之后就可以得到 + + "i18n-prefs": "USD", + "lc-main": "zh_CN", + "sp-cdn": "\"L5Z9:CN\"", + # "ubid-main": "132-1464863-8061124", + # "session-token": "m4jtyQF+jZJqVW/adslOeUE7aWcay+oPVttMzoTlqWO9R9VCk6M0xNooY5RmGRW9eOBxpsP949PLbSn9eXz1ECwAwFVxwxRSWZtYLjcpY/70/WSGpis0IqQpRZSPI5RmUQgi/1lHq4qB+zIqJoudzKwXxCt7ihAa4fhbjcAOJjVsAO3pxMHfOH7aDjRw3wHt4xDaW53dyRENzIaYNvwh+KCkzK0w5SOxz6fxuY6v9zUsuWLt8pZmtQ75YoU1C3+Okt2scs+5b+jt+1dl/OTQ6oHj7QyAqK5h0MFeVM9jEkXgoubepR1OgB0YWNmMD3wCrb3sB0NtbZThvFJmWxOV3Bri1TQREibq", + # "csm-hit": "tb:7NNEK8EZX7MDKY0R3SZH+s-7NNEK8EZX7MDKY0R3SZH|1715936997814&t:1715936997814&adb:adblk_no" +} +# url = "https://www.amazon.com/Munchkin%C2%AE-Brica%C2%AE-Stroller-Organizer-Bag/dp/B0BPMQQN6M" +url = "https://www.amazon.com/dp/B08DFLR38F" + +proxies = ip_proxies() +# proxies = {'http': 'http://613706c5ede9d:kF0C7UslCBzxXdt@114.239.95.42:3328', 'https': 'http://613706c5ede9d:kF0C7UslCBzxXdt@114.239.95.42:3328'} +response = requests.get(url, headers=headers, cookies=cookies,proxies=proxies) +print(response.text) +print(len(response.text)) +print(response.cookies) +print(response.headers) \ No newline at end of file diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_img.py" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_img.py" new file mode 100644 index 0000000..a5b6be8 --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/get_img.py" @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2024/5/18 11:46 +# @Author : Harvey +# @File : get_img.py +import requests + + +headers = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "zh-CN,zh;q=0.9", + "priority": "u=0, i", + "referer": "https://www.amazon.com/dp/B0CS28ZLWS", + "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"Windows\"", + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" +} +cookies = { + # "csm-sid": "602-5089573-6274975" +} +url = "https://www.amazon.com/errors/validateCaptcha" +params = { + "amzn": "b+TKPZCS+956d3A6Vjh14g==", + "amzn-r": "/dp/B08DFLR38F", + "field-keywords": "jtymlx" +} +proxies = {'http': 'http://613706c5ede9d:kF0C7UslCBzxXdt@121.206.142.66:3328', 'https': 'http://613706c5ede9d:kF0C7UslCBzxXdt@121.206.142.66:3328'} +# 请求成功后会返回302,需要禁用自动跳转,先获取cookie,否则会自动跳转到详情页 +response = requests.get(url, headers=headers, cookies=cookies, params=params,proxies=proxies, allow_redirects=False) + +# print(response.text) +print(len(response.text)) +print(response.cookies) +print(response.headers) +print(response.status_code) +# 当出现302状态说明请求成功 + +print("======= 重定向 =======") +response = requests.get(url, headers=headers, cookies=cookies, params=params,proxies=proxies) +print(len(response.text)) +print(response.cookies) +print(response.headers) +print(response.status_code) \ No newline at end of file diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img.png" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img.png" new file mode 100644 index 0000000000000000000000000000000000000000..7d0dacec8dbd7c643de55684e44c3b277fd726e5 GIT binary patch literal 6064 zcmbtY_d6R7&=0k>b`@1>3!-XowMS5^M(n+*y{TQq-dl|lJ0Y>DniZvjSVe1()*eOi z_WR5G2fXjyJ@?#t?zemH?sGqX{{ujwp`xw=z`?-*sQqKW{SrX&KkxrV_n!cyL;w}Q zC=L!K;C>Lm2*3g0KKR!X@V`V%LP$hPfQSDt%SQpg!NYw3z{UNiaPa??{PXY$a0mfJ z#3WRd)HIK2pU|0F|E*fh`Fh)v z>+y?@<<36H)zI_w z-2;TtsFG+u>I!NG2s1Jyk@|Qa5H)e@{>X)lq z^kEpAT`e)}RT38J^;P>N1FKYJ38~^1WNT-wE5Jl34tJOuM`*dG11!+HX3B%;?BnT z1VTl&5_X?R8RJg>8Hx#i*9$y%n;e9uo8+Xz7UWa!5I%2qk~SIyRy$46Aq)`U*hH&4 zuhE@Axj){rlfj)F&sfPYCA0}R>?A*3YH_El1EGR1LO3hiE+gs?4kkFMxvfrdVkBx` zCC&eY1jUzrgf<<%jY*JZQ)2ZVV(pyICz@Lrn}7JdJiL4S%Fu~_;WzvFeh0&ZP~}sv zW^=ocXPDC*k%qXLDfp-I>X=1qtB*i`V*F8iFS!2JXeE?LL5MXMF%52gwB+l2>k@bz z^2EipmN7p{IMC~ox5^*k6`6L$n(``Yd^F}tD??3%Ow?F0pXDcsa}mh4J1a<33XHY! ziAkMze}0{X6wDGra!<4)o`REn4VRSGa(+Uqs$f69Wz$T&gm#X}?%V|Lg>TMCk?GEB zkWJ4S1;qha-^_O=W83V~l&tKDh%+RpnAAq@0a5e_0D(!MiZM+~6l46;K=`*U_4O19 zh0^@jVHb_X3o^ice&~`|vaL;Otz{P>{IN<48jH@v{lY)oHJg|2&HlmLDvDZX@qk^( z`_&c46d24MYWQp@AzU`>F#n8nT#K#c(7)0wfV0ipQuUytb+{IUITYr2xS{L)uzvSp zwg{+^lRI-7Q}foJ)1m5hHiIlE`YDv0H3YWgypnmPzw~=k6E0)4KwL)LfD9eQuLQQE zx0d0G`%VdC`yGGlZpPkYO-P$v6bwI7LIS(1x#>f*o>WXBo-Hr`Zfcfv zP;K8-*HBN<`gx_;Mc;FltfmgmkCL;>sl5lp?=VyilTlLk(hKt;d@7pBSA4+m7rcyk`DAce+W* z+aMAADt$#^>@2iba=*G^gd&y$h*2-Jzz*4#Sj~_S!Fz+h|Su=R92iMkNlat$d#!%zNz!h5F{Hl^^GxT zgB*ZEOE;;e3PN+rU5r^~f!eS&zsM}1@8~OZt%-|XR=p$1xL+vxhNs7L>!4--yIGt{ z;cuqDQ4D5`QaLB#ZPU63tMw9N)TJ%+Ppo8MS(Uu>u5`A#y^S8FJaM!aY zc`JX_jH(?HAa*$Kvz)f)T;4t4f^5*MyMhOt+!Ks6Hg}&yAl6+qX6iRhCRs{1-KVu- z7SgK=%cx$ZWUbYn@ZgHOqTV?=8(D|&p^*7A2J|d3c1Z1D9#2*bPn>PA8^Nn)wOiH) zw2QMy-6K!jzal!jndJX_D7&UTjk0^K!E^hk8*qLo#7v8(ZZPUOBR$A@+o3=2ke^VUZ_*1ZfhF^C4%!ph-g zku7ytw(z_LT{UiLRR^Q=sr%M6MWjl)p&U8?X0O3t=bzGTzt70sc%BXSxnt^Ggb7qbeO>`t=smH>~Jp1*1(WgcK9^2%ln;dk71S`l9_+O$XF2P{Qo z&AD;=j~u@!@4Rfd)b0QVx;cONQ7+urdJic6l=ogexsO(3{YI?$CA>U7EVp8m?PMuA z=W~hQq0zDI_reF&7Rdp;=tqsqkx^WwEyWL>*uA_yalLsUuQn0u$Rk^v$${hj+Il(_ z*D5O|D3OV?T*7KBhm6Gc_S;pJCvjA3s;VRytcXEm(lFHa9~P-<)?I++&sZjZU$`~^&H?98@Z z^3vF?=ci_+GBLi%+%Nb2BY*SFS$GV7zWA8A1C?PPv2n%gG&Y@dp`p)n6^eRFHX??n z=l1}V}) z&qWu4_eODAyej~mQ+-4A+B>^=?oB`E=44F_XHp$P z@8&Jgw`B1qcv%mLdMoanFV2&tbUk~eo9DmA#Cd_@YA%uWT?&U(54noS0EXD?wHaS8 zze4-%3wCy{N{H9nj~EAT=vcI7=IX~YQfXCCKoN>2Mxy-iYs_8E1zX5mcp{m6;{YrldCdHfrMC>M_DAydO|I?di!5 zOx&UB+n3sw@^@TeS{Pa#2>*7nZ{79Q?M4cShIc zdLP(rLQj_I}^LwbV!V;MG%Wm+Q`drz~QwJqT3AkD9?D zdqL_+kG2rnur)*kCm*VUyL-w^(kZ^OnSa9m((9}8PN+0N4d@jnIn0+N+pFI=h^ZTj zeBUB~6q<-rcDJ`SuS<-Sv8^{8j;JyRPj6pVO0*|1?5BK8f7<5HU!2j*MXLV`q0BJ9 zS@C)Sa)T<8Oo~k-Xue)FRRHn?@P&y^^53b4cO@&%tYo^o929)ZLX}n4r#p_irZXXp z0Y#aqO8Ms?VM-Ag8fvUtMU#&z8P|#WGSrK!H9qA*8W>*Xmt^WY&`{u6CmAiI?V8L6 zPOY3pBQkJMCi042CrO3y!pHQaboNpHnk^2TV=3{8cnOM$fyO`bAOs${rNCFoy8$pJ z?dt>pF%e0h6rov<924d!T7Rw>M%>);gd_bVw}f?9aRF7krvu%@tXUC1=Fuip5elUs zCouq2v1wTC?6MF@KZx}sPqb1IjKb%3EF4Qa#S>Wf6x+#877M(szT7I@NvN)WgR|M8${(%IVQcM8{G!zfg1&kjm6N&HjvuD||C0~VM zj|;p)L1zXV=Zd{DC%x9Yu7+!iX$;{^qds)%RhbryJ#{E@w`^MymO@apjcyL4;Im2p zc^hR>`@Rd9A^DK+ancvh;Opw$9Jq2KLu!w`OIuImGn1B#U)fljBwk9LRhnsPAekv; z_RU34T78UG#Kjkuv(d3^%k=079CG6?w;!8Tf`d*LcJJWt_*@;z_ue>aFRG^{0bv@S z2WQWmWmiVkx#L>m9??S1-D~#;)SXG4KGWgZSF^U|zH}@N3x?kVEP_Da!`$#Z>>7|D zkGq2pqS%+2)UFtgmk0`Beac2d9&8 zUA6A%)~%Vm5A!_sYU%OIvt6vlpxLIB{zoAEs4|psz~JCdGtY0`EcEsTLjh0h zFFi-J)K|zK1?c;mN6$6%C*qf?{t9^GiJdyOCiID05NzwwSffyq*|z+ZdlW|QF+(2i z!qH_P2#?H;ERHFpyZL$bb#^ED>KOg4$2jAz_(#CVD=z2XS?p)YS%6K^TSGM4W03@) ziirSFJgAG?hg?x~0{`RJxJ`d9e%1asq<-#%oEpb<*YVEzdcBXg9z}>nMN65;M0Wyc z_2>Ao(08Xcq}B@O3z9L~&#(cPo>C`XK1YrRyxi*vUi`AMls2XB=M;A&*C&X=F{{WS z7gOhpC(dql?)=ksjmpecjV^yeiECe_L<;|866Jo17pt;4|L=1+n zFB9kd4Y<qsuuq}fU%}&ytF=OiOFsMZeSR8$2%#DT0(B$9zE_wHT^ z@7H-X8+JEuKwYu=Fw5m!It_FpyYsO2C~pXB@3K?KgU!Q6OQz&5-bIhNm4>ePIyfy1 zXj1h_RC@LN`ZLeyc$I&3H}If!U;{48@+GM_t40aVs1=z@JGQ8su4d*Kl)Ce`l6jQW zAV6d4P0Y+^e^rFuENDclu@Y6MlYF7RU6YH89gow|DN|kHi(Z|0Hn{$Ba|HCDB?~$& z(VQFOhb9c{Rq8Fcrgx9Y)v`4di*Pl~7o&O0k;p%X%7nF=Bn(DtNyfe3zi8?jMxdsy zM>Uw#(6^2X~C zK%jY17hj=<9^LJmxFnyG=KQ~EVzK6F_kh~aoSlPd9z{VzfN-m5OfQuaj~OuGNhMezmi zwYfGKa_m$39A7u`V>l+(xfq5DJgBJ?MHX9?HEXyZs~bPUoiHs|?xq{b;8ZyBtj^(2 z2{M|M;Rvl)GZ4+sR(G#PJ?B*UWJr{>HQzjD0gi+do-2GVs`-rW$86w)0lA;><~{6= z0LHfz%U-FSMacA?9Nc;}C_iCoxo(l7@8NOlpDTB86Ka0>`>|Qin>5{qH6|s@;wAbtG;v#?9MBd3-+Si zOv~QBI-2UNZKt#oK45m^m5dqdlTZC%PQ`B9pPMSlqp3okocXZk3ZUW654GZKh9^

HbW=RZKg(o|MGX^}K^`39VHIoYvcS2Dmt z&riFpDfUg-e&--ave+y|BaKGHzxM$p-0J0}UXcYIg+Hdv9I_HN*)<9BXh+srh`uOY zW?MWfj&3|GHuyIE22^vQJx@80eN1-=q85K1HIEo$en!J`xERfcP$!rukLY={y)wFu z$~DtCl}*4%t~PXfLY%bZ0g1ivnXDa%2q!VQq{h3bC`w$C43ZFuzyY;uzKZbxx#kwj z=h=5O2VRNIp##Ata$Uh94I%c8JbjB6osn=kRL7*Mqy`@)gT@i3Egu0?{3|aKELpcRa8C@)u8}P}Y||6WSjyo=H6GxRI@X_pTRr5VM`j+`I$b%Jp^1 zC;T4xtJ8ObQeW#O=jjO)pPS8_oyy`F$M=>UIa8$rLe(xKthb7~vIppCv&)YM z>OuX`LASiQn#-LJ1eSM4QCJYe9$;X^kOISg$S=9*eT83_8~@yE%vqCWfW5i9n%6Ea z?G!vln@gE3x@Hh-PsYNl?E}3v%&KFK<)R<7g|Bnv{qmQ;o2#o{jQKjD`F*G+AwlDm zYWC8DTJKLW&*t4zuBg4Qki2a47LM?szH-ztsqwzTBBi3bof)6>Q?f^-)K4jdn)q^6 zlK{pS&sZjh8N-HSoO7@zM_E5TU=ol%x*J|OsLxsZdcomR13CSoY~6)WBQ>$@A5*lt z+p@{cmtnzq0(02O5oHE!!mAYfS1G+*gXH36wt$*jcj>LxdN$@yzDvsLc~i}^g#8z8 zzMPYoy>d_Z1zF1r>sOCyY0mh^VX^cMk@chYBNgoW1}&}#`;W@_mqf|@2Ltiebm`Re z$fkb?tgNj9nVR5wr4rqr)8SCINRhb2hu_AW-Dp^BdgJNXZH&F#dIv^LUkf8qYc#Qr z^d~dReog1|Bk`{)W}I=yvKBDWGj(a_Si7%w4$*dB2-Y4@a%PRX39zwK$D2DDn$RPw zatal~a{mK+hYI_^t{%fKpoH?aM*+0U=E&NNfAi(6CzuW_X!x4)q4L#u77%_C`ST+C z&uu@0RQum>*(();Z0Cr=5_{*v7qCKCgY)ZcID6~C(0uYQ$Z&i2cq+5qMY{8XwTGuan z^Mm5-3=i4Z{3o|+ATqeymk9H^k6!Ev(~=OiA-%L9ksBcco=*_SLG$1G{134HKg;C( G!v6plmz|yf literal 0 HcmV?d00001 diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img2222.png" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/image/img2222.png" new file mode 100644 index 0000000000000000000000000000000000000000..91af59d8b94bc9e65880177ae16baef7f81987ad GIT binary patch literal 6121 zcmbuC$lGNw%6Lv$mcx(Aw*eC8Gwd{22lNX0M96Z!hhUgF+FE&pMVl{)zmzr+DEB|vqG|Mup=zqR+Ay%}t-QZXF;c2yJu4k;to1987)lj0{a80{NU7!{jI^mP+U*3UKv zMpVz!zE9Lb;;!>R*>&PFuY#JD&}4WghHsCUIUkhG+ZBMNQ_9E=&;xml%A{FB-E!gHa1 zu(i_Y%89#M?aBvzrBja;;eE<#sRMOI-VV6`I`V@Tkv~oQ3wxvdF?NP`$$v;2niX+; zS|aZ+8Z|1iwH*o-0?S^1cd%+Wp|zY+w)te>6s{t~RVbKmLX+ZIAM)uqbLo=CF)+`( z1qwS|YfhJ8-HU7NsIjZ6;te7nZ5M{NL@+@!4OH1}_bbP;l43n$x+Kt)bscB$C8`Pw z-pBPwe!ps&D%ao|?h9rI3YYfRh~xn#TcORC;L|P-7^d$Xnl3EJkbG>I2njH0$qZ&u zA!SH>$@YV&cAX%nQ&SW#dNLG#b>VTf1Ug^@)NC-vByNhV_U+HVv@Zbmwo;K&^Q<$c*_m)TsXgmPAZ@sIo;m z?n9JRlkQ6S;gB$*+46~%y<^B?{t>z}isPdB z8nUyss^44GlG4JJ`I>!NKDHWpL||%4vM?#HU7|&7!ZJKUK2{g_IfsD!tu{c~Q${It z1A5=YVD5hzCrU#RcI&86Ay505j$aFgqeY}91mU#Aq z98d4(62y1bU+uwN^u~{Yz1qU4h8^?HFFul_M^9SCs0JsUYW~B78-R)X)r__&-VOu%EqFvKs`?Af2(x{^w?Rc}ImS56dawdfZOJ0ILz&vK61o1j5B38|f z*0lR)fP-C=K%c!eA=lYZB;8+KB906~y~3?su#D!zjP13%wvWm{0o6v(LQ|fKwb7N7 zQcVm=HbPKZ6ee#!xVJ7~C>P2HouZ zdR7823LvgRy6(JEt;Q7?VoSr5 zS!&1lD7)F+Irj)!3|{N>4C1_M=4m3@?q3-#scx^~pB-tn9*lcoQBC~oXGBfRJ#Ipc z@wZdOktb}sOD4a5$;>I3+B9VoJD1r7-D#nSY*u>#0S!Z>gQUSgwLuQY*3t2@$WGPh zgY}5Zhm&&-^c%FA319L12aS@@i`xTE{k=fWwYn>uIjxMVf3go1we@`H*yPV?>q0m* zGv4FDC>(?u`&dmLMe$l@!Vir{265T3Cxnw4-06q<*4h*B&srKBD!9HhI*Y2$HRw7$ zrqDpEM`>iiL21`trWnWJ>GElJHs^RpuLGU|JHAKnE$Grw+e24b$g^?!mWzh=Cq%cD zn*6k?Xl=tkEzldgGmok8*sLsuo;h_hS>P?gqeUn>t{0odR4tKmQXQ!syDr~0=o5Fk zTWwYyAGb>r(~ z*x8)Duu)G3bInx4f|hx<2%AS}*e?51Q(0?@7#0s?bomyFd2rRRtTI+%Q%Nb2wLfHI zV%FDcZk_><(}7e}CY@552rio0W0OW zVWINe1FyN?tAC8%>DGzyoBVyLvtGFGHxCHT00s+hGzMKGCrI*b zl%ARDyHrWIVlR<>&zdW%>KIt!!;0cGy52gu(;2_hlcaRVtBxJW6`$qQqHPCe@p_n} z+vnG*N?ztt2{P|9d#fgA1g8Hyk=d86?p16`m9cKH?@GXF13KaiMdP|dHGG&6(qHQQ z&foc1<^}Cmxp<2}jIKp{^J%$tF7)Cxq@+&itb@p9mXHo0^~Dn7jkH8OJv}}aR;hbh z=h|KFo#eZl6l26WhtWasHQM@H5@l&C2 z;mk5V8q8L6vEyezdEaWZHDYsl`N%Wd|$wG7lNnhkxSVe~X)pbg>aYi;!?`0sV zRP>3uF-srmCk)T6p2Xu<4`$aIm431T;nu%7MoWxKqq2Vy^>T2b;58FJSQ?L(mzaG*N>Od3>(OaEyuC`V z(9MVZpB;QLug9!@m-PPdl~S#U?b|8Hz;c9jZ<8>U*amewlOF3W1hQKu%}R!?-VXED z&Q3X0kl5(j*Y`iy5o|tvWnTXTyl*!@-Bl9xW$J(Bnbv#;!DFKy))P51k#NlKe`&ew zeP3R^?E3O#^UUd%m`S^BxZPZW+knb8-r;X3PaYDdV0bSs_ z5L>urrwLLaJ1fB+A2~!oEHI%KK^imp*d+HNjnZSj?cvMK?MmJ^!X4=scW8m$-;u~? z07I$EDS}Njp8Z>Sk+Nv{(KDdR!N*2CH+K+>2 z+71+J+O)m$r=CC27Y8vLu2P$CQ&L*2MK8lf2S44hKW(c~hjQD7+LWFX6l1%c7yr5T ztlFj1il%t6N4m;<2VXqjzN_=YzqhO5Xeo6#i9h}MEwX4$cyX!p>1E^|>2$mOA>m&4 zh$5HSSCf(Lz{B1uS)7n<0k-&G=V}e3{dk^;hb0`Rq!*k?>s*v5@IVh;2PUu)g_fvnYjK9_BIcrF0s&NTWkt-XLUK_-Qom#ld=v-p+!mYM@^w;`}5w%`^ zu!x-xwb#Z-Ah0XtHTZo_Od!=-DM}H52li~XofLW6^U6u-tVqT zCu~m0<@8XtorZXJk(nmYQ&aUu$RoW|)UjDytE*NfU5p zaA~hCq7*Ejn^%tHQGug$yqZPIV+zi?_{5#9kvd?ik(ECM^EB!KhDQ19R14tM0pY)| z5N5|$BwZmxO)I;tKltihyJtGGDmP8Ag9asCw)}ZB3(U`EorZA7W2QAliRIU30-l)b z%#3R_lKYxu-f*{cSLE4sY}mDdW>gh4K}*CaH_=UxbkR+ccvhgiXLvEz1abNS<%D3o z4ej2U2z2giKlj`2Hwv3BBqdc;xsQ(?EsMm~oT}cJH1saoxbnm&ZhbqnZYECOQGtEs zYlt?4zeC{0PYZZjvn1@pov@x*OM&u+M_Vo44Qx)1$iG=A*W~)LU8!ZwavmRL>C9kK z&H2XF07FGRnYC4L(O9ESxQv_0wK3hl&LkSMpyAPCVCA|kA@lgHwc(NGsmKl`>ZQG| z^%tW?M3E!tSVEFcQHYtyVaDk5pB$moN=q9I55MTCZKvPs12VnW_@&$Y-t)>J*AT0i zdZT4zFj;QNig(BTo39`V51j@jEtifxHniv#D#@hH0Jz#9TIH1`IQ-- zo$350hK2~p$`ba&+Vo+xp*&*coUP|W*-;U?eb5Iztkw=!t5Q)dyv|>pvB-8uy#pH3L7D&u&{@*&Rrb$rAoq^RG zgaTFf9!3rlMAZrHMKJ|dl)r5al>XsZVyo&Q7>zB)%8hF^?`s~ISN&K0SC(sVg+pd0 z1*i)Jo)I#C%>LLsmr>R{CA^>R z_yBogln$v?m2B@P+FlRccH5Z1;(~M$d67+H_!J|(xtR>1^778R0pg|u-}U5D1%A(P z0-5jEiIUc{`4QloC5uG=`g-37$j`HGRDwkp^rOB^UxkYLGXREkbG)205S2WoQwC0T zP45iVSgMPN;s?#f=6? z&r8M(To&Ti?Ywk>w^vMyA_U!07&of5e5D09($k73bkRETHN4+{dRr(sH$IgQx8cz} zQW?+UnI-*R+pjP=jVq8ux-57|M)sG@Rug7S=xALYnX9JNmTHsjX5vjYcS+Ng#h+ej ze?PZq9`CH#Yw)w8VB`1A+%fX_zBX;vK)F{m_hT^Ts#YQh2;TF_U3*t=z{tuUojIs6 ztr3Qk!YH2^T=0>aUDq(^8Bo^t6jR<7=}c{7taD{%)h6)iZ8Y^ubG};Ay2Q=QNW(o{ zjjpf6?Fab07T0}$YeiVEW|V&suu5|lKXk86s;@pkPGNu*`Wwfej6%h+uAaF?HHy*` z*2K83;{{q;+@E2#tfzNFDKsCUWsyc{JU|u~-b*;5u-yMhd+1kUhuid3M?WVH3AY%! z_ib2@E?x$gJio}pxb<7+$>EtO2_iw9j6;HP2nqFZ#+^ifm(92Ml@FehjYUDFNBpi) z8%80awG9Xa%H8NB!9t@zCihIW1Xo~yUet3(6S~omV%pTcsX82csU2egaRL54T{RB;b@0E_A4p!1#?&F_;+Jg|&s`yaTm>jheDrhq=KC z%RAx^4m!+BSMv%npbb0gpoC&0e1n+blVJ2lJom;DQ_F=!YdM|e;V;z3yl9*()TRLr zGEw3cp)FBa@nhe_;92lVw+N9|&!v0P^4PJ}_`oLnLpLvdpHt!#;;y+yhH2)QZ*(kjyLss{-cFCJr^OMl z2$ysC4i=IU$<;n%9Ui4B?;HcNnj`8rnQ0C z7TE#>J0p4YsG)hjS$Rt~6*f)RTD~qW0 zFVD#2O{NR)bM^!s=8Febd@R;lRYD)6iI_2@!BXW@KwohVw9Sm6xFWK6c~$;+!1*oW zH(Xi*_~XusJ}DhTf&n-p@J8>2fMwEtC*eszfHmO(-#&f^s%iw$I6lVcbg$=`AE->K zlkKWAVJVk0p@$e1Lg9k)Yq_Rjc4ftFHe7lc`K;m-2Kh&_=MoE2kDhQ^Pq=t@mnsaz z)BkRYh{Cb~$+Qq&_$d#dW^B@BqDzOa$7IfP9gPX$Qzd>4{0Gl!AD8Gzg|U*#)6wXa z(s~=784NqG?#Ja{U!8=9r=D7+x{(zZ%TP6WvPTZ^t?uuzzr_`cUl80MoZPS$?&Up` Onhp}h|C>E}-uNGYtFE2^ literal 0 HcmV?d00001 diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/run.py" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/run.py" new file mode 100644 index 0000000..c67f903 --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/run.py" @@ -0,0 +1,231 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2024/6/4 16:51 +# @Author : Harvey +# @File : run.py +import random +import re + +import ddddocr +import requests + +from wbh_word.spider.Get_TJ_ip import ip_proxies + +# # 目前所有站点的域名和cookie(cookie有存活期) +amazon_site_info = { + # 20500 + "https://www.amazon.com": ['美国站', 'i18n-prefs=USD; lc-main=en_US; session-id={}-{}-{}; ubid-main={}-{}-{}'], + # NW1 6XE + "https://www.amazon.co.uk": ['英国站', 'i18n-prefs=CNY; lc-acbuk=en_GB; session-id={}-{}-{}; ubid-acbuk={}-{}-{}'], + # K1V 7P8 + "https://www.amazon.ca": ['加拿大站', 'i18n-prefs=CAD; lc-acbca=en_CA; session-id={}-{}-{}; ubid-acbca={}-{}-{}'], + # 10115 + "https://www.amazon.de": ['德国站', 'lc-acbde=en_GB; i18n-prefs=CNY; session-id={}-{}-{}; ubid-acbde={}-{}-{}'], + # 1011-1109 + "https://www.amazon.nl": ['荷兰站', 'i18n-prefs=EUR; lc-acbnl=en_GB; session-id={}-{}-{}; ubid-acbnl={}-{}-{}'], + # 11455 + "https://www.amazon.se": ['瑞典站', 'i18n-prefs=SEK; lc-acbse=en_GB; session-id={}-{}-{}; ubid-acbse={}-{}-{}'], + # 1930 + "https://www.amazon.com.be": ['比利时站', + 'i18n-prefs=EUR; lc-acbbe=en_GB; session-id={}-{}-{}; ubid-acbbe={}-{}-{}'], + # 789680 + "https://www.amazon.sg": ['新加坡站', 'i18n-prefs=SGD; session-id={}-{}-{}; ubid-acbsg={}-{}-{}'], + # 11433 + "https://www.amazon.sa": ['阿拉伯站', 'i18n-prefs=SAR; lc-acbsa=en_AE; session-id={}-{}-{}; ubid-acbsa={}-{}-{}'], + # Dubai + "https://www.amazon.ae": ['阿联酋站', 'i18n-prefs=USD; lc-acbae=en_AE; session-id={}-{}-{}; ubid-acbae={}-{}-{}'], + # 999008 + "https://www.amazon.in": ['印度站', 'i18n-prefs=INR; lc-acbin=en_IN; session-id={}-{}-{}; ubid-acbin={}-{}-{}'], + "https://www.amazon.eg": ['埃及站', 'i18n-prefs=EGP; lc-acbeg=en_AE; session-id={}-{}-{}; ubid-acbeg={}-{}-{}'], + # 00144 + "https://www.amazon.it": ['意大利站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbit={}-{}-{}'], + # 08358 + "https://www.amazon.es": ['西班牙站', 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbes={}-{}-{}'], + # 10115 + "https://www.amazon.pl": ["波兰站", 'i18n-prefs=PLN; session-id={}-{}-{}; ubid-acbpl={}-{}-{}'], + # 34000 + "https://www.amazon.com.tr": ["土耳其站", 'i18n-prefs=TRY; session-id={}-{}-{}; ubid-acbtr={}-{}-{}'], + # 83331-000 + "https://www.amazon.com.br": ["巴西站", 'i18n-prefs=BRL; session-id={}-{}-{}; ubid-acbbr={}-{}-{}'], + # 75020 + "https://www.amazon.fr": ["法国站", 'i18n-prefs=EUR; session-id={}-{}-{}; ubid-acbfr={}-{}-{}'], + # 01830 + "https://www.amazon.com.mx": ["墨西哥站", 'i18n-prefs=MXN; session-id={}-{}-{}; ubid-acbmx={}-{}-{}'], + # 2600 + "https://www.amazon.com.a": ["澳大利亚站", 'i18n-prefs=AUD; session-id={}-{}-{}; ubid-acbau={}-{}-{}'], +} + + +def get_response_type(response): + type = 9999 + if response.status_code == 404: + # 判断商品是否过期 + print("[Type]当前页面为-商品过期") + type = 2 + elif not response or response.status_code < 200 or response.status_code >= 400: + # 异常响应 + print("[Type]当前页面为-异常响应") + type = 0 + elif (response.status_code == 302) or ('Sorry! Something went wrong' in response.text) or ( + '请刷新页面并重试' in response.text): + # 请求错误 + print("[Type]当前页面为-请求错误") + type = -1 + elif re.search(r'Enter the characters you see below', response.text) or ( + '/errors/validateCaptcha' in response.text): + # 验证码 + print("[Type]当前页面为-验证码") + type = -2 + elif len(response.text) > 150000: + print("[Type]当前页面为-正常响应") + type = 1 + return type + + +def random_amazon_headers(): + headers = { + "dpr": "1", + "referer": "https://www.amazon.com", + # "sec-ch-ua": "\"Chromium\";v=\"124 \", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-viewport-width": "1912", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + # "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", + "viewport-width": "1912" + } + ua_temple = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.{}.{}" + headers['user-agent'] = ua_temple.format(random.randint(1000, 9999), random.randint(10, 1000)) + return headers + + +def updata_cookie(cookie_dict, meta): + ''' + 根据cookie字典来更新cookie + :param cookie_dict: + :return: + ''' + # print('[UPDATA_COOKIE]', cookie_dict) + cookies = meta.get('cookies', {}) + if cookie_dict.get('x-amz-captcha-1', ''): + cookies['x-amz-captcha-1'] = cookie_dict['x-amz-captcha-1'] + if cookie_dict.get('x-amz-captcha-2', ''): + cookies['x-amz-captcha-2'] = cookie_dict['x-amz-captcha-2'] + meta['cookies'] = cookies + return cookies, meta + + +def get_img(response): + ''' + 下载并识别图片 + :param response: + :return:图片ID + :return:图片识别结果 + ''' + img_id = re.findall(r'name="amzn" value="(.*?)"', response.text) + img = re.findall(r'', response.text) + if img_id and img: + img_url = img[0] + img_id = img_id[0] + r = requests.get(img_url) + img_path = './image/img.png' + ocr = ddddocr.DdddOcr() + with open(img_path, 'wb') as f: + f.write(r.content) + img_data = ocr.classification(r.content) + img_data = img_data.lower() + return img_id, img_data + + +def run_verify(response, meta): + ''' + 处理验证码 + :param response: + :return: + ''' + verify_url = "https://www.amazon.com/errors/validateCaptcha" + img_id, img_data = get_img(response) + if img_id and img_data: + msg_url = meta.get('msg_url', '') + url_href = msg_url.split('amazon.com')[-1] + proxies = meta.get('proxies', '') + headers = meta.get('headers') + cookies = meta.get('cookies') + params = { + "amzn": img_id, + "amzn-r": url_href, + "field-keywords": img_data + } + if msg_url and proxies: + print('[GET]正在请求验证码页, 验证码识别结果为:',img_data) + response = requests.get(verify_url, headers=headers, params=params, cookies=cookies, proxies=proxies, + allow_redirects=False) + response_cookie = dict(response.cookies) + cookies, meta = updata_cookie(response_cookie, meta) + return meta + + +def get_product_detail(meta): + ''' + 采集产品详情 + :param meta: + :return: + ''' + headers = meta['headers'] + msg_url = meta.get('msg_url', '') + proxies = meta.get('proxies', '') + cookies = meta.get('cookies', {}) + response = requests.get(msg_url, headers=headers, cookies=cookies, proxies=proxies) + + print('[GET]正在第 {} 次请求, 响应长度为:'.format(meta['retry_count'] + 1), len(response.text)) + response_type = get_response_type(response) + if response_type == -2: + # 出现验证码,判断是否超过最大重试次数 + if meta['retry_count'] < meta['max_retry']: + meta = run_verify(response, meta) + retry_count = meta['retry_count'] + meta['retry_count'] = retry_count + 1 + + print('[RETRY]重试当前任务:', ) + response, meta = get_product_detail(meta) + else: + print('[MAX_RETRY]超过最大重试次数') + return None + elif response_type == 1: + # 正常的响应 + pass + else: + print(response.text) + raise '超出预期的响应' + return response, meta + + +def run(): + url_list = [ + # 产品详情 + "https://www.amazon.com/dp/B08DFLR38F", + "https://www.amazon.com/TAISCAI-USB-Mount%EF%BC%8C18W-Dual-Waterproof/dp/B0CY99S3KN", + "https://www.amazon.com/Wireless-Charging-Mag-Safe-Foldable-Magnetic/dp/B0CSP7KHD1", + "https://www.amazon.com/Charger-Hohosb-Adapter-Charging-More-White/dp/B0CZ3WXFX3" + # 评论区 + "https://www.amazon.com/Spatula-Tableware-Serving-Scratch-Eco-friendly/product-reviews/B08DFLR38F", + ] + proxies = ip_proxies() + # proxies = None + meta = { + 'headers': random_amazon_headers(), + 'proxies': proxies, + 'max_retry': 3 # 最大重试次数 + } + for url in url_list: + meta['msg_url'] = url + meta['retry_count'] = 0 # 重试次数 + print('[START]',url) + response, meta = get_product_detail(meta) + + # print(response.text) + + +if __name__ == '__main__': + run() diff --git "a/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/test.py" "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/test.py" new file mode 100644 index 0000000..80dc9d8 --- /dev/null +++ "b/\344\272\232\351\251\254\351\200\212/\347\252\201\347\240\264\351\252\214\350\257\201\347\240\201\346\255\245\351\252\244\346\213\206\350\247\243/test.py" @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Time : 2024/5/18 11:44 +# @Author : Harvey +# @File : yanzhengma.py +import requests +from wbh_word.spider.Get_TJ_ip import ip_proxies + +headers = { + "dpr": "1", + "referer": "https://www.amazon.com", + "sec-ch-ua": "\"Chromium\";v=\"124\", \"Microsoft Edge\";v=\"124\", \"Not-A.Brand\";v=\"99\"", + "sec-ch-viewport-width": "1912", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "same-origin", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", + "viewport-width": "1912" +} +cookies = { + "csm-sid": "713-2299262-7567932", + "x-amz-captcha-1": "1715945070398999", + "x-amz-captcha-2": "/fvr6wiJciyvvZR8JOfw+Q==", + "session-id": "138-0202196-4407222", + "session-id-time": "2082787201l", + # 上面这部分为核心字段,过了验证码之后就可以得到 + + "i18n-prefs": "USD", + "lc-main": "zh_CN", + "sp-cdn": "\"L5Z9:CN\"", + # "ubid-main": "132-5607086-4560162", + # "session-token": "m4jtyQF+jZJqVW/adslOeUE7aWcay+oPVttMzoTlqWO9R9VCk6M0xNooY5RmGRW9eOBxpsP949PLbSn9eXz1ECwAwFVxwxRSWZtYLjcpY/70/WSGpis0IqQpRZSPI5RmUQgi/1lHq4qB+zIqJoudzKwXxCt7ihAa4fhbjcAOJjVsAO3pxMHfOH7aDjRw3wHt4xDaW53dyRENzIaYNvwh+KCkzK0w5SOxz6fxuY6v9zUsuWLt8pZmtQ75YoU1C3+Okt2scs+5b+jt+1dl/OTQ6oHj7QyAqK5h0MFeVM9jEkXgoubepR1OgB0YWNmMD3wCrb3sB0NtbZThvFJmWxOV3Bri1TQREibq", + # "csm-hit": "tb:7NNEK8EZX7MDKY0R3SZH+s-7NNEK8EZX7MDKY0R3SZH|1715936997814&t:1715936997814&adb:adblk_no" +} +# url = "https://www.amazon.com/Munchkin%C2%AE-Brica%C2%AE-Stroller-Organizer-Bag/dp/B0BPMQQN6M" +url = "https://www.amazon.com/dp/B08DFLR38F" + +proxies = ip_proxies() +response = requests.get(url, headers=headers, cookies=cookies,proxies=proxies) +print(response.text) +print(len(response.text)) +print(response.cookies) +print(response.headers) +print(proxies) \ No newline at end of file