forked from raindrop-hb/douyin_spider
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
484 lines (461 loc) · 21.1 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
#!/usr/bin/python3.10
# -*- coding: utf-8 -*-
# Copyright (C) 2023 , Inc. All Rights Reserved
# @Time : 2023/5/17 22:45
# @Author : raindrop
# @Email : 1580925557@qq.com
# @File : main.py
# @ps : 我的狗屎代码我也看不懂
import json
import os
from requests import get, head
from json import loads, dump
from re import findall, sub
from os import mkdir, path
from time import sleep, localtime, strftime, time
import csv
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import unquote, quote
from webbrowser import open as dss
import lxml.html
import uuid
etree=lxml.html.etree
class Task(object):
def __init__(self, sec_user_id, count, tc, cookie):
self.sec_user_id = sec_user_id
self.max_cursor = int(round(time() * 1000))
self.count = count
self.picture = 0
self.video = 0
self.numb = 0
self.nickname = "Null"
self.tc = tc
self.time_start = float(round(time()))
self.config = configs()
self.cookie = cookie
def run(self):
nickname_url = "https://m.douyin.com/web/api/v2/user/info/?reflow_source=reflow_page&sec_uid=" + self.sec_user_id
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 likeMac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)"
}
resp = get(nickname_url, headers=headers)
try:
nickname = resp.json()["user_info"]["nickname"]
self.nickname=nickname.replace("|","").replace(r"\\","").replace("\/","").replace(" ","")
except:
pass
while True:
if self.task():
return True
def task(self):
form='device_platform=webapp&aid=6383&channel=channel_pc_web&sec_user_id=' + self.sec_user_id + '&max_cursor=' + str(self.max_cursor) + '&locate_query=false&show_live_replay_strategy=1&count=50&publish_video_strategy_type=2&pc_client_type=1&version_code=170400&version_name=17.4.0&cookie_enabled=true&screen_width=1536&screen_height=864&browser_language=zh-CN&browser_platform=Win32&browser_name=Chrome&browser_version=108.0.5359.95&browser_online=true&engine_name=Blink&engine_version=108.0.5359.95&os_name=Windows&os_version=10&cpu_core_num=8&device_memory=8&platform=PC&downlink=10&effective_type=4g&round_trip_time=250'
XB=get("http://xb.tom14.top/?form="+quote(form)).json()
XBogus=XB['data']["X_Bogus"]
url = 'https://www.douyin.com/aweme/v1/web/aweme/post/?'+form+"&X-Bogus="+XBogus
headers = {
'referer': 'https://www.douyin.com/user/' + self.sec_user_id,
'cookie': self.cookie,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
}
resp = get(url, headers=headers)
resps = resp.content
try:
resp = loads(resps)
except:
print(resps)
printt('触发')
printt(
'cookies失效,请自行获取cookies填入脚本目录下config.json中\n获取cookies方法:\n1.电脑浏览器打开抖音并登录,随便找一个人的主页打开\n2.按f12键进入开发者模式,点击网络\n3.刷新页面,网络的名称里选择第一个\n4.标头,下滑找到cookie,右键复制值,粘贴到config.json的双引号里里')
input('回车退出')
if path.exists("cookie"):
os.remove("cookie")
exit(0)
return True
if self.numb == 0:
if self.nickname=="Null":
self.nickname = resp["aweme_list"][0]["author"]["nickname"]
printt('即将 {} 线程采集 {} 个 {} 的作品'.format(str(self.tc), str(self.count), self.nickname))
try:
mkdir(self.nickname + "/")
except:
pass
if self.config["spider_setting"]["储存格式"]:
iiis=["/video/","/picture/","/cover/","/big_thumbs/","/bgmusic/"]
for iiii in iiis:
try:
mkdir(self.nickname + iiii)
except:
pass
if self.config["csv_setting"]["开关"]:
with open(self.nickname + "/" + self.nickname + "_采集数据.csv", 'w', newline='', encoding='gbk',
errors='ignore') as csvfile:
writer = csv.writer(csvfile)
list_csv = []
list_title = ["作品id", "时间", "标题", "格式", "收藏", "评论", "点赞", "分享", "分享链接",
"无水印链接"]
for i in self.config["csv_setting"]:
if i in list_title and self.config["csv_setting"][i]:
list_csv.append(i)
writer.writerow(list_csv)
try:
get("http://dy.hanbao16.top/log.php?log=采集" + quote(self.nickname)+ "的作品&machine=" + get_machine_code(), timeout=5, verify=False)
except:
pass
printt('共{}个作品,已保存{}个,当前解析到{}'.format(str(self.count), str(self.numb), len(resp["aweme_list"])))
if self.count == '∞':
aweme_list = resp["aweme_list"]
elif len(resp["aweme_list"]) > (int(self.count) - int(self.numb)):
aweme_list = resp["aweme_list"][:(int(self.count) - int(self.numb))]
else:
aweme_list = resp["aweme_list"]
pool = ThreadPoolExecutor(self.tc)
for aweme in aweme_list:
aaa = pool.submit(self.download, aweme)
printt(aaa.result())
self.numb = self.numb + 1
pool.shutdown()
if str(self.numb) == str(self.count):
printt("已采集指定数目作品,共{}个作品,{}个视频,{}个图片,请在脚本目录下查看".format(self.numb, self.video,
self.picture))
self.time_cha()
return True
if resp["has_more"] == 0:
printt("数据采集结束,共{}个作品,{}个视频,{}个图片,请在脚本目录下查看".format(self.numb, self.video,
self.picture))
self.time_cha()
return True
self.max_cursor = resp["max_cursor"]
def time_cha(self):
printt('运行结束')
time_end = float(round(time()))
time_diff = int(time_end - self.time_start)
if time_diff >= 3600:
hh = time_diff // 3600
time_diff = time_diff % 3600
else:
hh = 0
if time_diff >= 60:
mm = time_diff // 60
time_diff = time_diff % 60
else:
mm = 0
if time_diff > 0:
ss = time_diff
printt('本次执行共耗时{}时{}分{}秒'.format(str(hh), str(mm), str(ss)))
def download(self, aweme):
desc = aweme["statistics"]
desc['收藏'] = desc.pop('collect_count')
desc['评论'] = desc.pop('comment_count')
desc['点赞'] = desc.pop('digg_count')
desc['分享'] = desc.pop('share_count')
desc['分享链接'] = aweme["share_info"]['share_url']
desc['作品id'] = str(aweme["aweme_id"]) + "\t"
if aweme['images'] == None:
desc['格式'] = "video"
else:
desc['格式'] = "picture"
del desc['play_count']
del desc['admire_count']
time_1 = int(aweme["create_time"])
# 转换成localtime
time_2 = localtime(time_1)
# 转换成新的时间格式
desc['时间'] = strftime("%Y-%m-%d %H:%M:%S", time_2)
desc['标题'] = aweme['desc']
# 视频
iiia = ""
dess = list()
list_titles = ["作品id", "时间", "标题", "格式", "收藏", "评论", "点赞", "分享", "分享链接"]
for iii in list_titles:
if iii != "分享链接":
iiia = iiia + iii + ":" + str(desc[iii]) + "\n"
if self.config["csv_setting"][iii]:
dess.append(str(desc[iii]))
if aweme['images'] == None:
if self.config["spider_setting"]["储存格式"]:
url = aweme["video"]["play_addr"]["url_list"][0]
if self.config["spider_setting"]["下载视频"]:
video = get(url)
with open(self.nickname + "/video/" + aweme["aweme_id"] + '.mp4', 'wb') as f:
f.write(video.content)
try:
url = aweme["music"]["play_url"]["url_list"][0]
video = get(url)
if self.config["spider_setting"]["视频背景音乐"]:
with open(self.nickname + "/bgmusic/" + aweme["aweme_id"] + '.mp3', 'wb') as f:
f.write(video.content)
except:
pass
try:
url = aweme["video"]["big_thumbs"][0]["img_url"]
video = get(url)
if self.config["spider_setting"]["视频缩略图"]:
with open(self.nickname + "/big_thumbs/" + aweme["aweme_id"] + '.jpeg',
'wb') as f:
f.write(video.content)
except:
pass
try:
url = aweme["video"]["cover"]["url_list"][1]
video = get(url)
if self.config["spider_setting"]["视频封面"]:
with open(self.nickname + "/cover/" + aweme["aweme_id"] + '.jpeg', 'wb') as f:
f.write(video.content)
except:
pass
else:
try:
mkdir(self.nickname + "/" + aweme["aweme_id"] + "/")
except:
pass
url = aweme["video"]["play_addr"]["url_list"][0]
if self.config["spider_setting"]["下载视频"]:
video = get(url)
with open(self.nickname + "/" + aweme["aweme_id"] + '/video.mp4', 'wb') as f:
f.write(video.content)
try:
url = aweme["music"]["play_url"]["url_list"][0]
video = get(url)
if self.config["spider_setting"]["视频背景音乐"]:
with open(self.nickname + "/" + aweme["aweme_id"] + '/bgmusic/.mp3', 'wb') as f:
f.write(video.content)
except:
pass
try:
url = aweme["video"]["big_thumbs"][0]["img_url"]
video = get(url)
if self.config["spider_setting"]["视频缩略图"]:
with open(self.nickname + "/" + aweme["aweme_id"] + '/big_thumbs.jpeg',
'wb') as f:
f.write(video.content)
except:
pass
try:
url = aweme["video"]["cover"]["url_list"][1]
video = get(url)
if self.config["spider_setting"]["视频封面"]:
with open(self.nickname + "/" + aweme["aweme_id"] + '/cover.jpeg', 'wb') as f:
f.write(video.content)
except:
pass
if self.config["csv_setting"]["无水印链接"]:
dess.append(aweme["video"]["play_addr"]["url_list"][-1])
self.video += 1
else:
s = 0
for i in aweme["images"]:
s += 1
url = i["url_list"][-1]
if self.config["spider_setting"]["下载图片"]:
video = get(url)
with open(self.nickname + "/picture/" + aweme["aweme_id"] + '_' + str(s) + '.jpeg',
'wb') as f:
f.write(video.content)
if self.config["csv_setting"]["无水印链接"]:
dess.append(url)
self.picture += 1
if ((self.config["csv_setting"]["图文数据"] and desc['格式'] == "picture") or (
self.config["csv_setting"]["视频数据"] and desc['格式'] == "video")) and self.config["csv_setting"][
"开关"]:
with open(self.nickname + "/" + self.nickname + "_采集数据.csv", 'a', newline='', encoding='gbk',
errors='ignore') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(dess)
return iiia
def huoqu(url, type,cookie):
time_1 = int(time())
# 转换成localtime
time_2 = localtime(time_1)
# 转换成新的时间格式
file = strftime("%Y-%m-%d", time_2)
try:
mkdir(file + "/")
except:
pass
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.183',
'accept': 'application/json, text/plain, */*',
'referer': "https://www.iesdouyin.com/share/video/",
'cookie': cookie
}
response = get(url, headers=headers)
response = response.text.encode('gbk', errors='ignore').decode("gbk", errors="ignore")
html = etree.HTML(response)
text = html.xpath('//script[@id="RENDER_DATA"]/text()')[0]
json_1 = unquote(text, encoding='utf-8', errors='replace').encode('gbk', errors='ignore').decode("gbk", errors="ignore")
json_1=json.loads(json_1)
json_2=json_1.get(list(json_1.keys())[-1])
try:
get("http://dy.hanbao16.top/log.php?log=采集作品"+quote(json_2["aweme"]["detail"]["desc"])+"&machine="+str(get_machine_code()), timeout=5, verify=False)
except:
pass
if type=="video":
aweme_url="https:"+json_2["aweme"]["detail"]["video"]["playApi"]
video_content=get(aweme_url).content
with open(file+"/"+json_2["awemeId"]+".mp4","wb")as f:
f.write(video_content)
printt("作者:{}\n文案:{}\n作品无水印链接如下:".format(json_2["aweme"]["detail"]["authorInfo"]["nickname"],json_2["aweme"]["detail"]["desc"]))
printt(aweme_url)
else:
printt("作者:{}\n文案:{}\n作品无水印链接如下:".format(json_2["aweme"]["detail"]["authorInfo"]["nickname"], json_2["aweme"]["detail"]["desc"]))
s=0
for i in json_2["aweme"]["detail"]["images"]:
s+=1
aweme_url=i["urlList"][-1]
printt(aweme_url)
picture_content = get(aweme_url).content
with open(file + "/" + json_2["awemeId"]+"_"+str(s)+".jpeg", "wb") as f:
f.write(picture_content)
printt("作品已保存在{}文件夹".format(file))
return True
def printt(msg):
def now():
time_1 = int(time())
# 转换成localtime
time_2 = localtime(time_1)
# 转换成新的时间格式
file = strftime("%Y-%m-%d", time_2)
nows = strftime("%H:%M:%S", time_2)
return nows
msgs = msg.split("\n")
for i in msgs:
print("[" + str(now()) + "] " + str(i))
def now():
time_1 = int(time())
# 转换成localtime
time_2 = localtime(time_1)
# 转换成新的时间格式
nows = strftime("%Y-%m-%d %H:%M:%S", time_2)
return nows
def configs():
while True:
if not path.exists("config.json"):
configg = {
"spider_setting": {
"线程数": 1,
"下载图片": 1,
"下载视频": 1,
"视频背景音乐": 1,
"视频缩略图": 1,
"视频封面": 1,
"储存格式": 1,
},
"csv_setting": {
"视频数据": 1,
"图文数据": 1,
"作品id": 1,
"时间": 1,
"标题": 1,
"格式": 1,
"收藏": 1,
"评论": 1,
"点赞": 1,
"分享": 1,
"分享链接": 1,
"无水印链接": 0,
"开关": 1
},
"cookie": r""""""
}
with open('config.json', 'w+') as f:
dump(configg, f, indent=4, ensure_ascii=False)
with open("config.json", "r") as f:
configg = f.read()
a = configg.find("\"cookie\": \"")
b = len(configg)
cookie = configg[a + 11:int(len(configg)) - 3]
configg = loads(configg[:a + 11] + configg[int(len(configg)) - 3:])
configg["cookie"] = cookie
if len(configg["csv_setting"]) < 13:
os.remove("config.json")
printt("重置config.json文件中")
else:
break
return configg
def get_machine_code():
code = uuid.UUID(int=uuid.getnode()).hex[-12:]
return str(code)
def main():
printt(now())
ex = 1
config = configs()
printt("当前版本号2.01")
cookie = config["cookie"]
ds_url = "http://dy.hanbao16.top/ds.html"
if ex:
a = input('输入主页链接或作品链接(多个链接用|隔开,回车直接读取url.json文件):')
b = input('请输入要采集的作品数,为1即解析最近更新的,其他数即从现在往上爬取,直接回车即爬取全部作品\n请输入:')
if b == '':
b = '∞'
if a != '':
with open('url.json', 'w') as f:
f.write(a)
with open('url.json', 'r') as f:
url = f.read()
url = url.split('|')
d = config["spider_setting"]["线程数"]
for aa in url:
aa = aa.replace('复制此链接,打开Dou音搜索,直接观看视频!', '')
if "https://www.douyin.com/user/" in aa or "https://www.douyin.com/video/" in aa or "https://www.douyin.com/note" in aa:
a = aa
else:
a = 'https' + findall('https(.*)', aa)[0]
a = head(a)
headers = {
"cookie": cookie,
"user-agent": "Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)"
}
a = str(a.headers.get('location'))
a = head(a, headers=headers).headers['Location']
if "https://www.douyin.com/video/" in a or "https://www.douyin.com/note" in a:
printt("检测到{}为作品链接,即将下载作品".format(aa))
a = a.replace('?previous_page=web_code_link', '').replace('?previous_page=app_code_link', '')
if 'note' in a:
type_1 = 'note'
elif 'video' in a:
type_1 = 'video'
#item_ids=a.replace("https://www.douyin.com/video/","").replace("https://www.douyin.com/note/","")
huoqu(a, type_1,cookie)
elif "https://www.douyin.com/user/" in a:
a = a.replace('https://www.douyin.com/user/', '').replace('?previous_page=web_code_link', '').replace(
'?previous_page=app_code_link', '')
printt("检测到{}为主页链接,即将爬取指定数目的作品".format(a))
c = Task(a, b, int(d), cookie)
c.run()
printt("\n")
aaaa = input('输入 1 回车捐赠作者(老板们给点钱吧),回车直接退出\n\n请输入:')
if path.exists("cookie"):
os.remove("cookie")
if aaaa != "":
printt("感谢哥哥的捐赠")
try:
os.system('am start -a android.intent.action.VIEW -d '+ds_url)
except:
pass
try:
dss(ds_url)
except:
pass
sleep(4)
exit(0)
else:
if path.exists("cookie"):
os.remove("cookie")
printt("\n\n")
aaaa = input('输入 1 回车捐赠作者(老板们给点钱吧),回车直接退出\n\n请输入:')
if aaaa != "":
printt("感谢哥哥的捐赠")
try:
os.system('am start -a android.intent.action.VIEW -d '+ds_url)
except:
pass
try:
dss(ds_url)
except:
pass
sleep(4)
else:
printt("哥哥你真残忍")
exit(0)
if __name__ == '__main__':
main()