forked from leVirve/CrawlerTutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
/
basic_crawler.py
129 lines (95 loc) · 3.45 KB
/
basic_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import re
import time
import urllib
from multiprocessing import Pool
import requests
from requests_html import HTML
from utils import pretty_print # noqa
def fetch(url):
''' Step-1: send a request and fetch the web page.
'''
response = requests.get(url)
return response
def parse_article_entries(doc):
''' Step-2: parse the post entries on the source string.
'''
html = HTML(html=doc)
post_entries = html.find('div.r-ent')
return post_entries
def parse_article_meta(ent):
''' Step-3: parse the metadata in article entry
'''
meta = {
'title': ent.find('div.title', first=True).text,
'push': ent.find('div.nrec', first=True).text,
'date': ent.find('div.date', first=True).text,
}
try:
meta['author'] = ent.find('div.author', first=True).text
meta['link'] = ent.find('div.title > a', first=True).attrs['href']
except AttributeError:
if '(本文已被刪除)' in meta['title']:
match_author = re.search('\[(\w*)\]', meta['title'])
if match_author:
meta['author'] = match_author.group(1)
elif re.search('已被\w*刪除', meta['title']):
match_author = re.search('\<(\w*)\>', meta['title'])
if match_author:
meta['author'] = match_author.group(1)
return meta
def get_metadata_from(url):
''' Step-4: parse the link of previous link.
'''
def parse_next_link(doc):
''' Step-4a: parse the link of previous link.
'''
html = HTML(html=doc)
controls = html.find('.action-bar a.btn.wide')
link = controls[1].attrs.get('href')
return urllib.parse.urljoin(domain, link)
resp = fetch(url)
post_entries = parse_article_entries(resp.text)
next_link = parse_next_link(resp.text)
metadata = [parse_article_meta(entry) for entry in post_entries]
return metadata, next_link
def get_paged_meta(url, num_pages):
''' Step-4-ext: collect pages of metadata starting from url.
'''
collected_meta = []
for _ in range(num_pages):
posts, link = get_metadata_from(url)
collected_meta += posts
url = urllib.parse.urljoin(domain, link)
return collected_meta
def partA():
resp = fetch(start_url)
post_entries = parse_article_entries(resp.text)
for entry in post_entries:
meta = parse_article_meta(entry)
pretty_print(meta['push'], meta['title'], meta['date'], meta['author'])
def partB():
metadata = get_paged_meta(start_url, num_pages=5)
for meta in metadata:
pretty_print(meta['push'], meta['title'], meta['date'], meta['author'])
def partC():
def get_posts(metadata):
post_links = [
urllib.parse.urljoin(domain, meta['link'])
for meta in metadata if 'link' in meta]
with Pool(processes=8) as pool:
contents = pool.map(fetch, post_links)
return contents
start = time.time()
metadata = get_paged_meta(start_url, num_pages=2)
resps = get_posts(metadata)
print('花費: %f 秒' % (time.time() - start))
print('共%d項結果:' % len(resps))
for post, resps in zip(metadata, resps):
print('{0} {1: <15} {2}, 網頁內容共 {3} 字'.format(
post['date'], post['author'], post['title'], len(resps.text)))
domain = 'https://www.ptt.cc/'
start_url = 'https://www.ptt.cc/bbs/movie/index.html'
if __name__ == '__main__':
partA()
partB()
partC()