-
Notifications
You must be signed in to change notification settings - Fork 0
/
get-titles.ts
124 lines (98 loc) · 3.67 KB
/
get-titles.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import puppeteer from 'puppeteer';
interface ScrapedData {
text: string
link: string
}
const sleep = (ms: number) => new Promise(resolve => setTimeout(resolve, ms));
const extractTitles = async (page: puppeteer.Page, selector: string): Promise<[ScrapedData]> => {
return await page.evaluate((selector) => {
return [...document.querySelectorAll(selector)].map(node => ({ text: node.innerText, link: node.href }));
}, selector);
}
const generateUniqueTitles = (titles: Array<ScrapedData>): Array<ScrapedData> => {
const cache: { [key: string]: boolean } = { }
const result: Array<ScrapedData> = [];
titles.forEach(title => {
if (!(title.text in cache)) {
result.push(title);
cache[title.text] = true;
}
})
return result;
}
interface FakeRequest {
url(): string
}
const determineRequest =
(a: puppeteer.Request | FakeRequest): a is puppeteer.Request => (a as puppeteer.Request).postData !== undefined;
const isEqualRequest = (a: puppeteer.Request | FakeRequest, b: puppeteer.Request | FakeRequest) => {
if (determineRequest(a) && determineRequest(b)) {
return a.postData() === b.postData() && a.url() === b.url();
}
return a.url() === b.url();
};
const scrapeTitles = async (url: string, titleSelector: string, nextButtonSelector: string, nextPageRequest: string, sleepTime?: number) => {
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox']
});
const page = await browser.newPage();
// page.on('console', consoleObj => console.log(consoleObj.text()));
await page.goto(url);
await page.waitForSelector(nextButtonSelector);
await page.setRequestInterception(true);
let titles: Array<ScrapedData> = await extractTitles(page, titleSelector);
let previousUrl: string = '';
let previousAjaxRequest: puppeteer.Request | FakeRequest = { url: () => '' };
let ajaxRequest: puppeteer.Request | FakeRequest = { url: () => '' };
page.on('request', request => {
if (nextPageRequest !== '' && request.url().includes(nextPageRequest)) {
ajaxRequest = request;
}
request.continue();
});
while (true) {
try {
const pageUrl = page.url();
if (!nextPageRequest && previousUrl === pageUrl) {
console.log('Exiting due to the same page again', previousUrl);
break;
}
previousUrl = pageUrl;
if (nextPageRequest) {
console.log(`Scraping ${ajaxRequest.url()}`);
} else {
console.log(`Scraping ${pageUrl}`);
}
await page.waitForSelector(nextButtonSelector, { timeout: 15000 });
await page.waitForSelector(titleSelector, { timeout: 15000 });
titles = titles.concat(await extractTitles(page, titleSelector));
await page.click(nextButtonSelector);
if (nextPageRequest && isEqualRequest(previousAjaxRequest, ajaxRequest)) {
console.log('End of page', ajaxRequest.url());
break;
}
previousAjaxRequest = ajaxRequest;
if (sleepTime) {
await sleep(sleepTime * 1000);
}
} catch(err) {
console.log('Error happened. Exiting gracefully: ', err);
break;
}
}
const uniqueTitles = generateUniqueTitles(titles);
uniqueTitles
.sort((a, b) => a.text <= b.text ? -1 : 1)
.forEach(title => {
console.log(`${title.text}\t${title.link}`);
})
browser.close();
}
const url = process.argv[2];
const titleSelector = process.argv[3];
const nextButtonSelector = process.argv[4];
const nextPageRequest = process.argv[5];
const sleepTime: string = process.argv[6];
console.log(url, titleSelector, nextButtonSelector, nextPageRequest, sleepTime)
scrapeTitles(url, titleSelector, nextButtonSelector, nextPageRequest || '', parseInt(sleepTime) || undefined);