Skip to content

Commit

Permalink
feature(crawler): Allow connecting to browser's websocket address and…
Browse files Browse the repository at this point in the history
… launching the browser on demand. This enables support for browserless
  • Loading branch information
MohamedBassem committed May 15, 2024
1 parent f64a5f3 commit 39025a8
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 36 deletions.
83 changes: 55 additions & 28 deletions apps/workers/crawlerWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,39 +48,53 @@ const metascraperParser = metascraper([
metascraperUrl(),
]);

let browser: Browser | undefined;
let globalBrowser: Browser | undefined;
// Guards the interactions with the browser instance.
// This is needed given that most of the browser APIs are async.
const browserMutex = new Mutex();

async function startBrowserInstance() {
const defaultViewport = {
width: 1440,
height: 900,
};
if (serverConfig.crawler.browserWebSocketUrl) {
logger.info(
`[Crawler] Connecting to existing browser websocket address: ${serverConfig.crawler.browserWebSocketUrl}`,
);
return await puppeteer.connect({
browserWSEndpoint: serverConfig.crawler.browserWebSocketUrl,
defaultViewport,
});
} else if (serverConfig.crawler.browserWebUrl) {
logger.info(
`[Crawler] Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
);
const webUrl = new URL(serverConfig.crawler.browserWebUrl);
// We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242
const { address: address } = await dns.promises.lookup(webUrl.hostname);
webUrl.hostname = address;
logger.info(
`[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`,
);
return await puppeteer.connect({
browserURL: webUrl.toString(),
defaultViewport,
});
} else {
logger.info(`Launching a new browser instance`);
return await puppeteer.launch({
headless: serverConfig.crawler.headlessBrowser,
defaultViewport,
});
}
}

async function launchBrowser() {
browser = undefined;
globalBrowser = undefined;
await browserMutex.runExclusive(async () => {
try {
if (serverConfig.crawler.browserWebUrl) {
logger.info(
`[Crawler] Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
);
const webUrl = new URL(serverConfig.crawler.browserWebUrl);
// We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242
const { address: address } = await dns.promises.lookup(webUrl.hostname);
webUrl.hostname = address;
logger.info(
`[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`,
);
browser = await puppeteer.connect({
browserURL: webUrl.toString(),
defaultViewport: {
width: 1440,
height: 900,
},
});
} else {
logger.info(`Launching a new browser instance`);
browser = await puppeteer.launch({
headless: serverConfig.crawler.headlessBrowser,
});
}
globalBrowser = await startBrowserInstance();
} catch (e) {
logger.error(
"[Crawler] Failed to connect to the browser instance, will retry in 5 secs",
Expand All @@ -90,7 +104,7 @@ async function launchBrowser() {
}, 5000);
return;
}
browser.on("disconnected", () => {
globalBrowser.on("disconnected", () => {
if (isShuttingDown) {
logger.info(
"[Crawler] The puppeteer browser got disconnected. But we're shutting down so won't restart it.",
Expand All @@ -113,7 +127,13 @@ export class CrawlerWorker {
blockTrackersAndAnnoyances: true,
}),
);
await launchBrowser();
if (!serverConfig.crawler.browserConnectOnDemand) {
await launchBrowser();
} else {
logger.info(
"[Crawler] Browser connect on demand is enabled, won't proactively start the browser instance",
);
}

logger.info("Starting crawler worker ...");
const worker = new Worker<ZCrawlLinkRequest, void>(
Expand Down Expand Up @@ -197,6 +217,13 @@ function validateUrl(url: string) {
}

async function crawlPage(jobId: string, url: string) {
let browser: Browser;
if (serverConfig.crawler.browserConnectOnDemand) {
browser = await startBrowserInstance();
} else {
assert(globalBrowser);
browser = globalBrowser;
}
assert(browser);
const context = await browser.createBrowserContext();

Expand Down
19 changes: 11 additions & 8 deletions docs/docs/03-configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,14 @@ Either `OPENAI_API_KEY` or `OLLAMA_BASE_URL` need to be set for automatic taggin

## Crawler Configs

| Name | Required | Default | Description |
| ----------------------------- | -------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| CRAWLER_NUM_WORKERS | No | 1 | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources. |
| CRAWLER_DOWNLOAD_BANNER_IMAGE | No | true | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites. |
| CRAWLER_STORE_SCREENSHOT | No | true | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link. |
| CRAWLER_FULL_PAGE_SCREENSHOT | No | false | Whether to store a screenshot of the full page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, the screenshot will only include the visible part of the page |
| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection |
| Name | Required | Default | Description |
| ----------------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| CRAWLER_NUM_WORKERS | No | 1 | Number of allowed concurrent crawling jobs. By default, we're only doing one crawling request at a time to avoid consuming a lot of resources. |
| BROWSER_WEB_URL | No | Not set | The browser's http debugging address. The worker will talk to this endpoint to resolve the debugging console's websocket address. If you already have the websocket address, use `BROWSER_WEBSOCKET_URL` instead. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will launch its own browser instance (assuming it has access to the chrome binary). |
| BROWSER_WEBSOCKET_URL | No | Not set | The websocket address of browser's debugging console. If you want to use [browserless](https://browserless.io), use their websocket address here. If neither `BROWSER_WEB_URL` nor `BROWSER_WEBSOCKET_URL` are set, the worker will launch its own browser instance (assuming it has access to the chrome binary). |
| BROWSER_CONNECT_ONDEMAND | No | false | If set to false, the crawler will proactively connect to the browser instance and always maintain an active connection. If set to true, the browser will be launched on demand only whenever a crawling is requested. Set to true if you're using a service that provides you with browser instances on demand. |
| CRAWLER_DOWNLOAD_BANNER_IMAGE | No | true | Whether to cache the banner image used in the cards locally or fetch it each time directly from the website. Caching it consumes more storage space, but is more resilient against link rot and rate limits from websites. |
| CRAWLER_STORE_SCREENSHOT | No | true | Whether to store a screenshot from the crawled website or not. Screenshots act as a fallback for when we fail to extract an image from a website. You can also view the stored screenshots for any link. |
| CRAWLER_FULL_PAGE_SCREENSHOT | No | false | Whether to store a screenshot of the full page or not. Disabled by default, as it can lead to much higher disk usage. If disabled, the screenshot will only include the visible part of the page |
| CRAWLER_JOB_TIMEOUT_SEC | No | 60 | How long to wait for the crawler job to finish before timing out. If you have a slow internet connection or a low powered device, you might want to bump this up a bit |
| CRAWLER_NAVIGATE_TIMEOUT_SEC | No | 30 | How long to spend navigating to the page (along with its redirects). Increase this if you have a slow internet connection |
4 changes: 4 additions & 0 deletions packages/shared/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ const allEnv = z.object({
REDIS_PASSWORD: z.string().optional(),
CRAWLER_HEADLESS_BROWSER: stringBool("true"),
BROWSER_WEB_URL: z.string().url().optional(),
BROWSER_WEBSOCKET_URL: z.string().url().optional(),
BROWSER_CONNECT_ONDEMAND: stringBool("false"),
CRAWLER_JOB_TIMEOUT_SEC: z.coerce.number().default(60),
CRAWLER_NAVIGATE_TIMEOUT_SEC: z.coerce.number().default(30),
CRAWLER_NUM_WORKERS: z.coerce.number().default(1),
Expand Down Expand Up @@ -65,6 +67,8 @@ const serverConfigSchema = allEnv.transform((val) => {
numWorkers: val.CRAWLER_NUM_WORKERS,
headlessBrowser: val.CRAWLER_HEADLESS_BROWSER,
browserWebUrl: val.BROWSER_WEB_URL,
browserWebSocketUrl: val.BROWSER_WEBSOCKET_URL,
browserConnectOnDemand: val.BROWSER_CONNECT_ONDEMAND,
jobTimeoutSec: val.CRAWLER_JOB_TIMEOUT_SEC,
navigateTimeoutSec: val.CRAWLER_NAVIGATE_TIMEOUT_SEC,
downloadBannerImage: val.CRAWLER_DOWNLOAD_BANNER_IMAGE,
Expand Down

0 comments on commit 39025a8

Please sign in to comment.