From 48c158049b8490cc574eaf5819bef705df747127 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 2 Oct 2024 18:06:26 +0300 Subject: [PATCH] Allow passing a browserContext --- src/scrapers/base-scraper-with-browser.ts | 104 ++++++++++++---------- src/scrapers/interface.ts | 89 +++++++++--------- 2 files changed, 104 insertions(+), 89 deletions(-) diff --git a/src/scrapers/base-scraper-with-browser.ts b/src/scrapers/base-scraper-with-browser.ts index 52ef6aa7..eb345b12 100644 --- a/src/scrapers/base-scraper-with-browser.ts +++ b/src/scrapers/base-scraper-with-browser.ts @@ -1,7 +1,4 @@ -import puppeteer, { - type Browser, type Frame, type GoToOptions, type Page, type PuppeteerLifeCycleEvent, -} from 'puppeteer'; - +import puppeteer, { type Frame, type GoToOptions, type Page, type PuppeteerLifeCycleEvent } from 'puppeteer'; import { ScraperProgressTypes } from '../definitions'; import { getDebug } from '../helpers/debug'; import { clickButton, fillInput, waitUntilElementFound } from '../helpers/elements-interactions'; @@ -85,9 +82,7 @@ function createGeneralError(): ScraperScrapingResult { } class BaseScraperWithBrowser extends BaseScraper { - // NOTICE - it is discouraged to use bang (!) in general. It is used here because - // all the classes that inherit from this base assume is it mandatory. - protected browser!: Browser; + private cleanups: Array<() => Promise> = []; // NOTICE - it is discouraged to use bang (!) in general. It is used here because // all the classes that inherit from this base assume is it mandatory. @@ -100,53 +95,69 @@ class BaseScraperWithBrowser extends Ba }; } - async initialize() { - await super.initialize(); - debug('initialize scraper'); - this.emitProgress(ScraperProgressTypes.Initializing); - - let env: Record | undefined; - if (this.options.verbose) { - env = { DEBUG: '*', ...process.env }; + private async initializePage() { + debug('initialize browser page'); + if ('browserContext' in this.options) { + debug('Using the browser context provided in options'); + return this.options.browserContext.newPage(); } - if (typeof this.options.browser !== 'undefined' && this.options.browser !== null) { - debug('use custom browser instance provided in options'); - this.browser = this.options.browser; - } else { - const executablePath = this.options.executablePath || undefined; - const args = this.options.args || []; - const { timeout } = this.options; - - const headless = !this.options.showBrowser; - debug(`launch a browser with headless mode = ${headless}`); - this.browser = await puppeteer.launch({ - env, - headless, - executablePath, - args, - timeout, + if ('browser' in this.options) { + debug('Using the browser instance provided in options'); + const { browser } = this.options; + + /** + * For backward compatibility, we will close the browser even if we didn't create it + */ + this.cleanups.push(async () => { + debug('closing the browser'); + await browser.close(); }); - } + + return browser.newPage(); + } + + const { timeout, args, executablePath, showBrowser } = this.options; + + const headless = !showBrowser; + debug(`launch a browser with headless mode = ${headless}`); + + const browser = await puppeteer.launch({ + env: this.options.verbose ? { DEBUG: '*', ...process.env } : undefined, + headless, + executablePath, + args, + timeout, + }); + + this.cleanups.push(async () => { + debug('closing the browser'); + await browser.close(); + }); if (this.options.prepareBrowser) { debug("execute 'prepareBrowser' interceptor provided in options"); - await this.options.prepareBrowser(this.browser); + await this.options.prepareBrowser(browser); } - if (!this.browser) { - debug('failed to initiate a browser, exit'); + debug('create a new browser page'); + return browser.newPage(); + } + + async initialize() { + await super.initialize(); + debug('initialize scraper'); + this.emitProgress(ScraperProgressTypes.Initializing); + + const page = await this.initializePage(); + if (!page) { + debug('failed to initiate a browser page, exit'); return; } - const pages = await this.browser.pages(); - if (pages.length) { - debug('browser has already pages open, use the first one'); - [this.page] = pages; - } else { - debug('create a new browser page'); - this.page = await this.browser.newPage(); - } + this.page = page; + + this.cleanups.push( () => page.close()); if (this.options.defaultTimeout) { this.page.setDefaultTimeout(this.options.defaultTimeout); @@ -274,11 +285,8 @@ class BaseScraperWithBrowser extends Ba }); } - if (!this.browser) { - return; - } - - await this.browser.close(); + await Promise.all(this.cleanups.reverse().map((cleanup) => cleanup())); + this.cleanups = []; } private handleLoginResult(loginResult: LoginResults) { diff --git a/src/scrapers/interface.ts b/src/scrapers/interface.ts index f10e51a4..d78001af 100644 --- a/src/scrapers/interface.ts +++ b/src/scrapers/interface.ts @@ -1,4 +1,4 @@ -import { type Browser, type Page } from 'puppeteer'; +import { type BrowserContext, type Browser, type Page } from 'puppeteer'; import { type CompanyTypes, type ScraperProgressTypes } from '../definitions'; import { type TransactionsAccount } from '../transactions'; import { type ErrorResult, type ScraperErrorTypes } from './errors'; @@ -26,7 +26,7 @@ export interface FutureDebit { bankAccountNumber?: string; } -export interface ScraperOptions { +export type ScraperOptions = ScraperBrowserOptions & { /** * The company you want to scrape */ @@ -42,82 +42,89 @@ export interface ScraperOptions { */ startDate: Date; - /** - * shows the browser while scraping, good for debugging (default false) - */ - showBrowser?: boolean; - /** * scrape transactions to be processed X months in the future */ futureMonthsToScrape?: number; /** - * option from init puppeteer browser instance outside the libary scope. you can get - * browser diretly from puppeteer via `puppeteer.launch()` - */ - browser?: any; - - /** - * provide a patch to local chromium to be used by puppeteer. Relevant when using - * `israeli-bank-scrapers-core` library + * if set to true, all installment transactions will be combine into the first one */ - executablePath?: string; + combineInstallments?: boolean; /** - * if set to true, all installment transactions will be combine into the first one + * if set, store a screenshot if failed to scrape. Used for debug purposes */ - combineInstallments?: boolean; + storeFailureScreenShotPath?: string; /** - * additional arguments to pass to the browser instance. The list of flags can be found in - * - * https://developer.mozilla.org/en-US/docs/Mozilla/Command_Line_Options - * https://peter.sh/experiments/chromium-command-line-switches/ + * if set, will set the timeout in milliseconds of puppeteer's `page.setDefaultTimeout`. */ - args?: string[]; + defaultTimeout?: number; /** - * Maximum navigation time in milliseconds, pass 0 to disable timeout. - * @default 30000 + * Options for manipulation of output data */ - timeout?: number | undefined; + outputData?: OutputDataOptions; /** - * adjust the browser instance before it is being used - * - * @param browser + * Perform additional operation for each transaction to get more information (Like category) about it. + * Please note: It will take more time to finish the process. */ - prepareBrowser?: (browser: Browser) => Promise; + additionalTransactionInformation?: boolean; +}; +export type ScraperBrowserOptions = { /** * adjust the page instance before it is being used. * * @param page */ - preparePage?: (page: Page) => Promise; + preparePage?: (page: Page) => Promise; } & ({ + /** + * option from init puppeteer browser instance outside the library scope. you can get + * browser directly from puppeteer via `puppeteer.launch()` + */ + browser: Browser; +} | { + /** + * + */ + browserContext: BrowserContext; +} | { + /** + * shows the browser while scraping, good for debugging (default false) + */ + showBrowser?: boolean; + /** - * if set, store a screenshot if failed to scrape. Used for debug purposes + * provide a patch to local chromium to be used by puppeteer. Relevant when using + * `israeli-bank-scrapers-core` library */ - storeFailureScreenShotPath?: string; + executablePath?: string; /** - * if set, will set the timeout in milliseconds of puppeteer's `page.setDefaultTimeout`. + * additional arguments to pass to the browser instance. The list of flags can be found in + * + * https://developer.mozilla.org/en-US/docs/Mozilla/Command_Line_Options + * https://peter.sh/experiments/chromium-command-line-switches/ */ - defaultTimeout?: number; + args?: string[]; /** - * Options for manipulation of output data + * Maximum navigation time in milliseconds, pass 0 to disable timeout. + * @default 30000 */ - outputData?: OutputDataOptions; + timeout?: number | undefined; /** - * Perform additional operation for each transaction to get more information (Like category) about it. - * Please note: It will take more time to finish the process. + * adjust the browser instance before it is being used + * + * @param browser */ - additionalTransactionInformation?: boolean; -} + prepareBrowser?: (browser: Browser) => Promise; +}); export interface OutputDataOptions { /**