Skip to content

Commit

Permalink
Allow passing a browserContext
Browse files Browse the repository at this point in the history
  • Loading branch information
daniel-hauser committed Oct 2, 2024
1 parent 80c6757 commit 48c1580
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 89 deletions.
104 changes: 56 additions & 48 deletions src/scrapers/base-scraper-with-browser.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
import puppeteer, {
type Browser, type Frame, type GoToOptions, type Page, type PuppeteerLifeCycleEvent,
} from 'puppeteer';

import puppeteer, { type Frame, type GoToOptions, type Page, type PuppeteerLifeCycleEvent } from 'puppeteer';
import { ScraperProgressTypes } from '../definitions';
import { getDebug } from '../helpers/debug';
import { clickButton, fillInput, waitUntilElementFound } from '../helpers/elements-interactions';
Expand Down Expand Up @@ -85,9 +82,7 @@ function createGeneralError(): ScraperScrapingResult {
}

class BaseScraperWithBrowser<TCredentials extends ScraperCredentials> extends BaseScraper<TCredentials> {
// NOTICE - it is discouraged to use bang (!) in general. It is used here because
// all the classes that inherit from this base assume is it mandatory.
protected browser!: Browser;
private cleanups: Array<() => Promise<void>> = [];

// NOTICE - it is discouraged to use bang (!) in general. It is used here because
// all the classes that inherit from this base assume is it mandatory.
Expand All @@ -100,53 +95,69 @@ class BaseScraperWithBrowser<TCredentials extends ScraperCredentials> extends Ba
};
}

async initialize() {
await super.initialize();
debug('initialize scraper');
this.emitProgress(ScraperProgressTypes.Initializing);

let env: Record<string, any> | undefined;
if (this.options.verbose) {
env = { DEBUG: '*', ...process.env };
private async initializePage() {
debug('initialize browser page');
if ('browserContext' in this.options) {
debug('Using the browser context provided in options');
return this.options.browserContext.newPage();
}

if (typeof this.options.browser !== 'undefined' && this.options.browser !== null) {
debug('use custom browser instance provided in options');
this.browser = this.options.browser;
} else {
const executablePath = this.options.executablePath || undefined;
const args = this.options.args || [];
const { timeout } = this.options;

const headless = !this.options.showBrowser;
debug(`launch a browser with headless mode = ${headless}`);
this.browser = await puppeteer.launch({
env,
headless,
executablePath,
args,
timeout,
if ('browser' in this.options) {
debug('Using the browser instance provided in options');
const { browser } = this.options;

/**
* For backward compatibility, we will close the browser even if we didn't create it
*/
this.cleanups.push(async () => {
debug('closing the browser');
await browser.close();
});
}

return browser.newPage();
}

const { timeout, args, executablePath, showBrowser } = this.options;

const headless = !showBrowser;
debug(`launch a browser with headless mode = ${headless}`);

const browser = await puppeteer.launch({
env: this.options.verbose ? { DEBUG: '*', ...process.env } : undefined,
headless,
executablePath,
args,
timeout,
});

this.cleanups.push(async () => {
debug('closing the browser');
await browser.close();
});

if (this.options.prepareBrowser) {
debug("execute 'prepareBrowser' interceptor provided in options");
await this.options.prepareBrowser(this.browser);
await this.options.prepareBrowser(browser);
}

if (!this.browser) {
debug('failed to initiate a browser, exit');
debug('create a new browser page');
return browser.newPage();
}

async initialize() {
await super.initialize();
debug('initialize scraper');
this.emitProgress(ScraperProgressTypes.Initializing);

const page = await this.initializePage();
if (!page) {
debug('failed to initiate a browser page, exit');
return;
}

const pages = await this.browser.pages();
if (pages.length) {
debug('browser has already pages open, use the first one');
[this.page] = pages;
} else {
debug('create a new browser page');
this.page = await this.browser.newPage();
}
this.page = page;

this.cleanups.push( () => page.close());

if (this.options.defaultTimeout) {
this.page.setDefaultTimeout(this.options.defaultTimeout);
Expand Down Expand Up @@ -274,11 +285,8 @@ class BaseScraperWithBrowser<TCredentials extends ScraperCredentials> extends Ba
});
}

if (!this.browser) {
return;
}

await this.browser.close();
await Promise.all(this.cleanups.reverse().map((cleanup) => cleanup()));
this.cleanups = [];
}

private handleLoginResult(loginResult: LoginResults) {
Expand Down
89 changes: 48 additions & 41 deletions src/scrapers/interface.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { type Browser, type Page } from 'puppeteer';
import { type BrowserContext, type Browser, type Page } from 'puppeteer';
import { type CompanyTypes, type ScraperProgressTypes } from '../definitions';
import { type TransactionsAccount } from '../transactions';
import { type ErrorResult, type ScraperErrorTypes } from './errors';
Expand Down Expand Up @@ -26,7 +26,7 @@ export interface FutureDebit {
bankAccountNumber?: string;
}

export interface ScraperOptions {
export type ScraperOptions = ScraperBrowserOptions & {
/**
* The company you want to scrape
*/
Expand All @@ -42,82 +42,89 @@ export interface ScraperOptions {
*/
startDate: Date;

/**
* shows the browser while scraping, good for debugging (default false)
*/
showBrowser?: boolean;

/**
* scrape transactions to be processed X months in the future
*/
futureMonthsToScrape?: number;

/**
* option from init puppeteer browser instance outside the libary scope. you can get
* browser diretly from puppeteer via `puppeteer.launch()`
*/
browser?: any;

/**
* provide a patch to local chromium to be used by puppeteer. Relevant when using
* `israeli-bank-scrapers-core` library
* if set to true, all installment transactions will be combine into the first one
*/
executablePath?: string;
combineInstallments?: boolean;

/**
* if set to true, all installment transactions will be combine into the first one
* if set, store a screenshot if failed to scrape. Used for debug purposes
*/
combineInstallments?: boolean;
storeFailureScreenShotPath?: string;

/**
* additional arguments to pass to the browser instance. The list of flags can be found in
*
* https://developer.mozilla.org/en-US/docs/Mozilla/Command_Line_Options
* https://peter.sh/experiments/chromium-command-line-switches/
* if set, will set the timeout in milliseconds of puppeteer's `page.setDefaultTimeout`.
*/
args?: string[];
defaultTimeout?: number;

/**
* Maximum navigation time in milliseconds, pass 0 to disable timeout.
* @default 30000
* Options for manipulation of output data
*/
timeout?: number | undefined;
outputData?: OutputDataOptions;

/**
* adjust the browser instance before it is being used
*
* @param browser
* Perform additional operation for each transaction to get more information (Like category) about it.
* Please note: It will take more time to finish the process.
*/
prepareBrowser?: (browser: Browser) => Promise<void>;
additionalTransactionInformation?: boolean;
};

export type ScraperBrowserOptions = {
/**
* adjust the page instance before it is being used.
*
* @param page
*/
preparePage?: (page: Page) => Promise<void>;
preparePage?: (page: Page) => Promise<void>; } & ({
/**
* option from init puppeteer browser instance outside the library scope. you can get
* browser directly from puppeteer via `puppeteer.launch()`
*/
browser: Browser;
} | {
/**
*
*/
browserContext: BrowserContext;
} | {
/**
* shows the browser while scraping, good for debugging (default false)
*/
showBrowser?: boolean;


/**
* if set, store a screenshot if failed to scrape. Used for debug purposes
* provide a patch to local chromium to be used by puppeteer. Relevant when using
* `israeli-bank-scrapers-core` library
*/
storeFailureScreenShotPath?: string;
executablePath?: string;

/**
* if set, will set the timeout in milliseconds of puppeteer's `page.setDefaultTimeout`.
* additional arguments to pass to the browser instance. The list of flags can be found in
*
* https://developer.mozilla.org/en-US/docs/Mozilla/Command_Line_Options
* https://peter.sh/experiments/chromium-command-line-switches/
*/
defaultTimeout?: number;
args?: string[];

/**
* Options for manipulation of output data
* Maximum navigation time in milliseconds, pass 0 to disable timeout.
* @default 30000
*/
outputData?: OutputDataOptions;
timeout?: number | undefined;

/**
* Perform additional operation for each transaction to get more information (Like category) about it.
* Please note: It will take more time to finish the process.
* adjust the browser instance before it is being used
*
* @param browser
*/
additionalTransactionInformation?: boolean;
}
prepareBrowser?: (browser: Browser) => Promise<void>;
});

export interface OutputDataOptions {
/**
Expand Down

0 comments on commit 48c1580

Please sign in to comment.