From c37f8e657a4ef8129fa8c86136f302bb45b3cac6 Mon Sep 17 00:00:00 2001 From: Dale Ryan Aldover Date: Sat, 26 Aug 2023 16:17:10 +0800 Subject: [PATCH 1/2] [Feature] Add types --- package.json | 1 + types/index.d.ts | 175 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 types/index.d.ts diff --git a/package.json b/package.json index 2db8495..edc761e 100644 --- a/package.json +++ b/package.json @@ -3,6 +3,7 @@ "version": "6.1.3", "description": "A web scraper for NodeJs", "main": "index.js", + "types": "types/index.d.ts", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" }, diff --git a/types/index.d.ts b/types/index.d.ts new file mode 100644 index 0000000..4d36407 --- /dev/null +++ b/types/index.d.ts @@ -0,0 +1,175 @@ +declare module 'nodejs-web-scraper' { + export type GlobalConfig = { + baseSiteUrl: string = ''; + startUrl: string = ''; + showConsoleLogs?: boolean = true; + // If an image with the same name exists, a new file with a number appended to it is created. Otherwise. it's overwritten. + cloneFiles?: boolean = true; + removeStyleAndScriptTags?: boolean = true; + // Maximum concurrent requests. + concurrency?: number = 3; + // Maximum number of retries of a failed request. + maxRetries?: number = 5; + delay?: number = 200; + timeout?: number = 6000; + // Needs to be provided only if a DownloadContent operation is created. + filePath?: string | null; + auth?: any; + headers?: Headers; + proxy?: string; + agent?: any; + logPath?: string; + // callback runs whenever any error occurs during scraping + onError?: () => unknown; + } + + export type ScrapingAction = any; + + export declare class Scraper { + config: GlobalConfig & { + errorCodesToSkip: [404, 403, 400]; + // usePuppeteer is deprecated since version 5. If you need it, downgrade to version 4.2.2 + usePuppeteer: boolean; + }; + qyu: Qyu; + state: { + failedScrapingIterations: any[]; + downloadedFiles: number = 0; + currentlyRunning: number = 0; + registeredOperations: Operation[]; + numRequests: number = 0; + repetitionCycles: number = 0; + }; + log: any; + requestSpacer: Promise; + referenceToRoot?: Root; + constructor(globalConfig: GlobalConfig): Scraper; + + registerOperation(Operation: Operation): void; + // Scraper.destroy() is deprecated. You can now have multiple instances, without calling this method. + destroy(): void; + async awaitBrowserReady(): Promise; + validateGlobalConfig(conf: GlobalConfig): void; + // This function will begin the entire scraping process. Expects a reference to the root operation. + async scrape(rootObject: Root): Promise; + areThereRepeatableErrors(): boolean; + reportFailedScrapingAction(errorString: string): void; + saveFile(data: any, fileName: string): Promise; + async createLogs(): Promise; + async createLog(obj: { fileName: string; data: ScrapingAction | ScrapingAction[] }): Promise; + log(message: string): void; + } + + export declare class PageHelper { + constructor(Operation: Operation): PageHelper; + // Will process one scraping object, including a pagination object. Used by Root and OpenLinks. + async processOneIteration(href: string, shouldPaginate: boolean): Promise<{ + data: []; + address: string; + }>; + // Divides a given page to multiple pages. + async paginate(address: string): Promise<{ + data: []; + address: string; + }>; + async getPage(href: string): Promise; + async runGetPageObjectHook(address: string, dataFromChildren: any[]): Promise; + async runAfterResponseHooks(response: any): Promise; + } + + export type RootConfig = { + // Look at the pagination API for more details. + pagination?: any; + getPageData?: Function; + // Receives a dictionary of children, and an address argument + getPageObject?: Function; + // Receives an axiosResponse object + getPageResponse?: Function; + // Receives htmlString and pageAddress + getPageHtml?: Function; + // Listens to every exception. Receives the Error object. + getException?: (error: Error) => Promise; + } + + // Fetches the initial page, and starts the scraping process. + export declare class Root extends HttpOperation { + operations: Operation[]; + pageHelper?: PageHelper; + constructor(config: RootConfig): Root; + addOperation(Operation: Operation): void; + initPageHelper(): void; + async scrape(): Promise; + // Will get the errors from all registered operations. + getErrors(): string[]; + validateOperationArguments(): void; + injectScraper: (ScraperInstance: Scraper) => void; + // Scrapes the child operations of this OpenLinks object. + scrapeChildren: (childOperations: any, { url, html }) => Promise; + } + + /** + * @see {@link https://github.com/shtaif/Qyu | Qyu on github} + */ + export declare class Qyu { + constructor(opts = {}, jobFn = null, jobOpts = {}): Qyu; + set(newOpts: any): void; + async runJobChannel(): Promise; + async runJobChannels(): Promise; + enqueue(fn: any, opts={}): Promise; + dequeue(promise: Promise): any; + add(): Promise; + map(iterator: any, fn: any, opts: any): Promise; + pause(): Promise | undefined; + resume(): void; + empty(): Promise; + whenEmpty(): Promise; + whenFree(): Promise; + writeStream(chunkObjTransformer=v=>v): import('node:stream').Writable; + transformStream(chunkObjTransformer=v=>v): import('node:stream').Transform; + } + + export type PromiseFactory = () => Promise; + + export type HttpOperationConfig = { + condition: () => unknown; + getException: (error: Error) => Promise; + } & OperationConfig; + + export declare class HttpOperation extends Operation { + condition?: () => unknown; + counter?: number; + constructor(config: HttpOperationConfig): HttpOperation; + async emitError(error: Error): Promise; + async repeatPromiseUntilResolved(promiseFactory: PromiseFactory, href: string): Promise; + // This function pushes promise-returning functions into the qyu. + qyuFactory(promiseFunction: Function): Qyu; + // Runs at the beginning of the promise-returning function, that is sent to repeatPromiseUntilResolved(). + async beforePromiseFactory(message: string): Promise; + // Runs at the end of the promise-returning function, that is sent to repeatPromiseUntilResolved(). + afterPromiseFactory(): void; + async createDelay(): Promise; + } + + export type OperationConfig = { + name?: string; + } + + export declare class Operation { + config: OperationConfig; + // Scraper instance is passed later on. + scraper?: Scraper; + // Holds all data collected by this operation, in the form of possibly multiple "ScrapingWrappers". + data: any[]; + // Holds the overall communication errors, encountered by the operation. + errors: any[]; + constructor(objectConfig: OperationConfig): Operation; + injectScraper(ScraperInstance: Scraper): void; + handleNewOperationCreation(Operation: Operation): void; + handleFailedScrapingIteration(errorString: String): void; + referenceToOperationObject(): this; + getData(): any[]; + getErrors(): any[]; + // Implemented by all Operation objects + validateOperationArguments(): unknown; + } +} \ No newline at end of file From ec0f878b7ee18ffd07c8611f377ed21247f31502 Mon Sep 17 00:00:00 2001 From: Dale Ryan Aldover Date: Sat, 26 Aug 2023 18:40:28 +0800 Subject: [PATCH 2/2] [Feature] Add type for OpenLinks --- types/index.d.ts | 55 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/types/index.d.ts b/types/index.d.ts index 4d36407..c0d62d8 100644 --- a/types/index.d.ts +++ b/types/index.d.ts @@ -89,7 +89,7 @@ declare module 'nodejs-web-scraper' { getPageHtml?: Function; // Listens to every exception. Receives the Error object. getException?: (error: Error) => Promise; - } + } & HttpOperationConfig; // Fetches the initial page, and starts the scraping process. export declare class Root extends HttpOperation { @@ -102,9 +102,60 @@ declare module 'nodejs-web-scraper' { // Will get the errors from all registered operations. getErrors(): string[]; validateOperationArguments(): void; + + // Mixins + injectScraper: (ScraperInstance: Scraper) => void; + // Scrapes the child operations of this OpenLinks object. + scrapeChildren: (childOperations: any, { url: any, html: any }) => Promise; + } + + export type ElementList = any; + + export type OpenLinksConfig = { + name?: string = 'Default OpenLinks name'; + // Look at the pagination API for more details. + pagination?: any; + slice?: number[]; + // Receives a Cheerio node. Use this hook to decide if this node should be included in the scraping. Return true or false + condition?: (nodeFromCheerio) => boolean; + // Receives an elementList array + getElementList?: (elementList: ElementList[]) => unknown; + getPageData?: () => unknown; + // Receives a dictionary of children, and an address argument + getPageObject?: (children, address) => unknown; + // Receives an axiosResponse object + getPageResponse?: (axiosResponse: any) => unknown; + // Receives htmlString and pageAddress + getPageHtml?: (html: string, pageAddress: any) => unknown; + getException?: (error: Error) => unknown; + // Callback that receives the href before it is opened. + transformHref?: (href: string) => string; + } + + export declare class OpenLinks extends HttpOperation { + pageHelper?: PageHelper; + operations: Operation[]; + querySelector: keyof HTMLElementTagNameMap; + transformHref?: (href: string) => string; + /** + * @param {keyof HTMLElementTagNameMap} querySelector - cheerio-advanced-selectors selector + * @param {OpenLinksConfig} config - OpenLinksConfig + */ + constructor(querySelector: keyof HTMLElementTagNameMap, config: OpenLinksConfig): OpenLinks + addOperation(Operation: Operation): void; + initPageHelper(): void; + validateOperationArguments(): void; + async scrape(scrapeParams: { url: string, html: string }): Promise<{ + type: string; + name: string; + data: any[]; + }> + async createLinkList(html: string, url: string): Promise; + + // Mixins injectScraper: (ScraperInstance: Scraper) => void; // Scrapes the child operations of this OpenLinks object. - scrapeChildren: (childOperations: any, { url, html }) => Promise; + scrapeChildren: (childOperations: any, { url: any, html: any }) => Promise; } /**