chore(CI): typecheck TS examples in docs automatically (apify#3278)

barjin · web-flow · commit e20b4bffe198 · 2025-11-27T17:55:02.000+01:00
The examples in the documentation are no longer failing with TS errors
on build.

This PR improves the DX (users copying examples for docs will get valid
scripts) and acts as an additional guard rail when making larger changes
in Crawlee.
diff --git a/.github/workflows/test-ci.yml b/.github/workflows/test-ci.yml
@@ -76,6 +76,14 @@ jobs:
                 env:
                     YARN_IGNORE_NODE: 1
 
+            -   name: Typecheck documentation examples
+                working-directory: ./docs
+                run: |
+                    yarn
+                    yarn typecheck
+                env:
+                    YARN_IGNORE_NODE: 1
+
             -   name: Tests
                 run: yarn test
                 env:
diff --git a/docs/examples/.eslintrc.json b/docs/examples/.eslintrc.json
diff --git a/docs/examples/cheerio_crawler.ts b/docs/examples/cheerio_crawler.ts
@@ -35,7 +35,7 @@ const crawler = new CheerioCrawler({
         // Extract data from the page using cheerio.
         const title = $('title').text();
         const h1texts: { text: string }[] = [];
-        $('h1').each((index, el) => {
+        $('h1').each((_, el) => {
             h1texts.push({
                 text: $(el).text(),
             });
diff --git a/docs/examples/crawler-plugins/playwright-extra.ts b/docs/examples/crawler-plugins/playwright-extra.ts
@@ -35,14 +35,14 @@ const crawler = new PlaywrightCrawler({
 
         // A function to be evaluated by Puppeteer within the browser context.
         const data = await page.$$eval('.athing', ($posts) => {
-            const scrapedData: { title: string; rank: string; href: string }[] = [];
+            const scrapedData: { title?: string; rank?: string; href?: string }[] = [];
 
             // We're getting the title, rank and URL of each post on Hacker News.
             $posts.forEach(($post) => {
                 scrapedData.push({
-                    title: $post.querySelector('.title a').innerText,
-                    rank: $post.querySelector('.rank').innerText,
-                    href: $post.querySelector('.title a').href,
+                    title: $post.querySelector<HTMLElement>('.title a')?.innerText,
+                    rank: $post.querySelector<HTMLElement>('.rank')?.innerText,
+                    href: $post.querySelector<HTMLAnchorElement>('.title a')?.href,
                 });
             });
 
diff --git a/docs/examples/crawler-plugins/puppeteer-extra.ts b/docs/examples/crawler-plugins/puppeteer-extra.ts
@@ -4,6 +4,7 @@ import stealthPlugin from 'puppeteer-extra-plugin-stealth';
 
 // First, we tell puppeteer-extra to use the plugin (or plugins) we want.
 // Certain plugins might have options you can pass in - read up on their documentation!
+// @ts-expect-error - The default export types for puppeteer-extra don't properly expose the 'use' method in ESM contexts
 puppeteerExtra.use(stealthPlugin());
 
 // Create an instance of the PuppeteerCrawler class - a crawler
@@ -32,14 +33,14 @@ const crawler = new PuppeteerCrawler({
 
         // A function to be evaluated by Puppeteer within the browser context.
         const data = await page.$$eval('.athing', ($posts) => {
-            const scrapedData: { title: string; rank: string; href: string }[] = [];
+            const scrapedData: { title?: string; rank?: string; href?: string }[] = [];
 
             // We're getting the title, rank and URL of each post on Hacker News.
             $posts.forEach(($post) => {
                 scrapedData.push({
-                    title: $post.querySelector('.title a').innerText,
-                    rank: $post.querySelector('.rank').innerText,
-                    href: $post.querySelector('.title a').href,
+                    title: $post.querySelector<HTMLElement>('.title a')?.innerText,
+                    rank: $post.querySelector<HTMLElement>('.rank')?.innerText,
+                    href: $post.querySelector<HTMLAnchorElement>('.title a')?.href,
                 });
             });
 
diff --git a/docs/examples/playwright_crawler.ts b/docs/examples/playwright_crawler.ts
@@ -25,14 +25,14 @@ const crawler = new PlaywrightCrawler({
 
         // A function to be evaluated by Playwright within the browser context.
         const data = await page.$$eval('.athing', ($posts) => {
-            const scrapedData: { title: string; rank: string; href: string }[] = [];
+            const scrapedData: { title?: string; rank?: string; href?: string }[] = [];
 
             // We're getting the title, rank and URL of each post on Hacker News.
             $posts.forEach(($post) => {
                 scrapedData.push({
-                    title: $post.querySelector('.title a').innerText,
-                    rank: $post.querySelector('.rank').innerText,
-                    href: $post.querySelector('.title a').href,
+                    title: $post.querySelector<HTMLElement>('.title a')?.innerText,
+                    rank: $post.querySelector<HTMLElement>('.rank')?.innerText,
+                    href: $post.querySelector<HTMLAnchorElement>('.title a')?.href,
                 });
             });
 
diff --git a/docs/examples/puppeteer_crawler.ts b/docs/examples/puppeteer_crawler.ts
@@ -25,14 +25,14 @@ const crawler = new PuppeteerCrawler({
 
         // A function to be evaluated by Puppeteer within the browser context.
         const data = await page.$$eval('.athing', ($posts) => {
-            const scrapedData: { title: string; rank: string; href: string }[] = [];
+            const scrapedData: { title?: string; rank?: string; href?: string }[] = [];
 
             // We're getting the title, rank and URL of each post on Hacker News.
             $posts.forEach(($post) => {
                 scrapedData.push({
-                    title: $post.querySelector('.title a').innerText,
-                    rank: $post.querySelector('.rank').innerText,
-                    href: $post.querySelector('.title a').href,
+                    title: $post.querySelector<HTMLElement>('.title a')?.innerText,
+                    rank: $post.querySelector<HTMLElement>('.rank')?.innerText,
+                    href: $post.querySelector<HTMLAnchorElement>('.title a')?.href,
                 });
             });
 
diff --git a/docs/examples/tsconfig.json b/docs/examples/tsconfig.json
diff --git a/docs/guides/custom-http-client/implementation.ts b/docs/guides/custom-http-client/implementation.ts
@@ -8,7 +8,7 @@ import type {
 } from '@crawlee/core';
 import { Readable } from 'node:stream';
 
-class CustomHttpClient implements BaseHttpClient {
+export class CustomHttpClient implements BaseHttpClient {
     async sendRequest<TResponseType extends keyof ResponseTypes = 'text'>(
         request: HttpRequest<TResponseType>,
     ): Promise<HttpResponse<TResponseType>> {
@@ -59,7 +59,7 @@ class CustomHttpClient implements BaseHttpClient {
         };
     }
 
-    async stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<StreamingHttpResponse> {
+    async stream(request: HttpRequest, _onRedirect?: RedirectHandler): Promise<StreamingHttpResponse> {
         const fetchResponse = await fetch(request.url, {
             method: request.method,
             headers: new Headers(),
@@ -79,7 +79,7 @@ class CustomHttpClient implements BaseHttpClient {
                         return null;
                     }
                     return pump();
-                    function pump() {
+                    function pump(): Promise<void> {
                         return reader!.read().then(({ done, value }) => {
                             // When no more data needs to be consumed, close the stream
                             if (done) {
diff --git a/docs/guides/custom-http-client/usage.ts b/docs/guides/custom-http-client/usage.ts
@@ -1,3 +1,6 @@
+import { HttpCrawler } from 'crawlee';
+import { CustomHttpClient } from './implementation.js';
+
 const crawler = new HttpCrawler({
     httpClient: new CustomHttpClient(),
     async requestHandler() {
diff --git a/docs/guides/request_storage_queue_crawler.ts b/docs/guides/request_storage_queue_crawler.ts
@@ -4,7 +4,7 @@ import { CheerioCrawler } from 'crawlee';
 // It's used the same way for Puppeteer/Playwright crawlers.
 const crawler = new CheerioCrawler({
     // Note that we're not specifying the requestQueue here
-    async requestHandler({ $, crawler, enqueueLinks }) {
+    async requestHandler({ crawler, enqueueLinks }) {
         // Add new request to the queue
         await crawler.addRequests([{ url: 'https://example.com/new-page' }]);
         // Add links found on page to the queue
diff --git a/docs/guides/request_storage_queue_crawler_explicit.ts b/docs/guides/request_storage_queue_crawler_explicit.ts
@@ -15,7 +15,7 @@ await requestQueue.addRequests([
 // It's used the same way for Puppeteer/Playwright crawlers
 const crawler = new CheerioCrawler({
     requestQueue,
-    async requestHandler({ $, request, enqueueLinks }) {
+    async requestHandler({ enqueueLinks }) {
         // Add new request to the queue
         await requestQueue.addRequests([{ url: 'https://example.com/new-page' }]);
         // Add links found on page to the queue
diff --git a/docs/guides/session_management_basic.ts b/docs/guides/session_management_basic.ts
@@ -16,12 +16,12 @@ const crawler = new BasicCrawler({
             url,
             // We use session id in order to have the same proxyUrl
             // for all the requests using the same session.
-            proxyUrl: await proxyConfiguration.newUrl(session.id),
+            proxyUrl: await proxyConfiguration.newUrl(session?.id),
             throwHttpErrors: false,
             headers: {
                 // If you want to use the cookieJar.
                 // This way you get the Cookie headers string from session.
-                Cookie: session.getCookieString(url),
+                Cookie: session?.getCookieString(url),
             },
         };
         let response;
@@ -33,24 +33,24 @@ const crawler = new BasicCrawler({
                 // If a network error happens, such as timeout, socket hangup, etc.
                 // There is usually a chance that it was just bad luck
                 // and the proxy works. No need to throw it away.
-                session.markBad();
+                session?.markBad();
             }
             throw e;
         }
 
         // Automatically retires the session based on response HTTP status code.
-        session.retireOnBlockedStatusCodes(response.statusCode);
+        session?.retireOnBlockedStatusCodes(response.statusCode);
 
-        if (response.body.blocked) {
+        if (response.body.includes('You are blocked!')) {
             // You are sure it is blocked.
             // This will throw away the session.
-            session.retire();
+            session?.retire();
         }
 
         // Everything is ok, you can get the data.
         // No need to call session.markGood -> BasicCrawler calls it for you.
 
         // If you want to use the CookieJar in session you need.
-        session.setCookiesFromResponse(response);
+        session?.setCookiesFromResponse(response);
     },
 });
diff --git a/docs/guides/session_management_cheerio.ts b/docs/guides/session_management_cheerio.ts
@@ -18,9 +18,9 @@ const crawler = new CheerioCrawler({
         const title = $('title').text();
 
         if (title === 'Blocked') {
-            session.retire();
+            session?.retire();
         } else if (title === 'Not sure if blocked, might also be a connection error') {
-            session.markBad();
+            session?.markBad();
         } else {
             // session.markGood() - this step is done automatically in BasicCrawler.
         }
diff --git a/docs/guides/session_management_http.ts b/docs/guides/session_management_http.ts
@@ -15,12 +15,12 @@ const crawler = new HttpCrawler({
     // and set the cookie header to request automatically (default is true).
     persistCookiesPerSession: true,
     async requestHandler({ session, body }) {
-        const title = body.match(/<title(?:.*?)>(.*?)<\/title>/)?.[1];
+        const title = (body as string).match(/<title(?:.*?)>(.*?)<\/title>/)?.[1];
 
         if (title === 'Blocked') {
-            session.retire();
+            session?.retire();
         } else if (title === 'Not sure if blocked, might also be a connection error') {
-            session.markBad();
+            session?.markBad();
         } else {
             // session.markGood() - this step is done automatically in BasicCrawler.
         }
diff --git a/docs/guides/session_management_jsdom.ts b/docs/guides/session_management_jsdom.ts
@@ -18,9 +18,9 @@ const crawler = new JSDOMCrawler({
         const title = window.document.title;
 
         if (title === 'Blocked') {
-            session.retire();
+            session?.retire();
         } else if (title === 'Not sure if blocked, might also be a connection error') {
-            session.markBad();
+            session?.markBad();
         } else {
             // session.markGood() - this step is done automatically in BasicCrawler.
         }
diff --git a/docs/guides/session_management_playwright.ts b/docs/guides/session_management_playwright.ts
@@ -18,9 +18,9 @@ const crawler = new PlaywrightCrawler({
         const title = await page.title();
 
         if (title === 'Blocked') {
-            session.retire();
+            session?.retire();
         } else if (title === 'Not sure if blocked, might also be a connection error') {
-            session.markBad();
+            session?.markBad();
         } else {
             // session.markGood() - this step is done automatically in PlaywrightCrawler.
         }
diff --git a/docs/guides/session_management_puppeteer.ts b/docs/guides/session_management_puppeteer.ts
@@ -18,9 +18,9 @@ const crawler = new PuppeteerCrawler({
         const title = await page.title();
 
         if (title === 'Blocked') {
-            session.retire();
+            session?.retire();
         } else if (title === 'Not sure if blocked, might also be a connection error') {
-            session.markBad();
+            session?.markBad();
         } else {
             // session.markGood() - this step is done automatically in PuppeteerCrawler.
         }
diff --git a/docs/introduction/06-example.ts b/docs/introduction/06-example.ts
@@ -18,8 +18,8 @@ const crawler = new PlaywrightCrawler({
                 .first();
 
             const currentPriceString = await priceElement.textContent();
-            const rawPrice = currentPriceString.split('$')[1];
-            const price = Number(rawPrice.replaceAll(',', ''));
+            const rawPrice = currentPriceString?.split('$')[1];
+            const price = Number(rawPrice?.replaceAll(',', ''));
 
             const inStockElement = page
                 .locator('span.product-form__inventory')
diff --git a/docs/introduction/07-example.ts b/docs/introduction/07-example.ts
@@ -18,8 +18,8 @@ const crawler = new PlaywrightCrawler({
                 .first();
 
             const currentPriceString = await priceElement.textContent();
-            const rawPrice = currentPriceString.split('$')[1];
-            const price = Number(rawPrice.replaceAll(',', ''));
+            const rawPrice = currentPriceString?.split('$')[1];
+            const price = Number(rawPrice?.replaceAll(',', ''));
 
             const inStockElement = page
                 .locator('span.product-form__inventory')
diff --git a/docs/introduction/tsconfig.json b/docs/introduction/tsconfig.json
diff --git a/docs/package.json b/docs/package.json
@@ -0,0 +1,17 @@
+{
+  "name": "crawlee-docs",
+  "description": "Documentation and examples for Crawlee. This package is not published to npm, only used locally for TS build checks.",
+  "type": "module",
+  "packageManager": "yarn@4.10.3",
+  "scripts": {
+    "typecheck": "tsc --noEmit"
+  },
+  "devDependencies": {
+    "typescript": "^5.9.3"
+  },
+  "dependencies": {
+    "playwright-extra": "^4.3.6",
+    "puppeteer-extra": "^3.3.6",
+    "puppeteer-extra-plugin-stealth": "^2.11.2"
+  }
+}
diff --git a/docs/tsconfig.json b/docs/tsconfig.json
@@ -0,0 +1,8 @@
+{
+	"extends": "../tsconfig.build.json",
+	"include": ["./**/*.ts"],
+	"compilerOptions": {
+		"lib": ["ES2022", "DOM.AsyncIterable"],
+		"noUnusedLocals": false
+	}
+}
diff --git a/docs/yarn.lock b/docs/yarn.lock