From 206638c059a3366c67d356318a2cea0f9eb2430a Mon Sep 17 00:00:00 2001 From: Nandini Muralidharan Date: Thu, 30 Apr 2026 12:21:19 +0530 Subject: [PATCH 1/4] feat: add browser-harness parallel web scraping sample Demonstrates using browser-harness with Playwright Workspaces to run 10+ parallel remote browser sessions for web scraping with LiveView debuggability. Includes: - Jupyter notebook with 6-section walkthrough - LiveViewWatcher helper for real-time session monitoring - Explicit coding agent prompt for PWW CDP connection - Product scraping example targeting books.toscrape.com Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../browser-harness-webscraping/.env.template | 8 + samples/browser-harness-webscraping/README.md | 110 +++++ .../helpers/__init__.py | 0 .../helpers/live_view_watcher.py | 107 ++++ .../parallel_webscraping.ipynb | 455 ++++++++++++++++++ .../requirements.txt | 6 + 6 files changed, 686 insertions(+) create mode 100644 samples/browser-harness-webscraping/.env.template create mode 100644 samples/browser-harness-webscraping/README.md create mode 100644 samples/browser-harness-webscraping/helpers/__init__.py create mode 100644 samples/browser-harness-webscraping/helpers/live_view_watcher.py create mode 100644 samples/browser-harness-webscraping/parallel_webscraping.ipynb create mode 100644 samples/browser-harness-webscraping/requirements.txt diff --git a/samples/browser-harness-webscraping/.env.template b/samples/browser-harness-webscraping/.env.template new file mode 100644 index 0000000..3474eb0 --- /dev/null +++ b/samples/browser-harness-webscraping/.env.template @@ -0,0 +1,8 @@ +# Azure Playwright Workspaces +SUBSCRIPTION_ID= +RESOURCE_GROUP= +LOCATION=eastus +PLAYWRIGHT_WORKSPACE_NAME= + +# This gets set automatically after PWW workspace creation (Step 2 in notebook) +# BU_CDP_WS=wss://browser.playwright.microsoft.com/ws?... diff --git a/samples/browser-harness-webscraping/README.md b/samples/browser-harness-webscraping/README.md new file mode 100644 index 0000000..9a2d5b9 --- /dev/null +++ b/samples/browser-harness-webscraping/README.md @@ -0,0 +1,110 @@ +# Parallel Web Scraping with Browser-Harness + Playwright Workspaces + +This sample demonstrates how to use [browser-harness](https://github.com/browser-use/browser-harness) with [Playwright Workspaces (PWW)](https://aka.ms/pww/docs) to run 10+ parallel remote browser sessions for web scraping, with LiveView for real-time debuggability. + +## Overview + +When you need to scrape data from many pages simultaneously — product prices, inventory levels, competitor catalogs — you need parallel browser sessions. This sample shows how to: + +1. **Create a Playwright Workspace** — managed cloud browsers on Azure +2. **Connect browser-harness** to PWW's remote CDP endpoint +3. **Spawn 10+ parallel browser sessions** — each with its own isolated browser +4. **Scrape product data** from multiple pages concurrently +5. **Debug in real-time** using PWW's LiveView + +## Architecture + +``` +┌─────────────────┐ ┌───────────────────────────┐ +│ Coding Agent │ │ Playwright Workspaces │ +│ (Claude Code / │────▶│ (Azure-managed browsers) │ +│ Codex) │ CDP │ │ +│ │ WSS │ ┌───────┐ ┌───────┐ │ +│ browser-harness│────▶│ │ Tab 1 │ │ Tab 2 │ ... │ +└─────────────────┘ │ └───────┘ └───────┘ │ + │ └───────────────────────────┘ + │ │ + ▼ ▼ +┌─────────────────┐ ┌───────────────────────────┐ +│ Aggregated │ │ LiveView (real-time) │ +│ Scraped Data │ │ Watch any session live │ +└─────────────────┘ └───────────────────────────┘ +``` + +## Prerequisites + +- **Azure subscription** with permissions to create Playwright Workspaces +- **Python 3.10+** +- **Git** installed +- **Azure CLI** authenticated (`az login`) +- Familiarity with Jupyter notebooks + +## Quick Start + +### 1. Install Dependencies + +```bash +pip install -r requirements.txt +``` + +### 2. Install Browser-Harness + +```bash +git clone https://github.com/browser-use/browser-harness +cd browser-harness +uv tool install -e . +``` + +### 3. Set Up Environment Variables + +Copy `.env.template` to `.env` and fill in your values: + +```bash +cp .env.template .env +``` + +Required variables: +``` +SUBSCRIPTION_ID= +RESOURCE_GROUP= +LOCATION=eastus +PLAYWRIGHT_WORKSPACE_NAME= +``` + +### 4. Run the Notebook + +Open `parallel_webscraping.ipynb` and follow the step-by-step instructions. + +## What You'll Learn + +- How to create and manage Playwright Workspaces programmatically +- How to connect browser-harness to remote CDP endpoints (PWW) +- The two-step connection flow (HTTP GET → resolve `sessionUrl` → set `BU_CDP_WS`) +- How to run 10+ parallel browser sessions for scraping +- How to use LiveView for real-time debugging of remote browser sessions + +## Files in This Sample + +| File | Description | +|------|-------------| +| `README.md` | This file | +| `requirements.txt` | Python dependencies | +| `.env.template` | Environment variable template | +| `parallel_webscraping.ipynb` | Step-by-step notebook | +| `helpers/live_view_watcher.py` | LiveView session watcher utility | + +## Important Notes + +- **Do NOT restart the daemon** after connecting to PWW — the remote browser is destroyed when the WebSocket closes +- **Cold start latency**: The initial browser provisioning takes 30-90 seconds +- **Session lifetime**: The browser stays alive as long as the daemon holds the WebSocket connection +- **Connect immediately**: After resolving the `sessionUrl`, connect the daemon right away — the session URL is ephemeral and expires quickly +- **Token limits**: PWW workspaces have a maximum number of access tokens. Delete unused tokens before creating new ones +- **CLI usage**: On Windows, browser-harness requires the `-c` flag: `browser-harness -c "print(page_info())"` +- The scraping target (`books.toscrape.com`) is a public demo site designed for scraping practice + +## More Resources + +- [Playwright Workspaces Documentation](https://aka.ms/pww/docs) +- [Browser-Harness GitHub](https://github.com/browser-use/browser-harness) +- [PWW Pricing](https://aka.ms/pww/pricing) diff --git a/samples/browser-harness-webscraping/helpers/__init__.py b/samples/browser-harness-webscraping/helpers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/samples/browser-harness-webscraping/helpers/live_view_watcher.py b/samples/browser-harness-webscraping/helpers/live_view_watcher.py new file mode 100644 index 0000000..f4aec7f --- /dev/null +++ b/samples/browser-harness-webscraping/helpers/live_view_watcher.py @@ -0,0 +1,107 @@ +""" +LiveViewWatcher — polls Playwright Workspaces for new browser sessions +and auto-opens the LiveView URL for real-time debugging. + +Usage: + from helpers.live_view_watcher import LiveViewWatcher + + watcher = LiveViewWatcher(pw_client, workspace_id, credential, auth_token) + watcher.start() + # ... run your browser automation ... + watcher.stop() +""" + +import threading +import webbrowser +from urllib.parse import quote + + +class LiveViewWatcher: + """Polls Playwright Service for new browser sessions and + auto-opens the live viewer when one is detected.""" + + LIVE_VIEW_BASE_URL = "https://stcnttestdataknarayasea.z23.web.core.windows.net/live_viewer_pww.html" + + def __init__(self, pw_client, workspace_id, credential, auth_token, + auth_service_base=None, poll_interval=2): + """ + Args: + pw_client: PlaywrightClient instance + workspace_id: PWW workspace ID + credential: Azure credential (for future token refresh) + auth_token: JWT access token for the live viewer + auth_service_base: Base URL of the auth service (derived from dataplane_uri) + poll_interval: Seconds between polling attempts + """ + self.pw_client = pw_client + self.workspace_id = workspace_id + self.credential = credential + self.auth_token = auth_token + self.auth_service_base = auth_service_base or "" + self.poll_interval = poll_interval + self.stop_event = threading.Event() + self.session_id = None + self.thread = None + self.existing_sessions = set() + + def _build_live_url(self, session_id): + """Construct the PWW live viewer URL with all required params.""" + return ( + f"{self.LIVE_VIEW_BASE_URL}" + f"?session={quote(session_id)}" + f"&workspace={quote(self.workspace_id)}" + f"&authBase={quote(self.auth_service_base)}" + f"&token={quote(self.auth_token)}" + ) + + def start(self): + """Snapshot existing sessions and start polling in background.""" + try: + self.existing_sessions = set( + s.id for s in self.pw_client.browser_sessions.list(self.workspace_id) + ) + except Exception: + self.existing_sessions = set() + self.stop_event.clear() + self.session_id = None + self.thread = threading.Thread(target=self._poll, daemon=True) + self.thread.start() + + def stop(self): + """Signal stop, wait briefly for the session to appear.""" + self.stop_event.set() + if self.thread: + self.thread.join(timeout=10) + + def _poll(self): + while True: + try: + current = set( + s.id for s in self.pw_client.browser_sessions.list(self.workspace_id) + ) + new_sessions = current - self.existing_sessions + if new_sessions: + self.session_id = new_sessions.pop() + live_url = self._build_live_url(self.session_id) + print(f"\n [LiveView] Session detected: {self.session_id}") + print(f" [LiveView] Opening browser...") + webbrowser.open(live_url) + return + except Exception: + pass + if self.stop_event.wait(self.poll_interval): + # Final check before exiting + try: + current = set( + s.id for s in self.pw_client.browser_sessions.list(self.workspace_id) + ) + new_sessions = current - self.existing_sessions + if new_sessions: + self.session_id = new_sessions.pop() + live_url = self._build_live_url(self.session_id) + print(f"\n [LiveView] Session detected: {self.session_id}") + print(f" [LiveView] Opening browser...") + webbrowser.open(live_url) + except Exception: + pass + return diff --git a/samples/browser-harness-webscraping/parallel_webscraping.ipynb b/samples/browser-harness-webscraping/parallel_webscraping.ipynb new file mode 100644 index 0000000..a13a63a --- /dev/null +++ b/samples/browser-harness-webscraping/parallel_webscraping.ipynb @@ -0,0 +1,455 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parallel Web Scraping with Browser-Harness + Playwright Workspaces\n", + "\n", + "This notebook demonstrates how to:\n", + "1. Create a Playwright Workspace (PWW) on Azure\n", + "2. Connect browser-harness to the PWW remote CDP endpoint\n", + "3. Spawn 10+ parallel browser sessions for web scraping\n", + "4. Use LiveView for real-time debuggability\n", + "\n", + "**Target**: Scrape product data from [books.toscrape.com](http://books.toscrape.com) across multiple category pages simultaneously." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Section 1: Prerequisites & Setup\n", + "\n", + "Ensure you have:\n", + "- Azure CLI authenticated (`az login`)\n", + "- browser-harness installed (`git clone https://github.com/browser-use/browser-harness && cd browser-harness && uv tool install -e .`)\n", + "- Dependencies installed (`pip install -r requirements.txt`)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import uuid\n", + "import subprocess\n", + "from datetime import datetime, timedelta, timezone\n", + "from urllib.parse import urlparse\n", + "from concurrent.futures import ThreadPoolExecutor, as_completed\n", + "\n", + "import pandas as pd\n", + "from dotenv import load_dotenv\n", + "from azure.identity import DefaultAzureCredential\n", + "from azure.mgmt.playwright import PlaywrightMgmtClient\n", + "from azure.mgmt.playwright.models import PlaywrightWorkspace, PlaywrightWorkspaceProperties\n", + "from azure.developer.playwright import PlaywrightClient\n", + "\n", + "load_dotenv()\n", + "\n", + "# Configuration\n", + "SUBSCRIPTION_ID = os.environ[\"SUBSCRIPTION_ID\"]\n", + "RESOURCE_GROUP = os.environ[\"RESOURCE_GROUP\"]\n", + "LOCATION = os.environ.get(\"LOCATION\", \"eastus\")\n", + "PLAYWRIGHT_WORKSPACE_NAME = os.environ[\"PLAYWRIGHT_WORKSPACE_NAME\"]\n", + "\n", + "credential = DefaultAzureCredential()\n", + "print(\"✅ Configuration loaded\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Section 2: Create Playwright Workspace (PWW)\n", + "\n", + "This creates a managed Playwright Workspace on Azure that provides cloud-hosted browsers.\n", + "Skip this cell if your workspace already exists." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create or get the Playwright Workspace\n", + "pw_mgmt = PlaywrightMgmtClient(credential, SUBSCRIPTION_ID)\n", + "\n", + "print(f\"Creating Playwright Workspace: {PLAYWRIGHT_WORKSPACE_NAME}...\")\n", + "workspace = pw_mgmt.playwright_workspaces.begin_create_or_update(\n", + " resource_group_name=RESOURCE_GROUP,\n", + " playwright_workspace_name=PLAYWRIGHT_WORKSPACE_NAME,\n", + " resource=PlaywrightWorkspace(\n", + " location=LOCATION,\n", + " properties=PlaywrightWorkspaceProperties(local_auth=\"Enabled\"),\n", + " ),\n", + ").result()\n", + "\n", + "workspace_id = workspace.properties.workspace_id\n", + "dataplane_uri = workspace.properties.dataplane_uri\n", + "base_url = f\"{urlparse(dataplane_uri).scheme}://{urlparse(dataplane_uri).netloc}\"\n", + "\n", + "print(f\"✅ Workspace ready: {workspace_id}\")\n", + "print(f\" Dataplane: {base_url}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create an access token for the workspace\n", + "pw_client = PlaywrightClient(endpoint=base_url, credential=credential)\n", + "\n", + "access_token_id = str(uuid.uuid4())\n", + "token = pw_client.access_tokens.create_or_replace(\n", + " workspace_id=workspace_id,\n", + " access_token_id=access_token_id,\n", + " resource={\n", + " \"name\": f\"scraping-demo-{access_token_id[:8]}\",\n", + " \"expiryAt\": (datetime.now(timezone.utc) + timedelta(days=30)).isoformat()\n", + " }\n", + ")\n", + "\n", + "playwright_api_key = token.jwt_token\n", + "print(\"✅ Access token created (valid 30 days)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Section 3: Connect Browser-Harness to PWW Remote Endpoint\n", + "\n", + "### The Connection Prompt\n", + "\n", + "Paste the following prompt into your coding agent (Claude Code / Codex) to connect browser-harness to the PWW remote browser:\n", + "\n", + "---\n", + "\n", + "```text\n", + "Set up browser-harness to connect to my Playwright Workspaces remote browser.\n", + "\n", + "Read install.md and SKILL.md first. Then connect to this Azure Playwright Service endpoint:\n", + "\n", + " SERVICE_URL=\n", + "\n", + "Follow the two-step connection flow:\n", + "1. HTTP GET the SERVICE_URL (allow 60-90s for the browser to spin up). Parse the JSON response to extract the `sessionUrl` (a wss:// WebSocket URL).\n", + "2. Set BU_CDP_WS to the resolved sessionUrl in .env, then restart the daemon ONCE.\n", + "\n", + "IMPORTANT:\n", + "- Do NOT kill or restart the daemon after the session is connected — the remote browser is destroyed when the WebSocket closes.\n", + "- Do NOT set shouldRedirect=true; use shouldRedirect=false and manually resolve the sessionUrl.\n", + "- The cold start takes 30-90s. Use a generous timeout on the initial HTTP GET.\n", + "- After connecting, verify with: browser-harness <<'PY'\\nprint(page_info())\\nPY\n", + "\n", + "Once connected, confirm with a screenshot that the remote browser is alive.\n", + "```\n", + "\n", + "---\n", + "\n", + "### Programmatic Connection (alternative)\n", + "\n", + "If you prefer to connect programmatically instead of via the coding agent prompt:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import urllib.request\n", + "\n", + "# Build the PWW service URL\n", + "service_url = (\n", + " f\"https://{urlparse(dataplane_uri).netloc}\"\n", + " f\"/playwrightworkspaces/{workspace_id}/browsers\"\n", + " f\"?playwrightVersion=cdp&shouldRedirect=false\"\n", + " f\"&accessKey={playwright_api_key}\"\n", + ")\n", + "\n", + "print(\"Resolving remote browser session (30-90s cold start)...\")\n", + "\n", + "# Step 1: HTTP GET to provision the browser and get the CDP WebSocket URL\n", + "resp = urllib.request.urlopen(service_url, timeout=120)\n", + "data = json.loads(resp.read())\n", + "cdp_ws_url = data[\"sessionUrl\"]\n", + "\n", + "print(f\"✅ Remote browser provisioned\")\n", + "print(f\" CDP WebSocket: {cdp_ws_url[:80]}...\")\n", + "\n", + "# Step 2: Write to .env so browser-harness picks it up\n", + "env_path = os.path.join(os.path.dirname(os.path.abspath('.')), 'browser-harness', '.env')\n", + "# Or set it directly for this session:\n", + "os.environ[\"BU_CDP_WS\"] = cdp_ws_url\n", + "\n", + "print(\"\\n⚠️ IMPORTANT: Do NOT restart the daemon after this point.\")\n", + "print(\" The remote browser is destroyed when the WebSocket closes.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Verify connection\n", + "# NOTE: browser-harness uses -c flag for script execution\n", + "result = subprocess.run(\n", + " [\"browser-harness\", \"-c\", \"print(page_info())\"],\n", + " capture_output=True, text=True, timeout=30,\n", + " env={**os.environ, \"BU_CDP_WS\": cdp_ws_url}\n", + ")\n", + "print(result.stdout)\n", + "if result.returncode == 0:\n", + " print(\"✅ Browser-harness connected to PWW remote browser\")\n", + "else:\n", + " print(f\"❌ Connection failed: {result.stderr}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Section 4: Parallel Web Scraping (10+ Sessions)\n", + "\n", + "We'll scrape product data from [books.toscrape.com](http://books.toscrape.com) — a public demo site designed for scraping practice.\n", + "\n", + "Each parallel session scrapes a different category page." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the pages to scrape (one per parallel browser session)\n", + "CATEGORY_URLS = [\n", + " \"http://books.toscrape.com/catalogue/category/books/travel_2/index.html\",\n", + " \"http://books.toscrape.com/catalogue/category/books/mystery_3/index.html\",\n", + " \"http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html\",\n", + " \"http://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html\",\n", + " \"http://books.toscrape.com/catalogue/category/books/classics_6/index.html\",\n", + " \"http://books.toscrape.com/catalogue/category/books/philosophy_7/index.html\",\n", + " \"http://books.toscrape.com/catalogue/category/books/romance_8/index.html\",\n", + " \"http://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html\",\n", + " \"http://books.toscrape.com/catalogue/category/books/fiction_10/index.html\",\n", + " \"http://books.toscrape.com/catalogue/category/books/childrens_11/index.html\",\n", + " \"http://books.toscrape.com/catalogue/category/books/religion_12/index.html\",\n", + " \"http://books.toscrape.com/catalogue/category/books/nonfiction_13/index.html\",\n", + "]\n", + "\n", + "print(f\"Will scrape {len(CATEGORY_URLS)} category pages in parallel\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Scraping script template for each parallel browser session\n", + "# browser-harness uses -c flag: browser-harness -c \"