From 0a725b01c1690f7ceeae90d327ba05b8fd41a9ab Mon Sep 17 00:00:00 2001 From: ThatXliner Date: Fri, 3 Apr 2026 22:08:38 -0700 Subject: [PATCH 01/11] feat(db): add Supabase Storage abstraction and imageUrl column Adds a storage-agnostic uploadImage/deleteImage API in packages/db/storage backed by Supabase Storage, so consumers never import Supabase directly. Adds imageUrl text column to Video schema (imageData kept temporarily for migration). Co-Authored-By: Claude Opus 4.6 --- packages/db/package.json | 5 +++ packages/db/src/schema.ts | 11 +++-- packages/db/src/storage.ts | 56 +++++++++++++++++++++++++ pnpm-lock.yaml | 83 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 151 insertions(+), 4 deletions(-) create mode 100644 packages/db/src/storage.ts diff --git a/packages/db/package.json b/packages/db/package.json index cef5270..9305219 100644 --- a/packages/db/package.json +++ b/packages/db/package.json @@ -14,6 +14,10 @@ "./schema": { "types": "./dist/schema.d.ts", "default": "./src/schema.ts" + }, + "./storage": { + "types": "./dist/storage.d.ts", + "default": "./src/storage.ts" } }, "license": "MIT", @@ -29,6 +33,7 @@ "with-env": "dotenv -e ../../.env --" }, "dependencies": { + "@supabase/supabase-js": "^2.101.1", "@vercel/postgres": "^0.10.0", "drizzle-orm": "^0.45.2", "drizzle-zod": "^0.8.3", diff --git a/packages/db/src/schema.ts b/packages/db/src/schema.ts index 88bf389..5524342 100644 --- a/packages/db/src/schema.ts +++ b/packages/db/src/schema.ts @@ -176,12 +176,15 @@ export const Video = pgTable( title: t.varchar({ length: 25 }).notNull(), // Max 25 chars description: t.text().notNull(), // 50-word catchy headline - // Hybrid image storage: Binary AI-generated images OR URL-based scraped thumbnails - imageData: bytea("image_data"), // Raw JPEG bytes (AI-generated) - imageMimeType: t.varchar("image_mime_type", { length: 50 }), // "image/jpeg" + // Image storage: URL to object storage (Supabase Storage / S3) + imageUrl: t.text(), // Public URL of uploaded image + thumbnailUrl: t.text(), // URL from source content (scraped) + + // Deprecated: binary image storage (pending migration removal) + imageData: bytea("image_data"), + imageMimeType: t.varchar("image_mime_type", { length: 50 }), imageWidth: t.integer("image_width"), imageHeight: t.integer("image_height"), - thumbnailUrl: t.text(), // URL from source content (scraped) // Metadata author: t.varchar({ length: 100 }), // "govtrack.com", "whitehouse.gov", etc. diff --git a/packages/db/src/storage.ts b/packages/db/src/storage.ts new file mode 100644 index 0000000..fe70ed7 --- /dev/null +++ b/packages/db/src/storage.ts @@ -0,0 +1,56 @@ +import { createClient } from "@supabase/supabase-js"; + +const BUCKET = process.env.SUPABASE_STORAGE_BUCKET ?? "images"; + +function getClient() { + const url = process.env.SUPABASE_URL; + const key = process.env.SUPABASE_SERVICE_ROLE_KEY; + if (!url || !key) { + throw new Error( + "Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY for storage", + ); + } + return createClient(url, key); +} + +/** + * Upload an image buffer to object storage. + * @param path - Storage path (e.g. "videos/abc-123.jpg") + * @param data - Raw image bytes + * @param mimeType - MIME type (default "image/jpeg") + * @returns Public URL of the uploaded image + */ +export async function uploadImage( + path: string, + data: Buffer, + mimeType = "image/jpeg", +): Promise { + const supabase = getClient(); + + const { error } = await supabase.storage.from(BUCKET).upload(path, data, { + contentType: mimeType, + upsert: true, + }); + + if (error) { + throw new Error(`Storage upload failed for ${path}: ${error.message}`); + } + + const { + data: { publicUrl }, + } = supabase.storage.from(BUCKET).getPublicUrl(path); + + return publicUrl; +} + +/** + * Delete an image from object storage. + * @param path - Storage path to delete + */ +export async function deleteImage(path: string): Promise { + const supabase = getClient(); + const { error } = await supabase.storage.from(BUCKET).remove([path]); + if (error) { + throw new Error(`Storage delete failed for ${path}: ${error.message}`); + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a8ded77..10c029f 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -509,6 +509,9 @@ importers: packages/db: dependencies: + '@supabase/supabase-js': + specifier: ^2.101.1 + version: 2.101.1(bufferutil@4.1.0)(utf-8-validate@6.0.4) '@vercel/postgres': specifier: ^0.10.0 version: 0.10.0(utf-8-validate@6.0.4) @@ -3549,6 +3552,33 @@ packages: '@standard-schema/spec@1.1.0': resolution: {integrity: sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==} + '@supabase/auth-js@2.101.1': + resolution: {integrity: sha512-Kd0Wey+RkFHgyVep7adS6UOE2pN6MJ3mZ32PAXSvfw6IjUkFRC7IQpdZZjUOcUe5pXr1ejufCRgF6lsGINe4Tw==} + engines: {node: '>=20.0.0'} + + '@supabase/functions-js@2.101.1': + resolution: {integrity: sha512-OZWU7YtaG+NNNFZK8p/FuJ6gpq7pFyrG2fLOopP73HAIDHDGpOttPJapvO8ADu3RkqfQfkwrB354vPkSBbZ20A==} + engines: {node: '>=20.0.0'} + + '@supabase/phoenix@0.4.0': + resolution: {integrity: sha512-RHSx8bHS02xwfHdAbX5Lpbo6PXbgyf7lTaXTlwtFDPwOIw64NnVRwFAXGojHhjtVYI+PEPNSWwkL90f4agN3bw==} + + '@supabase/postgrest-js@2.101.1': + resolution: {integrity: sha512-UW1RajH5jbZoK+ldAJ1I6VZ+HWwZ2oaKjEQ6Gn+AQ67CHQVxGl8wNQoLYyumbyaExm41I+wn7arulcY1eHeZJw==} + engines: {node: '>=20.0.0'} + + '@supabase/realtime-js@2.101.1': + resolution: {integrity: sha512-Oa6dno0OB9I+hv5do5zsZHbFu41ViZnE9IWjmkeeF/8fPmB5fWoHGqeTYEC3/0DAgtpUoFJa4FpvzFH0SBHo1Q==} + engines: {node: '>=20.0.0'} + + '@supabase/storage-js@2.101.1': + resolution: {integrity: sha512-WhTaUOBgeEvnKLy95Cdlp6+D5igSF/65yC727w1olxbet5nzUvMlajKUWyzNtQu2efrz2cQ7FcdVBdQqgT9YKQ==} + engines: {node: '>=20.0.0'} + + '@supabase/supabase-js@2.101.1': + resolution: {integrity: sha512-Jnhm3LfuACwjIzvk2pfUbGQn7pa7hi6MFzfSyPrRYWVCCu69RPLCFyHSBl7HSBwadbQ3UZOznnD3gPca3ePrRA==} + engines: {node: '>=20.0.0'} + '@swc/helpers@0.5.15': resolution: {integrity: sha512-JQ5TuMi45Owi4/BIMAJBoSQoOJu12oOk/gADqlcUL9JEdHB8vyjUSsxqeNXnmXHjYKMi2WcYtezGEEhqUI/E2g==} @@ -3855,6 +3885,9 @@ packages: '@types/turndown@5.0.6': resolution: {integrity: sha512-ru00MoyeeouE5BX4gRL+6m/BsDfbRayOskWqUvh7CLGW+UXxHQItqALa38kKnOiZPqJrtzJUgAC2+F0rL1S4Pg==} + '@types/ws@8.18.1': + resolution: {integrity: sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==} + '@types/yargs-parser@21.0.3': resolution: {integrity: sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==} @@ -5758,6 +5791,10 @@ packages: hyphenate-style-name@1.1.0: resolution: {integrity: sha512-WDC/ui2VVRrz3jOVi+XtjqkDjiVjTtFaAGiW37k6b+ohyQ5wYDOGkvCZa8+H0nx3gyvv0+BST9xuOgIyGQ00gw==} + iceberg-js@0.8.1: + resolution: {integrity: sha512-1dhVQZXhcHje7798IVM+xoo/1ZdVfzOMIc8/rgVSijRK38EDqOJoGula9N/8ZI5RD8QTxNQtK/Gozpr+qUqRRA==} + engines: {node: '>=20.0.0'} + iconv-lite@0.6.3: resolution: {integrity: sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==} engines: {node: '>=0.10.0'} @@ -11404,6 +11441,46 @@ snapshots: '@standard-schema/spec@1.1.0': {} + '@supabase/auth-js@2.101.1': + dependencies: + tslib: 2.8.1 + + '@supabase/functions-js@2.101.1': + dependencies: + tslib: 2.8.1 + + '@supabase/phoenix@0.4.0': {} + + '@supabase/postgrest-js@2.101.1': + dependencies: + tslib: 2.8.1 + + '@supabase/realtime-js@2.101.1(bufferutil@4.1.0)(utf-8-validate@6.0.4)': + dependencies: + '@supabase/phoenix': 0.4.0 + '@types/ws': 8.18.1 + tslib: 2.8.1 + ws: 8.20.0(bufferutil@4.1.0)(utf-8-validate@6.0.4) + transitivePeerDependencies: + - bufferutil + - utf-8-validate + + '@supabase/storage-js@2.101.1': + dependencies: + iceberg-js: 0.8.1 + tslib: 2.8.1 + + '@supabase/supabase-js@2.101.1(bufferutil@4.1.0)(utf-8-validate@6.0.4)': + dependencies: + '@supabase/auth-js': 2.101.1 + '@supabase/functions-js': 2.101.1 + '@supabase/postgrest-js': 2.101.1 + '@supabase/realtime-js': 2.101.1(bufferutil@4.1.0)(utf-8-validate@6.0.4) + '@supabase/storage-js': 2.101.1 + transitivePeerDependencies: + - bufferutil + - utf-8-validate + '@swc/helpers@0.5.15': dependencies: tslib: 2.8.1 @@ -11666,6 +11743,10 @@ snapshots: '@types/turndown@5.0.6': {} + '@types/ws@8.18.1': + dependencies: + '@types/node': 25.5.0 + '@types/yargs-parser@21.0.3': {} '@types/yargs@17.0.35': @@ -14013,6 +14094,8 @@ snapshots: hyphenate-style-name@1.1.0: {} + iceberg-js@0.8.1: {} + iconv-lite@0.6.3: dependencies: safer-buffer: 2.1.2 From 136d4798918bfb91e70dd9769bec1c7f304adb60 Mon Sep 17 00:00:00 2001 From: ThatXliner Date: Fri, 3 Apr 2026 22:08:49 -0700 Subject: [PATCH 02/11] feat(scraper,api): upload images to object storage instead of bytea MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scraper now uploads DALL-E images via @acme/db/storage and stores the public URL in Video.imageUrl. API serves the URL directly instead of base64-encoding blobs — eliminates the data URI overhead. Co-Authored-By: Claude Opus 4.6 --- apps/scraper/src/utils/db/video-operations.ts | 27 +++++++++---------- packages/api/src/router/content.ts | 2 +- packages/api/src/router/video.ts | 25 ++++++----------- 3 files changed, 21 insertions(+), 33 deletions(-) diff --git a/apps/scraper/src/utils/db/video-operations.ts b/apps/scraper/src/utils/db/video-operations.ts index d4ccca2..b908675 100644 --- a/apps/scraper/src/utils/db/video-operations.ts +++ b/apps/scraper/src/utils/db/video-operations.ts @@ -5,6 +5,7 @@ import { db } from '@acme/db/client'; import { Video } from '@acme/db/schema'; +import { uploadImage } from '@acme/db/storage'; import { and, eq } from '@acme/db'; import { generateMarketingCopy } from '../ai/marketing-generation.js'; import { generateImage, convertToJpeg } from '../ai/image-generation.js'; @@ -68,12 +69,14 @@ export async function generateVideoForContent( // Generate marketing copy const marketingCopy = await generateMarketingCopy(title, fullText, contentType); - // Generate and convert image - let imageData: Buffer | null = null; - let imageMimeType = 'image/jpeg'; + // Generate, convert, and upload image + let imageUrl: string | null = null; const generatedImage = await generateImage(marketingCopy.imagePrompt); if (generatedImage) { - imageData = await convertToJpeg(generatedImage.data); + const jpegData = await convertToJpeg(generatedImage.data); + const storagePath = `videos/${contentType}/${contentId}.jpg`; + imageUrl = await uploadImage(storagePath, jpegData); + logger.debug(`Uploaded image to ${storagePath}`); } // Random engagement metrics (same as current video.ts) @@ -83,7 +86,7 @@ export async function generateVideoForContent( shares: Math.floor(Math.random() * 1000) + 10, }; - // Upsert video with hybrid image support + // Upsert video try { await db .insert(Video) @@ -92,11 +95,8 @@ export async function generateVideoForContent( contentId, title: marketingCopy.title, description: marketingCopy.description, - imageData, - imageMimeType, - imageWidth: imageData ? 1024 : null, - imageHeight: imageData ? 1024 : null, - thumbnailUrl: thumbnailUrl ?? undefined, // Add URL-based thumbnail support + imageUrl, + thumbnailUrl: thumbnailUrl ?? undefined, author, engagementMetrics, sourceContentHash: contentHash, @@ -106,11 +106,8 @@ export async function generateVideoForContent( set: { title: marketingCopy.title, description: marketingCopy.description, - imageData, - imageMimeType, - imageWidth: imageData ? 1024 : null, - imageHeight: imageData ? 1024 : null, - thumbnailUrl: thumbnailUrl ?? undefined, // Update thumbnail URL on conflict + imageUrl, + thumbnailUrl: thumbnailUrl ?? undefined, sourceContentHash: contentHash, updatedAt: new Date(), }, diff --git a/packages/api/src/router/content.ts b/packages/api/src/router/content.ts index 41e68d1..ccdd1e8 100644 --- a/packages/api/src/router/content.ts +++ b/packages/api/src/router/content.ts @@ -49,7 +49,7 @@ const ContentCardSchema = z.object({ type: z.enum(["bill", "government_content", "court_case", "general"]), isAIGenerated: z.boolean(), thumbnailUrl: z.string().optional(), - imageUri: z.string().optional(), // Add support for AI-generated data URIs + imageUrl: z.string().optional(), }); export type ContentCard = z.infer; diff --git a/packages/api/src/router/video.ts b/packages/api/src/router/video.ts index 02919e0..46f5e75 100644 --- a/packages/api/src/router/video.ts +++ b/packages/api/src/router/video.ts @@ -7,7 +7,6 @@ import { Video } from "@acme/db/schema"; import { publicProcedure } from "../trpc"; -// Schema for video/feed post (from Video table) - Hybrid image support export const VideoPostSchema = z.object({ id: z.string(), title: z.string().max(100), @@ -18,10 +17,9 @@ export const VideoPostSchema = z.object({ shares: z.number(), type: z.enum(["bill", "government_content", "court_case", "general"]), articlePreview: z.string(), - // Hybrid image support - use whichever is available - imageUri: z.string().optional(), // Data URI from Video.imageData (AI-generated) - thumbnailUrl: z.string().optional(), // URL from source content (scraped) - originalContentId: z.string(), // Reference to source content + imageUrl: z.string().optional(), + thumbnailUrl: z.string().optional(), + originalContentId: z.string(), }); export type VideoPost = z.infer; @@ -45,15 +43,8 @@ export const videoRouter = { .limit(limit) .offset(cursor); - // Transform to feed format with hybrid image support + // Transform to feed format const feedPosts = videos.map((video) => { - // Handle AI-generated binary images (convert to data URI) - let imageUri: string | undefined; - if (video.imageData && video.imageMimeType) { - const base64 = video.imageData.toString("base64"); - imageUri = `data:${video.imageMimeType};base64,${base64}`; - } - const metrics = video.engagementMetrics as { likes: number; comments: number; @@ -80,10 +71,10 @@ export const videoRouter = { comments: metrics.comments, shares: metrics.shares, type, - articlePreview: video.description, // Marketing description as preview - imageUri, // AI-generated data URI (if exists) - thumbnailUrl: video.thumbnailUrl ?? undefined, // URL-based thumbnail (if exists) - originalContentId: video.contentId, // For "Read Full Article" navigation + articlePreview: video.description, + imageUrl: video.imageUrl ?? undefined, + thumbnailUrl: video.thumbnailUrl ?? undefined, + originalContentId: video.contentId, }; }); From 025e1fbeb18240d20066e88d46829aa80802b7ac Mon Sep 17 00:00:00 2001 From: ThatXliner Date: Fri, 3 Apr 2026 22:09:34 -0700 Subject: [PATCH 03/11] chore: rename imageUri to imageUrl, add migration script and env vars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Renames imageUri → imageUrl across frontend and social-media-agent. Adds Supabase Storage env vars to .env.example. Adds migration script to move existing bytea blobs to Supabase Storage. Co-Authored-By: Claude Opus 4.6 --- .env.example | 6 + apps/expo/src/app/(tabs)/feed.tsx | 6 +- apps/expo/src/app/(tabs)/index.tsx | 6 +- packages/db/migrate-images-to-storage.ts | 133 +++++++++++++++++++++++ social-media-agent/src/agent.ts | 2 +- 5 files changed, 146 insertions(+), 7 deletions(-) create mode 100644 packages/db/migrate-images-to-storage.ts diff --git a/.env.example b/.env.example index e4256d7..b525e27 100644 --- a/.env.example +++ b/.env.example @@ -29,6 +29,12 @@ OPENAI_API_KEY='your-openai-api-key-here' # Google Gemini API key for AI text generation GOOGLE_GENERATIVE_AI_API_KEY='your-gemini-api-key-here' +# Supabase Storage (for image uploads) +# Project URL: https://supabase.com/dashboard/project/_/settings/api +SUPABASE_URL=https://your-project-ref.supabase.co +SUPABASE_SERVICE_ROLE_KEY=your_service_role_key_here +SUPABASE_STORAGE_BUCKET=images + # Expo app API URL (for local development, set to localhost:3000) EXPO_PUBLIC_API_URL=http://localhost:3000 diff --git a/apps/expo/src/app/(tabs)/feed.tsx b/apps/expo/src/app/(tabs)/feed.tsx index eca6593..24a3f7a 100644 --- a/apps/expo/src/app/(tabs)/feed.tsx +++ b/apps/expo/src/app/(tabs)/feed.tsx @@ -120,11 +120,11 @@ export default function FeedScreen() { {item.title} - {/* Hybrid Image Display - prioritize AI-generated imageUri */} - {item.imageUri ? ( + {/* Image display - prioritize AI-generated imageUrl */} + {item.imageUrl ? ( diff --git a/apps/expo/src/app/(tabs)/index.tsx b/apps/expo/src/app/(tabs)/index.tsx index bd9ea89..ef1ba53 100644 --- a/apps/expo/src/app/(tabs)/index.tsx +++ b/apps/expo/src/app/(tabs)/index.tsx @@ -39,7 +39,7 @@ interface ContentCard { type: "bill" | "government_content" | "court_case" | "general"; isAIGenerated: boolean; thumbnailUrl?: string; - imageUri?: string; + imageUrl?: string; } const _TYPE_LABELS: Record = { @@ -137,10 +137,10 @@ const ContentCardComponent = ({ {/* Thumbnail */} - {(item.imageUri ?? item.thumbnailUrl) ? ( + {(item.imageUrl ?? item.thumbnailUrl) ? ( diff --git a/packages/db/migrate-images-to-storage.ts b/packages/db/migrate-images-to-storage.ts new file mode 100644 index 0000000..10c3152 --- /dev/null +++ b/packages/db/migrate-images-to-storage.ts @@ -0,0 +1,133 @@ +/** + * Migration: move Video.imageData (bytea) blobs to Supabase Storage. + * + * For each Video row that has imageData but no imageUrl: + * 1. Upload the buffer to Supabase Storage + * 2. Write the public URL back to imageUrl + * 3. Null out imageData to free space + * + * After running this and verifying, drop the imageData column: + * ALTER TABLE video DROP COLUMN image_data; + * ALTER TABLE video DROP COLUMN image_mime_type; + * ALTER TABLE video DROP COLUMN image_width; + * ALTER TABLE video DROP COLUMN image_height; + * + * Usage: pnpm with-env tsx migrate-images-to-storage.ts [--dry-run] + */ + +import { dirname, join } from "path"; +import { fileURLToPath } from "url"; +import { config } from "dotenv"; +import pg from "pg"; +import { createClient } from "@supabase/supabase-js"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +config({ path: join(__dirname, "../../.env") }); + +const BUCKET = process.env.SUPABASE_STORAGE_BUCKET ?? "images"; +const dryRun = process.argv.includes("--dry-run"); + +function getSupabase() { + const url = process.env.SUPABASE_URL; + const key = process.env.SUPABASE_SERVICE_ROLE_KEY; + if (!url || !key) { + throw new Error("Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY"); + } + return createClient(url, key); +} + +async function migrate() { + const pool = new pg.Pool({ connectionString: process.env.POSTGRES_URL }); + const supabase = getSupabase(); + + console.log(dryRun ? "[DRY RUN] " : "", "Starting image migration...\n"); + + const client = await pool.connect(); + + try { + // Find all videos with binary data but no storage URL + const { rows } = await client.query<{ + id: string; + content_type: string; + content_id: string; + image_data: Buffer; + image_mime_type: string | null; + }>( + `SELECT id, content_type, content_id, image_data, image_mime_type + FROM video + WHERE image_data IS NOT NULL AND image_url IS NULL`, + ); + + console.log(`Found ${rows.length} videos to migrate\n`); + + let migrated = 0; + let errors = 0; + + for (const row of rows) { + const storagePath = `videos/${row.content_type}/${row.content_id}.jpg`; + const mimeType = row.image_mime_type ?? "image/jpeg"; + + try { + if (dryRun) { + console.log(` [DRY RUN] Would upload ${storagePath} (${row.image_data.length} bytes)`); + migrated++; + continue; + } + + // Upload to storage + const { error: uploadError } = await supabase.storage + .from(BUCKET) + .upload(storagePath, row.image_data, { + contentType: mimeType, + upsert: true, + }); + + if (uploadError) { + throw new Error(uploadError.message); + } + + // Get public URL + const { + data: { publicUrl }, + } = supabase.storage.from(BUCKET).getPublicUrl(storagePath); + + // Write URL back and clear blob + await client.query( + `UPDATE video + SET image_url = $1, image_data = NULL, image_mime_type = NULL, + image_width = NULL, image_height = NULL + WHERE id = $2`, + [publicUrl, row.id], + ); + + migrated++; + console.log(` Migrated: ${storagePath}`); + } catch (err) { + errors++; + console.error(` Failed: ${storagePath} — ${err instanceof Error ? err.message : err}`); + } + } + + console.log(`\nDone: ${migrated} migrated, ${errors} errors`); + + if (!dryRun && migrated > 0 && errors === 0) { + console.log( + "\nAll blobs migrated. You can now drop the old columns:\n" + + " ALTER TABLE video DROP COLUMN image_data;\n" + + " ALTER TABLE video DROP COLUMN image_mime_type;\n" + + " ALTER TABLE video DROP COLUMN image_width;\n" + + " ALTER TABLE video DROP COLUMN image_height;", + ); + } + } finally { + client.release(); + await pool.end(); + } +} + +migrate().catch((err) => { + console.error("Migration failed:", err); + process.exit(1); +}); diff --git a/social-media-agent/src/agent.ts b/social-media-agent/src/agent.ts index 61a2cbe..4f23c2f 100644 --- a/social-media-agent/src/agent.ts +++ b/social-media-agent/src/agent.ts @@ -36,7 +36,7 @@ export interface ContentItem { type: string; isAIGenerated?: boolean; thumbnailUrl?: string; - imageUri?: string; + imageUrl?: string; } export class SocialMediaAgent { From 1c982f67f40662094dd263665b94e990bb309b12 Mon Sep 17 00:00:00 2001 From: ThatXliner Date: Fri, 3 Apr 2026 22:26:44 -0700 Subject: [PATCH 04/11] address review comments --- .env.example | 2 + apps/scraper/src/utils/db/video-operations.ts | 38 +- .../plans/2026-03-30-scraper-refactor.md | 1692 +++++++++++++++++ .../2026-03-30-scraper-refactor-design.md | 118 ++ packages/api/src/router/video.ts | 6 +- packages/db/migrate-images-to-storage.ts | 115 +- .../db/migrations/add_imageurl_to_video.sql | 4 + packages/db/src/schema.ts | 2 +- 8 files changed, 1897 insertions(+), 80 deletions(-) create mode 100644 docs/superpowers/plans/2026-03-30-scraper-refactor.md create mode 100644 docs/superpowers/specs/2026-03-30-scraper-refactor-design.md create mode 100644 packages/db/migrations/add_imageurl_to_video.sql diff --git a/.env.example b/.env.example index b525e27..476ffb6 100644 --- a/.env.example +++ b/.env.example @@ -32,6 +32,8 @@ GOOGLE_GENERATIVE_AI_API_KEY='your-gemini-api-key-here' # Supabase Storage (for image uploads) # Project URL: https://supabase.com/dashboard/project/_/settings/api SUPABASE_URL=https://your-project-ref.supabase.co +# WARNING: `SUPABASE_SERVICE_ROLE_KEY` is highly privileged and must only be used server-side. +# Never use it in Expo/browser code and never expose it via `EXPO_PUBLIC_*` or any other client-exposed env var mechanism. SUPABASE_SERVICE_ROLE_KEY=your_service_role_key_here SUPABASE_STORAGE_BUCKET=images diff --git a/apps/scraper/src/utils/db/video-operations.ts b/apps/scraper/src/utils/db/video-operations.ts index b908675..8506a9d 100644 --- a/apps/scraper/src/utils/db/video-operations.ts +++ b/apps/scraper/src/utils/db/video-operations.ts @@ -5,7 +5,7 @@ import { db } from '@acme/db/client'; import { Video } from '@acme/db/schema'; -import { uploadImage } from '@acme/db/storage'; +import { uploadImage, deleteImage } from '@acme/db/storage'; import { and, eq } from '@acme/db'; import { generateMarketingCopy } from '../ai/marketing-generation.js'; import { generateImage, convertToJpeg } from '../ai/image-generation.js'; @@ -69,14 +69,11 @@ export async function generateVideoForContent( // Generate marketing copy const marketingCopy = await generateMarketingCopy(title, fullText, contentType); - // Generate, convert, and upload image - let imageUrl: string | null = null; + // Generate and convert image (upload happens after DB write to avoid orphans) + let jpegData: Buffer | null = null; const generatedImage = await generateImage(marketingCopy.imagePrompt); if (generatedImage) { - const jpegData = await convertToJpeg(generatedImage.data); - const storagePath = `videos/${contentType}/${contentId}.jpg`; - imageUrl = await uploadImage(storagePath, jpegData); - logger.debug(`Uploaded image to ${storagePath}`); + jpegData = await convertToJpeg(generatedImage.data); } // Random engagement metrics (same as current video.ts) @@ -86,7 +83,7 @@ export async function generateVideoForContent( shares: Math.floor(Math.random() * 1000) + 10, }; - // Upsert video + // Upsert video first (without image URL) try { await db .insert(Video) @@ -95,7 +92,6 @@ export async function generateVideoForContent( contentId, title: marketingCopy.title, description: marketingCopy.description, - imageUrl, thumbnailUrl: thumbnailUrl ?? undefined, author, engagementMetrics, @@ -106,15 +102,11 @@ export async function generateVideoForContent( set: { title: marketingCopy.title, description: marketingCopy.description, - imageUrl, thumbnailUrl: thumbnailUrl ?? undefined, sourceContentHash: contentHash, updatedAt: new Date(), }, }); - - incrementVideosGenerated(); - logger.success(`Video generated for ${contentType}:${contentId}`); } catch (error) { // Sanitize error to avoid logging raw image data const sanitizedError = error instanceof Error @@ -123,4 +115,24 @@ export async function generateVideoForContent( logger.error(`Failed to insert video for ${contentType}:${contentId}: ${sanitizedError}`); throw error; } + + // Upload image after successful DB write, then update the row + if (jpegData) { + const storagePath = `videos/${contentType}/${contentId}.jpg`; + try { + const imageUrl = await uploadImage(storagePath, jpegData); + await db + .update(Video) + .set({ imageUrl }) + .where(and(eq(Video.contentType, contentType), eq(Video.contentId, contentId))); + logger.debug(`Uploaded image to ${storagePath}`); + } catch (error) { + // Best-effort cleanup of orphaned upload + try { await deleteImage(storagePath); } catch { /* ignore */ } + logger.warn(`Image upload/update failed for ${contentType}:${contentId}, video saved without image`); + } + } + + incrementVideosGenerated(); + logger.success(`Video generated for ${contentType}:${contentId}`); } diff --git a/docs/superpowers/plans/2026-03-30-scraper-refactor.md b/docs/superpowers/plans/2026-03-30-scraper-refactor.md new file mode 100644 index 0000000..da8c942 --- /dev/null +++ b/docs/superpowers/plans/2026-03-30-scraper-refactor.md @@ -0,0 +1,1692 @@ +# Scraper Architecture Refactor — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace Crawlee with hand-rolled fetch+cheerio, unify the three upsert functions into one, add a shared `fetchWithRetry` utility and `log` helper, and simplify the runner in `main.ts`. + +**Architecture:** Each scraper becomes a plain `{ name, scrape }` object using `fetchWithRetry()` + cheerio/turndown directly. A unified `upsertContent(type, data)` replaces the three per-table upsert functions. `main.ts` becomes a loop over selected scrapers. + +**Tech Stack:** Node.js, TypeScript, cheerio, turndown, Drizzle ORM, Vercel AI SDK, OpenAI SDK + +--- + +## File Map + +| Action | File | Responsibility | +|--------|------|---------------| +| Create | `src/utils/fetch.ts` | `fetchWithRetry()` — shared retry + timeout wrapper | +| Create | `src/utils/log.ts` | `log(scraper, msg)` — prefixed logging | +| Modify | `src/utils/db/operations.ts` | Merge 3 upsert fns → `upsertContent()` | +| Modify | `src/scrapers/govtrack.ts` | Replace CheerioCrawler with fetch+cheerio | +| Modify | `src/scrapers/whitehouse.ts` | Replace CheerioCrawler with fetch+cheerio+turndown | +| Modify | `src/scrapers/congress.ts` | Use shared `fetchWithRetry`, use `upsertContent`, use `log` | +| Modify | `src/scrapers/scotus.ts` | Use shared `fetchWithRetry`, use `upsertContent`, use `log` | +| Modify | `src/main.ts` | Scraper runner loop | +| Modify | `src/utils/types.ts` | Add `Scraper` type, add `ContentType` union | +| Modify | `package.json` | Remove crawlee, playwright, @apify/tsconfig | +| Modify | `tsconfig.json` | Extend monorepo base only (remove apify dep) | +| Modify | `Dockerfile.scraper` (repo root) | Remove playwright install, simplify | + +--- + +### Task 1: Create `fetchWithRetry` utility + +**Files:** +- Create: `apps/scraper/src/utils/fetch.ts` + +- [ ] **Step 1: Create `fetchWithRetry`** + +```ts +// apps/scraper/src/utils/fetch.ts + +export interface FetchWithRetryOptions extends RequestInit { + maxRetries?: number; + timeoutMs?: number; +} + +export async function fetchWithRetry( + url: string, + options: FetchWithRetryOptions = {}, +): Promise { + const { maxRetries = 3, timeoutMs = 30_000, ...fetchOptions } = options; + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), timeoutMs); + + try { + const res = await fetch(url, { + ...fetchOptions, + signal: controller.signal, + }); + + if (res.ok) return res; + + const isRetriable = res.status === 429 || res.status >= 500; + if (isRetriable && attempt < maxRetries) { + let delayMs = 1000 * Math.pow(2, attempt); + + // Honor Retry-After header + const retryAfter = res.headers.get("Retry-After"); + if (retryAfter) { + const seconds = Number(retryAfter); + if (!Number.isNaN(seconds)) { + delayMs = Math.max(delayMs, seconds * 1000); + } else { + const retryDate = Date.parse(retryAfter); + if (!Number.isNaN(retryDate)) { + const diff = retryDate - Date.now(); + if (diff > 0) delayMs = Math.max(delayMs, diff); + } + } + } + + await new Promise((r) => setTimeout(r, delayMs)); + continue; + } + + throw new Error(`HTTP ${res.status}: ${url}`); + } catch (err: any) { + if (err?.name === "AbortError") { + if (attempt < maxRetries) { + await new Promise((r) => setTimeout(r, 1000 * Math.pow(2, attempt))); + continue; + } + throw new Error(`Request timed out after ${timeoutMs}ms: ${url}`); + } + // Retry network errors + if (attempt < maxRetries && (err?.code === "ECONNRESET" || err?.code === "ECONNREFUSED")) { + await new Promise((r) => setTimeout(r, 1000 * Math.pow(2, attempt))); + continue; + } + throw err; + } finally { + clearTimeout(timeoutId); + } + } + + throw new Error(`Failed after ${maxRetries + 1} attempts: ${url}`); +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd apps/scraper && npx tsc --noEmit` +Expected: No errors from `fetch.ts` + +- [ ] **Step 3: Commit** + +```bash +git add apps/scraper/src/utils/fetch.ts +git commit -m "feat(scraper): add fetchWithRetry utility" +``` + +--- + +### Task 2: Create `log` utility + +**Files:** +- Create: `apps/scraper/src/utils/log.ts` + +- [ ] **Step 1: Create `log.ts`** + +```ts +// apps/scraper/src/utils/log.ts + +function timestamp(): string { + return new Date().toISOString().slice(11, 19); // HH:MM:SS +} + +export function log(scraper: string, message: string): void { + console.log(`[${timestamp()}] [${scraper}] ${message}`); +} + +export function logError(scraper: string, message: string, error?: unknown): void { + console.error(`[${timestamp()}] [${scraper}] ERROR: ${message}`, error ?? ""); +} + +export function logWarn(scraper: string, message: string): void { + console.warn(`[${timestamp()}] [${scraper}] WARN: ${message}`); +} +``` + +- [ ] **Step 2: Commit** + +```bash +git add apps/scraper/src/utils/log.ts +git commit -m "feat(scraper): add log utility with scraper prefix" +``` + +--- + +### Task 3: Add `Scraper` type and `ContentType` union + +**Files:** +- Modify: `apps/scraper/src/utils/types.ts` + +- [ ] **Step 1: Add types to `types.ts`** + +Add to the end of the file: + +```ts +// Content type union for unified upsert +export type ContentType = "bill" | "government_content" | "court_case"; + +// Scraper interface for the runner +export interface Scraper { + name: string; + scrape: () => Promise; +} +``` + +- [ ] **Step 2: Commit** + +```bash +git add apps/scraper/src/utils/types.ts +git commit -m "feat(scraper): add Scraper and ContentType types" +``` + +--- + +### Task 4: Unify upsert functions into `upsertContent` + +**Files:** +- Modify: `apps/scraper/src/utils/db/operations.ts` + +This is the biggest single change. The three functions (`upsertBill`, `upsertGovernmentContent`, `upsertCourtCase`) share ~90% of their logic. We merge them into one `upsertContent(type, data)` that switches on type for the DB-specific parts (which table, which conflict target, which fields to hash, which check function). + +- [ ] **Step 1: Refactor `operations.ts`** + +Replace the entire file with: + +```ts +import { db } from "@acme/db/client"; +import { Bill, GovernmentContent, CourtCase } from "@acme/db/schema"; +import type { + BillData, + GovernmentContentData, + CourtCaseData, + ContentType, +} from "../types.js"; +import { createContentHash } from "../hash.js"; +import { generateAISummary, generateAIArticle } from "../ai/text-generation.js"; +import { generateImageSearchKeywords } from "../ai/image-keywords.js"; +import { getThumbnailImage } from "../api/google-images.js"; +import { + checkExistingBill, + checkExistingGovernmentContent, + checkExistingCourtCase, +} from "./helpers.js"; +import { + incrementTotalProcessed, + incrementNewEntries, + incrementExistingUnchanged, + incrementExistingChanged, + incrementAIArticlesGenerated, + incrementImagesSearched, +} from "./metrics.js"; +import { generateVideoForContent } from "./video-operations.js"; + +function isUsableText(text: string | undefined | null): text is string { + if (!text || text.length < 200) return false; + if (/[A-Z]:\\/.test(text)) return false; + + const lines = text.split("\n"); + const boilerplateLines = lines.filter((line) => { + const trimmed = line.trim(); + return ( + trimmed === "" || + trimmed.split(/\s+/).length === 1 || + (/[a-zA-Z]/.test(trimmed) && + trimmed === trimmed.toUpperCase() && + trimmed.length > 2) + ); + }); + if (boilerplateLines.length / lines.length >= 0.3) return false; + + return true; +} + +type ContentData = + | { type: "bill"; data: BillData } + | { type: "government_content"; data: GovernmentContentData } + | { type: "court_case"; data: CourtCaseData }; + +// Identify a content item for logging +function contentLabel(input: ContentData): string { + switch (input.type) { + case "bill": + return `bill ${input.data.billNumber}`; + case "government_content": + return `${input.data.type} "${input.data.title}"`; + case "court_case": + return `court case ${input.data.caseNumber}`; + } +} + +// Build hash input — only fields that matter for change detection +function hashFields(input: ContentData): string { + switch (input.type) { + case "bill": + return JSON.stringify({ + title: input.data.title, + description: input.data.description, + status: input.data.status, + summary: input.data.summary, + fullText: input.data.fullText, + }); + case "government_content": + return JSON.stringify({ + title: input.data.title, + description: input.data.description, + fullText: input.data.fullText, + }); + case "court_case": + return JSON.stringify({ + title: input.data.title, + description: input.data.description, + status: input.data.status, + fullText: input.data.fullText, + }); + } +} + +// Check existing record per type +async function checkExisting(input: ContentData) { + switch (input.type) { + case "bill": + return checkExistingBill( + input.data.billNumber, + input.data.sourceWebsite, + ); + case "government_content": + return checkExistingGovernmentContent(input.data.url); + case "court_case": + return checkExistingCourtCase(input.data.caseNumber); + } +} + +export async function upsertContent(input: ContentData) { + const newContentHash = createContentHash(hashFields(input)); + const existing = await checkExisting(input); + const label = contentLabel(input); + + incrementTotalProcessed(); + + // All content types have these fields + const fullText = input.data.fullText; + const title = input.data.title; + const url = input.data.url; + + // Determine what to generate + const hasUsableText = isUsableText(fullText); + let shouldGenerateArticle = false; + let shouldGenerateImage = false; + + if (!existing) { + shouldGenerateArticle = hasUsableText; + shouldGenerateImage = hasUsableText; + incrementNewEntries(); + console.log(`New ${label} detected`); + } else if (existing.contentHash !== newContentHash) { + shouldGenerateArticle = hasUsableText; + shouldGenerateImage = !existing.hasThumbnail && hasUsableText; + incrementExistingChanged(); + console.log(`Content changed for ${label}`); + } else { + shouldGenerateArticle = false; + shouldGenerateImage = !existing.hasThumbnail && hasUsableText; + incrementExistingUnchanged(); + console.log(`No changes for ${label}, skipping AI generation`); + } + + // Generate AI summary if needed + let description: string | undefined; + const existingDescription = input.data.description; + + if (existingDescription) { + description = existingDescription; + } else if (shouldGenerateArticle && fullText) { + const summarySource = + input.type === "bill" + ? input.data.summary || input.data.fullText || "" + : fullText; + console.log(`Generating AI summary for ${label}`); + description = await generateAISummary(title, summarySource); + } + + // Generate AI article + let aiGeneratedArticle: string | undefined; + const articleType = + input.type === "bill" + ? "bill" + : input.type === "government_content" + ? input.data.type + : "court case"; + + if (shouldGenerateArticle && hasUsableText) { + console.log(`Generating AI article for ${label}`); + aiGeneratedArticle = await generateAIArticle(title, fullText!, articleType, url); + incrementAIArticlesGenerated(); + } else if (existing?.hasArticle) { + console.log(`Using existing AI article for ${label}`); + } + + // Search for thumbnail + let thumbnailUrl: string | null | undefined; + if (shouldGenerateImage) { + try { + console.log(`Searching for thumbnail for ${label}`); + const searchQuery = await generateImageSearchKeywords( + title, + fullText || "", + articleType, + ); + console.log(`Image search query: ${searchQuery}`); + thumbnailUrl = await getThumbnailImage(searchQuery); + incrementImagesSearched(); + } catch (error) { + console.warn(`Failed to fetch thumbnail for ${label}:`, error); + thumbnailUrl = null; + } + } else if (existing?.hasThumbnail) { + console.log(`Using existing thumbnail for ${label}`); + } + + // Type-specific DB upsert + let result: any; + + if (input.type === "bill") { + const d = input.data; + const [row] = await db + .insert(Bill) + .values({ + ...d, + description: description ?? d.description, + aiGeneratedArticle: aiGeneratedArticle || undefined, + thumbnailUrl: + thumbnailUrl === undefined + ? undefined + : thumbnailUrl || undefined, + contentHash: newContentHash, + versions: [], + }) + .onConflictDoUpdate({ + target: [Bill.billNumber, Bill.sourceWebsite], + set: { + title: d.title, + description: description ?? d.description, + sponsor: d.sponsor, + status: d.status, + introducedDate: d.introducedDate, + congress: d.congress, + chamber: d.chamber, + summary: d.summary, + fullText: d.fullText, + ...(aiGeneratedArticle !== undefined && { aiGeneratedArticle }), + ...(thumbnailUrl !== undefined && { + thumbnailUrl: thumbnailUrl || undefined, + }), + url: d.url, + contentHash: newContentHash, + updatedAt: new Date(), + }, + }) + .returning(); + result = row; + } else if (input.type === "government_content") { + const d = input.data; + const [row] = await db + .insert(GovernmentContent) + .values({ + ...d, + aiGeneratedArticle: aiGeneratedArticle || undefined, + thumbnailUrl: + thumbnailUrl === undefined + ? undefined + : thumbnailUrl || undefined, + contentHash: newContentHash, + versions: [], + }) + .onConflictDoUpdate({ + target: GovernmentContent.url, + set: { + title: d.title, + type: d.type, + publishedDate: d.publishedDate, + description: d.description, + fullText: d.fullText, + ...(aiGeneratedArticle !== undefined && { aiGeneratedArticle }), + ...(thumbnailUrl !== undefined && { + thumbnailUrl: thumbnailUrl || undefined, + }), + source: d.source, + contentHash: newContentHash, + updatedAt: new Date(), + }, + }) + .returning(); + result = row; + } else { + const d = input.data; + const [row] = await db + .insert(CourtCase) + .values({ + ...d, + description: description ?? d.description, + aiGeneratedArticle: aiGeneratedArticle || undefined, + thumbnailUrl: + thumbnailUrl === undefined + ? undefined + : thumbnailUrl || undefined, + contentHash: newContentHash, + versions: [], + }) + .onConflictDoUpdate({ + target: CourtCase.caseNumber, + set: { + title: d.title, + court: d.court, + filedDate: d.filedDate, + description: description ?? d.description, + status: d.status, + fullText: d.fullText, + ...(aiGeneratedArticle !== undefined && { aiGeneratedArticle }), + ...(thumbnailUrl !== undefined && { + thumbnailUrl: thumbnailUrl || undefined, + }), + url: d.url, + contentHash: newContentHash, + updatedAt: new Date(), + }, + }) + .returning(); + result = row; + } + + console.log(`${label} upserted`); + + // Generate video + if (result && fullText) { + const videoSource = + input.type === "bill" + ? input.data.sourceWebsite + : input.type === "government_content" + ? (input.data.source ?? "whitehouse.gov") + : input.type === "court_case" + ? input.data.court + : ""; + await generateVideoForContent( + input.type, + result.id, + title, + fullText, + newContentHash, + videoSource, + result.thumbnailUrl, + ); + } + + return result; +} + +// Legacy wrapper for whitehouse scraper's upsertPresidentialAction calls +export async function upsertPresidentialAction(actionData: { + title: string; + type: string; + issuedDate?: Date; + publishedDate?: Date; + description?: string; + fullText?: string; + url: string; + source?: string; +}) { + return upsertContent({ + type: "government_content", + data: { + ...actionData, + publishedDate: + actionData.publishedDate || actionData.issuedDate || new Date(), + source: actionData.source || "whitehouse.gov", + }, + }); +} +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd apps/scraper && npx tsc --noEmit` +Expected: Errors only from scrapers still importing old function names (fixed in later tasks) + +- [ ] **Step 3: Commit** + +```bash +git add apps/scraper/src/utils/db/operations.ts +git commit -m "refactor(scraper): unify upsertBill/GovernmentContent/CourtCase into upsertContent" +``` + +--- + +### Task 5: Rewrite `govtrack.ts` — drop Crawlee + +**Files:** +- Modify: `apps/scraper/src/scrapers/govtrack.ts` + +- [ ] **Step 1: Rewrite `govtrack.ts`** + +Replace the entire file: + +```ts +import * as cheerio from "cheerio"; + +import { fetchWithRetry } from "../utils/fetch.js"; +import { log, logError } from "../utils/log.js"; +import { upsertContent } from "../utils/db/operations.js"; +import { printMetricsSummary, resetMetrics } from "../utils/db/metrics.js"; +import type { Scraper } from "../utils/types.js"; + +const NAME = "GovTrack"; + +interface GovTrackConfig { + maxBills?: number; + congress?: number; +} + +async function scrape(config: GovTrackConfig = {}) { + const { maxBills = 100, congress = 119 } = config; + log(NAME, "Starting..."); + resetMetrics(); + + // Step 1: Fetch listing page and collect bill links + const listingUrl = "https://www.govtrack.us/congress/bills/#docket"; + const listingRes = await fetchWithRetry(listingUrl); + const listingHtml = await listingRes.text(); + const $listing = cheerio.load(listingHtml); + + const collectedLinks: string[] = []; + $listing('.card > .card-body .card-title > a[href*="/congress/bills/"]').each( + (_, element) => { + const href = $listing(element).attr("href"); + if (href && /\/congress\/bills\/\d+\/[a-z]+\d+/.test(href)) { + const fullUrl = href.startsWith("http") + ? href + : `https://www.govtrack.us${href}`; + if (collectedLinks.length < maxBills) { + collectedLinks.push(fullUrl); + } + } + }, + ); + + log(NAME, `Found ${collectedLinks.length} bill links`); + + // Step 2: Scrape each bill's /text page + const textUrls = collectedLinks.slice(0, maxBills).map((url) => `${url}/text`); + log(NAME, `Scraping ${textUrls.length} text pages...`); + + for (const textUrl of textUrls) { + try { + const res = await fetchWithRetry(textUrl, { timeoutMs: 60_000 }); + const html = await res.text(); + const $ = cheerio.load(html); + + // Remove noise + $("#main_text_content script, #main_text_content style, #main_text_content nav").remove(); + let fullText = $("#main_text_content").text().trim(); + + // Reject garbage text + if ( + /[A-Z]:\\/.test(fullText) || + fullText.startsWith("Examples:") || + fullText.startsWith("IB ") + ) { + log(NAME, `Rejecting garbage text for ${textUrl}`); + fullText = ""; + } + + // Truncate to 1,000 words + if (fullText) { + const words = fullText.split(/\s+/); + if (words.length > 1000) { + fullText = words.slice(0, 1000).join(" "); + } + } + + // Extract bill info + const h1Text = $("#maincontent h1").first().text().trim(); + const h1Parts = h1Text.split(":"); + const billNumber = h1Parts[0]?.trim() || ""; + const title = + h1Parts.length > 1 ? h1Parts.slice(1).join(":").trim() : h1Text; + + const status = $(".bill-status").first().text().trim() || "Unknown"; + + let introducedDate: Date | undefined; + $("p, div").each((_, element) => { + const text = $(element).text(); + if (text.includes("Introduced:")) { + const dateStr = text.replace("Introduced:", "").trim(); + introducedDate = new Date(dateStr); + return false; + } + }); + + const congressMatch = textUrl.match(/\/congress\/bills\/(\d+)\//); + const congressNum = congressMatch + ? parseInt(congressMatch[1]!) + : undefined; + + const chamber = billNumber.toLowerCase().startsWith("h.") + ? "House" + : "Senate"; + + const summary = $(".summary").first().text().trim() || undefined; + const billUrl = textUrl.replace(/\/text$/, ""); + + if (fullText !== "") { + await upsertContent({ + type: "bill", + data: { + billNumber, + title, + description: summary, + sponsor: undefined, + status, + introducedDate, + congress: congressNum, + chamber, + summary, + fullText, + url: billUrl, + sourceWebsite: "govtrack" as const, + }, + }); + } + + log(NAME, `Scraped: ${billNumber} — ${title}`); + } catch (error) { + logError(NAME, `Error scraping ${textUrl}`, error); + } + } + + log(NAME, "Completed"); + printMetricsSummary(NAME); +} + +export const govtrack: Scraper = { + name: NAME, + scrape: () => scrape(), +}; +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd apps/scraper && npx tsc --noEmit` + +- [ ] **Step 3: Commit** + +```bash +git add apps/scraper/src/scrapers/govtrack.ts +git commit -m "refactor(scraper): rewrite govtrack to use fetch+cheerio, drop Crawlee" +``` + +--- + +### Task 6: Rewrite `whitehouse.ts` — drop Crawlee + +**Files:** +- Modify: `apps/scraper/src/scrapers/whitehouse.ts` + +- [ ] **Step 1: Rewrite `whitehouse.ts`** + +Replace the entire file: + +```ts +import * as cheerio from "cheerio"; +import TurndownService from "turndown"; + +import { fetchWithRetry } from "../utils/fetch.js"; +import { log, logError } from "../utils/log.js"; +import { upsertContent } from "../utils/db/operations.js"; +import { generateAISummary } from "../utils/ai/text-generation.js"; +import { resetMetrics, printMetricsSummary } from "../utils/db/metrics.js"; +import type { Scraper } from "../utils/types.js"; + +const NAME = "White House"; + +function toTitleCase(text: string): string { + const uppercaseCount = (text.match(/[A-Z]/g) || []).length; + const letterCount = (text.match(/[a-zA-Z]/g) || []).length; + + if (letterCount === 0 || uppercaseCount / letterCount < 0.5) { + return text; + } + + return text + .toLowerCase() + .split(" ") + .map((word) => { + if (word.length === 0) return word; + return word.charAt(0).toUpperCase() + word.slice(1); + }) + .join(" ") + .replace(/^./, (char) => char.toUpperCase()); +} + +async function scrape() { + log(NAME, "Starting..."); + resetMetrics(); + + const maxArticles = 20; + const turndownService = new TurndownService({ + headingStyle: "atx", + codeBlockStyle: "fenced", + }); + + // Step 1: Collect article links from listing pages (with pagination) + const collectedLinks: string[] = []; + let nextPageUrl: string | null = "https://www.whitehouse.gov/news/"; + + while (nextPageUrl && collectedLinks.length < maxArticles) { + const res = await fetchWithRetry(nextPageUrl, { timeoutMs: 60_000 }); + const html = await res.text(); + const $ = cheerio.load(html); + + $(".wp-block-post-title > a").each((_, element) => { + const href = $(element).attr("href"); + if (href && collectedLinks.length < maxArticles) { + collectedLinks.push(href); + } + }); + + log(NAME, `Found ${collectedLinks.length} article links so far`); + + if (collectedLinks.length < maxArticles) { + nextPageUrl = $(".wp-block-query-pagination-next").attr("href") || null; + } else { + nextPageUrl = null; + } + } + + log(NAME, `Collected ${collectedLinks.length} articles, now scraping...`); + + // Step 2: Scrape each article + for (const articleUrl of collectedLinks.slice(0, maxArticles)) { + try { + const res = await fetchWithRetry(articleUrl, { timeoutMs: 60_000 }); + const html = await res.text(); + const $ = cheerio.load(html); + + let headline = $(".wp-block-whitehouse-topper__headline") + .first() + .text() + .trim(); + if (!headline) { + headline = $("h1").first().text().trim() || "Untitled Article"; + } + headline = toTitleCase(headline); + + const dateStr = + $(".wp-block-post-date > time").first().attr("datetime") || + $(".wp-block-post-date > time").first().text().trim(); + const issuedDate = dateStr ? new Date(dateStr) : new Date(); + + // Extract content after the first div in .entry-content + const entryContent = $(".entry-content").first(); + let fullTextMarkdown = ""; + + if (entryContent.length > 0) { + const children = entryContent.children(); + let firstDivIndex = -1; + + children.each((index, element) => { + if ( + element.tagName.toLowerCase() === "div" && + firstDivIndex === -1 + ) { + firstDivIndex = index; + } + }); + + let contentHtml = ""; + if (firstDivIndex === -1) { + contentHtml = entryContent.html() || ""; + } else { + children.each((index, element) => { + if (index > firstDivIndex) { + contentHtml += $.html(element); + } + }); + } + + fullTextMarkdown = turndownService.turndown(contentHtml).trim(); + } + + // Determine content type from URL + let contentType = "News Article"; + if (articleUrl.includes("/fact-sheets/")) { + contentType = "Fact Sheet"; + } else if (articleUrl.includes("/briefings-statements/")) { + contentType = "Briefing Statement"; + } else if (articleUrl.includes("/presidential-actions/")) { + contentType = "Presidential Action"; + } + + log(NAME, `Generating AI summary for: ${headline}`); + const aiSummary = await generateAISummary(headline, fullTextMarkdown); + + await upsertContent({ + type: "government_content", + data: { + title: headline, + type: contentType, + publishedDate: issuedDate, + description: aiSummary, + fullText: fullTextMarkdown, + url: articleUrl, + source: "whitehouse.gov", + }, + }); + + log(NAME, `Scraped ${contentType}: ${headline}`); + } catch (error) { + logError(NAME, `Error scraping ${articleUrl}`, error); + } + } + + log(NAME, "Completed"); + printMetricsSummary(NAME); +} + +export const whitehouse: Scraper = { + name: NAME, + scrape, +}; +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd apps/scraper && npx tsc --noEmit` + +- [ ] **Step 3: Commit** + +```bash +git add apps/scraper/src/scrapers/whitehouse.ts +git commit -m "refactor(scraper): rewrite whitehouse to use fetch+cheerio+turndown, drop Crawlee" +``` + +--- + +### Task 7: Update `congress.ts` — use shared utilities + +**Files:** +- Modify: `apps/scraper/src/scrapers/congress.ts` + +Replace the local `congressFetch` with `fetchWithRetry`, switch to `upsertContent`, use `log`/`logError`, and export as `Scraper` object. + +- [ ] **Step 1: Rewrite `congress.ts`** + +Key changes from current code: +1. Replace `congressFetch()` with a wrapper around `fetchWithRetry()` that adds the API key and JSON parsing +2. Replace `upsertBill(...)` calls with `upsertContent({ type: "bill", data: ... })` +3. Replace `console.log`/`console.error` with `log(NAME, ...)` / `logError(NAME, ...)` +4. Export as `Scraper` object instead of bare function + +```ts +import { fetchWithRetry } from "../utils/fetch.js"; +import { log, logError } from "../utils/log.js"; +import { printMetricsSummary, resetMetrics } from "../utils/db/metrics.js"; +import { upsertContent } from "../utils/db/operations.js"; +import type { Scraper } from "../utils/types.js"; + +const BASE_URL = "https://api.congress.gov/v3"; +const NAME = "Congress.gov"; + +// ─── Config ────────────────────────────────────────────────────────────────── + +interface CongressScraperConfig { + maxBills?: number; + congress?: number; + chamber?: "House" | "Senate"; +} + +// ─── API response shapes (partial — only what we use) ──────────────────────── + +interface ApiBillListItem { + number: string; + type: string; + title: string; + congress: number; + url: string; + latestAction?: { text: string; actionDate: string }; +} + +interface ApiBillDetail { + bill: { + number: string; + type: string; + title: string; + congress: number; + originChamber: string; + introducedDate?: string; + sponsors?: Array<{ + firstName: string; + lastName: string; + party: string; + state: string; + }>; + latestAction?: { text: string; actionDate: string }; + }; +} + +interface ApiSummary { + actionDate: string; + actionDesc: string; + text: string; + updateDate: string; +} + +interface ApiTextVersion { + type: string; + date: string | null; + formats: Array<{ type: string; url: string }>; +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +function getApiKey(): string { + const key = process.env.CONGRESS_API_KEY; + if (!key) { + throw new Error( + "CONGRESS_API_KEY is not set. Sign up at https://api.congress.gov/sign-up/", + ); + } + return key; +} + +async function congressFetch( + path: string, + params: Record = {}, +): Promise { + const apiKey = getApiKey(); + const url = new URL(`${BASE_URL}${path}`); + url.searchParams.set("api_key", apiKey); + url.searchParams.set("format", "json"); + for (const [k, v] of Object.entries(params)) { + url.searchParams.set(k, String(v)); + } + + const res = await fetchWithRetry(url.toString()); + return res.json() as Promise; +} + +function ordinalSuffix(n: number): string { + const mod100 = Math.abs(n) % 100; + const mod10 = Math.abs(n) % 10; + if (mod100 >= 11 && mod100 <= 13) return "th"; + if (mod10 === 1) return "st"; + if (mod10 === 2) return "nd"; + if (mod10 === 3) return "rd"; + return "th"; +} + +function billTypeToUrlSlug(type: string): string { + const slugMap: Record = { + HR: "house-bill", + S: "senate-bill", + HJRES: "house-joint-resolution", + SJRES: "senate-joint-resolution", + HCONRES: "house-concurrent-resolution", + SCONRES: "senate-concurrent-resolution", + HRES: "house-simple-resolution", + SRES: "senate-simple-resolution", + }; + return slugMap[type.toUpperCase()] ?? `${type.toLowerCase()}-bill`; +} + +function formatBillNumber(type: string, number: string): string { + const prefixMap: Record = { + HR: "H.R.", + S: "S.", + HJRES: "H.J.Res.", + SJRES: "S.J.Res.", + HCONRES: "H.Con.Res.", + SCONRES: "S.Con.Res.", + HRES: "H.Res.", + SRES: "S.Res.", + }; + const prefix = prefixMap[type.toUpperCase()] ?? type; + return `${prefix} ${number}`; +} + +function stripHtml(html: string): string { + return html + .replace(/<[^>]+>/g, " ") + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/ /g, " ") + .replace(/\s{2,}/g, " ") + .trim(); +} + +async function fetchSummary( + congress: number, + billType: string, + billNumber: string, +): Promise { + try { + const data = await congressFetch<{ summaries: ApiSummary[] }>( + `/bill/${congress}/${billType.toLowerCase()}/${billNumber}/summaries`, + ); + if (!data.summaries?.length) return undefined; + const latest = data.summaries[data.summaries.length - 1]!; + return stripHtml(latest.text).slice(0, 5000); + } catch { + return undefined; + } +} + +async function fetchFullText( + congress: number, + billType: string, + billNumber: string, +): Promise { + try { + const data = await congressFetch<{ textVersions: ApiTextVersion[] }>( + `/bill/${congress}/${billType.toLowerCase()}/${billNumber}/text`, + ); + if (!data.textVersions?.length) return undefined; + + for (const version of [...data.textVersions].reverse()) { + const txtFormat = version.formats.find( + (f) => f.type === "Formatted Text", + ); + if (!txtFormat) continue; + + const res = await fetchWithRetry(txtFormat.url); + const rawText = await res.text(); + if (!rawText) continue; + + let text = stripHtml(rawText); + const words = text.split(/\s+/); + if (words.length > 1000) { + text = words.slice(0, 1000).join(" "); + } + return text.trim() || undefined; + } + } catch { + // Full text is optional + } + return undefined; +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +async function scrape(config: CongressScraperConfig = {}) { + const { maxBills = 100, congress = 119, chamber = "House" } = config; + + log(NAME, `Starting (congress=${congress}, chamber=${chamber})...`); + resetMetrics(); + + const chamberParam = chamber === "House" ? "house" : "senate"; + + // Step 1: fetch bill listing + const allBills: ApiBillListItem[] = []; + let offset = 0; + const pageSize = 250; + + while (allBills.length < maxBills) { + const remaining = maxBills - allBills.length; + const limit = Math.min(remaining, pageSize); + + const pageData = await congressFetch<{ bills: ApiBillListItem[] }>( + `/bill/${congress}`, + { chamber: chamberParam, limit, offset, sort: "updateDate+desc" }, + ); + + const page = pageData.bills ?? []; + allBills.push(...page); + if (page.length < limit) break; + offset += page.length; + } + + const bills = allBills.slice(0, maxBills); + log(NAME, `Fetched ${bills.length} bills`); + + // Step 2: enrich each bill + for (const item of bills) { + try { + const billType = item.type.toLowerCase(); + const billNumber = item.number; + + const detailData = await congressFetch( + `/bill/${congress}/${billType}/${billNumber}`, + ); + const detail = detailData.bill; + + const formattedBillNumber = formatBillNumber(detail.type, detail.number); + const title = (detail.title ?? "Unknown").slice(0, 250); + + const primarySponsor = detail.sponsors?.[0]; + const sponsor = primarySponsor + ? `${primarySponsor.firstName} ${primarySponsor.lastName} (${primarySponsor.party}-${primarySponsor.state})`.slice( + 0, + 250, + ) + : undefined; + + const status = (detail.latestAction?.text ?? "Unknown").slice(0, 250); + const introducedDate = detail.introducedDate + ? new Date(detail.introducedDate) + : undefined; + const chamberValue = (detail.originChamber ?? chamber) as + | "House" + | "Senate"; + const billUrl = `https://www.congress.gov/bill/${congress}${ordinalSuffix(congress)}-congress/${billTypeToUrlSlug(detail.type)}/${billNumber}`; + + const summary = await fetchSummary(congress, billType, billNumber); + const fullText = await fetchFullText(congress, billType, billNumber); + + await upsertContent({ + type: "bill", + data: { + billNumber: formattedBillNumber, + title, + description: summary, + sponsor, + status, + introducedDate, + congress, + chamber: chamberValue, + summary, + fullText, + url: billUrl, + sourceWebsite: "congress.gov", + }, + }); + + log(NAME, `Processed: ${formattedBillNumber} — ${title}`); + } catch (error) { + logError( + NAME, + `Error processing bill ${item.type}${item.number}`, + error, + ); + } + } + + log(NAME, "Completed"); + printMetricsSummary(NAME); +} + +export const congress: Scraper = { + name: NAME, + scrape: () => scrape(), +}; +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd apps/scraper && npx tsc --noEmit` + +- [ ] **Step 3: Commit** + +```bash +git add apps/scraper/src/scrapers/congress.ts +git commit -m "refactor(scraper): congress uses shared fetchWithRetry + upsertContent + log" +``` + +--- + +### Task 8: Update `scotus.ts` — use shared utilities + +**Files:** +- Modify: `apps/scraper/src/scrapers/scotus.ts` + +Same pattern as congress: replace local `clFetch` with wrapper around `fetchWithRetry`, switch to `upsertContent`, use `log`/`logError`, export as `Scraper`. + +- [ ] **Step 1: Rewrite `scotus.ts`** + +```ts +import { fetchWithRetry } from "../utils/fetch.js"; +import { log, logError } from "../utils/log.js"; +import { printMetricsSummary, resetMetrics } from "../utils/db/metrics.js"; +import { upsertContent } from "../utils/db/operations.js"; +import type { Scraper } from "../utils/types.js"; + +const CL_BASE = "https://www.courtlistener.com/api/rest/v4"; +const NAME = "SCOTUS"; + +// ─── Config ────────────────────────────────────────────────────────────────── + +interface ScotusScraperConfig { + maxCases?: number; + court?: string; +} + +// ─── API response shapes ───────────────────────────────────────────────────── + +interface ClCluster { + id: number; + absolute_url: string; + case_name: string; + docket_id: number; + date_filed: string | null; + precedential_status: string; + syllabus: string; + sub_opinions: string[]; +} + +interface ClOpinion { + id: number; + plain_text: string; + html: string; + type: string; +} + +interface ClDocket { + id: number; + docket_number: string; + court: string; + date_filed: string | null; + case_name: string; +} + +// ─── Constants ─────────────────────────────────────────────────────────────── + +const COURT_NAMES: Record = { + scotus: "Supreme Court of the United States", + ca1: "1st Circuit Court of Appeals", + ca2: "2nd Circuit Court of Appeals", + ca3: "3rd Circuit Court of Appeals", + ca4: "4th Circuit Court of Appeals", + ca5: "5th Circuit Court of Appeals", + ca6: "6th Circuit Court of Appeals", + ca7: "7th Circuit Court of Appeals", + ca8: "8th Circuit Court of Appeals", + ca9: "9th Circuit Court of Appeals", + ca10: "10th Circuit Court of Appeals", + ca11: "11th Circuit Court of Appeals", + cadc: "D.C. Circuit Court of Appeals", +}; + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +function clHeaders(): Record { + const headers: Record = { + Accept: "application/json", + "User-Agent": "billion-scraper/1.0 (contact via github)", + }; + if (process.env.COURTLISTENER_API_KEY) { + headers["Authorization"] = `Token ${process.env.COURTLISTENER_API_KEY}`; + } + return headers; +} + +async function clFetch( + path: string, + params: Record = {}, +): Promise { + const url = new URL(`${CL_BASE}${path}`); + for (const [k, v] of Object.entries(params)) { + url.searchParams.set(k, String(v)); + } + + const res = await fetchWithRetry(url.toString(), { + headers: clHeaders(), + }); + return res.json() as Promise; +} + +function stripHtml(html: string): string { + return html + .replace(/<[^>]+>/g, " ") + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/ /g, " ") + .replace(/\s{2,}/g, " ") + .trim(); +} + +function truncateWords(text: string, maxWords: number): string { + const words = text.split(/\s+/); + return words.length > maxWords ? words.slice(0, maxWords).join(" ") : text; +} + +async function fetchOpinionText( + subOpinionUrls: string[], +): Promise { + const fetched: { opinion: ClOpinion; text: string }[] = []; + + for (const url of subOpinionUrls) { + try { + const res = await fetchWithRetry(url, { headers: clHeaders() }); + const opinion = (await res.json()) as ClOpinion; + const text = ( + opinion.plain_text?.trim() || stripHtml(opinion.html ?? "") + ).trim(); + if (text.length > 0) { + fetched.push({ opinion, text }); + } + } catch { + // Skip failed sub-opinions + } + } + + if (fetched.length === 0) return undefined; + + const preferredTypes = new Set(["010combined", "020lead"]); + fetched.sort((a, b) => { + const aPref = preferredTypes.has(a.opinion.type) ? 0 : 1; + const bPref = preferredTypes.has(b.opinion.type) ? 0 : 1; + return aPref - bPref; + }); + + for (const { text } of fetched) { + if (text.length > 200) { + return truncateWords(text, 1000); + } + } + return undefined; +} + +// ─── Main ──────────────────────────────────────────────────────────────────── + +async function scrape(config: ScotusScraperConfig = {}) { + const { maxCases = 50, court = "scotus" } = config; + + const displayName = court === "scotus" ? "SCOTUS" : court.toUpperCase(); + log(displayName, `Starting (court=${court}, maxCases=${maxCases})...`); + resetMetrics(); + + // Step 1: fetch opinion clusters + const allClusters: ClCluster[] = []; + let page = 1; + const pageSize = 100; + + while (allClusters.length < maxCases) { + const pageData = await clFetch<{ + results: ClCluster[]; + next: string | null; + }>("/clusters/", { + court, + order_by: "-date_filed", + page_size: pageSize, + page, + }); + + const results = pageData.results ?? []; + allClusters.push(...results); + if (!pageData.next || results.length < pageSize) break; + page++; + } + + const clusters = allClusters.slice(0, maxCases); + log(displayName, `Fetched ${clusters.length} opinion clusters`); + + // Step 2: process each cluster + for (const cluster of clusters) { + try { + const docket = await clFetch( + `/dockets/${cluster.docket_id}/`, + ); + const docketNumber = docket.docket_number || `CL-${cluster.id}`; + const filedDate = docket.date_filed + ? new Date(docket.date_filed) + : undefined; + const courtCode = docket.court ?? court; + const courtName = COURT_NAMES[courtCode] ?? courtCode.toUpperCase(); + + const title = cluster.case_name?.slice(0, 250) || "Unknown Case"; + const status = cluster.precedential_status || "Unknown"; + const caseUrl = `https://www.courtlistener.com${cluster.absolute_url}`; + + const fullText = await fetchOpinionText(cluster.sub_opinions ?? []); + + const description = cluster.syllabus + ? stripHtml(cluster.syllabus).slice(0, 1000) || undefined + : undefined; + + await upsertContent({ + type: "court_case", + data: { + caseNumber: docketNumber, + title, + court: courtName, + filedDate, + description, + status, + fullText, + url: caseUrl, + }, + }); + + log(displayName, `Processed: ${docketNumber} — ${title}`); + } catch (error) { + logError(displayName, `Error processing cluster ${cluster.id}`, error); + } + } + + log(displayName, "Completed"); + printMetricsSummary(displayName); +} + +export const scotus: Scraper = { + name: NAME, + scrape: () => scrape(), +}; +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd apps/scraper && npx tsc --noEmit` + +- [ ] **Step 3: Commit** + +```bash +git add apps/scraper/src/scrapers/scotus.ts +git commit -m "refactor(scraper): scotus uses shared fetchWithRetry + upsertContent + log" +``` + +--- + +### Task 9: Rewrite `main.ts` — runner loop + +**Files:** +- Modify: `apps/scraper/src/main.ts` + +- [ ] **Step 1: Rewrite `main.ts`** + +```ts +import { dirname, join } from "path"; +import { fileURLToPath } from "url"; +import dotenv from "dotenv"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +dotenv.config({ path: join(__dirname, "../../../.env") }); +dotenv.config({ path: join(__dirname, "../.env") }); + +import { congress } from "./scrapers/congress.js"; +import { govtrack } from "./scrapers/govtrack.js"; +import { scotus } from "./scrapers/scotus.js"; +import { whitehouse } from "./scrapers/whitehouse.js"; +import type { Scraper } from "./utils/types.js"; + +const scrapers: Scraper[] = [govtrack, whitehouse, congress, scotus]; + +async function main() { + const arg = process.argv[2]?.toLowerCase(); + + if (arg && arg !== "all") { + const scraper = scrapers.find((s) => s.name.toLowerCase().replace(/[.\s]/g, "") === arg.replace(/[.\s]/g, "")); + if (!scraper) { + console.error( + `Unknown scraper: "${arg}". Available: ${scrapers.map((s) => s.name).join(", ")}, all`, + ); + process.exit(1); + } + await scraper.scrape(); + } else { + console.log("Running all scrapers...\n"); + for (const scraper of scrapers) { + await scraper.scrape(); + console.log("\n---\n"); + } + console.log("All scrapers completed."); + } +} + +main().catch((error) => { + console.error("Error running scrapers:", error); + process.exit(1); +}); +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cd apps/scraper && npx tsc --noEmit` + +- [ ] **Step 3: Commit** + +```bash +git add apps/scraper/src/main.ts +git commit -m "refactor(scraper): simplify main.ts to runner loop over Scraper objects" +``` + +--- + +### Task 10: Remove Crawlee + Playwright dependencies + +**Files:** +- Modify: `apps/scraper/package.json` +- Modify: `apps/scraper/tsconfig.json` + +- [ ] **Step 1: Add cheerio dependency, remove crawlee/playwright/@apify/tsconfig** + +Run: +```bash +cd apps/scraper && pnpm remove crawlee playwright @apify/tsconfig && pnpm add cheerio +``` + +- [ ] **Step 2: Update `package.json` description** + +In `apps/scraper/package.json`, change the `description` field from `"This is an example of a Crawlee project."` to `"Government data scraper for Billion app"`. + +- [ ] **Step 3: Verify tsconfig.json** + +The tsconfig extends `../../tooling/typescript/base.json` which is fine — `@apify/tsconfig` was a devDependency, not extended in tsconfig. No tsconfig changes needed. + +- [ ] **Step 4: Verify it compiles and all imports resolve** + +Run: `cd apps/scraper && npx tsc --noEmit` +Expected: Clean compile, no errors + +- [ ] **Step 5: Commit** + +```bash +git add apps/scraper/package.json apps/scraper/tsconfig.json pnpm-lock.yaml +git commit -m "chore(scraper): remove crawlee, playwright, @apify/tsconfig; add cheerio" +``` + +--- + +### Task 11: Update Dockerfile + +**Files:** +- Modify: `Dockerfile.scraper` (repo root) + +The Dockerfile no longer needs Playwright. It also gets simpler since we don't need the Crawlee storage directory. + +- [ ] **Step 1: Update Dockerfile.scraper** + +Replace the entire file: + +```dockerfile +# Build context: repo root +FROM node:20-slim AS builder + +ENV PNPM_HOME="/root/.local/share/pnpm" +ENV PATH="$PNPM_HOME:$PATH" +RUN corepack enable && corepack prepare pnpm@latest --activate + +WORKDIR /app +COPY pnpm-lock.yaml pnpm-workspace.yaml package.json ./ +COPY apps/scraper/package.json ./apps/scraper/package.json +COPY packages/db/package.json ./packages/db/package.json +COPY tooling/typescript/package.json ./tooling/typescript/package.json +RUN pnpm install --frozen-lockfile + +COPY tooling/typescript ./tooling/typescript +COPY packages/db/src ./packages/db/src +COPY packages/db/tsconfig.json ./packages/db/tsconfig.json +WORKDIR /app/packages/db +RUN pnpm exec tsc --emitDeclarationOnly false --skipLibCheck true && \ + find dist -name "*.js" -exec sed -i "s|from '\./\([^']*\)'|from './\1.js'|g" {} + && \ + find dist -name "*.js" -exec sed -i "s|from \"\./\([^\"]*\)\"|from \"./\1.js\"|g" {} + + +COPY apps/scraper/src /app/apps/scraper/src +COPY apps/scraper/tsconfig.json /app/apps/scraper/tsconfig.json +WORKDIR /app/apps/scraper +RUN pnpm run build + +# Final image +FROM node:20-slim + +ENV PNPM_HOME="/root/.local/share/pnpm" +ENV PATH="$PNPM_HOME:$PATH" +RUN apt-get update && apt-get install -y --no-install-recommends procps && rm -rf /var/lib/apt/lists/* +RUN corepack enable && corepack prepare pnpm@latest --activate + +WORKDIR /app +COPY pnpm-lock.yaml pnpm-workspace.yaml package.json ./ +COPY apps/scraper/package.json ./apps/scraper/package.json +COPY packages/db/package.json ./packages/db/package.json +RUN echo "enable-pre-post-scripts=true" >> .npmrc && pnpm install --frozen-lockfile --prod + +COPY --from=builder /app/apps/scraper/dist ./apps/scraper/dist +COPY --from=builder /app/packages/db/dist ./packages/db/dist + +# Rewrite db exports to use compiled dist/ instead of src/ +RUN node -e " \ + const p = require('./packages/db/package.json'); \ + Object.values(p.exports).forEach(e => { e.default = e.default.replace('./src/', './dist/').replace('.ts', '.js'); }); \ + require('fs').writeFileSync('./packages/db/package.json', JSON.stringify(p, null, 2)); \ +" + +WORKDIR /app/apps/scraper +CMD ["pnpm", "run", "start:prod"] +``` + +Note: This is essentially the same Dockerfile — the only real change is that `crawlee` and `playwright` are no longer in `package.json` so they won't be installed. The `.dockerignore` `storage` entry for Crawlee storage is now irrelevant but harmless. + +- [ ] **Step 2: Commit** + +```bash +git add Dockerfile.scraper +git commit -m "chore(scraper): update Dockerfile after removing Crawlee/Playwright" +``` + +--- + +### Task 12: Smoke test + +- [ ] **Step 1: Full compile check** + +Run: `cd apps/scraper && npx tsc --noEmit` +Expected: Clean compile, zero errors + +- [ ] **Step 2: Dry run with a single scraper** + +Run: `cd apps/scraper && pnpm run start:dev govtrack` +Expected: Scraper runs, fetches listing page, scrapes bill text pages, logs with `[HH:MM:SS] [GovTrack]` prefix, prints metrics summary. Verify no Crawlee references in output. + +- [ ] **Step 3: Verify no Crawlee imports remain** + +Run: `grep -r "crawlee" apps/scraper/src/` +Expected: No matches + +- [ ] **Step 4: Commit any final fixes if needed** diff --git a/docs/superpowers/specs/2026-03-30-scraper-refactor-design.md b/docs/superpowers/specs/2026-03-30-scraper-refactor-design.md new file mode 100644 index 0000000..6345e5a --- /dev/null +++ b/docs/superpowers/specs/2026-03-30-scraper-refactor-design.md @@ -0,0 +1,118 @@ +# Scraper Architecture Refactor + +## Goal + +Replace Crawlee with a hand-rolled approach to reduce complexity, dependencies, and learning surface while keeping reliability. The result is a simpler, more unified codebase where all scrapers follow the same patterns. + +## What Changes + +### Drop Crawlee + Playwright + +Crawlee is only used by 2 of 4 scrapers (govtrack, whitehouse) for a pattern that amounts to: fetch HTML, parse with Cheerio, follow links. Replace with `fetch` + `cheerio` directly. + +**Removed dependencies:** `crawlee`, `playwright`, `@apify/tsconfig` + +### New: `src/utils/fetch.ts` — `fetchWithRetry()` + +Single shared fetch utility (~30 lines). All four scrapers use this. + +- Configurable max retries (default 3) +- Exponential backoff +- Honors `Retry-After` header +- Retries on 429 and 5xx +- Configurable timeout via `AbortSignal.timeout` (default 30s) +- Returns standard `Response` + +### New: `src/utils/log.ts` — `log(scraperName, message)` + +Thin wrapper over `console.log` that prefixes scraper name + timestamp. Replace all scattered `console.log`/`console.error` calls with this. + +### Changed: `src/utils/db/operations.ts` — Unified `upsertContent()` + +Merge `upsertBill()`, `upsertGovernmentContent()`, `upsertCourtCase()` into a single `upsertContent(type, data)` that switches on content type internally. DB schema stays the same (three separate tables). The shared logic: + +1. Hash content +2. Check if exists + compare hash +3. Conditionally generate AI summary/article/thumbnail +4. Upsert to correct table +5. Generate video + +### Changed: `src/scrapers/govtrack.ts` and `src/scrapers/whitehouse.ts` + +Replace `CheerioCrawler` with direct `fetchWithRetry()` + `cheerio.load()`. Each scraper implements its own fetching pattern (listing page, pagination, detail pages) — no shared crawl abstraction, since the two are different enough that abstracting adds more complexity than it removes. + +### Changed: `src/main.ts` — Runner loop + +```ts +const scrapers: Scraper[] = [congress, govtrack, whitehouse, scotus] + +const selected = parseArgs(process.argv) +for (const scraper of selected) { + resetMetrics() + await scraper.scrape() + printMetricsSummary(scraper.name) +} +``` + +Each scraper conforms to: + +```ts +type Scraper = { + name: string + scrape: (config?) => Promise +} +``` + +Scrapers return `void` because they call `upsertContent()` as they go — no need to buffer all results in memory. + +## What Stays the Same + +- All AI generation (`src/utils/ai/`) — unchanged +- Google Images API (`src/utils/api/`) — unchanged +- Video operations (`src/utils/db/video-operations.ts`) — unchanged +- DB helpers (`src/utils/db/helpers.ts`) — unchanged +- Metrics (`src/utils/db/metrics.ts`) — unchanged +- Types and hash utilities — unchanged +- `retroactive-videos.ts` — unchanged +- DB schema (three separate tables) — unchanged + +## File Structure + +``` +src/ +├── main.ts # Runner: parse args, loop scrapers, print metrics +├── scrapers/ +│ ├── congress.ts # Congress.gov API +│ ├── govtrack.ts # GovTrack HTML (fetch + cheerio) +│ ├── whitehouse.ts # Whitehouse HTML (fetch + cheerio + turndown) +│ └── scotus.ts # CourtListener API +├── utils/ +│ ├── types.ts +│ ├── hash.ts +│ ├── fetch.ts # NEW +│ ├── log.ts # NEW +│ ├── db/ +│ │ ├── operations.ts # CHANGED: unified upsertContent() +│ │ ├── video-operations.ts +│ │ ├── helpers.ts +│ │ └── metrics.ts +│ ├── api/ +│ │ └── google-images.ts +│ └── ai/ +│ ├── text-generation.ts +│ ├── image-generation.ts +│ ├── image-keywords.ts +│ └── marketing-generation.ts +├── retroactive-videos.ts +``` + +## Resumability + +AI generation is already guarded by content hashing at the DB layer — unchanged content skips all AI calls. This means a crashed scraper can restart from scratch without re-running expensive AI generation. Fetch-level resumability (tracking visited URLs) is out of scope for now but could be added later by persisting a URL set to disk. + +## Out of Scope + +- DB schema changes (merging tables) +- Fetch-level resumability / URL persistence +- Structured/JSON logging +- New scraper sources diff --git a/packages/api/src/router/video.ts b/packages/api/src/router/video.ts index 46f5e75..62549af 100644 --- a/packages/api/src/router/video.ts +++ b/packages/api/src/router/video.ts @@ -72,7 +72,11 @@ export const videoRouter = { shares: metrics.shares, type, articlePreview: video.description, - imageUrl: video.imageUrl ?? undefined, + imageUrl: video.imageUrl + // Fallback: serve legacy imageData as data-URI until migration completes + ?? (video.imageData + ? `data:${video.imageMimeType ?? "image/jpeg"};base64,${Buffer.from(video.imageData).toString("base64")}` + : undefined), thumbnailUrl: video.thumbnailUrl ?? undefined, originalContentId: video.contentId, }; diff --git a/packages/db/migrate-images-to-storage.ts b/packages/db/migrate-images-to-storage.ts index 10c3152..3be6cf2 100644 --- a/packages/db/migrate-images-to-storage.ts +++ b/packages/db/migrate-images-to-storage.ts @@ -19,95 +19,80 @@ import { dirname, join } from "path"; import { fileURLToPath } from "url"; import { config } from "dotenv"; import pg from "pg"; -import { createClient } from "@supabase/supabase-js"; +import { uploadImage } from "./src/storage.js"; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); config({ path: join(__dirname, "../../.env") }); -const BUCKET = process.env.SUPABASE_STORAGE_BUCKET ?? "images"; +const BATCH_SIZE = 50; const dryRun = process.argv.includes("--dry-run"); -function getSupabase() { - const url = process.env.SUPABASE_URL; - const key = process.env.SUPABASE_SERVICE_ROLE_KEY; - if (!url || !key) { - throw new Error("Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY"); - } - return createClient(url, key); -} - async function migrate() { const pool = new pg.Pool({ connectionString: process.env.POSTGRES_URL }); - const supabase = getSupabase(); console.log(dryRun ? "[DRY RUN] " : "", "Starting image migration...\n"); const client = await pool.connect(); try { - // Find all videos with binary data but no storage URL - const { rows } = await client.query<{ - id: string; - content_type: string; - content_id: string; - image_data: Buffer; - image_mime_type: string | null; - }>( - `SELECT id, content_type, content_id, image_data, image_mime_type - FROM video - WHERE image_data IS NOT NULL AND image_url IS NULL`, - ); - - console.log(`Found ${rows.length} videos to migrate\n`); - let migrated = 0; let errors = 0; + let lastId: string | null = null; + + // Migrate in batches using an ID cursor to keep memory bounded + while (true) { + const { rows } = await client.query<{ + id: string; + content_type: string; + content_id: string; + image_data: Buffer; + image_mime_type: string | null; + }>( + `SELECT id, content_type, content_id, image_data, image_mime_type + FROM video + WHERE image_data IS NOT NULL AND image_url IS NULL + ${lastId ? "AND id > $2" : ""} + ORDER BY id + LIMIT $1`, + lastId ? [BATCH_SIZE, lastId] : [BATCH_SIZE], + ); - for (const row of rows) { - const storagePath = `videos/${row.content_type}/${row.content_id}.jpg`; - const mimeType = row.image_mime_type ?? "image/jpeg"; + if (rows.length === 0) break; - try { - if (dryRun) { - console.log(` [DRY RUN] Would upload ${storagePath} (${row.image_data.length} bytes)`); - migrated++; - continue; - } + for (const row of rows) { + const storagePath = `videos/${row.content_type}/${row.content_id}.jpg`; + const mimeType = row.image_mime_type ?? "image/jpeg"; - // Upload to storage - const { error: uploadError } = await supabase.storage - .from(BUCKET) - .upload(storagePath, row.image_data, { - contentType: mimeType, - upsert: true, - }); + try { + if (dryRun) { + console.log(` [DRY RUN] Would upload ${storagePath} (${row.image_data.length} bytes)`); + migrated++; + continue; + } - if (uploadError) { - throw new Error(uploadError.message); - } + // Upload via shared storage abstraction + const publicUrl = await uploadImage(storagePath, row.image_data, mimeType); + + // Write URL back and clear blob + await client.query( + `UPDATE video + SET image_url = $1, image_data = NULL, image_mime_type = NULL, + image_width = NULL, image_height = NULL + WHERE id = $2`, + [publicUrl, row.id], + ); - // Get public URL - const { - data: { publicUrl }, - } = supabase.storage.from(BUCKET).getPublicUrl(storagePath); - - // Write URL back and clear blob - await client.query( - `UPDATE video - SET image_url = $1, image_data = NULL, image_mime_type = NULL, - image_width = NULL, image_height = NULL - WHERE id = $2`, - [publicUrl, row.id], - ); - - migrated++; - console.log(` Migrated: ${storagePath}`); - } catch (err) { - errors++; - console.error(` Failed: ${storagePath} — ${err instanceof Error ? err.message : err}`); + migrated++; + console.log(` Migrated: ${storagePath}`); + } catch (err) { + errors++; + console.error(` Failed: ${storagePath} — ${err instanceof Error ? err.message : err}`); + } } + + lastId = rows[rows.length - 1]!.id; } console.log(`\nDone: ${migrated} migrated, ${errors} errors`); diff --git a/packages/db/migrations/add_imageurl_to_video.sql b/packages/db/migrations/add_imageurl_to_video.sql new file mode 100644 index 0000000..f275ae1 --- /dev/null +++ b/packages/db/migrations/add_imageurl_to_video.sql @@ -0,0 +1,4 @@ +-- Add image_url column to video table for object storage URLs +-- This stores the public URL of AI-generated images uploaded to Supabase Storage / S3 + +ALTER TABLE video ADD COLUMN IF NOT EXISTS image_url TEXT; diff --git a/packages/db/src/schema.ts b/packages/db/src/schema.ts index 5524342..aca8228 100644 --- a/packages/db/src/schema.ts +++ b/packages/db/src/schema.ts @@ -176,7 +176,7 @@ export const Video = pgTable( title: t.varchar({ length: 25 }).notNull(), // Max 25 chars description: t.text().notNull(), // 50-word catchy headline - // Image storage: URL to object storage (Supabase Storage / S3) + // Image storage: source thumbnail URL (scraped) imageUrl: t.text(), // Public URL of uploaded image thumbnailUrl: t.text(), // URL from source content (scraped) From 4994cccefcdd23678874bee6700fd453e72e8c15 Mon Sep 17 00:00:00 2001 From: ThatXliner Date: Fri, 3 Apr 2026 22:34:49 -0700 Subject: [PATCH 05/11] fix(scraper): don't delete uploaded image on DB update failure Separate upload and DB update into distinct try/catch blocks so that a transient DB failure doesn't delete the image at the deterministic storage path, which may already be referenced by an existing imageUrl. Co-Authored-By: Claude Opus 4.6 --- apps/scraper/src/utils/db/video-operations.ts | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/apps/scraper/src/utils/db/video-operations.ts b/apps/scraper/src/utils/db/video-operations.ts index 8506a9d..5af9792 100644 --- a/apps/scraper/src/utils/db/video-operations.ts +++ b/apps/scraper/src/utils/db/video-operations.ts @@ -119,17 +119,25 @@ export async function generateVideoForContent( // Upload image after successful DB write, then update the row if (jpegData) { const storagePath = `videos/${contentType}/${contentId}.jpg`; + let imageUrl: string | undefined; try { - const imageUrl = await uploadImage(storagePath, jpegData); - await db - .update(Video) - .set({ imageUrl }) - .where(and(eq(Video.contentType, contentType), eq(Video.contentId, contentId))); - logger.debug(`Uploaded image to ${storagePath}`); + imageUrl = await uploadImage(storagePath, jpegData); } catch (error) { - // Best-effort cleanup of orphaned upload - try { await deleteImage(storagePath); } catch { /* ignore */ } - logger.warn(`Image upload/update failed for ${contentType}:${contentId}, video saved without image`); + logger.warn(`Image upload failed for ${contentType}:${contentId}, video saved without image`); + } + if (imageUrl) { + try { + await db + .update(Video) + .set({ imageUrl }) + .where(and(eq(Video.contentType, contentType), eq(Video.contentId, contentId))); + logger.debug(`Uploaded image to ${storagePath}`); + } catch (error) { + // Don't delete the uploaded file — it lives at a deterministic path that + // may already be referenced by a previous imageUrl, and will be + // overwritten on the next successful run. + logger.warn(`DB update for imageUrl failed for ${contentType}:${contentId}, image uploaded but URL not saved`); + } } } From 6a49c052f9a9316e7646bb91d11a100c22705791 Mon Sep 17 00:00:00 2001 From: ThatXliner Date: Fri, 3 Apr 2026 22:39:28 -0700 Subject: [PATCH 06/11] Format --- packages/api/src/router/video.ts | 5 +++-- packages/db/migrate-images-to-storage.ts | 15 ++++++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/packages/api/src/router/video.ts b/packages/api/src/router/video.ts index 62549af..48e12dc 100644 --- a/packages/api/src/router/video.ts +++ b/packages/api/src/router/video.ts @@ -72,9 +72,10 @@ export const videoRouter = { shares: metrics.shares, type, articlePreview: video.description, - imageUrl: video.imageUrl + imageUrl: + video.imageUrl ?? // Fallback: serve legacy imageData as data-URI until migration completes - ?? (video.imageData + (video.imageData ? `data:${video.imageMimeType ?? "image/jpeg"};base64,${Buffer.from(video.imageData).toString("base64")}` : undefined), thumbnailUrl: video.thumbnailUrl ?? undefined, diff --git a/packages/db/migrate-images-to-storage.ts b/packages/db/migrate-images-to-storage.ts index 3be6cf2..b39cf8a 100644 --- a/packages/db/migrate-images-to-storage.ts +++ b/packages/db/migrate-images-to-storage.ts @@ -19,6 +19,7 @@ import { dirname, join } from "path"; import { fileURLToPath } from "url"; import { config } from "dotenv"; import pg from "pg"; + import { uploadImage } from "./src/storage.js"; const __filename = fileURLToPath(import.meta.url); @@ -67,13 +68,19 @@ async function migrate() { try { if (dryRun) { - console.log(` [DRY RUN] Would upload ${storagePath} (${row.image_data.length} bytes)`); + console.log( + ` [DRY RUN] Would upload ${storagePath} (${row.image_data.length} bytes)`, + ); migrated++; continue; } // Upload via shared storage abstraction - const publicUrl = await uploadImage(storagePath, row.image_data, mimeType); + const publicUrl = await uploadImage( + storagePath, + row.image_data, + mimeType, + ); // Write URL back and clear blob await client.query( @@ -88,7 +95,9 @@ async function migrate() { console.log(` Migrated: ${storagePath}`); } catch (err) { errors++; - console.error(` Failed: ${storagePath} — ${err instanceof Error ? err.message : err}`); + console.error( + ` Failed: ${storagePath} — ${err instanceof Error ? err.message : err}`, + ); } } From db49802fff34c5c20fa32fa2e89f7cad0c78d9fa Mon Sep 17 00:00:00 2001 From: ThatXliner Date: Fri, 3 Apr 2026 22:43:55 -0700 Subject: [PATCH 07/11] Fix code review issues --- packages/db/eslint.config.ts | 2 +- turbo.json | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/db/eslint.config.ts b/packages/db/eslint.config.ts index f54f34c..93660c4 100644 --- a/packages/db/eslint.config.ts +++ b/packages/db/eslint.config.ts @@ -4,7 +4,7 @@ import { baseConfig } from "@acme/eslint-config/base"; export default defineConfig( { - ignores: ["dist/**", "migrate-images.ts"], + ignores: ["dist/**", "migrate-images.ts", "migrate-images-to-storage.ts"], }, baseConfig, ); diff --git a/turbo.json b/turbo.json index dd1677a..a045ef2 100644 --- a/turbo.json +++ b/turbo.json @@ -51,7 +51,10 @@ "AUTH_DISCORD_SECRET", "AUTH_REDIRECT_PROXY_URL", "AUTH_SECRET", - "PORT" + "PORT", + "SUPABASE_URL", + "SUPABASE_SERVICE_ROLE_KEY", + "SUPABASE_STORAGE_BUCKET" ], "globalPassThroughEnv": [ "NODE_ENV", From 17d76c7c7049dbd98ded0d2cf01d45aac4331105 Mon Sep 17 00:00:00 2001 From: ThatXliner Date: Fri, 3 Apr 2026 23:02:03 -0700 Subject: [PATCH 08/11] NO DOCS --- docs/IMAGE_INTEGRATION.md | 218 --- .../plans/2026-03-30-scraper-refactor.md | 1692 ----------------- .../2026-03-30-scraper-refactor-design.md | 118 -- 3 files changed, 2028 deletions(-) delete mode 100644 docs/IMAGE_INTEGRATION.md delete mode 100644 docs/superpowers/plans/2026-03-30-scraper-refactor.md delete mode 100644 docs/superpowers/specs/2026-03-30-scraper-refactor-design.md diff --git a/docs/IMAGE_INTEGRATION.md b/docs/IMAGE_INTEGRATION.md deleted file mode 100644 index 0503198..0000000 --- a/docs/IMAGE_INTEGRATION.md +++ /dev/null @@ -1,218 +0,0 @@ -# Image Integration for Article System - -## Overview - -This implementation adds relevant photo search and integration to the article generation system. Instead of AI-generated images, it uses the Pexels API to find high-quality, relevant stock photos that fit each article. Pexels provides instant API access (no approval wait) with generous rate limits. - -## Features - -- **Automatic Image Search**: When articles are generated, the system automatically searches for relevant photos -- **AI-Powered Keywords**: Uses GPT-4o-mini to generate optimal search keywords from article content -- **Thumbnail Support**: Each article gets a primary thumbnail image -- **Multiple Images**: Articles can have up to 3 relevant images with proper attribution -- **Source Attribution**: All images include photographer credit and source links - -## Database Schema Changes - -Added to `Bill`, `GovernmentContent`, and `CourtCase` tables: -- `thumbnailUrl`: Text field for the primary thumbnail image URL -- `images`: JSONB array containing image objects with: - - `url`: Direct URL to the image - - `alt`: Alt text description - - `source`: Attribution text (e.g., "Photo by John Doe on Unsplash") - - `sourceUrl`: Link to the original source page - -## Setup - -### 1. Get Pexels API Key (INSTANT - No Approval Wait!) - -1. Sign up at [Pexels API](https://www.pexels.com/api/) -2. Your API key is displayed immediately after signup -3. Copy your API Key - -### 2. Set Environment Variable - -Add to your `.env` file: - -```bash -PEXELS_API_KEY=your_api_key_here -``` - -### 3. Run Database Migration - -```bash -cd packages/db -# If using a migration tool, run the migration -# Or apply manually: -psql -d your_database < migrations/add_image_fields.sql -``` - -### 4. Install Dependencies - -The scraper already has the necessary dependencies. Just ensure you have: -- `ai` package (already installed) -- `@ai-sdk/openai` (already installed) - -## How It Works - -### 1. Image Search Process - -When an article is generated in `apps/scraper/src/utils/db.ts`: - -1. **Keyword Generation**: AI analyzes the title and content to extract visual concepts - - Example: "Infrastructure Bill" → "highway construction bridge" - -2. **Image Search**: Queries Unsplash API with generated keywords - - Filters for landscape orientation - - Ensures high content quality filter - -3. **Storage**: Saves thumbnail URL and image array to database - -### 2. Image Search Utility - -Located at `apps/scraper/src/utils/image-search.ts`: - -```typescript -// Search for images -const images = await searchImages('renewable energy solar panels', 3); - -// Get just a thumbnail -const thumbnail = await getThumbnailImage('healthcare hospital'); - -// Generate search keywords from content -const keywords = await generateImageSearchKeywords(title, content, type); -``` - -### 3. API Integration - -The tRPC API endpoints in `packages/api/src/router/content.ts` now include: - -- `thumbnailUrl` in content card responses (for list views) -- `images` array in detailed content responses (for article pages) - -## Usage in Frontend - -### Content Cards (List View) - -```typescript -// Thumbnails are available in list responses -const { data } = trpc.content.getAll.useQuery(); - -data.forEach(item => { - if (item.thumbnailUrl) { - // Display thumbnail - {item.title} - } -}); -``` - -### Article Detail View - -```typescript -// Full image array available in detail view -const { data } = trpc.content.getById.useQuery({ id }); - -if (data.images && data.images.length > 0) { - data.images.forEach(image => { -
- {image.alt} -
- {image.source} -
-
- }); -} -``` - -## Fallback Behavior - -The system gracefully handles cases where images aren't available: - -- **No API Key**: Logs warning and continues without images -- **No Results**: Articles work fine without images -- **API Errors**: Logs error and continues processing -- **Rate Limits**: Respects Unsplash's free tier limits (50 requests/hour) - -## Customization - -### Change Number of Images - -In `apps/scraper/src/utils/db.ts`: - -```typescript -// Get more or fewer images -images = await searchImages(searchQuery, 5); // Get 5 instead of 3 -``` - -### Different Image Source - -Replace `apps/scraper/src/utils/image-search.ts` with a different API: - -- **Pexels**: Free, no attribution required -- **Pixabay**: Free, no attribution required -- **Getty Images**: Premium, requires license - -### Customize Search Keywords - -Modify the AI prompt in `generateImageSearchKeywords()` to adjust keyword generation: - -```typescript -prompt: `Generate keywords focusing on [your specific requirements]...` -``` - -## Rate Limits - -**Unsplash Free Tier**: -- 50 requests per hour -- 5,000 total requests per month - -For higher volume, consider: -1. Upgrading to Unsplash paid tier -2. Caching image search results -3. Using multiple image APIs with fallback - -## Testing - -To test image search without running the full scraper: - -```bash -cd apps/scraper - -# Test image search -node -e " -import('./src/utils/image-search.ts').then(async ({ searchImages }) => { - const images = await searchImages('congress capitol building', 3); - console.log(images); -}); -" -``` - -## Troubleshooting - -### No images appearing - -1. Check `UNSPLASH_ACCESS_KEY` is set correctly -2. Verify you haven't hit rate limits -3. Check console logs for errors -4. Test API key manually: `curl -H "Authorization: Client-ID YOUR_KEY" "https://api.unsplash.com/photos/random"` - -### Images not relevant - -1. Review generated keywords in logs -2. Adjust keyword generation prompt -3. Consider using different search terms or manual keywords - -### Database errors - -1. Ensure migration was applied -2. Check that columns exist: `\d bill` in psql -3. Verify JSONB type is supported in your PostgreSQL version - -## Future Enhancements - -- [ ] Image caching to reduce API calls -- [ ] Multiple image source fallbacks -- [ ] Image optimization and CDN integration -- [ ] User-selectable images from search results -- [ ] Image relevance scoring -- [ ] Automatic image cropping for thumbnails diff --git a/docs/superpowers/plans/2026-03-30-scraper-refactor.md b/docs/superpowers/plans/2026-03-30-scraper-refactor.md deleted file mode 100644 index da8c942..0000000 --- a/docs/superpowers/plans/2026-03-30-scraper-refactor.md +++ /dev/null @@ -1,1692 +0,0 @@ -# Scraper Architecture Refactor — Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Replace Crawlee with hand-rolled fetch+cheerio, unify the three upsert functions into one, add a shared `fetchWithRetry` utility and `log` helper, and simplify the runner in `main.ts`. - -**Architecture:** Each scraper becomes a plain `{ name, scrape }` object using `fetchWithRetry()` + cheerio/turndown directly. A unified `upsertContent(type, data)` replaces the three per-table upsert functions. `main.ts` becomes a loop over selected scrapers. - -**Tech Stack:** Node.js, TypeScript, cheerio, turndown, Drizzle ORM, Vercel AI SDK, OpenAI SDK - ---- - -## File Map - -| Action | File | Responsibility | -|--------|------|---------------| -| Create | `src/utils/fetch.ts` | `fetchWithRetry()` — shared retry + timeout wrapper | -| Create | `src/utils/log.ts` | `log(scraper, msg)` — prefixed logging | -| Modify | `src/utils/db/operations.ts` | Merge 3 upsert fns → `upsertContent()` | -| Modify | `src/scrapers/govtrack.ts` | Replace CheerioCrawler with fetch+cheerio | -| Modify | `src/scrapers/whitehouse.ts` | Replace CheerioCrawler with fetch+cheerio+turndown | -| Modify | `src/scrapers/congress.ts` | Use shared `fetchWithRetry`, use `upsertContent`, use `log` | -| Modify | `src/scrapers/scotus.ts` | Use shared `fetchWithRetry`, use `upsertContent`, use `log` | -| Modify | `src/main.ts` | Scraper runner loop | -| Modify | `src/utils/types.ts` | Add `Scraper` type, add `ContentType` union | -| Modify | `package.json` | Remove crawlee, playwright, @apify/tsconfig | -| Modify | `tsconfig.json` | Extend monorepo base only (remove apify dep) | -| Modify | `Dockerfile.scraper` (repo root) | Remove playwright install, simplify | - ---- - -### Task 1: Create `fetchWithRetry` utility - -**Files:** -- Create: `apps/scraper/src/utils/fetch.ts` - -- [ ] **Step 1: Create `fetchWithRetry`** - -```ts -// apps/scraper/src/utils/fetch.ts - -export interface FetchWithRetryOptions extends RequestInit { - maxRetries?: number; - timeoutMs?: number; -} - -export async function fetchWithRetry( - url: string, - options: FetchWithRetryOptions = {}, -): Promise { - const { maxRetries = 3, timeoutMs = 30_000, ...fetchOptions } = options; - - for (let attempt = 0; attempt <= maxRetries; attempt++) { - const controller = new AbortController(); - const timeoutId = setTimeout(() => controller.abort(), timeoutMs); - - try { - const res = await fetch(url, { - ...fetchOptions, - signal: controller.signal, - }); - - if (res.ok) return res; - - const isRetriable = res.status === 429 || res.status >= 500; - if (isRetriable && attempt < maxRetries) { - let delayMs = 1000 * Math.pow(2, attempt); - - // Honor Retry-After header - const retryAfter = res.headers.get("Retry-After"); - if (retryAfter) { - const seconds = Number(retryAfter); - if (!Number.isNaN(seconds)) { - delayMs = Math.max(delayMs, seconds * 1000); - } else { - const retryDate = Date.parse(retryAfter); - if (!Number.isNaN(retryDate)) { - const diff = retryDate - Date.now(); - if (diff > 0) delayMs = Math.max(delayMs, diff); - } - } - } - - await new Promise((r) => setTimeout(r, delayMs)); - continue; - } - - throw new Error(`HTTP ${res.status}: ${url}`); - } catch (err: any) { - if (err?.name === "AbortError") { - if (attempt < maxRetries) { - await new Promise((r) => setTimeout(r, 1000 * Math.pow(2, attempt))); - continue; - } - throw new Error(`Request timed out after ${timeoutMs}ms: ${url}`); - } - // Retry network errors - if (attempt < maxRetries && (err?.code === "ECONNRESET" || err?.code === "ECONNREFUSED")) { - await new Promise((r) => setTimeout(r, 1000 * Math.pow(2, attempt))); - continue; - } - throw err; - } finally { - clearTimeout(timeoutId); - } - } - - throw new Error(`Failed after ${maxRetries + 1} attempts: ${url}`); -} -``` - -- [ ] **Step 2: Verify it compiles** - -Run: `cd apps/scraper && npx tsc --noEmit` -Expected: No errors from `fetch.ts` - -- [ ] **Step 3: Commit** - -```bash -git add apps/scraper/src/utils/fetch.ts -git commit -m "feat(scraper): add fetchWithRetry utility" -``` - ---- - -### Task 2: Create `log` utility - -**Files:** -- Create: `apps/scraper/src/utils/log.ts` - -- [ ] **Step 1: Create `log.ts`** - -```ts -// apps/scraper/src/utils/log.ts - -function timestamp(): string { - return new Date().toISOString().slice(11, 19); // HH:MM:SS -} - -export function log(scraper: string, message: string): void { - console.log(`[${timestamp()}] [${scraper}] ${message}`); -} - -export function logError(scraper: string, message: string, error?: unknown): void { - console.error(`[${timestamp()}] [${scraper}] ERROR: ${message}`, error ?? ""); -} - -export function logWarn(scraper: string, message: string): void { - console.warn(`[${timestamp()}] [${scraper}] WARN: ${message}`); -} -``` - -- [ ] **Step 2: Commit** - -```bash -git add apps/scraper/src/utils/log.ts -git commit -m "feat(scraper): add log utility with scraper prefix" -``` - ---- - -### Task 3: Add `Scraper` type and `ContentType` union - -**Files:** -- Modify: `apps/scraper/src/utils/types.ts` - -- [ ] **Step 1: Add types to `types.ts`** - -Add to the end of the file: - -```ts -// Content type union for unified upsert -export type ContentType = "bill" | "government_content" | "court_case"; - -// Scraper interface for the runner -export interface Scraper { - name: string; - scrape: () => Promise; -} -``` - -- [ ] **Step 2: Commit** - -```bash -git add apps/scraper/src/utils/types.ts -git commit -m "feat(scraper): add Scraper and ContentType types" -``` - ---- - -### Task 4: Unify upsert functions into `upsertContent` - -**Files:** -- Modify: `apps/scraper/src/utils/db/operations.ts` - -This is the biggest single change. The three functions (`upsertBill`, `upsertGovernmentContent`, `upsertCourtCase`) share ~90% of their logic. We merge them into one `upsertContent(type, data)` that switches on type for the DB-specific parts (which table, which conflict target, which fields to hash, which check function). - -- [ ] **Step 1: Refactor `operations.ts`** - -Replace the entire file with: - -```ts -import { db } from "@acme/db/client"; -import { Bill, GovernmentContent, CourtCase } from "@acme/db/schema"; -import type { - BillData, - GovernmentContentData, - CourtCaseData, - ContentType, -} from "../types.js"; -import { createContentHash } from "../hash.js"; -import { generateAISummary, generateAIArticle } from "../ai/text-generation.js"; -import { generateImageSearchKeywords } from "../ai/image-keywords.js"; -import { getThumbnailImage } from "../api/google-images.js"; -import { - checkExistingBill, - checkExistingGovernmentContent, - checkExistingCourtCase, -} from "./helpers.js"; -import { - incrementTotalProcessed, - incrementNewEntries, - incrementExistingUnchanged, - incrementExistingChanged, - incrementAIArticlesGenerated, - incrementImagesSearched, -} from "./metrics.js"; -import { generateVideoForContent } from "./video-operations.js"; - -function isUsableText(text: string | undefined | null): text is string { - if (!text || text.length < 200) return false; - if (/[A-Z]:\\/.test(text)) return false; - - const lines = text.split("\n"); - const boilerplateLines = lines.filter((line) => { - const trimmed = line.trim(); - return ( - trimmed === "" || - trimmed.split(/\s+/).length === 1 || - (/[a-zA-Z]/.test(trimmed) && - trimmed === trimmed.toUpperCase() && - trimmed.length > 2) - ); - }); - if (boilerplateLines.length / lines.length >= 0.3) return false; - - return true; -} - -type ContentData = - | { type: "bill"; data: BillData } - | { type: "government_content"; data: GovernmentContentData } - | { type: "court_case"; data: CourtCaseData }; - -// Identify a content item for logging -function contentLabel(input: ContentData): string { - switch (input.type) { - case "bill": - return `bill ${input.data.billNumber}`; - case "government_content": - return `${input.data.type} "${input.data.title}"`; - case "court_case": - return `court case ${input.data.caseNumber}`; - } -} - -// Build hash input — only fields that matter for change detection -function hashFields(input: ContentData): string { - switch (input.type) { - case "bill": - return JSON.stringify({ - title: input.data.title, - description: input.data.description, - status: input.data.status, - summary: input.data.summary, - fullText: input.data.fullText, - }); - case "government_content": - return JSON.stringify({ - title: input.data.title, - description: input.data.description, - fullText: input.data.fullText, - }); - case "court_case": - return JSON.stringify({ - title: input.data.title, - description: input.data.description, - status: input.data.status, - fullText: input.data.fullText, - }); - } -} - -// Check existing record per type -async function checkExisting(input: ContentData) { - switch (input.type) { - case "bill": - return checkExistingBill( - input.data.billNumber, - input.data.sourceWebsite, - ); - case "government_content": - return checkExistingGovernmentContent(input.data.url); - case "court_case": - return checkExistingCourtCase(input.data.caseNumber); - } -} - -export async function upsertContent(input: ContentData) { - const newContentHash = createContentHash(hashFields(input)); - const existing = await checkExisting(input); - const label = contentLabel(input); - - incrementTotalProcessed(); - - // All content types have these fields - const fullText = input.data.fullText; - const title = input.data.title; - const url = input.data.url; - - // Determine what to generate - const hasUsableText = isUsableText(fullText); - let shouldGenerateArticle = false; - let shouldGenerateImage = false; - - if (!existing) { - shouldGenerateArticle = hasUsableText; - shouldGenerateImage = hasUsableText; - incrementNewEntries(); - console.log(`New ${label} detected`); - } else if (existing.contentHash !== newContentHash) { - shouldGenerateArticle = hasUsableText; - shouldGenerateImage = !existing.hasThumbnail && hasUsableText; - incrementExistingChanged(); - console.log(`Content changed for ${label}`); - } else { - shouldGenerateArticle = false; - shouldGenerateImage = !existing.hasThumbnail && hasUsableText; - incrementExistingUnchanged(); - console.log(`No changes for ${label}, skipping AI generation`); - } - - // Generate AI summary if needed - let description: string | undefined; - const existingDescription = input.data.description; - - if (existingDescription) { - description = existingDescription; - } else if (shouldGenerateArticle && fullText) { - const summarySource = - input.type === "bill" - ? input.data.summary || input.data.fullText || "" - : fullText; - console.log(`Generating AI summary for ${label}`); - description = await generateAISummary(title, summarySource); - } - - // Generate AI article - let aiGeneratedArticle: string | undefined; - const articleType = - input.type === "bill" - ? "bill" - : input.type === "government_content" - ? input.data.type - : "court case"; - - if (shouldGenerateArticle && hasUsableText) { - console.log(`Generating AI article for ${label}`); - aiGeneratedArticle = await generateAIArticle(title, fullText!, articleType, url); - incrementAIArticlesGenerated(); - } else if (existing?.hasArticle) { - console.log(`Using existing AI article for ${label}`); - } - - // Search for thumbnail - let thumbnailUrl: string | null | undefined; - if (shouldGenerateImage) { - try { - console.log(`Searching for thumbnail for ${label}`); - const searchQuery = await generateImageSearchKeywords( - title, - fullText || "", - articleType, - ); - console.log(`Image search query: ${searchQuery}`); - thumbnailUrl = await getThumbnailImage(searchQuery); - incrementImagesSearched(); - } catch (error) { - console.warn(`Failed to fetch thumbnail for ${label}:`, error); - thumbnailUrl = null; - } - } else if (existing?.hasThumbnail) { - console.log(`Using existing thumbnail for ${label}`); - } - - // Type-specific DB upsert - let result: any; - - if (input.type === "bill") { - const d = input.data; - const [row] = await db - .insert(Bill) - .values({ - ...d, - description: description ?? d.description, - aiGeneratedArticle: aiGeneratedArticle || undefined, - thumbnailUrl: - thumbnailUrl === undefined - ? undefined - : thumbnailUrl || undefined, - contentHash: newContentHash, - versions: [], - }) - .onConflictDoUpdate({ - target: [Bill.billNumber, Bill.sourceWebsite], - set: { - title: d.title, - description: description ?? d.description, - sponsor: d.sponsor, - status: d.status, - introducedDate: d.introducedDate, - congress: d.congress, - chamber: d.chamber, - summary: d.summary, - fullText: d.fullText, - ...(aiGeneratedArticle !== undefined && { aiGeneratedArticle }), - ...(thumbnailUrl !== undefined && { - thumbnailUrl: thumbnailUrl || undefined, - }), - url: d.url, - contentHash: newContentHash, - updatedAt: new Date(), - }, - }) - .returning(); - result = row; - } else if (input.type === "government_content") { - const d = input.data; - const [row] = await db - .insert(GovernmentContent) - .values({ - ...d, - aiGeneratedArticle: aiGeneratedArticle || undefined, - thumbnailUrl: - thumbnailUrl === undefined - ? undefined - : thumbnailUrl || undefined, - contentHash: newContentHash, - versions: [], - }) - .onConflictDoUpdate({ - target: GovernmentContent.url, - set: { - title: d.title, - type: d.type, - publishedDate: d.publishedDate, - description: d.description, - fullText: d.fullText, - ...(aiGeneratedArticle !== undefined && { aiGeneratedArticle }), - ...(thumbnailUrl !== undefined && { - thumbnailUrl: thumbnailUrl || undefined, - }), - source: d.source, - contentHash: newContentHash, - updatedAt: new Date(), - }, - }) - .returning(); - result = row; - } else { - const d = input.data; - const [row] = await db - .insert(CourtCase) - .values({ - ...d, - description: description ?? d.description, - aiGeneratedArticle: aiGeneratedArticle || undefined, - thumbnailUrl: - thumbnailUrl === undefined - ? undefined - : thumbnailUrl || undefined, - contentHash: newContentHash, - versions: [], - }) - .onConflictDoUpdate({ - target: CourtCase.caseNumber, - set: { - title: d.title, - court: d.court, - filedDate: d.filedDate, - description: description ?? d.description, - status: d.status, - fullText: d.fullText, - ...(aiGeneratedArticle !== undefined && { aiGeneratedArticle }), - ...(thumbnailUrl !== undefined && { - thumbnailUrl: thumbnailUrl || undefined, - }), - url: d.url, - contentHash: newContentHash, - updatedAt: new Date(), - }, - }) - .returning(); - result = row; - } - - console.log(`${label} upserted`); - - // Generate video - if (result && fullText) { - const videoSource = - input.type === "bill" - ? input.data.sourceWebsite - : input.type === "government_content" - ? (input.data.source ?? "whitehouse.gov") - : input.type === "court_case" - ? input.data.court - : ""; - await generateVideoForContent( - input.type, - result.id, - title, - fullText, - newContentHash, - videoSource, - result.thumbnailUrl, - ); - } - - return result; -} - -// Legacy wrapper for whitehouse scraper's upsertPresidentialAction calls -export async function upsertPresidentialAction(actionData: { - title: string; - type: string; - issuedDate?: Date; - publishedDate?: Date; - description?: string; - fullText?: string; - url: string; - source?: string; -}) { - return upsertContent({ - type: "government_content", - data: { - ...actionData, - publishedDate: - actionData.publishedDate || actionData.issuedDate || new Date(), - source: actionData.source || "whitehouse.gov", - }, - }); -} -``` - -- [ ] **Step 2: Verify it compiles** - -Run: `cd apps/scraper && npx tsc --noEmit` -Expected: Errors only from scrapers still importing old function names (fixed in later tasks) - -- [ ] **Step 3: Commit** - -```bash -git add apps/scraper/src/utils/db/operations.ts -git commit -m "refactor(scraper): unify upsertBill/GovernmentContent/CourtCase into upsertContent" -``` - ---- - -### Task 5: Rewrite `govtrack.ts` — drop Crawlee - -**Files:** -- Modify: `apps/scraper/src/scrapers/govtrack.ts` - -- [ ] **Step 1: Rewrite `govtrack.ts`** - -Replace the entire file: - -```ts -import * as cheerio from "cheerio"; - -import { fetchWithRetry } from "../utils/fetch.js"; -import { log, logError } from "../utils/log.js"; -import { upsertContent } from "../utils/db/operations.js"; -import { printMetricsSummary, resetMetrics } from "../utils/db/metrics.js"; -import type { Scraper } from "../utils/types.js"; - -const NAME = "GovTrack"; - -interface GovTrackConfig { - maxBills?: number; - congress?: number; -} - -async function scrape(config: GovTrackConfig = {}) { - const { maxBills = 100, congress = 119 } = config; - log(NAME, "Starting..."); - resetMetrics(); - - // Step 1: Fetch listing page and collect bill links - const listingUrl = "https://www.govtrack.us/congress/bills/#docket"; - const listingRes = await fetchWithRetry(listingUrl); - const listingHtml = await listingRes.text(); - const $listing = cheerio.load(listingHtml); - - const collectedLinks: string[] = []; - $listing('.card > .card-body .card-title > a[href*="/congress/bills/"]').each( - (_, element) => { - const href = $listing(element).attr("href"); - if (href && /\/congress\/bills\/\d+\/[a-z]+\d+/.test(href)) { - const fullUrl = href.startsWith("http") - ? href - : `https://www.govtrack.us${href}`; - if (collectedLinks.length < maxBills) { - collectedLinks.push(fullUrl); - } - } - }, - ); - - log(NAME, `Found ${collectedLinks.length} bill links`); - - // Step 2: Scrape each bill's /text page - const textUrls = collectedLinks.slice(0, maxBills).map((url) => `${url}/text`); - log(NAME, `Scraping ${textUrls.length} text pages...`); - - for (const textUrl of textUrls) { - try { - const res = await fetchWithRetry(textUrl, { timeoutMs: 60_000 }); - const html = await res.text(); - const $ = cheerio.load(html); - - // Remove noise - $("#main_text_content script, #main_text_content style, #main_text_content nav").remove(); - let fullText = $("#main_text_content").text().trim(); - - // Reject garbage text - if ( - /[A-Z]:\\/.test(fullText) || - fullText.startsWith("Examples:") || - fullText.startsWith("IB ") - ) { - log(NAME, `Rejecting garbage text for ${textUrl}`); - fullText = ""; - } - - // Truncate to 1,000 words - if (fullText) { - const words = fullText.split(/\s+/); - if (words.length > 1000) { - fullText = words.slice(0, 1000).join(" "); - } - } - - // Extract bill info - const h1Text = $("#maincontent h1").first().text().trim(); - const h1Parts = h1Text.split(":"); - const billNumber = h1Parts[0]?.trim() || ""; - const title = - h1Parts.length > 1 ? h1Parts.slice(1).join(":").trim() : h1Text; - - const status = $(".bill-status").first().text().trim() || "Unknown"; - - let introducedDate: Date | undefined; - $("p, div").each((_, element) => { - const text = $(element).text(); - if (text.includes("Introduced:")) { - const dateStr = text.replace("Introduced:", "").trim(); - introducedDate = new Date(dateStr); - return false; - } - }); - - const congressMatch = textUrl.match(/\/congress\/bills\/(\d+)\//); - const congressNum = congressMatch - ? parseInt(congressMatch[1]!) - : undefined; - - const chamber = billNumber.toLowerCase().startsWith("h.") - ? "House" - : "Senate"; - - const summary = $(".summary").first().text().trim() || undefined; - const billUrl = textUrl.replace(/\/text$/, ""); - - if (fullText !== "") { - await upsertContent({ - type: "bill", - data: { - billNumber, - title, - description: summary, - sponsor: undefined, - status, - introducedDate, - congress: congressNum, - chamber, - summary, - fullText, - url: billUrl, - sourceWebsite: "govtrack" as const, - }, - }); - } - - log(NAME, `Scraped: ${billNumber} — ${title}`); - } catch (error) { - logError(NAME, `Error scraping ${textUrl}`, error); - } - } - - log(NAME, "Completed"); - printMetricsSummary(NAME); -} - -export const govtrack: Scraper = { - name: NAME, - scrape: () => scrape(), -}; -``` - -- [ ] **Step 2: Verify it compiles** - -Run: `cd apps/scraper && npx tsc --noEmit` - -- [ ] **Step 3: Commit** - -```bash -git add apps/scraper/src/scrapers/govtrack.ts -git commit -m "refactor(scraper): rewrite govtrack to use fetch+cheerio, drop Crawlee" -``` - ---- - -### Task 6: Rewrite `whitehouse.ts` — drop Crawlee - -**Files:** -- Modify: `apps/scraper/src/scrapers/whitehouse.ts` - -- [ ] **Step 1: Rewrite `whitehouse.ts`** - -Replace the entire file: - -```ts -import * as cheerio from "cheerio"; -import TurndownService from "turndown"; - -import { fetchWithRetry } from "../utils/fetch.js"; -import { log, logError } from "../utils/log.js"; -import { upsertContent } from "../utils/db/operations.js"; -import { generateAISummary } from "../utils/ai/text-generation.js"; -import { resetMetrics, printMetricsSummary } from "../utils/db/metrics.js"; -import type { Scraper } from "../utils/types.js"; - -const NAME = "White House"; - -function toTitleCase(text: string): string { - const uppercaseCount = (text.match(/[A-Z]/g) || []).length; - const letterCount = (text.match(/[a-zA-Z]/g) || []).length; - - if (letterCount === 0 || uppercaseCount / letterCount < 0.5) { - return text; - } - - return text - .toLowerCase() - .split(" ") - .map((word) => { - if (word.length === 0) return word; - return word.charAt(0).toUpperCase() + word.slice(1); - }) - .join(" ") - .replace(/^./, (char) => char.toUpperCase()); -} - -async function scrape() { - log(NAME, "Starting..."); - resetMetrics(); - - const maxArticles = 20; - const turndownService = new TurndownService({ - headingStyle: "atx", - codeBlockStyle: "fenced", - }); - - // Step 1: Collect article links from listing pages (with pagination) - const collectedLinks: string[] = []; - let nextPageUrl: string | null = "https://www.whitehouse.gov/news/"; - - while (nextPageUrl && collectedLinks.length < maxArticles) { - const res = await fetchWithRetry(nextPageUrl, { timeoutMs: 60_000 }); - const html = await res.text(); - const $ = cheerio.load(html); - - $(".wp-block-post-title > a").each((_, element) => { - const href = $(element).attr("href"); - if (href && collectedLinks.length < maxArticles) { - collectedLinks.push(href); - } - }); - - log(NAME, `Found ${collectedLinks.length} article links so far`); - - if (collectedLinks.length < maxArticles) { - nextPageUrl = $(".wp-block-query-pagination-next").attr("href") || null; - } else { - nextPageUrl = null; - } - } - - log(NAME, `Collected ${collectedLinks.length} articles, now scraping...`); - - // Step 2: Scrape each article - for (const articleUrl of collectedLinks.slice(0, maxArticles)) { - try { - const res = await fetchWithRetry(articleUrl, { timeoutMs: 60_000 }); - const html = await res.text(); - const $ = cheerio.load(html); - - let headline = $(".wp-block-whitehouse-topper__headline") - .first() - .text() - .trim(); - if (!headline) { - headline = $("h1").first().text().trim() || "Untitled Article"; - } - headline = toTitleCase(headline); - - const dateStr = - $(".wp-block-post-date > time").first().attr("datetime") || - $(".wp-block-post-date > time").first().text().trim(); - const issuedDate = dateStr ? new Date(dateStr) : new Date(); - - // Extract content after the first div in .entry-content - const entryContent = $(".entry-content").first(); - let fullTextMarkdown = ""; - - if (entryContent.length > 0) { - const children = entryContent.children(); - let firstDivIndex = -1; - - children.each((index, element) => { - if ( - element.tagName.toLowerCase() === "div" && - firstDivIndex === -1 - ) { - firstDivIndex = index; - } - }); - - let contentHtml = ""; - if (firstDivIndex === -1) { - contentHtml = entryContent.html() || ""; - } else { - children.each((index, element) => { - if (index > firstDivIndex) { - contentHtml += $.html(element); - } - }); - } - - fullTextMarkdown = turndownService.turndown(contentHtml).trim(); - } - - // Determine content type from URL - let contentType = "News Article"; - if (articleUrl.includes("/fact-sheets/")) { - contentType = "Fact Sheet"; - } else if (articleUrl.includes("/briefings-statements/")) { - contentType = "Briefing Statement"; - } else if (articleUrl.includes("/presidential-actions/")) { - contentType = "Presidential Action"; - } - - log(NAME, `Generating AI summary for: ${headline}`); - const aiSummary = await generateAISummary(headline, fullTextMarkdown); - - await upsertContent({ - type: "government_content", - data: { - title: headline, - type: contentType, - publishedDate: issuedDate, - description: aiSummary, - fullText: fullTextMarkdown, - url: articleUrl, - source: "whitehouse.gov", - }, - }); - - log(NAME, `Scraped ${contentType}: ${headline}`); - } catch (error) { - logError(NAME, `Error scraping ${articleUrl}`, error); - } - } - - log(NAME, "Completed"); - printMetricsSummary(NAME); -} - -export const whitehouse: Scraper = { - name: NAME, - scrape, -}; -``` - -- [ ] **Step 2: Verify it compiles** - -Run: `cd apps/scraper && npx tsc --noEmit` - -- [ ] **Step 3: Commit** - -```bash -git add apps/scraper/src/scrapers/whitehouse.ts -git commit -m "refactor(scraper): rewrite whitehouse to use fetch+cheerio+turndown, drop Crawlee" -``` - ---- - -### Task 7: Update `congress.ts` — use shared utilities - -**Files:** -- Modify: `apps/scraper/src/scrapers/congress.ts` - -Replace the local `congressFetch` with `fetchWithRetry`, switch to `upsertContent`, use `log`/`logError`, and export as `Scraper` object. - -- [ ] **Step 1: Rewrite `congress.ts`** - -Key changes from current code: -1. Replace `congressFetch()` with a wrapper around `fetchWithRetry()` that adds the API key and JSON parsing -2. Replace `upsertBill(...)` calls with `upsertContent({ type: "bill", data: ... })` -3. Replace `console.log`/`console.error` with `log(NAME, ...)` / `logError(NAME, ...)` -4. Export as `Scraper` object instead of bare function - -```ts -import { fetchWithRetry } from "../utils/fetch.js"; -import { log, logError } from "../utils/log.js"; -import { printMetricsSummary, resetMetrics } from "../utils/db/metrics.js"; -import { upsertContent } from "../utils/db/operations.js"; -import type { Scraper } from "../utils/types.js"; - -const BASE_URL = "https://api.congress.gov/v3"; -const NAME = "Congress.gov"; - -// ─── Config ────────────────────────────────────────────────────────────────── - -interface CongressScraperConfig { - maxBills?: number; - congress?: number; - chamber?: "House" | "Senate"; -} - -// ─── API response shapes (partial — only what we use) ──────────────────────── - -interface ApiBillListItem { - number: string; - type: string; - title: string; - congress: number; - url: string; - latestAction?: { text: string; actionDate: string }; -} - -interface ApiBillDetail { - bill: { - number: string; - type: string; - title: string; - congress: number; - originChamber: string; - introducedDate?: string; - sponsors?: Array<{ - firstName: string; - lastName: string; - party: string; - state: string; - }>; - latestAction?: { text: string; actionDate: string }; - }; -} - -interface ApiSummary { - actionDate: string; - actionDesc: string; - text: string; - updateDate: string; -} - -interface ApiTextVersion { - type: string; - date: string | null; - formats: Array<{ type: string; url: string }>; -} - -// ─── Helpers ───────────────────────────────────────────────────────────────── - -function getApiKey(): string { - const key = process.env.CONGRESS_API_KEY; - if (!key) { - throw new Error( - "CONGRESS_API_KEY is not set. Sign up at https://api.congress.gov/sign-up/", - ); - } - return key; -} - -async function congressFetch( - path: string, - params: Record = {}, -): Promise { - const apiKey = getApiKey(); - const url = new URL(`${BASE_URL}${path}`); - url.searchParams.set("api_key", apiKey); - url.searchParams.set("format", "json"); - for (const [k, v] of Object.entries(params)) { - url.searchParams.set(k, String(v)); - } - - const res = await fetchWithRetry(url.toString()); - return res.json() as Promise; -} - -function ordinalSuffix(n: number): string { - const mod100 = Math.abs(n) % 100; - const mod10 = Math.abs(n) % 10; - if (mod100 >= 11 && mod100 <= 13) return "th"; - if (mod10 === 1) return "st"; - if (mod10 === 2) return "nd"; - if (mod10 === 3) return "rd"; - return "th"; -} - -function billTypeToUrlSlug(type: string): string { - const slugMap: Record = { - HR: "house-bill", - S: "senate-bill", - HJRES: "house-joint-resolution", - SJRES: "senate-joint-resolution", - HCONRES: "house-concurrent-resolution", - SCONRES: "senate-concurrent-resolution", - HRES: "house-simple-resolution", - SRES: "senate-simple-resolution", - }; - return slugMap[type.toUpperCase()] ?? `${type.toLowerCase()}-bill`; -} - -function formatBillNumber(type: string, number: string): string { - const prefixMap: Record = { - HR: "H.R.", - S: "S.", - HJRES: "H.J.Res.", - SJRES: "S.J.Res.", - HCONRES: "H.Con.Res.", - SCONRES: "S.Con.Res.", - HRES: "H.Res.", - SRES: "S.Res.", - }; - const prefix = prefixMap[type.toUpperCase()] ?? type; - return `${prefix} ${number}`; -} - -function stripHtml(html: string): string { - return html - .replace(/<[^>]+>/g, " ") - .replace(/&/g, "&") - .replace(/</g, "<") - .replace(/>/g, ">") - .replace(/ /g, " ") - .replace(/\s{2,}/g, " ") - .trim(); -} - -async function fetchSummary( - congress: number, - billType: string, - billNumber: string, -): Promise { - try { - const data = await congressFetch<{ summaries: ApiSummary[] }>( - `/bill/${congress}/${billType.toLowerCase()}/${billNumber}/summaries`, - ); - if (!data.summaries?.length) return undefined; - const latest = data.summaries[data.summaries.length - 1]!; - return stripHtml(latest.text).slice(0, 5000); - } catch { - return undefined; - } -} - -async function fetchFullText( - congress: number, - billType: string, - billNumber: string, -): Promise { - try { - const data = await congressFetch<{ textVersions: ApiTextVersion[] }>( - `/bill/${congress}/${billType.toLowerCase()}/${billNumber}/text`, - ); - if (!data.textVersions?.length) return undefined; - - for (const version of [...data.textVersions].reverse()) { - const txtFormat = version.formats.find( - (f) => f.type === "Formatted Text", - ); - if (!txtFormat) continue; - - const res = await fetchWithRetry(txtFormat.url); - const rawText = await res.text(); - if (!rawText) continue; - - let text = stripHtml(rawText); - const words = text.split(/\s+/); - if (words.length > 1000) { - text = words.slice(0, 1000).join(" "); - } - return text.trim() || undefined; - } - } catch { - // Full text is optional - } - return undefined; -} - -// ─── Main ──────────────────────────────────────────────────────────────────── - -async function scrape(config: CongressScraperConfig = {}) { - const { maxBills = 100, congress = 119, chamber = "House" } = config; - - log(NAME, `Starting (congress=${congress}, chamber=${chamber})...`); - resetMetrics(); - - const chamberParam = chamber === "House" ? "house" : "senate"; - - // Step 1: fetch bill listing - const allBills: ApiBillListItem[] = []; - let offset = 0; - const pageSize = 250; - - while (allBills.length < maxBills) { - const remaining = maxBills - allBills.length; - const limit = Math.min(remaining, pageSize); - - const pageData = await congressFetch<{ bills: ApiBillListItem[] }>( - `/bill/${congress}`, - { chamber: chamberParam, limit, offset, sort: "updateDate+desc" }, - ); - - const page = pageData.bills ?? []; - allBills.push(...page); - if (page.length < limit) break; - offset += page.length; - } - - const bills = allBills.slice(0, maxBills); - log(NAME, `Fetched ${bills.length} bills`); - - // Step 2: enrich each bill - for (const item of bills) { - try { - const billType = item.type.toLowerCase(); - const billNumber = item.number; - - const detailData = await congressFetch( - `/bill/${congress}/${billType}/${billNumber}`, - ); - const detail = detailData.bill; - - const formattedBillNumber = formatBillNumber(detail.type, detail.number); - const title = (detail.title ?? "Unknown").slice(0, 250); - - const primarySponsor = detail.sponsors?.[0]; - const sponsor = primarySponsor - ? `${primarySponsor.firstName} ${primarySponsor.lastName} (${primarySponsor.party}-${primarySponsor.state})`.slice( - 0, - 250, - ) - : undefined; - - const status = (detail.latestAction?.text ?? "Unknown").slice(0, 250); - const introducedDate = detail.introducedDate - ? new Date(detail.introducedDate) - : undefined; - const chamberValue = (detail.originChamber ?? chamber) as - | "House" - | "Senate"; - const billUrl = `https://www.congress.gov/bill/${congress}${ordinalSuffix(congress)}-congress/${billTypeToUrlSlug(detail.type)}/${billNumber}`; - - const summary = await fetchSummary(congress, billType, billNumber); - const fullText = await fetchFullText(congress, billType, billNumber); - - await upsertContent({ - type: "bill", - data: { - billNumber: formattedBillNumber, - title, - description: summary, - sponsor, - status, - introducedDate, - congress, - chamber: chamberValue, - summary, - fullText, - url: billUrl, - sourceWebsite: "congress.gov", - }, - }); - - log(NAME, `Processed: ${formattedBillNumber} — ${title}`); - } catch (error) { - logError( - NAME, - `Error processing bill ${item.type}${item.number}`, - error, - ); - } - } - - log(NAME, "Completed"); - printMetricsSummary(NAME); -} - -export const congress: Scraper = { - name: NAME, - scrape: () => scrape(), -}; -``` - -- [ ] **Step 2: Verify it compiles** - -Run: `cd apps/scraper && npx tsc --noEmit` - -- [ ] **Step 3: Commit** - -```bash -git add apps/scraper/src/scrapers/congress.ts -git commit -m "refactor(scraper): congress uses shared fetchWithRetry + upsertContent + log" -``` - ---- - -### Task 8: Update `scotus.ts` — use shared utilities - -**Files:** -- Modify: `apps/scraper/src/scrapers/scotus.ts` - -Same pattern as congress: replace local `clFetch` with wrapper around `fetchWithRetry`, switch to `upsertContent`, use `log`/`logError`, export as `Scraper`. - -- [ ] **Step 1: Rewrite `scotus.ts`** - -```ts -import { fetchWithRetry } from "../utils/fetch.js"; -import { log, logError } from "../utils/log.js"; -import { printMetricsSummary, resetMetrics } from "../utils/db/metrics.js"; -import { upsertContent } from "../utils/db/operations.js"; -import type { Scraper } from "../utils/types.js"; - -const CL_BASE = "https://www.courtlistener.com/api/rest/v4"; -const NAME = "SCOTUS"; - -// ─── Config ────────────────────────────────────────────────────────────────── - -interface ScotusScraperConfig { - maxCases?: number; - court?: string; -} - -// ─── API response shapes ───────────────────────────────────────────────────── - -interface ClCluster { - id: number; - absolute_url: string; - case_name: string; - docket_id: number; - date_filed: string | null; - precedential_status: string; - syllabus: string; - sub_opinions: string[]; -} - -interface ClOpinion { - id: number; - plain_text: string; - html: string; - type: string; -} - -interface ClDocket { - id: number; - docket_number: string; - court: string; - date_filed: string | null; - case_name: string; -} - -// ─── Constants ─────────────────────────────────────────────────────────────── - -const COURT_NAMES: Record = { - scotus: "Supreme Court of the United States", - ca1: "1st Circuit Court of Appeals", - ca2: "2nd Circuit Court of Appeals", - ca3: "3rd Circuit Court of Appeals", - ca4: "4th Circuit Court of Appeals", - ca5: "5th Circuit Court of Appeals", - ca6: "6th Circuit Court of Appeals", - ca7: "7th Circuit Court of Appeals", - ca8: "8th Circuit Court of Appeals", - ca9: "9th Circuit Court of Appeals", - ca10: "10th Circuit Court of Appeals", - ca11: "11th Circuit Court of Appeals", - cadc: "D.C. Circuit Court of Appeals", -}; - -// ─── Helpers ───────────────────────────────────────────────────────────────── - -function clHeaders(): Record { - const headers: Record = { - Accept: "application/json", - "User-Agent": "billion-scraper/1.0 (contact via github)", - }; - if (process.env.COURTLISTENER_API_KEY) { - headers["Authorization"] = `Token ${process.env.COURTLISTENER_API_KEY}`; - } - return headers; -} - -async function clFetch( - path: string, - params: Record = {}, -): Promise { - const url = new URL(`${CL_BASE}${path}`); - for (const [k, v] of Object.entries(params)) { - url.searchParams.set(k, String(v)); - } - - const res = await fetchWithRetry(url.toString(), { - headers: clHeaders(), - }); - return res.json() as Promise; -} - -function stripHtml(html: string): string { - return html - .replace(/<[^>]+>/g, " ") - .replace(/&/g, "&") - .replace(/</g, "<") - .replace(/>/g, ">") - .replace(/ /g, " ") - .replace(/\s{2,}/g, " ") - .trim(); -} - -function truncateWords(text: string, maxWords: number): string { - const words = text.split(/\s+/); - return words.length > maxWords ? words.slice(0, maxWords).join(" ") : text; -} - -async function fetchOpinionText( - subOpinionUrls: string[], -): Promise { - const fetched: { opinion: ClOpinion; text: string }[] = []; - - for (const url of subOpinionUrls) { - try { - const res = await fetchWithRetry(url, { headers: clHeaders() }); - const opinion = (await res.json()) as ClOpinion; - const text = ( - opinion.plain_text?.trim() || stripHtml(opinion.html ?? "") - ).trim(); - if (text.length > 0) { - fetched.push({ opinion, text }); - } - } catch { - // Skip failed sub-opinions - } - } - - if (fetched.length === 0) return undefined; - - const preferredTypes = new Set(["010combined", "020lead"]); - fetched.sort((a, b) => { - const aPref = preferredTypes.has(a.opinion.type) ? 0 : 1; - const bPref = preferredTypes.has(b.opinion.type) ? 0 : 1; - return aPref - bPref; - }); - - for (const { text } of fetched) { - if (text.length > 200) { - return truncateWords(text, 1000); - } - } - return undefined; -} - -// ─── Main ──────────────────────────────────────────────────────────────────── - -async function scrape(config: ScotusScraperConfig = {}) { - const { maxCases = 50, court = "scotus" } = config; - - const displayName = court === "scotus" ? "SCOTUS" : court.toUpperCase(); - log(displayName, `Starting (court=${court}, maxCases=${maxCases})...`); - resetMetrics(); - - // Step 1: fetch opinion clusters - const allClusters: ClCluster[] = []; - let page = 1; - const pageSize = 100; - - while (allClusters.length < maxCases) { - const pageData = await clFetch<{ - results: ClCluster[]; - next: string | null; - }>("/clusters/", { - court, - order_by: "-date_filed", - page_size: pageSize, - page, - }); - - const results = pageData.results ?? []; - allClusters.push(...results); - if (!pageData.next || results.length < pageSize) break; - page++; - } - - const clusters = allClusters.slice(0, maxCases); - log(displayName, `Fetched ${clusters.length} opinion clusters`); - - // Step 2: process each cluster - for (const cluster of clusters) { - try { - const docket = await clFetch( - `/dockets/${cluster.docket_id}/`, - ); - const docketNumber = docket.docket_number || `CL-${cluster.id}`; - const filedDate = docket.date_filed - ? new Date(docket.date_filed) - : undefined; - const courtCode = docket.court ?? court; - const courtName = COURT_NAMES[courtCode] ?? courtCode.toUpperCase(); - - const title = cluster.case_name?.slice(0, 250) || "Unknown Case"; - const status = cluster.precedential_status || "Unknown"; - const caseUrl = `https://www.courtlistener.com${cluster.absolute_url}`; - - const fullText = await fetchOpinionText(cluster.sub_opinions ?? []); - - const description = cluster.syllabus - ? stripHtml(cluster.syllabus).slice(0, 1000) || undefined - : undefined; - - await upsertContent({ - type: "court_case", - data: { - caseNumber: docketNumber, - title, - court: courtName, - filedDate, - description, - status, - fullText, - url: caseUrl, - }, - }); - - log(displayName, `Processed: ${docketNumber} — ${title}`); - } catch (error) { - logError(displayName, `Error processing cluster ${cluster.id}`, error); - } - } - - log(displayName, "Completed"); - printMetricsSummary(displayName); -} - -export const scotus: Scraper = { - name: NAME, - scrape: () => scrape(), -}; -``` - -- [ ] **Step 2: Verify it compiles** - -Run: `cd apps/scraper && npx tsc --noEmit` - -- [ ] **Step 3: Commit** - -```bash -git add apps/scraper/src/scrapers/scotus.ts -git commit -m "refactor(scraper): scotus uses shared fetchWithRetry + upsertContent + log" -``` - ---- - -### Task 9: Rewrite `main.ts` — runner loop - -**Files:** -- Modify: `apps/scraper/src/main.ts` - -- [ ] **Step 1: Rewrite `main.ts`** - -```ts -import { dirname, join } from "path"; -import { fileURLToPath } from "url"; -import dotenv from "dotenv"; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = dirname(__filename); - -dotenv.config({ path: join(__dirname, "../../../.env") }); -dotenv.config({ path: join(__dirname, "../.env") }); - -import { congress } from "./scrapers/congress.js"; -import { govtrack } from "./scrapers/govtrack.js"; -import { scotus } from "./scrapers/scotus.js"; -import { whitehouse } from "./scrapers/whitehouse.js"; -import type { Scraper } from "./utils/types.js"; - -const scrapers: Scraper[] = [govtrack, whitehouse, congress, scotus]; - -async function main() { - const arg = process.argv[2]?.toLowerCase(); - - if (arg && arg !== "all") { - const scraper = scrapers.find((s) => s.name.toLowerCase().replace(/[.\s]/g, "") === arg.replace(/[.\s]/g, "")); - if (!scraper) { - console.error( - `Unknown scraper: "${arg}". Available: ${scrapers.map((s) => s.name).join(", ")}, all`, - ); - process.exit(1); - } - await scraper.scrape(); - } else { - console.log("Running all scrapers...\n"); - for (const scraper of scrapers) { - await scraper.scrape(); - console.log("\n---\n"); - } - console.log("All scrapers completed."); - } -} - -main().catch((error) => { - console.error("Error running scrapers:", error); - process.exit(1); -}); -``` - -- [ ] **Step 2: Verify it compiles** - -Run: `cd apps/scraper && npx tsc --noEmit` - -- [ ] **Step 3: Commit** - -```bash -git add apps/scraper/src/main.ts -git commit -m "refactor(scraper): simplify main.ts to runner loop over Scraper objects" -``` - ---- - -### Task 10: Remove Crawlee + Playwright dependencies - -**Files:** -- Modify: `apps/scraper/package.json` -- Modify: `apps/scraper/tsconfig.json` - -- [ ] **Step 1: Add cheerio dependency, remove crawlee/playwright/@apify/tsconfig** - -Run: -```bash -cd apps/scraper && pnpm remove crawlee playwright @apify/tsconfig && pnpm add cheerio -``` - -- [ ] **Step 2: Update `package.json` description** - -In `apps/scraper/package.json`, change the `description` field from `"This is an example of a Crawlee project."` to `"Government data scraper for Billion app"`. - -- [ ] **Step 3: Verify tsconfig.json** - -The tsconfig extends `../../tooling/typescript/base.json` which is fine — `@apify/tsconfig` was a devDependency, not extended in tsconfig. No tsconfig changes needed. - -- [ ] **Step 4: Verify it compiles and all imports resolve** - -Run: `cd apps/scraper && npx tsc --noEmit` -Expected: Clean compile, no errors - -- [ ] **Step 5: Commit** - -```bash -git add apps/scraper/package.json apps/scraper/tsconfig.json pnpm-lock.yaml -git commit -m "chore(scraper): remove crawlee, playwright, @apify/tsconfig; add cheerio" -``` - ---- - -### Task 11: Update Dockerfile - -**Files:** -- Modify: `Dockerfile.scraper` (repo root) - -The Dockerfile no longer needs Playwright. It also gets simpler since we don't need the Crawlee storage directory. - -- [ ] **Step 1: Update Dockerfile.scraper** - -Replace the entire file: - -```dockerfile -# Build context: repo root -FROM node:20-slim AS builder - -ENV PNPM_HOME="/root/.local/share/pnpm" -ENV PATH="$PNPM_HOME:$PATH" -RUN corepack enable && corepack prepare pnpm@latest --activate - -WORKDIR /app -COPY pnpm-lock.yaml pnpm-workspace.yaml package.json ./ -COPY apps/scraper/package.json ./apps/scraper/package.json -COPY packages/db/package.json ./packages/db/package.json -COPY tooling/typescript/package.json ./tooling/typescript/package.json -RUN pnpm install --frozen-lockfile - -COPY tooling/typescript ./tooling/typescript -COPY packages/db/src ./packages/db/src -COPY packages/db/tsconfig.json ./packages/db/tsconfig.json -WORKDIR /app/packages/db -RUN pnpm exec tsc --emitDeclarationOnly false --skipLibCheck true && \ - find dist -name "*.js" -exec sed -i "s|from '\./\([^']*\)'|from './\1.js'|g" {} + && \ - find dist -name "*.js" -exec sed -i "s|from \"\./\([^\"]*\)\"|from \"./\1.js\"|g" {} + - -COPY apps/scraper/src /app/apps/scraper/src -COPY apps/scraper/tsconfig.json /app/apps/scraper/tsconfig.json -WORKDIR /app/apps/scraper -RUN pnpm run build - -# Final image -FROM node:20-slim - -ENV PNPM_HOME="/root/.local/share/pnpm" -ENV PATH="$PNPM_HOME:$PATH" -RUN apt-get update && apt-get install -y --no-install-recommends procps && rm -rf /var/lib/apt/lists/* -RUN corepack enable && corepack prepare pnpm@latest --activate - -WORKDIR /app -COPY pnpm-lock.yaml pnpm-workspace.yaml package.json ./ -COPY apps/scraper/package.json ./apps/scraper/package.json -COPY packages/db/package.json ./packages/db/package.json -RUN echo "enable-pre-post-scripts=true" >> .npmrc && pnpm install --frozen-lockfile --prod - -COPY --from=builder /app/apps/scraper/dist ./apps/scraper/dist -COPY --from=builder /app/packages/db/dist ./packages/db/dist - -# Rewrite db exports to use compiled dist/ instead of src/ -RUN node -e " \ - const p = require('./packages/db/package.json'); \ - Object.values(p.exports).forEach(e => { e.default = e.default.replace('./src/', './dist/').replace('.ts', '.js'); }); \ - require('fs').writeFileSync('./packages/db/package.json', JSON.stringify(p, null, 2)); \ -" - -WORKDIR /app/apps/scraper -CMD ["pnpm", "run", "start:prod"] -``` - -Note: This is essentially the same Dockerfile — the only real change is that `crawlee` and `playwright` are no longer in `package.json` so they won't be installed. The `.dockerignore` `storage` entry for Crawlee storage is now irrelevant but harmless. - -- [ ] **Step 2: Commit** - -```bash -git add Dockerfile.scraper -git commit -m "chore(scraper): update Dockerfile after removing Crawlee/Playwright" -``` - ---- - -### Task 12: Smoke test - -- [ ] **Step 1: Full compile check** - -Run: `cd apps/scraper && npx tsc --noEmit` -Expected: Clean compile, zero errors - -- [ ] **Step 2: Dry run with a single scraper** - -Run: `cd apps/scraper && pnpm run start:dev govtrack` -Expected: Scraper runs, fetches listing page, scrapes bill text pages, logs with `[HH:MM:SS] [GovTrack]` prefix, prints metrics summary. Verify no Crawlee references in output. - -- [ ] **Step 3: Verify no Crawlee imports remain** - -Run: `grep -r "crawlee" apps/scraper/src/` -Expected: No matches - -- [ ] **Step 4: Commit any final fixes if needed** diff --git a/docs/superpowers/specs/2026-03-30-scraper-refactor-design.md b/docs/superpowers/specs/2026-03-30-scraper-refactor-design.md deleted file mode 100644 index 6345e5a..0000000 --- a/docs/superpowers/specs/2026-03-30-scraper-refactor-design.md +++ /dev/null @@ -1,118 +0,0 @@ -# Scraper Architecture Refactor - -## Goal - -Replace Crawlee with a hand-rolled approach to reduce complexity, dependencies, and learning surface while keeping reliability. The result is a simpler, more unified codebase where all scrapers follow the same patterns. - -## What Changes - -### Drop Crawlee + Playwright - -Crawlee is only used by 2 of 4 scrapers (govtrack, whitehouse) for a pattern that amounts to: fetch HTML, parse with Cheerio, follow links. Replace with `fetch` + `cheerio` directly. - -**Removed dependencies:** `crawlee`, `playwright`, `@apify/tsconfig` - -### New: `src/utils/fetch.ts` — `fetchWithRetry()` - -Single shared fetch utility (~30 lines). All four scrapers use this. - -- Configurable max retries (default 3) -- Exponential backoff -- Honors `Retry-After` header -- Retries on 429 and 5xx -- Configurable timeout via `AbortSignal.timeout` (default 30s) -- Returns standard `Response` - -### New: `src/utils/log.ts` — `log(scraperName, message)` - -Thin wrapper over `console.log` that prefixes scraper name + timestamp. Replace all scattered `console.log`/`console.error` calls with this. - -### Changed: `src/utils/db/operations.ts` — Unified `upsertContent()` - -Merge `upsertBill()`, `upsertGovernmentContent()`, `upsertCourtCase()` into a single `upsertContent(type, data)` that switches on content type internally. DB schema stays the same (three separate tables). The shared logic: - -1. Hash content -2. Check if exists + compare hash -3. Conditionally generate AI summary/article/thumbnail -4. Upsert to correct table -5. Generate video - -### Changed: `src/scrapers/govtrack.ts` and `src/scrapers/whitehouse.ts` - -Replace `CheerioCrawler` with direct `fetchWithRetry()` + `cheerio.load()`. Each scraper implements its own fetching pattern (listing page, pagination, detail pages) — no shared crawl abstraction, since the two are different enough that abstracting adds more complexity than it removes. - -### Changed: `src/main.ts` — Runner loop - -```ts -const scrapers: Scraper[] = [congress, govtrack, whitehouse, scotus] - -const selected = parseArgs(process.argv) -for (const scraper of selected) { - resetMetrics() - await scraper.scrape() - printMetricsSummary(scraper.name) -} -``` - -Each scraper conforms to: - -```ts -type Scraper = { - name: string - scrape: (config?) => Promise -} -``` - -Scrapers return `void` because they call `upsertContent()` as they go — no need to buffer all results in memory. - -## What Stays the Same - -- All AI generation (`src/utils/ai/`) — unchanged -- Google Images API (`src/utils/api/`) — unchanged -- Video operations (`src/utils/db/video-operations.ts`) — unchanged -- DB helpers (`src/utils/db/helpers.ts`) — unchanged -- Metrics (`src/utils/db/metrics.ts`) — unchanged -- Types and hash utilities — unchanged -- `retroactive-videos.ts` — unchanged -- DB schema (three separate tables) — unchanged - -## File Structure - -``` -src/ -├── main.ts # Runner: parse args, loop scrapers, print metrics -├── scrapers/ -│ ├── congress.ts # Congress.gov API -│ ├── govtrack.ts # GovTrack HTML (fetch + cheerio) -│ ├── whitehouse.ts # Whitehouse HTML (fetch + cheerio + turndown) -│ └── scotus.ts # CourtListener API -├── utils/ -│ ├── types.ts -│ ├── hash.ts -│ ├── fetch.ts # NEW -│ ├── log.ts # NEW -│ ├── db/ -│ │ ├── operations.ts # CHANGED: unified upsertContent() -│ │ ├── video-operations.ts -│ │ ├── helpers.ts -│ │ └── metrics.ts -│ ├── api/ -│ │ └── google-images.ts -│ └── ai/ -│ ├── text-generation.ts -│ ├── image-generation.ts -│ ├── image-keywords.ts -│ └── marketing-generation.ts -├── retroactive-videos.ts -``` - -## Resumability - -AI generation is already guarded by content hashing at the DB layer — unchanged content skips all AI calls. This means a crashed scraper can restart from scratch without re-running expensive AI generation. Fetch-level resumability (tracking visited URLs) is out of scope for now but could be added later by persisting a URL set to disk. - -## Out of Scope - -- DB schema changes (merging tables) -- Fetch-level resumability / URL persistence -- Structured/JSON logging -- New scraper sources From 33c3decc4af0e064bda727ad5b1cd452aece95a7 Mon Sep 17 00:00:00 2001 From: ThatXliner Date: Sat, 4 Apr 2026 00:49:40 -0700 Subject: [PATCH 09/11] :bug: fix(storage): defer env read to call time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SUPABASE_STORAGE_BUCKET was captured at import time, before the migration script dotenv.config() could populate it — uploads silently used the default bucket. Co-Authored-By: Claude Opus 4.6 --- packages/db/src/storage.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/packages/db/src/storage.ts b/packages/db/src/storage.ts index fe70ed7..7df55e4 100644 --- a/packages/db/src/storage.ts +++ b/packages/db/src/storage.ts @@ -1,6 +1,8 @@ import { createClient } from "@supabase/supabase-js"; -const BUCKET = process.env.SUPABASE_STORAGE_BUCKET ?? "images"; +function getBucket() { + return process.env.SUPABASE_STORAGE_BUCKET ?? "images"; +} function getClient() { const url = process.env.SUPABASE_URL; @@ -27,7 +29,7 @@ export async function uploadImage( ): Promise { const supabase = getClient(); - const { error } = await supabase.storage.from(BUCKET).upload(path, data, { + const { error } = await supabase.storage.from(getBucket()).upload(path, data, { contentType: mimeType, upsert: true, }); @@ -38,7 +40,7 @@ export async function uploadImage( const { data: { publicUrl }, - } = supabase.storage.from(BUCKET).getPublicUrl(path); + } = supabase.storage.from(getBucket()).getPublicUrl(path); return publicUrl; } @@ -49,7 +51,7 @@ export async function uploadImage( */ export async function deleteImage(path: string): Promise { const supabase = getClient(); - const { error } = await supabase.storage.from(BUCKET).remove([path]); + const { error } = await supabase.storage.from(getBucket()).remove([path]); if (error) { throw new Error(`Storage delete failed for ${path}: ${error.message}`); } From 152a24c5338aab0006bf2039dac914eb90dd3582 Mon Sep 17 00:00:00 2001 From: ThatXliner Date: Sat, 4 Apr 2026 00:49:45 -0700 Subject: [PATCH 10/11] :bug: fix(scraper): clear legacy image columns on upload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a new image URL was saved, the old imageData/imageMimeType/ imageWidth/imageHeight columns were left populated — wasting storage and preventing the migration script from skipping them. Also removes unused deleteImage import. Co-Authored-By: Claude Opus 4.6 --- apps/scraper/src/utils/db/video-operations.ts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/apps/scraper/src/utils/db/video-operations.ts b/apps/scraper/src/utils/db/video-operations.ts index 5af9792..079ef08 100644 --- a/apps/scraper/src/utils/db/video-operations.ts +++ b/apps/scraper/src/utils/db/video-operations.ts @@ -5,7 +5,7 @@ import { db } from '@acme/db/client'; import { Video } from '@acme/db/schema'; -import { uploadImage, deleteImage } from '@acme/db/storage'; +import { uploadImage } from '@acme/db/storage'; import { and, eq } from '@acme/db'; import { generateMarketingCopy } from '../ai/marketing-generation.js'; import { generateImage, convertToJpeg } from '../ai/image-generation.js'; @@ -129,7 +129,13 @@ export async function generateVideoForContent( try { await db .update(Video) - .set({ imageUrl }) + .set({ + imageUrl, + imageData: null, + imageMimeType: null, + imageWidth: null, + imageHeight: null, + }) .where(and(eq(Video.contentType, contentType), eq(Video.contentId, contentId))); logger.debug(`Uploaded image to ${storagePath}`); } catch (error) { From 7966e8beb241299d142d1b27cc7bd5d956db8f94 Mon Sep 17 00:00:00 2001 From: ThatXliner Date: Sat, 4 Apr 2026 00:49:50 -0700 Subject: [PATCH 11/11] :pencil2: fix(db): correct schema comment for image fields Co-Authored-By: Claude Opus 4.6 --- packages/db/src/schema.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/db/src/schema.ts b/packages/db/src/schema.ts index aca8228..d6e8fdc 100644 --- a/packages/db/src/schema.ts +++ b/packages/db/src/schema.ts @@ -176,7 +176,7 @@ export const Video = pgTable( title: t.varchar({ length: 25 }).notNull(), // Max 25 chars description: t.text().notNull(), // 50-word catchy headline - // Image storage: source thumbnail URL (scraped) + // Image storage: uploaded image URL and source thumbnail URL (scraped) imageUrl: t.text(), // Public URL of uploaded image thumbnailUrl: t.text(), // URL from source content (scraped)