diff --git a/apps/scraper/README.md b/apps/scraper/README.md index bcea339..8e10b75 100644 --- a/apps/scraper/README.md +++ b/apps/scraper/README.md @@ -25,6 +25,9 @@ Then fill in the values. The ones you need for scraping: | Variable | Required | Where to get it | |---|---|---| | `POSTGRES_URL` | ✅ | Your Supabase project settings | +| `GOOGLE_VERTEX_PROJECT` | ✅ | Your Google Cloud project id for Vertex AI | +| `GOOGLE_VERTEX_LOCATION` | ✅ | Your Vertex AI region, e.g. `us-central1` | +| `GOOGLE_VERTEX_API_KEY` | Optional | Vertex AI Express Mode API key | | `OPENAI_API_KEY` | ✅ | [platform.openai.com](https://platform.openai.com) | | `CONGRESS_API_KEY` | ✅ | Free at [api.congress.gov/sign-up](https://api.congress.gov/sign-up/) | | `COURTLISTENER_API_KEY` | Optional | Free at [courtlistener.com](https://www.courtlistener.com/sign-in/) — only needed for higher rate limits | @@ -82,4 +85,6 @@ All scrapers call into `src/utils/db/operations.ts`. Each time a bill or case is - If it's **new** → saves it and generates an AI article + thumbnail - If the **content changed** → regenerates the article -- If **nothing changed** → skips AI generation entirely (saves API costs) \ No newline at end of file +- If **nothing changed** → backfills any missing AI summary/article/thumbnail fields, otherwise skips AI generation + +Set `SCRAPER_FORCE_AI_REGEN=1` to force a full AI refresh even when the record already has AI content. diff --git a/apps/scraper/package.json b/apps/scraper/package.json index 3dc7f72..14c24fc 100644 --- a/apps/scraper/package.json +++ b/apps/scraper/package.json @@ -6,11 +6,11 @@ "dependencies": { "@acme/db": "workspace:*", "@ai-sdk/google": "^3.0.53", + "@ai-sdk/google-vertex": "^4.0.105", "ai": "^6.0.141", "cheerio": "^1.2.0", "consola": "^3.4.2", "dotenv": "^17.3.1", - "openai": "^6.33.0", "p-limit": "^7.3.0", "sharp": "^0.34.5", "turndown": "^7.2.2", diff --git a/apps/scraper/run.ts b/apps/scraper/run.ts index 32d52e2..dacb9d2 100644 --- a/apps/scraper/run.ts +++ b/apps/scraper/run.ts @@ -28,6 +28,9 @@ printHeader("Environment"); printKeyValue("POSTGRES_URL", check(process.env.POSTGRES_URL)); printKeyValue("PEXELS_API_KEY", check(process.env.PEXELS_API_KEY)); printKeyValue("OPENAI_API_KEY", check(process.env.OPENAI_API_KEY)); +printKeyValue("GOOGLE_VERTEX_PROJECT", check(process.env.GOOGLE_VERTEX_PROJECT)); +printKeyValue("GOOGLE_VERTEX_LOCATION", check(process.env.GOOGLE_VERTEX_LOCATION)); +printKeyValue("GOOGLE_VERTEX_API_KEY", check(process.env.GOOGLE_VERTEX_API_KEY)); printFooter(); // Now import and run main diff --git a/apps/scraper/src/scrapers/federalregister.ts b/apps/scraper/src/scrapers/federalregister.ts index 8565794..751579a 100644 --- a/apps/scraper/src/scrapers/federalregister.ts +++ b/apps/scraper/src/scrapers/federalregister.ts @@ -100,7 +100,7 @@ async function scrape() { title: doc.title, type: contentType, publishedDate, - description: doc.abstract ?? undefined, + description: fullText ? undefined : (doc.abstract ?? undefined), fullText, url: doc.html_url, source: "federalregister.gov", diff --git a/apps/scraper/src/utils/ai/image-generation.ts b/apps/scraper/src/utils/ai/image-generation.ts index 5f68d86..651630f 100644 --- a/apps/scraper/src/utils/ai/image-generation.ts +++ b/apps/scraper/src/utils/ai/image-generation.ts @@ -1,17 +1,16 @@ /** - * AI image generation using OpenAI DALL-E + * AI image generation using Google Vertex AI Imagen 3 * Generates images from text prompts and converts them to JPEG format */ -import OpenAI from 'openai'; +import { generateImage as aiGenerateImage } from 'ai'; +import { vertexProvider } from './provider.js'; import { createLogger } from '../log.js'; -import { trackDalle3Image } from '../costs.js'; +import { trackImagenImage } from '../costs.js'; import { AIRateLimitError, setRateLimitHit } from './text-generation.js'; const logger = createLogger("image"); -const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); - export interface GeneratedImage { data: Buffer; mimeType: string; @@ -27,7 +26,7 @@ async function sleep(ms: number): Promise { } /** - * Generate an image using DALL-E 3 with retry logic for rate limits + * Generate an image using Vertex AI Imagen 3 with retry logic for rate limits * @param prompt - Text description of desired image * @param maxRetries - Maximum number of retry attempts (default: 3) * @returns Generated image as Buffer with metadata, or null if generation fails @@ -43,56 +42,48 @@ export async function generateImage( if (attempt > 0) { logger.warn(`Retry attempt ${attempt}/${maxRetries} for image generation`); } else { - logger.start(`Generating image with DALL-E 3: ${prompt.substring(0, 50)}...`); + logger.start(`Generating image with Imagen 3: ${prompt.substring(0, 50)}...`); } - // DALL-E 3 for quality - const response = await openai.images.generate({ - model: 'dall-e-3', - prompt: `Professional news photography: ${prompt}. Photorealistic, high quality, journalistic style.`, - size: '1024x1024', - quality: 'standard', - response_format: 'url', + const result = await aiGenerateImage({ + model: vertexProvider.image('imagen-3.0-generate-001'), + prompt: `Premium editorial photography: ${prompt}. Cinematic lighting, vibrant color palette, masterpiece composition, 8k resolution, highly detailed, expressive and dynamic.`, + aspectRatio: '1:1', + providerOptions: { + vertex: { sampleCount: 1 }, + }, }); - if (!response.data?.[0]?.url) { - logger.error('No image URL returned from DALL-E'); - return null; - } - - const imageUrl = response.data[0].url; - - // Download image to buffer (URLs expire after 1 hour, need to store permanently) - const imageResponse = await fetch(imageUrl); - if (!imageResponse.ok) { - logger.error(`Failed to download image: ${imageResponse.status}`); - return null; - } - - const buffer = Buffer.from(await imageResponse.arrayBuffer()); + // Imagen returns base64-encoded bytes directly — no URL download needed + const buffer = Buffer.from(result.image.base64, 'base64'); - trackDalle3Image(); + trackImagenImage(); logger.success(`Image generated: ${buffer.length} bytes`); return { data: buffer, - mimeType: 'image/png', // DALL-E returns PNG + mimeType: (result.image as any).mimeType ?? 'image/png', width: 1024, height: 1024, }; } catch (error) { lastError = error instanceof Error ? error : new Error(String(error)); - // Check if error is due to content policy violation (don't retry) - if (lastError.message.includes('content_policy_violation')) { - logger.warn(`Image generation blocked by content filter for prompt: ${prompt.substring(0, 100)}...`); + // Imagen safety filter block (don't retry) + if ( + lastError.message.includes('SAFETY') || + lastError.message.includes('blocked') || + lastError.message.includes('content_filter') + ) { + logger.warn(`Image generation blocked by safety filter for prompt: ${prompt.substring(0, 100)}...`); return null; } - // Check for rate limit errors (429 or rate_limit_exceeded) + // Check for rate limit errors (429 or RESOURCE_EXHAUSTED) const isRateLimitError = - lastError.message.includes('rate_limit_exceeded') || + lastError.message.includes('RESOURCE_EXHAUSTED') || lastError.message.includes('429') || + lastError.message.includes('rate_limit_exceeded') || lastError.message.includes('Rate limit'); if (isRateLimitError && attempt < maxRetries) { diff --git a/apps/scraper/src/utils/ai/image-keywords.ts b/apps/scraper/src/utils/ai/image-keywords.ts index 51ce219..324a5b9 100644 --- a/apps/scraper/src/utils/ai/image-keywords.ts +++ b/apps/scraper/src/utils/ai/image-keywords.ts @@ -1,14 +1,14 @@ /** * AI-powered image keyword generation - * Uses OpenAI to extract visual concepts for image search + * Uses Google Vertex AI to extract visual concepts for image search */ -import { google } from '@ai-sdk/google'; import { generateText, APICallError, RetryError } from 'ai'; import { AIRateLimitError, rateLimitHit, setRateLimitHit } from './text-generation.js'; import { createLogger } from '../log.js'; import { trackGeminiUsage } from '../costs.js'; +import { vertexProvider } from './provider.js'; const logger = createLogger("ai"); @@ -44,20 +44,22 @@ export async function generateImageSearchKeywords( } try { const { text, usage } = await generateText({ - model: google('gemini-2.5-flash'), - prompt: `Given this ${type} title and content, generate 2-4 search keywords for finding relevant stock photos. Focus on concrete, visual, photographic concepts that would actually appear in news photography or documentary images. + model: vertexProvider('gemini-2.5-flash'), + prompt: `Given this ${type} title and content, generate 2-4 search keywords for finding visually striking, high-end editorial stock photos. Focus on dramatic, cinematic, and photographic concepts that feel professional and modern. -GOOD examples (specific, visual, photographic): -- capitol building washington dc -- hospital doctor medical equipment -- construction workers infrastructure -- classroom students education -- solar panels renewable energy +GOOD examples (specific, dynamic, visual): +- dramatic capitol building sunset +- surgical team intense motion +- worker silhouette infrastructure +- vibrant classroom activity +- cinematic solar farm aerial -BAD examples (too abstract, no clear visual): -- government policy legislation -- economic impact financial -- social justice equality +BAD examples (generic, static): +- capitol building +- doctor +- construction site +- students +- solar panels Title: ${title} diff --git a/apps/scraper/src/utils/ai/marketing-generation.ts b/apps/scraper/src/utils/ai/marketing-generation.ts index 59236cb..750ea71 100644 --- a/apps/scraper/src/utils/ai/marketing-generation.ts +++ b/apps/scraper/src/utils/ai/marketing-generation.ts @@ -1,14 +1,14 @@ /** - * AI marketing content generation using OpenAI + * AI marketing content generation using Google Vertex AI * Generates compelling social media titles, descriptions, and image prompts */ -import { google } from "@ai-sdk/google"; import { generateObject, APICallError, RetryError } from "ai"; import { z } from "zod"; import { createLogger } from "../log.js"; import { trackGeminiUsage } from "../costs.js"; import { AIRateLimitError, rateLimitHit, setRateLimitHit } from "./text-generation.js"; +import { vertexProvider } from "./provider.js"; function isRateLimitError(error: unknown): boolean { if (error instanceof APICallError) return error.statusCode === 429; @@ -21,7 +21,7 @@ function isRateLimitError(error: unknown): boolean { const logger = createLogger("ai"); const MarketingCopySchema = z.object({ - title: z.string().max(100), + title: z.string().max(25), // Must match Video.title varchar(25) DB constraint description: z.string(), imagePrompt: z.string(), }); @@ -47,7 +47,7 @@ export async function generateMarketingCopy( logger.start(`Generating marketing copy for: ${articleTitle}`); const { object, usage } = await generateObject({ - model: google("gemini-2.5-flash"), + model: vertexProvider("gemini-2.5-flash"), schema: MarketingCopySchema, prompt: `You are a professional marketing copywriter creating engaging social media content. @@ -55,8 +55,8 @@ Create compelling marketing copy for this ${contentType} to be displayed in a so Requirements: 1. "title": Compelling, attention-grabbing title (MUST be 25 characters or less) -2. "description": Engaging 50-word description that makes people want to learn more. Write in an accessible, conversational tone. -3. "imagePrompt": Detailed prompt for AI image generation (describe a visually striking, photorealistic image that captures the essence of this content) +2. "description": A very short (max 25 words) summary for a mobile feed. Write in simple, plain English (8th-grade level). Focus on the "so what?"—why should a regular person care? No jargon. +3. "imagePrompt": A creative, high-energy, and visually arresting scene description that captures the *essence* of the story. Instead of literal office buildings or meetings, focus on dramatic metaphors, intense human emotion, or dynamic action. Use vivid color descriptions and interesting perspectives (e.g., extreme close-ups, wide cinematic shots, or dramatic low angles). Avoid text, icons, or stereotypical stock photo tropes. Article Title: ${articleTitle} Content Preview: ${articleContent.substring(0, 1000)}`, @@ -75,7 +75,7 @@ Content Preview: ${articleContent.substring(0, 1000)}`, return { title: articleTitle.substring(0, 25), description: articleContent.substring(0, 200) + "...", - imagePrompt: `professional news photography about ${articleTitle}`, + imagePrompt: `A dynamic, cinematic editorial photo about ${articleTitle}. Dramatic lighting, vivid colors.`, }; } } diff --git a/apps/scraper/src/utils/ai/provider.ts b/apps/scraper/src/utils/ai/provider.ts new file mode 100644 index 0000000..0b52fc7 --- /dev/null +++ b/apps/scraper/src/utils/ai/provider.ts @@ -0,0 +1,12 @@ +import { createVertex } from "@ai-sdk/google-vertex"; + +const project = process.env.GOOGLE_VERTEX_PROJECT; +const location = process.env.GOOGLE_VERTEX_LOCATION; +const apiKey = process.env.GOOGLE_VERTEX_API_KEY; + +export const vertexProvider = createVertex({ + ...(project ? { project } : {}), + ...(location ? { location } : {}), + ...(apiKey ? { apiKey } : {}), +}); + diff --git a/apps/scraper/src/utils/ai/text-generation.ts b/apps/scraper/src/utils/ai/text-generation.ts index fd5f776..cc3ec32 100644 --- a/apps/scraper/src/utils/ai/text-generation.ts +++ b/apps/scraper/src/utils/ai/text-generation.ts @@ -1,12 +1,12 @@ /** - * AI text generation utilities using OpenAI + * AI text generation utilities using Google Vertex AI * Generates summaries and full articles from government content */ -import { google } from '@ai-sdk/google'; import { generateText, APICallError, RetryError } from 'ai'; import { createLogger } from '../log.js'; import { trackGeminiUsage } from '../costs.js'; +import { vertexProvider } from './provider.js'; const logger = createLogger("ai"); @@ -53,8 +53,12 @@ export async function generateAISummary( } try { const { text, usage } = await generateText({ - model: google('gemini-2.5-flash'), - prompt: `Generate a concise, engaging summary (max 100 characters) for this government content. Focus on the key action or impact. + model: vertexProvider('gemini-2.5-flash'), + prompt: `You are an expert at simplifying complex government and legal jargon for a general audience. +Generate a very short, punchy summary (max 100 characters) for this content. + +Goal: Tell a regular person "what happened" or "what changed" in one quick sentence. +Style: Use active voice, plain English (8th-grade level), and NO jargon. Focus on the direct impact. Title: ${title} @@ -96,13 +100,13 @@ export async function generateAIArticle( logger.start(`Generating AI article for: ${title}`); const { text, usage } = await generateText({ - model: google('gemini-2.5-flash'), + model: vertexProvider('gemini-2.5-flash'), prompt: `You are an expert at making government and legal content accessible and engaging for everyday people. Transform the following ${type} into a well-structured, markdown-formatted article. **Structure your article with these 4 sections:** ## What This Means For You -Write 2-3 concise sentences (max 150 words) that immediately tell everyday people what this means for their lives. Use plain language, avoid jargon, and focus on direct impact. Make it relatable and concrete. +Write 1-2 very short, punchy sentences (max 50 words) that immediately tell a regular person how this affects their life. Use 5th-8th grade reading level. Completely avoid legal or technical terms. Focus on the "so what?"—the direct, practical result for everyday people. Make it feel human and relevant. ## Overview Provide a balanced, neutral, and informative explanation of what this ${type} is about. Use engaging storytelling elements while remaining objective. Break down complex concepts, define technical terms, and provide context. Make it interesting to read while being thorough. Aim for 200-400 words. diff --git a/apps/scraper/src/utils/costs.ts b/apps/scraper/src/utils/costs.ts index 8e7b46c..bee6205 100644 --- a/apps/scraper/src/utils/costs.ts +++ b/apps/scraper/src/utils/costs.ts @@ -12,8 +12,8 @@ const PRICES = { // Gemini 2.5 Flash — $/1M tokens geminiFlashInput: Number(process.env.GEMINI_FLASH_INPUT_PRICE) || 0.15, geminiFlashOutput: Number(process.env.GEMINI_FLASH_OUTPUT_PRICE) || 0.60, - // DALL-E 3 — $/image (1024x1024, standard) - dalle3Image: Number(process.env.DALLE3_IMAGE_PRICE) || 0.04, + // Imagen 3 — $/image (1:1 aspect ratio) + imagenImage: Number(process.env.IMAGEN_IMAGE_PRICE) || 0.03, // Google Custom Search — $/query (after free tier) googleSearch: Number(process.env.GOOGLE_SEARCH_PRICE) || 0.005, }; @@ -21,14 +21,14 @@ const PRICES = { interface CostState { geminiInputTokens: number; geminiOutputTokens: number; - dalle3Images: number; + imagenImages: number; googleSearches: number; } let state: CostState = { geminiInputTokens: 0, geminiOutputTokens: 0, - dalle3Images: 0, + imagenImages: 0, googleSearches: 0, }; @@ -36,7 +36,7 @@ export function resetCosts(): void { state = { geminiInputTokens: 0, geminiOutputTokens: 0, - dalle3Images: 0, + imagenImages: 0, googleSearches: 0, }; } @@ -49,8 +49,8 @@ export function trackGeminiUsage( state.geminiOutputTokens += outputTokens ?? 0; } -export function trackDalle3Image(): void { - state.dalle3Images++; +export function trackImagenImage(): void { + state.imagenImages++; } export function trackGoogleSearch(): void { @@ -60,10 +60,10 @@ export function trackGoogleSearch(): void { export interface CostSummary { geminiInputTokens: number; geminiOutputTokens: number; - dalle3Images: number; + imagenImages: number; googleSearches: number; geminiCost: number; - dalle3Cost: number; + imagenCost: number; googleSearchCost: number; totalCost: number; } @@ -72,14 +72,14 @@ export function getCostSummary(): CostSummary { const geminiCost = (state.geminiInputTokens / 1_000_000) * PRICES.geminiFlashInput + (state.geminiOutputTokens / 1_000_000) * PRICES.geminiFlashOutput; - const dalle3Cost = state.dalle3Images * PRICES.dalle3Image; + const imagenCost = state.imagenImages * PRICES.imagenImage; const googleSearchCost = state.googleSearches * PRICES.googleSearch; return { ...state, geminiCost, - dalle3Cost, + imagenCost, googleSearchCost, - totalCost: geminiCost + dalle3Cost + googleSearchCost, + totalCost: geminiCost + imagenCost + googleSearchCost, }; } diff --git a/apps/scraper/src/utils/db/helpers.ts b/apps/scraper/src/utils/db/helpers.ts index 385db6e..0d0ce7a 100644 --- a/apps/scraper/src/utils/db/helpers.ts +++ b/apps/scraper/src/utils/db/helpers.ts @@ -3,7 +3,7 @@ * Check for existing records before performing expensive operations */ -import { eq, and, isNull } from '@acme/db'; +import { eq, and, isNull, or } from '@acme/db'; import { db } from '@acme/db/client'; import { Bill, GovernmentContent, CourtCase, Video } from '@acme/db/schema'; import type { ExistingRecordCheck } from '../types.js'; @@ -25,6 +25,7 @@ export async function checkExistingBill( const [existing] = await db .select({ contentHash: Bill.contentHash, + description: Bill.description, aiGeneratedArticle: Bill.aiGeneratedArticle, thumbnailUrl: Bill.thumbnailUrl, }) @@ -39,6 +40,7 @@ export async function checkExistingBill( return { exists: true, contentHash: existing.contentHash, + description: existing.description, hasArticle: !!existing.aiGeneratedArticle, hasThumbnail: !!existing.thumbnailUrl, }; @@ -60,6 +62,7 @@ export async function checkExistingGovernmentContent( const [existing] = await db .select({ contentHash: GovernmentContent.contentHash, + description: GovernmentContent.description, aiGeneratedArticle: GovernmentContent.aiGeneratedArticle, thumbnailUrl: GovernmentContent.thumbnailUrl, }) @@ -74,6 +77,7 @@ export async function checkExistingGovernmentContent( return { exists: true, contentHash: existing.contentHash, + description: existing.description, hasArticle: !!existing.aiGeneratedArticle, hasThumbnail: !!existing.thumbnailUrl, }; @@ -95,6 +99,7 @@ export async function checkExistingCourtCase( const [existing] = await db .select({ contentHash: CourtCase.contentHash, + description: CourtCase.description, aiGeneratedArticle: CourtCase.aiGeneratedArticle, thumbnailUrl: CourtCase.thumbnailUrl, }) @@ -109,6 +114,7 @@ export async function checkExistingCourtCase( return { exists: true, contentHash: existing.contentHash, + description: existing.description, hasArticle: !!existing.aiGeneratedArticle, hasThumbnail: !!existing.thumbnailUrl, }; @@ -141,7 +147,16 @@ export async function findArticlesWithoutVideos( }) .from(Bill) .leftJoin(Video, and(eq(Video.contentType, 'bill'), eq(Video.contentId, Bill.id))) - .where(isNull(Video.id)) + .where( + or( + isNull(Video.id), + and( + eq(Video.contentType, 'bill'), + isNull(Video.imageData), + isNull(Video.thumbnailUrl), + ), + ), + ) .limit(limit); return billsWithoutVideos; @@ -157,7 +172,16 @@ export async function findArticlesWithoutVideos( }) .from(GovernmentContent) .leftJoin(Video, and(eq(Video.contentType, 'government_content'), eq(Video.contentId, GovernmentContent.id))) - .where(isNull(Video.id)) + .where( + or( + isNull(Video.id), + and( + eq(Video.contentType, 'government_content'), + isNull(Video.imageData), + isNull(Video.thumbnailUrl), + ), + ), + ) .limit(limit); return contentWithoutVideos; @@ -172,7 +196,16 @@ export async function findArticlesWithoutVideos( }) .from(CourtCase) .leftJoin(Video, and(eq(Video.contentType, 'court_case'), eq(Video.contentId, CourtCase.id))) - .where(isNull(Video.id)) + .where( + or( + isNull(Video.id), + and( + eq(Video.contentType, 'court_case'), + isNull(Video.imageData), + isNull(Video.thumbnailUrl), + ), + ), + ) .limit(limit); return casesWithoutVideos.map(c => ({ ...c, source: 'court' })); diff --git a/apps/scraper/src/utils/db/metrics.ts b/apps/scraper/src/utils/db/metrics.ts index 7227087..c930929 100644 --- a/apps/scraper/src/utils/db/metrics.ts +++ b/apps/scraper/src/utils/db/metrics.ts @@ -138,8 +138,8 @@ export function printMetricsSummary(scraperName: string): void { const totalTokens = costs.geminiInputTokens + costs.geminiOutputTokens; printKeyValue("Gemini tokens", `${totalTokens.toLocaleString()} (${formatUsd(costs.geminiCost)})`); } - if (costs.dalle3Images > 0) { - printKeyValue("DALL-E 3 images", `${costs.dalle3Images} (${formatUsd(costs.dalle3Cost)})`); + if (costs.imagenImages > 0) { + printKeyValue("Imagen 3 images", `${costs.imagenImages} (${formatUsd(costs.imagenCost)})`); } if (costs.googleSearches > 0) { printKeyValue("Google searches", `${costs.googleSearches} (${formatUsd(costs.googleSearchCost)})`); diff --git a/apps/scraper/src/utils/db/operations.ts b/apps/scraper/src/utils/db/operations.ts index 9eb8b2c..3b67c80 100644 --- a/apps/scraper/src/utils/db/operations.ts +++ b/apps/scraper/src/utils/db/operations.ts @@ -32,6 +32,7 @@ import { tickProgress } from "../progress.js"; import { createLogger } from "../log.js"; const logger = createLogger("db"); +const forceAIRegeneration = process.env.SCRAPER_FORCE_AI_REGEN === "1"; function isUsableText(text: string | undefined | null): text is string { if (!text || text.length < 200) return false; @@ -40,15 +41,21 @@ function isUsableText(text: string | undefined | null): text is string { const lines = text.split("\n"); const boilerplateLines = lines.filter((line) => { const trimmed = line.trim(); - return ( - trimmed === "" || - trimmed.split(/\s+/).length === 1 || - (/[a-zA-Z]/.test(trimmed) && - trimmed === trimmed.toUpperCase() && - trimmed.length > 2) - ); + // Blank lines + if (trimmed === "") return true; + // Single-word lines (section numbers, lone tokens) + if (trimmed.split(/\s+/).length === 1) return true; + // Fully uppercase lines that are NOT legislative section headers + // (e.g. "SEC. 1." or "CHAPTER 2—" are expected in bill text — don't penalise them) + const isAllCaps = + /[a-zA-Z]/.test(trimmed) && + trimmed === trimmed.toUpperCase() && + trimmed.length > 2; + const isLegislativeHeader = /^(SEC\.|SECTION|CHAPTER|TITLE|PART|SUBPART|ART\.|ARTICLE)\s/i.test(trimmed); + return isAllCaps && !isLegislativeHeader; }); - if (boilerplateLines.length / lines.length >= 0.3) return false; + // Raise threshold: bill/order text is legitimately header-heavy (50% instead of 30%) + if (boilerplateLines.length / lines.length >= 0.5) return false; return true; } @@ -127,30 +134,60 @@ export async function upsertContent(input: ContentData) { const fullText = input.data.fullText; const title = input.data.title; const url = input.data.url; + const sourceDescription = input.data.description; const hasUsableText = isUsableText(fullText); + if (!hasUsableText && fullText) { + logger.debug(`${label} fullText failed usability check (too short or boilerplate-heavy) — AI article will be skipped`); + } + const hasSummarySource = Boolean( + fullText || (input.type === "bill" && input.data.summary), + ); + const persistedDescription = existing?.description; + const hasPersistedSummary = Boolean( + (sourceDescription && sourceDescription.trim()) || + (persistedDescription && persistedDescription.trim()), + ); + let shouldGenerateSummary = false; let shouldGenerateArticle = false; let shouldGenerateImage = false; let progressKind: "new" | "changed" | "unchanged"; if (!existing) { + shouldGenerateSummary = !sourceDescription && hasSummarySource; shouldGenerateArticle = hasUsableText; shouldGenerateImage = hasUsableText; incrementNewEntries(); progressKind = "new"; logger.info(`New ${label} detected`); } else if (existing.contentHash !== newContentHash) { - shouldGenerateArticle = hasUsableText; - shouldGenerateImage = !existing.hasThumbnail && hasUsableText; + shouldGenerateSummary = forceAIRegeneration + ? !sourceDescription && hasSummarySource + : !hasPersistedSummary && !sourceDescription && hasSummarySource; + shouldGenerateArticle = forceAIRegeneration + ? hasUsableText + : hasUsableText && !existing.hasArticle; + shouldGenerateImage = + (forceAIRegeneration || !existing.hasThumbnail) && hasUsableText; incrementExistingChanged(); progressKind = "changed"; logger.info(`Content changed for ${label}`); } else { - shouldGenerateArticle = false; - shouldGenerateImage = !existing.hasThumbnail && hasUsableText; + shouldGenerateSummary = forceAIRegeneration + ? !sourceDescription && hasSummarySource + : !hasPersistedSummary && !sourceDescription && hasSummarySource; + shouldGenerateArticle = forceAIRegeneration + ? hasUsableText + : hasUsableText && !existing.hasArticle; + shouldGenerateImage = + (forceAIRegeneration || !existing.hasThumbnail) && hasUsableText; incrementExistingUnchanged(); progressKind = "unchanged"; - logger.debug(`No changes for ${label}, skipping AI generation`); + logger.debug( + shouldGenerateSummary || shouldGenerateArticle || shouldGenerateImage + ? `No raw changes for ${label}, backfilling missing AI content` + : `No changes for ${label}, skipping AI generation`, + ); } // Phase 1: always persist raw content first (no AI fields) @@ -248,7 +285,7 @@ export async function upsertContent(input: ContentData) { // Phase 2: AI enrichment — skipped entirely if rate-limited try { - const existingDescription = input.data.description; + const existingDescription = sourceDescription || persistedDescription; const articleType = input.type === "bill" ? "bill" @@ -261,10 +298,7 @@ export async function upsertContent(input: ContentData) { (async (): Promise => { if (existingDescription) { return existingDescription; - } else if ( - shouldGenerateArticle && - (fullText || (input.type === "bill" && input.data.summary)) - ) { + } else if (shouldGenerateSummary) { const summarySource = input.type === "bill" ? input.data.summary || input.data.fullText || "" @@ -285,8 +319,11 @@ export async function upsertContent(input: ContentData) { articleType, url, ); - incrementAIArticlesGenerated(); - return article; + if (article) { + incrementAIArticlesGenerated(); + return article; + } + logger.warn(`AI article generation returned empty result for ${label}`); } else if (existing?.hasArticle) { logger.debug(`Using existing AI article for ${label}`); } @@ -370,7 +407,10 @@ export async function upsertContent(input: ContentData) { if (error instanceof AIRateLimitError) { logger.warn(`AI rate limit hit — ${label} saved without video, will retry next run`); } else { - throw error; + // Video generation is supplementary — a failure here must not abort + // content processing or propagate the raw DB error (which can contain + // binary image data) up to the scraper's generic error handler + logger.warn(`Video generation failed for ${label} — content was saved successfully: ${error instanceof Error ? error.message : error}`); } } } diff --git a/apps/scraper/src/utils/db/video-operations.ts b/apps/scraper/src/utils/db/video-operations.ts index d4ccca2..55cb8ac 100644 --- a/apps/scraper/src/utils/db/video-operations.ts +++ b/apps/scraper/src/utils/db/video-operations.ts @@ -22,16 +22,25 @@ async function checkExistingVideo( currentContentHash: string, ): Promise<{ exists: boolean; needsRegeneration: boolean } | null> { const [existing] = await db - .select({ sourceContentHash: Video.sourceContentHash }) + .select({ + sourceContentHash: Video.sourceContentHash, + imageData: Video.imageData, + thumbnailUrl: Video.thumbnailUrl, + }) .from(Video) .where(and(eq(Video.contentType, contentType), eq(Video.contentId, contentId))) .limit(1); if (!existing) return null; + // Record needs regeneration if content hash changed OR if it's missing image data entirely + // (neither AI generated nor a scraped fallback) + const isMissingImage = !existing.imageData && !existing.thumbnailUrl; + const needsRegeneration = existing.sourceContentHash !== currentContentHash || isMissingImage; + return { exists: true, - needsRegeneration: existing.sourceContentHash !== currentContentHash, + needsRegeneration, }; } @@ -84,13 +93,17 @@ export async function generateVideoForContent( }; // Upsert video with hybrid image support + // Hard-truncate title to DB constraint (varchar 25) as a safety net in case + // the AI schema validation ever drifts from the DB schema again + const safeTitle = marketingCopy.title.substring(0, 25); + try { await db .insert(Video) .values({ contentType, contentId, - title: marketingCopy.title, + title: safeTitle, description: marketingCopy.description, imageData, imageMimeType, @@ -104,7 +117,7 @@ export async function generateVideoForContent( .onConflictDoUpdate({ target: [Video.contentType, Video.contentId], set: { - title: marketingCopy.title, + title: safeTitle, description: marketingCopy.description, imageData, imageMimeType, @@ -119,11 +132,16 @@ export async function generateVideoForContent( incrementVideosGenerated(); logger.success(`Video generated for ${contentType}:${contentId}`); } catch (error) { - // Sanitize error to avoid logging raw image data - const sanitizedError = error instanceof Error - ? `${error.name}: ${error.message.replace(/image_data[^,]*,/g, 'image_data=,')}` - : 'Unknown database error'; - logger.error(`Failed to insert video for ${contentType}:${contentId}: ${sanitizedError}`); - throw error; + // Build a sanitized error message — the raw DB error embeds binary image + // data as SQL parameter values which floods logs with unicode gibberish + const rawMessage = error instanceof Error ? error.message : String(error); + const sanitizedMessage = rawMessage + // Remove the full query dump (contains binary data as parameter values) + .replace(/Failed query:[\s\S]*/i, 'Failed query: ') + // Belt-and-suspenders: also strip any remaining base64/binary blobs + .replace(/\\x[0-9a-fA-F]{20,}/g, ''); + logger.error(`Failed to upsert video for ${contentType}:${contentId}: ${sanitizedMessage}`); + // Throw a clean error so callers don't re-log the raw binary payload + throw new Error(`Video upsert failed for ${contentType}:${contentId}: ${sanitizedMessage}`); } } diff --git a/apps/scraper/src/utils/types.ts b/apps/scraper/src/utils/types.ts index 21ed8bb..d0b11f5 100644 --- a/apps/scraper/src/utils/types.ts +++ b/apps/scraper/src/utils/types.ts @@ -39,6 +39,7 @@ export interface ScraperMetrics { export interface ExistingRecordCheck { exists: boolean; contentHash?: string; + description?: string | null; hasArticle: boolean; hasThumbnail: boolean; }