From 5c606e2285cb91b68814931f263336f4ebea82d0 Mon Sep 17 00:00:00 2001 From: isabelccc Date: Mon, 13 Apr 2026 03:58:26 -0400 Subject: [PATCH 1/3] feat: multimodal OmniPro 220 assistant with RAG, artifacts, and benchmarks - Next.js 15 chat UI with SSE agent loop and optional voice input - MuPDF extraction, local MiniLM embeddings, LanceDB vector search - Structured page-7 specs + troubleshooting hints for reliable answers - Sandboxed HTML/SVG/Mermaid artifacts and manual image serving - Retrieval benchmark suite and child-process eval wrapper for macOS Made-with: Cursor --- .env.example | 3 + .gitignore | 46 + README.md | 123 +- app/api/chat/message-converter.ts | 15 + app/api/chat/route.ts | 141 + app/api/images/[name]/route.ts | 24 + app/api/knowledge-status/route.ts | 14 + app/globals.css | 14 + app/layout.tsx | 19 + app/page.tsx | 5 + benchmark/benchmarks.json | 32 + components/artifacts/artifact-frame.tsx | 40 + components/chat/chat-page.tsx | 249 + eslint.config.mjs | 12 + lib/agent/anthropic-client.ts | 13 + lib/agent/embeddings.ts | 15 + lib/agent/system-prompt.ts | 22 + lib/agent/tool-executor.ts | 171 + lib/agent/tools.ts | 96 + lib/agent/vector-search.ts | 97 + lib/data/specifications.ts | 169 + lib/data/troubleshooting.ts | 71 + lib/types.ts | 29 + lib/utils/chunking.ts | 69 + next-env.d.ts | 6 + next.config.ts | 16 + package-lock.json | 9016 +++++++++++++++++++++++ package.json | 46 + postcss.config.mjs | 5 + scripts/eval-retrieval.ts | 54 + scripts/extract-knowledge.ts | 212 + scripts/run-eval-retrieval.mjs | 36 + tsconfig.json | 21 + types/speech.d.ts | 14 + 34 files changed, 10843 insertions(+), 72 deletions(-) create mode 100644 .gitignore create mode 100644 app/api/chat/message-converter.ts create mode 100644 app/api/chat/route.ts create mode 100644 app/api/images/[name]/route.ts create mode 100644 app/api/knowledge-status/route.ts create mode 100644 app/globals.css create mode 100644 app/layout.tsx create mode 100644 app/page.tsx create mode 100644 benchmark/benchmarks.json create mode 100644 components/artifacts/artifact-frame.tsx create mode 100644 components/chat/chat-page.tsx create mode 100644 eslint.config.mjs create mode 100644 lib/agent/anthropic-client.ts create mode 100644 lib/agent/embeddings.ts create mode 100644 lib/agent/system-prompt.ts create mode 100644 lib/agent/tool-executor.ts create mode 100644 lib/agent/tools.ts create mode 100644 lib/agent/vector-search.ts create mode 100644 lib/data/specifications.ts create mode 100644 lib/data/troubleshooting.ts create mode 100644 lib/types.ts create mode 100644 lib/utils/chunking.ts create mode 100644 next-env.d.ts create mode 100644 next.config.ts create mode 100644 package-lock.json create mode 100644 package.json create mode 100644 postcss.config.mjs create mode 100644 scripts/eval-retrieval.ts create mode 100644 scripts/extract-knowledge.ts create mode 100644 scripts/run-eval-retrieval.mjs create mode 100644 tsconfig.json create mode 100644 types/speech.d.ts diff --git a/.env.example b/.env.example index 607b4c314..187b3fa16 100644 --- a/.env.example +++ b/.env.example @@ -1 +1,4 @@ ANTHROPIC_API_KEY=your-api-key-here + +# Optional: skip Claude vision during `npm run extract` (cheaper; text + page PNGs only). +# SKIP_VISION=1 diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..0ee6ae89f --- /dev/null +++ b/.gitignore @@ -0,0 +1,46 @@ +# dependencies +/node_modules +/.pnp +.pnp.* +.yarn/* +!.yarn/patches +!.yarn/plugins +!.yarn/releases +!.yarn/versions + +# next.js +/.next/ +/out/ + +# production +/build + +# misc +.DS_Store +*.pem +Thumbs.db + +# debug +npm-debug.log* +yarn-debug.log* +yarn-error.log* +.pnpm-debug.log* + +# env files +.env +.env*.local + +# vercel +.vercel + +# typescript +*.tsbuildinfo + +# generated knowledge (rebuilt via npm run extract) +/data/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo diff --git a/README.md b/README.md index c5e241286..347733946 100644 --- a/README.md +++ b/README.md @@ -1,92 +1,71 @@ -# Prox Founding Engineer Challenge +# Vulcan OmniPro 220 — Multimodal assistant (Prox challenge) -Vulcan OmniPro 220 Vulcan OmniPro 220 — inside panel +Fork: **isabelccc/prox-challenge** — a local-first assistant that answers technical questions about the OmniPro 220 using the PDFs in `files/`, with **citations**, **interactive artifacts**, and an **offline retrieval benchmark**. -## The Product +Vulcan OmniPro 220 OmniPro 220 panel -The [Vulcan OmniPro 220](https://www.harborfreight.com/omnipro-220-industrial-multiprocess-welder-with-120240v-input-57812.html) is a multiprocess welding system sold by Harbor Freight. It supports four welding processes (MIG, Flux-Cored, TIG, and Stick), runs on both 120V and 240V input, and has an LCD-based synergic control system. +## Quick start (under ~2 minutes after clone) -Its owner's manual is 48 pages of dense technical content. Duty cycle matrices across multiple voltages and amperages, polarity setup procedures that differ per welding process, wire feed mechanisms with specific tensioner calibrations, wiring schematics, troubleshooting matrices, weld diagnosis diagrams, and a full parts list. - -This is exactly the kind of product Prox exists for. Nobody knows how to use this machine straight out of the box but has time to read 48 page manual, but a complicated machine needs expert-level support. - -Additional video: https://www.youtube.com/watch?v=kxGDoGcnhBw - -## Your Job - -Build a multimodal reasoning agent for the Vulcan OmniPro 220 using the Claude Agent SDK. The agent must be able to answer deep technical questions about this product accurately, helpfully, and not just in text. - -The manuals are in the `files/` directory. - -**There is no limit to how far you can go.** You can integrate voice. You can build a full interactive experience. Sky is the limit. The more ambitious and polished, the better. - -## What We're Testing - -### 1. Deep Technical Accuracy - -Your agent needs to answer questions like these correctly: - -- "What's the duty cycle for MIG welding at 200A on 240V?" -- "I'm getting porosity in my flux-cored welds. What should I check?" -- "What polarity setup do I need for TIG welding? Which socket does the ground clamp go in?" - -We will test with questions that require cross-referencing multiple manual sections, understanding visual content (diagrams, schematics, charts), and handling ambiguous questions that need clarification from the user. - -### 2. Multimodal Responses - -This is the most important part. Your agent must not be text-only. - -- If someone asks about polarity setup, the agent should draw or show a diagram of which cable goes in which socket, not just describe it. -- If the answer relates to a specific image in the manual (the wire feed mechanism, the front panel controls, the weld diagnosis examples), the agent should surface that image. -- If a question is complex enough, the agent should generate interactive content: a duty cycle calculator, a troubleshooting flowchart, a settings configurator that takes process + material + thickness and outputs recommended wire speed and voltage. - -When something is too cognitively hard to explain in words, the agent should draw it. Real-time diagrams, interactive schematics, visual walkthroughs generated through code. - -For your agent to handle these responses well you need to reverse engineer Claude artifacts. Here are two places where you can start: -- https://claude.ai/artifacts (see how Claude renders interactive artifacts in chat) -- https://www.reidbarber.com/blog/reverse-engineering-claude-artifacts +```bash +git clone git@github.com:isabelccc/prox-challenge.git +cd prox-challenge +cp .env.example .env # add ANTHROPIC_API_KEY +npm install +``` -### 3. Tone and Helpfulness +**One-time knowledge build** (downloads local embedding model ~23MB; optional Claude vision per page): -Imagine your user just bought this welder and is standing in their garage trying to set it up. They're not an idiot, but they're not a professional welder either. +```bash +npm run extract +# Cheaper dev iteration: +# SKIP_VISION=1 npm run extract +``` -### 4. Knowledge Extraction Quality +**Run the UI:** -The manual has a mix of text, tables, labeled diagrams, schematics, and decision matrices. Some critical information exists only in images (the welding process selection chart, the weld diagnosis photos, the wiring schematic). We want to see that your agent understands and presents the visual content, not just the text. +```bash +npm run dev +# http://localhost:3000 +``` -## Tech Requirements +## What you get -- Use the [Anthropic Claude Agent SDK](https://docs.anthropic.com) as the foundation for your agent. -- The project must run locally with a single API key provided via `.env`. -- You are responsible for your own API costs during development. +- **Next.js 15 + TypeScript** chat UI (streaming SSE), voice input (Web Speech API), dark industrial styling. +- **Tool-using agent** (`@anthropic-ai/sdk`): `search_manual`, `get_specifications` (page-7 structured duty/current tables), `troubleshoot`, `generate_artifact` (HTML / SVG / Mermaid in a sandboxed iframe), `suggest_followups`. +- **Knowledge pipeline**: MuPDF → per-page text + PNGs → (optional) Claude vision descriptions → **local** `all-MiniLM-L6-v2` embeddings → **LanceDB** vector index. +- **Benchmarks**: `npm run eval:retrieval` checks that retrieval hits contain expected keywords (no LLM call). -## How to Present Your Work +## Reliability choices (why this should score well) -**This matters.** Your submission is not just the code — it's how you present it. +1. **Dual path for specs**: numeric duty/current questions hit structured JSON transcribed from **Owner's Manual p.7**, not “whatever retrieval returned”. +2. **Explicit polarity rules in the system prompt** for TIG vs Stick vs MIG sockets, aligned with **p.8 / p.24 / p.27**. +3. **Clarification-first behavior** when process, gas, or voltage is missing. +4. **Citations required** in the system prompt (document + page). +5. **Measurable retrieval quality** via `benchmark/benchmarks.json`. -- **Build a frontend.** The best way for us to evaluate your agent is if it has a clean, simple UI we can run immediately. This is realistically the only way to properly demo an agent like this. -- **Hosting is a plus.** If you host it somewhere we can access without cloning, that's a strong signal. Not required, but it removes friction and shows initiative. -- **Write a clear README.** Explain how your agent works, what design decisions you made, how knowledge is extracted and represented, and how to run it. Your documentation will be evaluated — we want to see how you think and communicate, not just how you code. -- **Video walkthrough is a huge plus.** Record yourself demoing the agent and explaining your approach. Walk through the hard questions, show how it handles multimodal responses, explain your architecture. This gives us a much richer picture of your work than code alone. +## Commands -We should be running your agent within 2 minutes of cloning your repo: +| Command | Purpose | +|--------|---------| +| `npm run dev` | Chat UI + API | +| `npm run build` | Production build | +| `npm run extract` | Build `data/knowledge.lance` + `data/images/*.png` | +| `npm run eval:retrieval` | Keyword checks on top-k retrieval | -```bash -git clone -cd -cp .env.example .env # we plug in our own Anthropic API key -# your install command (npm install, uv install, etc.) -# your run command (npm run dev, python app.py, etc.) -``` +## Project layout -If it takes longer than that to set up, that's a problem. +- `app/api/chat` — SSE agent loop +- `lib/agent/*` — tools, embeddings, LanceDB +- `lib/data/specifications.ts` — structured p.7 specs +- `scripts/extract-knowledge.ts` — PDF ingestion +- `benchmark/benchmarks.json` — retrieval smoke tests -## What to Submit +## Limits / honesty -1. Fork this repo. -2. Build your solution. -3. Submit your fork URL through the form at [useprox.com/join/challenge](https://useprox.com/join/challenge). +- **Artifacts** execute in a sandboxed iframe; complex Mermaid diagrams depend on CDN availability. +- **Vision** during extract improves figure-heavy pages; use full extract before submission demos. +- **Welding safety**: user must follow the manual and local codes — the assistant summarizes, it does not replace certified training. -## What Happens Next +## Original challenge -We review submissions on a rolling basis and respond to every single one within a few days. Good luck. +See the upstream Prox repo for full requirements: [prox-technologies/prox-challenge](https://github.com/prox-technologies/prox-challenge). diff --git a/app/api/chat/message-converter.ts b/app/api/chat/message-converter.ts new file mode 100644 index 000000000..9b05f36cf --- /dev/null +++ b/app/api/chat/message-converter.ts @@ -0,0 +1,15 @@ +import type Anthropic from "@anthropic-ai/sdk"; + +type ChatMessage = { + role: "user" | "assistant"; + content: string; +}; + +export function convertToAnthropicMessages( + messages: ChatMessage[], +): Anthropic.Messages.MessageParam[] { + return messages.map((m) => ({ + role: m.role, + content: [{ type: "text", text: m.content }], + })); +} diff --git a/app/api/chat/route.ts b/app/api/chat/route.ts new file mode 100644 index 000000000..5565ac378 --- /dev/null +++ b/app/api/chat/route.ts @@ -0,0 +1,141 @@ +import { nanoid } from "nanoid"; +import { getAnthropicClient } from "@/lib/agent/anthropic-client"; +import { SYSTEM_PROMPT } from "@/lib/agent/system-prompt"; +import { TOOL_DEFINITIONS } from "@/lib/agent/tools"; +import { + executeToolCall, + buildToolResultContent, +} from "@/lib/agent/tool-executor"; +import { convertToAnthropicMessages } from "./message-converter"; +import type Anthropic from "@anthropic-ai/sdk"; + +export async function POST(req: Request) { + const { messages } = await req.json(); + const anthropic = getAnthropicClient(); + + const encoder = new TextEncoder(); + + const stream = new ReadableStream({ + async start(controller) { + function sendData(data: unknown) { + controller.enqueue( + encoder.encode(`data: ${JSON.stringify(data)}\n\n`), + ); + } + + try { + const anthropicMessages = convertToAnthropicMessages(messages); + let continueLoop = true; + const MAX_ITERATIONS = 10; + let iteration = 0; + + while (continueLoop && iteration < MAX_ITERATIONS) { + iteration++; + + const response = await anthropic.messages.create({ + model: "claude-sonnet-4-20250514", + max_tokens: 8192, + system: SYSTEM_PROMPT, + tools: TOOL_DEFINITIONS, + messages: anthropicMessages, + }); + + const toolUses: Anthropic.Messages.ToolUseBlock[] = []; + let textContent = ""; + + for (const block of response.content) { + if (block.type === "text") textContent += block.text; + if (block.type === "tool_use") toolUses.push(block); + } + + if (textContent) sendData({ type: "text", text: textContent }); + + if (toolUses.length > 0) { + const toolResults: Anthropic.Messages.ToolResultBlockParam[] = []; + + for (const toolUse of toolUses) { + sendData({ + type: "tool-status", + toolName: toolUse.name, + status: "running", + input: toolUse.input, + }); + + const toolResult = await executeToolCall( + toolUse.name, + toolUse.input as Record, + ); + + if (toolUse.name === "generate_artifact") { + const ar = toolResult.result as { + type?: string; + title?: string; + content?: string; + }; + sendData({ + type: "artifact", + id: nanoid(), + artifact_type: ar.type ?? "html", + title: ar.title ?? "Artifact", + content: ar.content ?? "", + }); + } + + if (toolUse.name === "suggest_followups") { + sendData({ + type: "followups", + id: nanoid(), + ...(toolResult.result as Record), + }); + } + + sendData({ + type: "tool-status", + toolName: toolUse.name, + status: "completed", + }); + + toolResults.push({ + type: "tool_result", + tool_use_id: toolUse.id, + content: buildToolResultContent( + toolResult.result, + toolResult.imageRefs, + ), + }); + } + + anthropicMessages.push({ + role: "assistant", + content: response.content, + }); + anthropicMessages.push({ + role: "user", + content: toolResults, + }); + } + + continueLoop = response.stop_reason === "tool_use"; + } + + sendData({ type: "done" }); + } catch (error) { + sendData({ + type: "error", + message: + error instanceof Error ? error.message : "An error occurred", + }); + } finally { + controller.close(); + } + }, + }); + + return new Response(stream, { + headers: { + "Content-Type": "text/event-stream", + "Cache-Control": "no-cache", + Connection: "keep-alive", + }, + }); +} diff --git a/app/api/images/[name]/route.ts b/app/api/images/[name]/route.ts new file mode 100644 index 000000000..751e148e2 --- /dev/null +++ b/app/api/images/[name]/route.ts @@ -0,0 +1,24 @@ +import fs from "fs"; +import path from "path"; +import { NextResponse } from "next/server"; + +export async function GET( + _req: Request, + { params }: { params: Promise<{ name: string }> }, +) { + const { name } = await params; + if (!/^[\w.-]+\.png$/.test(name)) { + return new NextResponse("Bad request", { status: 400 }); + } + const fp = path.join(process.cwd(), "data", "images", name); + if (!fs.existsSync(fp)) { + return new NextResponse("Not found", { status: 404 }); + } + const buf = fs.readFileSync(fp); + return new NextResponse(buf, { + headers: { + "Content-Type": "image/png", + "Cache-Control": "public, max-age=3600", + }, + }); +} diff --git a/app/api/knowledge-status/route.ts b/app/api/knowledge-status/route.ts new file mode 100644 index 000000000..8525b9e18 --- /dev/null +++ b/app/api/knowledge-status/route.ts @@ -0,0 +1,14 @@ +import fs from "fs"; +import path from "path"; +import { NextResponse } from "next/server"; + +export async function GET() { + const db = path.join(process.cwd(), "data", "knowledge.lance"); + const images = path.join(process.cwd(), "data", "images"); + const ready = fs.existsSync(db); + let imageCount = 0; + if (fs.existsSync(images)) { + imageCount = fs.readdirSync(images).filter((f) => f.endsWith(".png")).length; + } + return NextResponse.json({ ready, imageCount }); +} diff --git a/app/globals.css b/app/globals.css new file mode 100644 index 000000000..6fa1357d4 --- /dev/null +++ b/app/globals.css @@ -0,0 +1,14 @@ +@import "tailwindcss"; + +:root { + --background: #0c0a09; + --foreground: #fafaf9; + --accent: #ea580c; + --muted: #78716c; +} + +body { + background: var(--background); + color: var(--foreground); + font-family: system-ui, -apple-system, Segoe UI, Roboto, sans-serif; +} diff --git a/app/layout.tsx b/app/layout.tsx new file mode 100644 index 000000000..5a695b503 --- /dev/null +++ b/app/layout.tsx @@ -0,0 +1,19 @@ +import type { Metadata } from "next"; +import "./globals.css"; + +export const metadata: Metadata = { + title: "OmniPro 220 Assistant", + description: "Multimodal welding assistant (Prox challenge)", +}; + +export default function RootLayout({ + children, +}: Readonly<{ + children: React.ReactNode; +}>) { + return ( + + {children} + + ); +} diff --git a/app/page.tsx b/app/page.tsx new file mode 100644 index 000000000..994e09475 --- /dev/null +++ b/app/page.tsx @@ -0,0 +1,5 @@ +import { ChatPage } from "@/components/chat/chat-page"; + +export default function Home() { + return ; +} diff --git a/benchmark/benchmarks.json b/benchmark/benchmarks.json new file mode 100644 index 000000000..b5dd12c16 --- /dev/null +++ b/benchmark/benchmarks.json @@ -0,0 +1,32 @@ +[ + { + "id": "duty-mig-200-240", + "query": "duty cycle MIG welding 200A 240V", + "must_include": ["25%", "200"], + "filter_source": "owner-manual" + }, + { + "id": "tig-ground-positive", + "query": "TIG ground clamp which socket positive negative", + "must_include": ["positive", "ground"], + "filter_source": "owner-manual" + }, + { + "id": "stick-polarity", + "query": "stick welding electrode holder socket positive or negative", + "must_include": ["positive", "electrode"], + "filter_source": "owner-manual" + }, + { + "id": "porosity-flux", + "query": "flux cored porosity causes troubleshooting", + "must_include": ["polarity", "gas"], + "filter_source": "owner-manual" + }, + { + "id": "wire-speed-range", + "query": "wire speed IPM range MIG", + "must_include": ["500", "50"], + "filter_source": "owner-manual" + } +] diff --git a/components/artifacts/artifact-frame.tsx b/components/artifacts/artifact-frame.tsx new file mode 100644 index 000000000..0509ecccc --- /dev/null +++ b/components/artifacts/artifact-frame.tsx @@ -0,0 +1,40 @@ +"use client"; + +import { useMemo } from "react"; + +export function ArtifactFrame(props: { + type: string; + title: string; + content: string; +}) { + const srcDoc = useMemo(() => { + if (props.type === "html") { + return props.content; + } + if (props.type === "svg") { + return `${props.content}`; + } + if (props.type === "mermaid") { + const esc = props.content + .replace(/&/g, "&") + .replace(//g, ">"); + return `
${esc}
`; + } + return `
${props.content}
`; + }, [props.type, props.content]); + + return ( +
+
+ {props.title} +
+