diff --git a/.env.example b/.env.example index 607b4c314..187b3fa16 100644 --- a/.env.example +++ b/.env.example @@ -1 +1,4 @@ ANTHROPIC_API_KEY=your-api-key-here + +# Optional: skip Claude vision during `npm run extract` (cheaper; text + page PNGs only). +# SKIP_VISION=1 diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..0ee6ae89f --- /dev/null +++ b/.gitignore @@ -0,0 +1,46 @@ +# dependencies +/node_modules +/.pnp +.pnp.* +.yarn/* +!.yarn/patches +!.yarn/plugins +!.yarn/releases +!.yarn/versions + +# next.js +/.next/ +/out/ + +# production +/build + +# misc +.DS_Store +*.pem +Thumbs.db + +# debug +npm-debug.log* +yarn-debug.log* +yarn-error.log* +.pnpm-debug.log* + +# env files +.env +.env*.local + +# vercel +.vercel + +# typescript +*.tsbuildinfo + +# generated knowledge (rebuilt via npm run extract) +/data/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo diff --git a/README.md b/README.md index c5e241286..718f6254d 100644 --- a/README.md +++ b/README.md @@ -1,92 +1,62 @@ -# Prox Founding Engineer Challenge +# Vulcan OmniPro 220 — Multimodal assistant -Vulcan OmniPro 220 Vulcan OmniPro 220 — inside panel -## The Product -The [Vulcan OmniPro 220](https://www.harborfreight.com/omnipro-220-industrial-multiprocess-welder-with-120240v-input-57812.html) is a multiprocess welding system sold by Harbor Freight. It supports four welding processes (MIG, Flux-Cored, TIG, and Stick), runs on both 120V and 240V input, and has an LCD-based synergic control system. +Vulcan OmniPro 220 OmniPro 220 panel -Its owner's manual is 48 pages of dense technical content. Duty cycle matrices across multiple voltages and amperages, polarity setup procedures that differ per welding process, wire feed mechanisms with specific tensioner calibrations, wiring schematics, troubleshooting matrices, weld diagnosis diagrams, and a full parts list. +## Quick start (under ~2 minutes after clone) -This is exactly the kind of product Prox exists for. Nobody knows how to use this machine straight out of the box but has time to read 48 page manual, but a complicated machine needs expert-level support. - -Additional video: https://www.youtube.com/watch?v=kxGDoGcnhBw - -## Your Job - -Build a multimodal reasoning agent for the Vulcan OmniPro 220 using the Claude Agent SDK. The agent must be able to answer deep technical questions about this product accurately, helpfully, and not just in text. - -The manuals are in the `files/` directory. - -**There is no limit to how far you can go.** You can integrate voice. You can build a full interactive experience. Sky is the limit. The more ambitious and polished, the better. - -## What We're Testing - -### 1. Deep Technical Accuracy - -Your agent needs to answer questions like these correctly: - -- "What's the duty cycle for MIG welding at 200A on 240V?" -- "I'm getting porosity in my flux-cored welds. What should I check?" -- "What polarity setup do I need for TIG welding? Which socket does the ground clamp go in?" - -We will test with questions that require cross-referencing multiple manual sections, understanding visual content (diagrams, schematics, charts), and handling ambiguous questions that need clarification from the user. - -### 2. Multimodal Responses - -This is the most important part. Your agent must not be text-only. - -- If someone asks about polarity setup, the agent should draw or show a diagram of which cable goes in which socket, not just describe it. -- If the answer relates to a specific image in the manual (the wire feed mechanism, the front panel controls, the weld diagnosis examples), the agent should surface that image. -- If a question is complex enough, the agent should generate interactive content: a duty cycle calculator, a troubleshooting flowchart, a settings configurator that takes process + material + thickness and outputs recommended wire speed and voltage. - -When something is too cognitively hard to explain in words, the agent should draw it. Real-time diagrams, interactive schematics, visual walkthroughs generated through code. - -For your agent to handle these responses well you need to reverse engineer Claude artifacts. Here are two places where you can start: -- https://claude.ai/artifacts (see how Claude renders interactive artifacts in chat) -- https://www.reidbarber.com/blog/reverse-engineering-claude-artifacts - -### 3. Tone and Helpfulness +```bash +git clone git@github.com:isabelccc/prox-challenge.git +cd prox-challenge +cp .env.example .env # add ANTHROPIC_API_KEY +npm install +``` -Imagine your user just bought this welder and is standing in their garage trying to set it up. They're not an idiot, but they're not a professional welder either. +**One-time knowledge build** (downloads local embedding model ~23MB; optional Claude vision per page): -### 4. Knowledge Extraction Quality +```bash +npm run extract +# Cheaper dev iteration: +# SKIP_VISION=1 npm run extract +``` -The manual has a mix of text, tables, labeled diagrams, schematics, and decision matrices. Some critical information exists only in images (the welding process selection chart, the weld diagnosis photos, the wiring schematic). We want to see that your agent understands and presents the visual content, not just the text. +**Run the UI:** -## Tech Requirements +```bash +npm run dev +# http://localhost:3000 +``` -- Use the [Anthropic Claude Agent SDK](https://docs.anthropic.com) as the foundation for your agent. -- The project must run locally with a single API key provided via `.env`. -- You are responsible for your own API costs during development. +## What you get -## How to Present Your Work +- **Next.js 15 + TypeScript** chat UI (streaming SSE), voice input (Web Speech API), dark industrial styling. +- **Tool-using agent** (`@anthropic-ai/sdk`): `search_manual`, `get_specifications` (page-7 structured duty/current tables), `troubleshoot`, `generate_artifact` (HTML / SVG / Mermaid in a sandboxed iframe), `suggest_followups`. +- **Knowledge pipeline**: MuPDF → per-page text + PNGs → (optional) Claude vision descriptions → **local** `all-MiniLM-L6-v2` embeddings → **LanceDB** vector index. +- **Benchmarks**: `npm run eval:retrieval` checks that retrieval hits contain expected keywords (no LLM call). -**This matters.** Your submission is not just the code — it's how you present it. -- **Build a frontend.** The best way for us to evaluate your agent is if it has a clean, simple UI we can run immediately. This is realistically the only way to properly demo an agent like this. -- **Hosting is a plus.** If you host it somewhere we can access without cloning, that's a strong signal. Not required, but it removes friction and shows initiative. -- **Write a clear README.** Explain how your agent works, what design decisions you made, how knowledge is extracted and represented, and how to run it. Your documentation will be evaluated — we want to see how you think and communicate, not just how you code. -- **Video walkthrough is a huge plus.** Record yourself demoing the agent and explaining your approach. Walk through the hard questions, show how it handles multimodal responses, explain your architecture. This gives us a much richer picture of your work than code alone. -We should be running your agent within 2 minutes of cloning your repo: +## Commands -```bash -git clone -cd -cp .env.example .env # we plug in our own Anthropic API key -# your install command (npm install, uv install, etc.) -# your run command (npm run dev, python app.py, etc.) -``` +| Command | Purpose | +|--------|---------| +| `npm run dev` | Chat UI + API | +| `npm run build` | Production build | +| `npm run extract` | Build `data/knowledge.lance` + `data/images/*.png` | +| `npm run eval:retrieval` | Keyword checks on top-k retrieval | -If it takes longer than that to set up, that's a problem. +## Project layout -## What to Submit +- `app/api/chat` — SSE agent loop +- `lib/agent/*` — tools, embeddings, LanceDB +- `lib/data/specifications.ts` — structured p.7 specs +- `scripts/extract-knowledge.ts` — PDF ingestion +- `benchmark/benchmarks.json` — retrieval smoke tests -1. Fork this repo. -2. Build your solution. -3. Submit your fork URL through the form at [useprox.com/join/challenge](https://useprox.com/join/challenge). +## Limits / honesty -## What Happens Next +- **Artifacts** execute in a sandboxed iframe; complex Mermaid diagrams depend on CDN availability. +- **Vision** during extract improves figure-heavy pages; use full extract before submission demos. +- **Welding safety**: user must follow the manual and local codes — the assistant summarizes, it does not replace certified training. -We review submissions on a rolling basis and respond to every single one within a few days. Good luck. diff --git a/app/api/chat/message-converter.ts b/app/api/chat/message-converter.ts new file mode 100644 index 000000000..9b05f36cf --- /dev/null +++ b/app/api/chat/message-converter.ts @@ -0,0 +1,15 @@ +import type Anthropic from "@anthropic-ai/sdk"; + +type ChatMessage = { + role: "user" | "assistant"; + content: string; +}; + +export function convertToAnthropicMessages( + messages: ChatMessage[], +): Anthropic.Messages.MessageParam[] { + return messages.map((m) => ({ + role: m.role, + content: [{ type: "text", text: m.content }], + })); +} diff --git a/app/api/chat/route.ts b/app/api/chat/route.ts new file mode 100644 index 000000000..5565ac378 --- /dev/null +++ b/app/api/chat/route.ts @@ -0,0 +1,141 @@ +import { nanoid } from "nanoid"; +import { getAnthropicClient } from "@/lib/agent/anthropic-client"; +import { SYSTEM_PROMPT } from "@/lib/agent/system-prompt"; +import { TOOL_DEFINITIONS } from "@/lib/agent/tools"; +import { + executeToolCall, + buildToolResultContent, +} from "@/lib/agent/tool-executor"; +import { convertToAnthropicMessages } from "./message-converter"; +import type Anthropic from "@anthropic-ai/sdk"; + +export async function POST(req: Request) { + const { messages } = await req.json(); + const anthropic = getAnthropicClient(); + + const encoder = new TextEncoder(); + + const stream = new ReadableStream({ + async start(controller) { + function sendData(data: unknown) { + controller.enqueue( + encoder.encode(`data: ${JSON.stringify(data)}\n\n`), + ); + } + + try { + const anthropicMessages = convertToAnthropicMessages(messages); + let continueLoop = true; + const MAX_ITERATIONS = 10; + let iteration = 0; + + while (continueLoop && iteration < MAX_ITERATIONS) { + iteration++; + + const response = await anthropic.messages.create({ + model: "claude-sonnet-4-20250514", + max_tokens: 8192, + system: SYSTEM_PROMPT, + tools: TOOL_DEFINITIONS, + messages: anthropicMessages, + }); + + const toolUses: Anthropic.Messages.ToolUseBlock[] = []; + let textContent = ""; + + for (const block of response.content) { + if (block.type === "text") textContent += block.text; + if (block.type === "tool_use") toolUses.push(block); + } + + if (textContent) sendData({ type: "text", text: textContent }); + + if (toolUses.length > 0) { + const toolResults: Anthropic.Messages.ToolResultBlockParam[] = []; + + for (const toolUse of toolUses) { + sendData({ + type: "tool-status", + toolName: toolUse.name, + status: "running", + input: toolUse.input, + }); + + const toolResult = await executeToolCall( + toolUse.name, + toolUse.input as Record, + ); + + if (toolUse.name === "generate_artifact") { + const ar = toolResult.result as { + type?: string; + title?: string; + content?: string; + }; + sendData({ + type: "artifact", + id: nanoid(), + artifact_type: ar.type ?? "html", + title: ar.title ?? "Artifact", + content: ar.content ?? "", + }); + } + + if (toolUse.name === "suggest_followups") { + sendData({ + type: "followups", + id: nanoid(), + ...(toolResult.result as Record), + }); + } + + sendData({ + type: "tool-status", + toolName: toolUse.name, + status: "completed", + }); + + toolResults.push({ + type: "tool_result", + tool_use_id: toolUse.id, + content: buildToolResultContent( + toolResult.result, + toolResult.imageRefs, + ), + }); + } + + anthropicMessages.push({ + role: "assistant", + content: response.content, + }); + anthropicMessages.push({ + role: "user", + content: toolResults, + }); + } + + continueLoop = response.stop_reason === "tool_use"; + } + + sendData({ type: "done" }); + } catch (error) { + sendData({ + type: "error", + message: + error instanceof Error ? error.message : "An error occurred", + }); + } finally { + controller.close(); + } + }, + }); + + return new Response(stream, { + headers: { + "Content-Type": "text/event-stream", + "Cache-Control": "no-cache", + Connection: "keep-alive", + }, + }); +} diff --git a/app/api/images/[name]/route.ts b/app/api/images/[name]/route.ts new file mode 100644 index 000000000..751e148e2 --- /dev/null +++ b/app/api/images/[name]/route.ts @@ -0,0 +1,24 @@ +import fs from "fs"; +import path from "path"; +import { NextResponse } from "next/server"; + +export async function GET( + _req: Request, + { params }: { params: Promise<{ name: string }> }, +) { + const { name } = await params; + if (!/^[\w.-]+\.png$/.test(name)) { + return new NextResponse("Bad request", { status: 400 }); + } + const fp = path.join(process.cwd(), "data", "images", name); + if (!fs.existsSync(fp)) { + return new NextResponse("Not found", { status: 404 }); + } + const buf = fs.readFileSync(fp); + return new NextResponse(buf, { + headers: { + "Content-Type": "image/png", + "Cache-Control": "public, max-age=3600", + }, + }); +} diff --git a/app/api/knowledge-status/route.ts b/app/api/knowledge-status/route.ts new file mode 100644 index 000000000..8525b9e18 --- /dev/null +++ b/app/api/knowledge-status/route.ts @@ -0,0 +1,14 @@ +import fs from "fs"; +import path from "path"; +import { NextResponse } from "next/server"; + +export async function GET() { + const db = path.join(process.cwd(), "data", "knowledge.lance"); + const images = path.join(process.cwd(), "data", "images"); + const ready = fs.existsSync(db); + let imageCount = 0; + if (fs.existsSync(images)) { + imageCount = fs.readdirSync(images).filter((f) => f.endsWith(".png")).length; + } + return NextResponse.json({ ready, imageCount }); +} diff --git a/app/globals.css b/app/globals.css new file mode 100644 index 000000000..6fa1357d4 --- /dev/null +++ b/app/globals.css @@ -0,0 +1,14 @@ +@import "tailwindcss"; + +:root { + --background: #0c0a09; + --foreground: #fafaf9; + --accent: #ea580c; + --muted: #78716c; +} + +body { + background: var(--background); + color: var(--foreground); + font-family: system-ui, -apple-system, Segoe UI, Roboto, sans-serif; +} diff --git a/app/layout.tsx b/app/layout.tsx new file mode 100644 index 000000000..5a695b503 --- /dev/null +++ b/app/layout.tsx @@ -0,0 +1,19 @@ +import type { Metadata } from "next"; +import "./globals.css"; + +export const metadata: Metadata = { + title: "OmniPro 220 Assistant", + description: "Multimodal welding assistant (Prox challenge)", +}; + +export default function RootLayout({ + children, +}: Readonly<{ + children: React.ReactNode; +}>) { + return ( + + {children} + + ); +} diff --git a/app/page.tsx b/app/page.tsx new file mode 100644 index 000000000..994e09475 --- /dev/null +++ b/app/page.tsx @@ -0,0 +1,5 @@ +import { ChatPage } from "@/components/chat/chat-page"; + +export default function Home() { + return ; +} diff --git a/benchmark/benchmarks.json b/benchmark/benchmarks.json new file mode 100644 index 000000000..b5dd12c16 --- /dev/null +++ b/benchmark/benchmarks.json @@ -0,0 +1,32 @@ +[ + { + "id": "duty-mig-200-240", + "query": "duty cycle MIG welding 200A 240V", + "must_include": ["25%", "200"], + "filter_source": "owner-manual" + }, + { + "id": "tig-ground-positive", + "query": "TIG ground clamp which socket positive negative", + "must_include": ["positive", "ground"], + "filter_source": "owner-manual" + }, + { + "id": "stick-polarity", + "query": "stick welding electrode holder socket positive or negative", + "must_include": ["positive", "electrode"], + "filter_source": "owner-manual" + }, + { + "id": "porosity-flux", + "query": "flux cored porosity causes troubleshooting", + "must_include": ["polarity", "gas"], + "filter_source": "owner-manual" + }, + { + "id": "wire-speed-range", + "query": "wire speed IPM range MIG", + "must_include": ["500", "50"], + "filter_source": "owner-manual" + } +] diff --git a/components/artifacts/artifact-frame.tsx b/components/artifacts/artifact-frame.tsx new file mode 100644 index 000000000..0509ecccc --- /dev/null +++ b/components/artifacts/artifact-frame.tsx @@ -0,0 +1,40 @@ +"use client"; + +import { useMemo } from "react"; + +export function ArtifactFrame(props: { + type: string; + title: string; + content: string; +}) { + const srcDoc = useMemo(() => { + if (props.type === "html") { + return props.content; + } + if (props.type === "svg") { + return `${props.content}`; + } + if (props.type === "mermaid") { + const esc = props.content + .replace(/&/g, "&") + .replace(//g, ">"); + return `
${esc}
`; + } + return `
${props.content}
`; + }, [props.type, props.content]); + + return ( +
+
+ {props.title} +
+