From 78d9eaca90461fd80c840d34b7048f16f8a5fa69 Mon Sep 17 00:00:00 2001 From: Benjamin Shafii Date: Thu, 21 May 2026 21:03:05 -0700 Subject: [PATCH 1/4] feat(app): add voice mode extension --- apps/app/src/app/constants.ts | 11 + apps/app/src/app/lib/openwork-server.ts | 16 + .../domains/session/chat/session-page.tsx | 92 +- .../session/settings/extensions-pane-slot.tsx | 49 ++ .../session/surface/session-surface.tsx | 18 + .../domains/session/voice/voice-panel.tsx | 795 ++++++++++++++++++ .../domains/settings/extension-registry.tsx | 8 + .../settings/openwork-voice-config.tsx | 107 +++ .../app/src/react-app/shell/session-route.tsx | 1 + .../src/react-app/shell/settings-route.tsx | 68 ++ .../app/src/react-app/shell/ui-state-store.ts | 2 +- apps/desktop/electron/main.mjs | 16 + apps/server/src/server.ts | 169 ++++ docs/mcp-ui-control-profile.md | 26 +- evals/README.md | 2 + evals/voice-mode-flows.md | 113 +++ packages/openwork-ui-mcp/index.mjs | 123 +++ 17 files changed, 1611 insertions(+), 5 deletions(-) create mode 100644 apps/app/src/react-app/domains/session/voice/voice-panel.tsx create mode 100644 apps/app/src/react-app/domains/settings/openwork-voice-config.tsx create mode 100644 evals/voice-mode-flows.md diff --git a/apps/app/src/app/constants.ts b/apps/app/src/app/constants.ts index fa52e09c0b..30544f79c7 100644 --- a/apps/app/src/app/constants.ts +++ b/apps/app/src/app/constants.ts @@ -166,6 +166,17 @@ export const MCP_QUICK_CONNECT: McpDirectoryInfo[] = [ iconSrc: "/ext-openai.svg", composerPrompt: "Use the OpenAI Image Gen extension to ", }, + { + id: "openwork-voice", + name: "Voice Mode", + serverName: "openwork-voice", + description: "Talk to OpenWork through a Realtime voice panel that drives the same semantic UI controls as OpenWork UI MCP.", + oauth: false, + kind: "extension", + iconSrc: "/openwork-mark.svg", + composerPrompt: "Use Voice Mode to ", + defaultEnabled: true, + }, { id: "ollama", name: "Ollama", diff --git a/apps/app/src/app/lib/openwork-server.ts b/apps/app/src/app/lib/openwork-server.ts index 9f09379174..570f15146c 100644 --- a/apps/app/src/app/lib/openwork-server.ts +++ b/apps/app/src/app/lib/openwork-server.ts @@ -1388,6 +1388,22 @@ export function createOpenworkServerClient(options: { baseUrl: string; token?: s method: "DELETE", timeoutMs: timeouts.config, }), + + createVoiceRealtimeSession: (payload?: { model?: string }) => + requestJson<{ + ok: true; + clientSecret: string; + expiresAt: number | null; + model: string; + transcriptionModel: string; + tools: string[]; + }>(baseUrl, "/voice/realtime/session", { + token, + hostToken, + method: "POST", + body: payload ?? {}, + timeoutMs: timeouts.config, + }), }; } diff --git a/apps/app/src/react-app/domains/session/chat/session-page.tsx b/apps/app/src/react-app/domains/session/chat/session-page.tsx index 5c3d2acb14..5c6cfdae3d 100644 --- a/apps/app/src/react-app/domains/session/chat/session-page.tsx +++ b/apps/app/src/react-app/domains/session/chat/session-page.tsx @@ -2,9 +2,10 @@ import type { CSSProperties } from "react"; import { useCallback, useEffect, useMemo, useRef, useState } from "react"; import { usePanelRef } from "react-resizable-panels"; -import { FileText, Globe, Settings2, Zap } from "lucide-react"; +import { FileText, Globe, Mic2, Settings2, Zap } from "lucide-react"; import { t } from "../../../../i18n"; +import { OPENWORK_EXTENSION_CATALOG } from "../../../../app/constants"; import { type OpenworkServerClient, type OpenworkServerStatus } from "../../../../app/lib/openwork-server"; import { getDisplaySessionTitle } from "../../../../app/lib/session-title"; import type { BootPhase } from "../../../../app/lib/startup-boot"; @@ -45,7 +46,10 @@ import { isElectronRuntime } from "../../../../app/utils"; import { BrowserPanel } from "../browser/browser-panel"; import { ArtifactPanel } from "../artifacts/artifact-panel"; import { isCollectibleArtifactTarget, isLocalhostBrowserTarget, type OpenTarget } from "../artifacts/open-target"; +import { VoicePanel } from "../voice/voice-panel"; import { useWorkspaceShellLayout } from "../../../shell/workspace-shell-layout"; +import { useControlAction, type OpenworkControlAction } from "../../../shell/control/control-provider"; +import { getExtensionId, isOpenWorkExtensionEnabled, OPENWORK_EXTENSION_STATE_CHANGED } from "../../settings/extension-state"; import { cn } from "@/lib/utils"; const STARTUP_SKELETON_ROWS = [ @@ -123,6 +127,7 @@ export type SessionPageProps = { clientConnected: boolean; openworkServerStatus: OpenworkServerStatus; openworkServerClient: OpenworkServerClient | null; + hostOpenworkServerClient?: OpenworkServerClient | null; openworkServerToken?: string | null; developerMode: boolean; headerStatus: string; @@ -225,6 +230,7 @@ export function SessionPage(props: SessionPageProps) { const [artifactTarget, setArtifactTarget] = useState(null); const [openTargets, setOpenTargets] = useState([]); const [hiddenAccessibleTargetIds, setHiddenAccessibleTargetIds] = useState>(() => new Set()); + const [, setExtensionStateVersion] = useState(0); const loadedHiddenTargetsKeyRef = useRef(null); const accessibleTargets = useMemo( () => openTargets.filter((target) => isTrackableAccessibleTarget(target) && !hiddenAccessibleTargetIds.has(target.id)), @@ -238,6 +244,12 @@ export function SessionPage(props: SessionPageProps) { const browserRailActive = activeSidePanel === "browser"; const artifactRailActive = activeSidePanel === "artifacts"; const extensionsRailActive = activeSidePanel === "extensions"; + const voiceRailActive = activeSidePanel === "voice"; + const voiceExtension = useMemo( + () => OPENWORK_EXTENSION_CATALOG.find((entry) => getExtensionId(entry) === "openwork-voice") ?? null, + [], + ); + const voiceExtensionEnabled = voiceExtension ? isOpenWorkExtensionEnabled(voiceExtension) : false; useReactRenderWatchdog("SessionPage", { selectedSessionId: props.selectedSessionId, @@ -364,6 +376,9 @@ export function SessionPage(props: SessionPageProps) { const openExtensionsRailPane = useCallback(() => { toggleCurrentSidePanel("extensions"); }, [toggleCurrentSidePanel]); + const openVoiceRailPane = useCallback(() => { + toggleCurrentSidePanel("voice"); + }, [toggleCurrentSidePanel]); const removeAccessibleTarget = useCallback((target: OpenTarget) => { setHiddenAccessibleTargetIds((current) => new Set(current).add(target.id)); setArtifactTarget((current) => current?.id === target.id ? null : current); @@ -393,8 +408,50 @@ export function SessionPage(props: SessionPageProps) { window.addEventListener("openwork-close-right-pane", handler); return () => window.removeEventListener("openwork-close-right-pane", handler); }, [setCurrentSidePanel]); + useEffect(() => { + const refresh = () => setExtensionStateVersion((value) => value + 1); + window.addEventListener(OPENWORK_EXTENSION_STATE_CHANGED, refresh); + window.addEventListener("storage", refresh); + return () => { + window.removeEventListener(OPENWORK_EXTENSION_STATE_CHANGED, refresh); + window.removeEventListener("storage", refresh); + }; + }, []); + useEffect(() => { + if (activeSidePanel === "voice" && !voiceExtensionEnabled) { + setCurrentSidePanel(null); + } + }, [activeSidePanel, setCurrentSidePanel, voiceExtensionEnabled]); const [showDelayedSessionLoadingState, setShowDelayedSessionLoadingState] = useState(false); + const openVoicePanelControlAction = useMemo(() => ( + voiceExtensionEnabled && props.selectedSessionId ? { + id: "voice.panel.open", + label: "Open Voice Mode", + description: "Open the Voice Mode right-side panel for the active session.", + sideEffect: "none", + execute: () => { + setCurrentSidePanel("voice"); + return { open: true }; + }, + } : null + ), [props.selectedSessionId, setCurrentSidePanel, voiceExtensionEnabled]); + useControlAction(openVoicePanelControlAction); + + const closeVoicePanelControlAction = useMemo(() => ( + voiceExtensionEnabled && activeSidePanel === "voice" ? { + id: "voice.panel.close", + label: "Close Voice Mode", + description: "Close the Voice Mode right-side panel.", + sideEffect: "none", + execute: () => { + setCurrentSidePanel(null); + return { open: false }; + }, + } : null + ), [activeSidePanel, setCurrentSidePanel, voiceExtensionEnabled]); + useControlAction(closeVoicePanelControlAction); + const selectedSessionTitle = useMemo( () => sessionTitleForId(props.sidebar.workspaceSessionGroups, props.selectedSessionId), [props.selectedSessionId, props.sidebar.workspaceSessionGroups], @@ -441,6 +498,12 @@ export function SessionPage(props: SessionPageProps) { selectedWorkspaceGroupError || ""; const showSelectedWorkspaceError = Boolean(selectedWorkspaceErrorMessage); + const rightPanelDefaultSize = activeSidePanel === "extensions" + ? `${Math.max(browserPanelDefaultWidth, 480)}px` + : activeSidePanel === "voice" + ? `${Math.max(browserPanelDefaultWidth, 380)}px` + : `${browserPanelDefaultWidth}px`; + const rightPanelMinSize = activeSidePanel === "extensions" ? "420px" : activeSidePanel === "voice" ? "360px" : "320px"; const reactSessionBaseUrl = props.opencodeBaseUrl?.trim() ?? ""; const reactSessionToken = @@ -822,8 +885,8 @@ export function SessionPage(props: SessionPageProps) { @@ -831,6 +894,12 @@ export function SessionPage(props: SessionPageProps) {
{props.settingsSlot}
+ ) : activeSidePanel === "voice" ? ( + ) : activeSidePanel === "artifacts" && visibleArtifactTarget && props.openworkServerClient && props.runtimeWorkspaceId ? ( ) : null} + {voiceExtensionEnabled ? ( + + ) : null} + ); +} + +export function VoicePanel(props: VoicePanelProps) { + const panelRef = useRef(null); + const peerRef = useRef(null); + const channelRef = useRef(null); + const streamRef = useRef(null); + const remoteAudioRef = useRef(null); + const timelineEndRef = useRef(null); + const assistantBufferRef = useRef(""); + const responseInProgressRef = useRef(false); + const pendingResponseRef = useRef(false); + const micMutedRef = useRef(false); + const [status, setStatus] = useState("idle"); + const [statusText, setStatusText] = useState("Ready for voice control."); + const [micMuted, setMicMuted] = useState(false); + const [entries, setEntries] = useState([]); + const [textCommand, setTextCommand] = useState(""); + const [latestUserTranscript, setLatestUserTranscript] = useState(""); + const [assistantPreview, setAssistantPreview] = useState(""); + const [expandedEntries, setExpandedEntries] = useState>(() => new Set()); + const connected = status === "listening" || status === "speaking" || status === "muted"; + + useEffect(() => { + micMutedRef.current = micMuted; + }, [micMuted]); + + const addEntry = useCallback((role: VoiceTimelineEntry["role"], text: string, options: { toolName?: string; error?: boolean } = {}) => { + const trimmed = text.trim(); + if ((role === "user" || role === "assistant") && !trimmed) return; + setEntries((current) => [ + ...current, + { + id: `voice-${Date.now()}-${current.length}`, + role, + text: trimmed || options.toolName || "Tool call", + toolName: options.toolName, + error: options.error, + at: Date.now(), + }, + ].slice(-120)); + }, []); + + const setRuntimeStatus = useCallback((nextStatus: VoiceStatus, text?: string) => { + setStatus(nextStatus); + setStatusText(text ?? ( + nextStatus === "connecting" ? "Connecting to OpenAI Realtime..." : + nextStatus === "listening" ? "Listening. Ask OpenWork to act." : + nextStatus === "speaking" ? "OpenWork is speaking..." : + nextStatus === "muted" ? "Connected, microphone muted." : + nextStatus === "error" ? "Voice Mode needs attention." : + "Ready for voice control." + )); + }, []); + + const disconnectRealtime = useCallback((silent = false) => { + try { streamRef.current?.getTracks().forEach((track) => track.stop()); } catch {} + streamRef.current = null; + try { channelRef.current?.close(); } catch {} + channelRef.current = null; + try { peerRef.current?.close(); } catch {} + peerRef.current = null; + try { remoteAudioRef.current?.remove(); } catch {} + remoteAudioRef.current = null; + assistantBufferRef.current = ""; + responseInProgressRef.current = false; + pendingResponseRef.current = false; + setMicMuted(false); + setAssistantPreview(""); + setRuntimeStatus("idle"); + if (!silent) addEntry("system", "Voice session stopped."); + recordInspectorEvent("voice.disconnected", { sessionId: props.sessionId }); + }, [addEntry, props.sessionId, setRuntimeStatus]); + + useEffect(() => { + timelineEndRef.current?.scrollIntoView({ block: "end", behavior: "smooth" }); + }, [entries.length, assistantPreview]); + + const toggleEntryExpanded = useCallback((id: string) => { + setExpandedEntries((current) => { + const next = new Set(current); + if (next.has(id)) next.delete(id); + else next.add(id); + return next; + }); + }, []); + + const requestRealtimeResponse = useCallback((channel: RTCDataChannel, deferIfBusy = true) => { + if (responseInProgressRef.current) { + if (deferIfBusy) pendingResponseRef.current = true; + return false; + } + responseInProgressRef.current = true; + channel.send(JSON.stringify({ type: "response.create", response: { output_modalities: ["audio"] } })); + return true; + }, []); + + const handleRealtimeMessage = useCallback(async (raw: string) => { + let event: unknown = null; + try { + event = JSON.parse(raw); + } catch { + return; + } + const type = readString(event, "type"); + + if (type === "input_audio_buffer.speech_started") { + setRuntimeStatus("listening", "Hearing you..."); + return; + } + if (type === "response.created") { + responseInProgressRef.current = true; + pendingResponseRef.current = false; + return; + } + if (type === "conversation.item.input_audio_transcription.completed") { + const transcript = readString(event, "transcript").trim(); + if (transcript && isMeaningfulTranscript(transcript)) { + setLatestUserTranscript(transcript); + addEntry("user", transcript); + recordInspectorEvent("voice.transcript", { sessionId: props.sessionId, transcript }); + } + return; + } + if (type === "response.output_text.delta" || type === "response.output_audio_transcript.delta" || type === "response.audio_transcript.delta") { + const delta = readString(event, "delta"); + assistantBufferRef.current += delta; + setAssistantPreview(assistantBufferRef.current.trim()); + setRuntimeStatus("speaking"); + return; + } + if (type === "response.function_call_arguments.done") { + const toolName = readString(event, "name") || "tool"; + const callId = readString(event, "call_id"); + const args = parseJsonRecord(readString(event, "arguments")); + addEntry("tool", toolName, { toolName }); + const output = await executeOpenWorkTool(toolName, args); + if (isRecord(output) && output.ok === false) { + const error = typeof output.error === "string" ? output.error : "Tool failed."; + addEntry("tool", error, { toolName, error: true }); + } + const channel = channelRef.current; + if (!callId || !channel || channel.readyState !== "open") return; + channel.send(JSON.stringify({ + type: "conversation.item.create", + item: { type: "function_call_output", call_id: callId, output: safeJson(output) }, + })); + requestRealtimeResponse(channel); + return; + } + if (type === "response.done") { + const text = assistantBufferRef.current.trim(); + if (text) addEntry("assistant", text); + assistantBufferRef.current = ""; + setAssistantPreview(""); + responseInProgressRef.current = false; + const channel = channelRef.current; + if (pendingResponseRef.current && channel?.readyState === "open") { + pendingResponseRef.current = false; + requestRealtimeResponse(channel, false); + } else { + setRuntimeStatus(micMutedRef.current ? "muted" : "listening"); + } + return; + } + if (type === "error") { + responseInProgressRef.current = false; + const error = readRecord(event, "error"); + const message = typeof error.message === "string" ? error.message : "Realtime returned an error."; + addEntry("system", message, { error: true }); + setRuntimeStatus("error", message); + } + }, [addEntry, props.sessionId, requestRealtimeResponse, setRuntimeStatus]); + + const connectRealtime = useCallback(async (audioInput = true) => { + const client = props.client; + if (!client) throw new Error("OpenWork host connection is not ready."); + if (audioInput && !navigator.mediaDevices?.getUserMedia) throw new Error("Microphone capture is unavailable in this runtime."); + + disconnectRealtime(true); + setRuntimeStatus("connecting", "Minting Realtime session..."); + const realtimeSession = await client.createVoiceRealtimeSession(); + + const peer = new RTCPeerConnection(); + peerRef.current = peer; + if (audioInput) { + setRuntimeStatus("connecting", "Requesting microphone..."); + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { echoCancellation: true, noiseSuppression: true, autoGainControl: true }, + }); + streamRef.current = stream; + for (const track of stream.getAudioTracks()) { + peer.addTrack(track, stream); + } + } else { + peer.addTransceiver("audio", { direction: "recvonly" }); + } + + const audio = document.createElement("audio"); + audio.autoplay = true; + audio.style.display = "none"; + document.body.appendChild(audio); + remoteAudioRef.current = audio; + peer.ontrack = (event) => { + audio.srcObject = event.streams[0] ?? null; + }; + + const channel = peer.createDataChannel("oai-events"); + channelRef.current = channel; + channel.addEventListener("message", (event) => void handleRealtimeMessage(String(event.data))); + channel.addEventListener("close", () => { + if (channelRef.current === channel) setRuntimeStatus("idle"); + }); + + const offer = await peer.createOffer(); + await peer.setLocalDescription(offer); + if (!offer.sdp) throw new Error("Realtime offer did not include SDP."); + + setRuntimeStatus("connecting", "Opening voice channel..."); + const sdpResponse = await desktopFetch("https://api.openai.com/v1/realtime/calls", { + method: "POST", + headers: { Authorization: `Bearer ${realtimeSession.clientSecret}`, "Content-Type": "application/sdp" }, + body: offer.sdp, + }); + if (!sdpResponse.ok) { + const detail = await sdpResponse.text().catch(() => ""); + throw new Error(`OpenAI Realtime SDP failed: ${sdpResponse.status} ${detail}`.trim()); + } + await peer.setRemoteDescription({ type: "answer", sdp: await sdpResponse.text() }); + await waitForDataChannelOpen(channel); + setRuntimeStatus("listening", audioInput ? undefined : "Connected. Send a typed voice command."); + addEntry("system", `Realtime connected with ${realtimeSession.model} and ${realtimeSession.tools.length} OpenWork tools.`); + recordInspectorEvent("voice.connected", { sessionId: props.sessionId, model: realtimeSession.model }); + }, [addEntry, disconnectRealtime, handleRealtimeMessage, props.client, props.sessionId, setRuntimeStatus]); + + const startVoice = useCallback(async () => { + try { + await connectRealtime(true); + return true; + } catch (error) { + disconnectRealtime(true); + const message = error instanceof Error ? error.message : String(error); + setRuntimeStatus("error", message); + addEntry("system", message, { error: true }); + return { ok: false, error: message }; + } + }, [addEntry, connectRealtime, disconnectRealtime, setRuntimeStatus]); + + const stopVoice = useCallback(() => { + disconnectRealtime(); + return true; + }, [disconnectRealtime]); + + const toggleMic = useCallback(() => { + const nextMuted = !micMutedRef.current; + micMutedRef.current = nextMuted; + setMicMuted(nextMuted); + streamRef.current?.getAudioTracks().forEach((track) => { + track.enabled = !nextMuted; + }); + setRuntimeStatus(nextMuted ? "muted" : "listening"); + return { muted: nextMuted }; + }, [setRuntimeStatus]); + + const sendTextCommand = useCallback(async (text: string) => { + const value = text.trim(); + if (!value) return { ok: false, error: "Text command required." }; + if (!channelRef.current || channelRef.current.readyState !== "open") { + try { + await connectRealtime(false); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + setRuntimeStatus("error", message); + addEntry("system", message, { error: true }); + return { ok: false, error: message }; + } + } + const channel = channelRef.current; + if (!channel || channel.readyState !== "open") return { ok: false, error: "Realtime channel is not open." }; + addEntry("user", value); + channel.send(JSON.stringify({ + type: "conversation.item.create", + item: { type: "message", role: "user", content: [{ type: "input_text", text: value }] }, + })); + requestRealtimeResponse(channel); + return { ok: true, text: value }; + }, [addEntry, connectRealtime, requestRealtimeResponse, setRuntimeStatus]); + + const injectAudio = useCallback(async (args: unknown) => { + const audio = voiceAudioArgument(args); + if (!audio) return { ok: false, error: "pcm16Base64 audio is required." }; + if (!channelRef.current || channelRef.current.readyState !== "open") { + const started = await startVoice(); + if (isRecord(started) && started.ok === false) return started; + } + const channel = channelRef.current; + if (!channel || channel.readyState !== "open") return { ok: false, error: "Realtime channel is not open." }; + addEntry("system", "Injected deterministic audio into the Realtime input buffer."); + channel.send(JSON.stringify({ type: "input_audio_buffer.append", audio })); + channel.send(JSON.stringify({ type: "input_audio_buffer.commit" })); + requestRealtimeResponse(channel); + return { ok: true, bytesBase64: audio.length }; + }, [addEntry, requestRealtimeResponse, startVoice]); + + const injectTranscript = useCallback(async (args: unknown) => { + const text = voiceTextArgument(args); + setLatestUserTranscript(text); + addEntry("user", text); + window.dispatchEvent(new CustomEvent("openwork:voice-transcript", { detail: { text } })); + recordInspectorEvent("voice.inject_transcript", { sessionId: props.sessionId, text }); + return { ok: true, transcript: text }; + }, [addEntry, props.sessionId]); + + useEffect(() => { + const dispose = publishInspectorSlice("voice", () => ({ + sessionId: props.sessionId, + status, + statusText, + connected, + micMuted, + latestUserTranscript, + assistantPreview, + textCommandLength: textCommand.length, + timeline: entries.slice(-12).map((entry) => ({ + role: entry.role, + text: entry.text, + toolName: entry.toolName, + error: entry.error === true, + at: entry.at, + })), + })); + return dispose; + }, [assistantPreview, connected, entries, latestUserTranscript, micMuted, props.sessionId, status, statusText, textCommand.length]); + + useEffect(() => () => disconnectRealtime(true), [disconnectRealtime]); + + const startAction = useMemo(() => ({ + id: "voice.start", + label: "Start Voice Mode", + description: "Connect the Voice Mode panel to OpenAI Realtime and start listening.", + sideEffect: "external", + disabled: !props.client || connected || status === "connecting", + targetRef: panelRef, + execute: startVoice, + }), [connected, props.client, startVoice, status]); + useControlAction(startAction); + + const stopAction = useMemo(() => ({ + id: "voice.stop", + label: "Stop Voice Mode", + description: "Disconnect the active Voice Mode Realtime session.", + sideEffect: "external", + disabled: !connected, + targetRef: panelRef, + execute: stopVoice, + }), [connected, stopVoice]); + useControlAction(stopAction); + + const muteAction = useMemo(() => ({ + id: "voice.toggle_mute", + label: micMuted ? "Unmute Voice Mode" : "Mute Voice Mode", + description: "Toggle the microphone track without closing the Realtime session.", + sideEffect: "none", + disabled: !connected, + targetRef: panelRef, + execute: toggleMic, + }), [connected, micMuted, toggleMic]); + useControlAction(muteAction); + + const injectTranscriptAction = useMemo(() => ({ + id: "voice.inject_transcript", + label: "Inject a voice transcript", + description: "Deterministic eval hook: add a transcript to Voice Mode and place it in the composer.", + sideEffect: "mutation", + requiresArgs: true, + args: [{ name: "text", type: "string", required: true, description: "Transcript text to inject." }], + previewArgs: { text: DEFAULT_TEXT_COMMAND }, + targetRef: panelRef, + execute: injectTranscript, + }), [injectTranscript]); + useControlAction(injectTranscriptAction); + + const sendTextAction = useMemo(() => ({ + id: "voice.send_text", + label: "Send text through Voice Mode", + description: "Send a deterministic text command through the active OpenAI Realtime voice session.", + sideEffect: "external", + requiresArgs: true, + args: [{ name: "text", type: "string", required: true, description: "Text command to send through the Realtime model." }], + previewArgs: { text: DEFAULT_TEXT_COMMAND }, + targetRef: panelRef, + execute: (args) => sendTextCommand(voiceTextArgument(args)), + }), [sendTextCommand]); + useControlAction(sendTextAction); + + const injectAudioAction = useMemo(() => ({ + id: "voice.inject_audio", + label: "Inject voice audio", + description: "Deterministic eval hook: send PCM16 audio through the active OpenAI Realtime input buffer.", + sideEffect: "external", + requiresArgs: true, + args: [{ name: "pcm16Base64", type: "string", required: true, description: "Base64 encoded PCM16 mono audio." }], + targetRef: panelRef, + execute: injectAudio, + }), [injectAudio]); + useControlAction(injectAudioAction); + + const statusAction = useMemo(() => ({ + id: "voice.status", + label: "Read Voice Mode status", + description: "Return the Voice Mode runtime state for tests and agents.", + sideEffect: "none", + execute: () => ({ status, statusText, connected, micMuted, latestUserTranscript, assistantPreview }), + }), [assistantPreview, connected, latestUserTranscript, micMuted, status, statusText]); + useControlAction(statusAction); + + return ( +
+
+
+
+ + + Voice Mode +
+
Realtime voice over OpenWork UI MCP controls
+
+ +
+ + +
+ + +
+
{statusText}
+
+ Say things like "type a follow-up", "send it", or "read the latest session message". +
+
+ + {entries.length === 0 && !assistantPreview ? ( +
+ {VOICE_SUGGESTIONS.map((suggestion) => ( + + ))} +
+ ) : null} + +
+ + + +
+ + {!props.client ? ( + + + Host connection required + + + Voice Mode needs the local OpenWork server so it can mint short-lived Realtime client secrets without exposing your API key to the renderer. + + + ) : null} + + {assistantPreview ? ( + + +
+ +
+
+
Rendering response
+
+ {assistantPreview} +
+
+
+
+ ) : null} + + + + + + Typed voice command + + + +