From 0d5b822149d424f02e70c5521d8df2fc06cc7e9f Mon Sep 17 00:00:00 2001 From: yuhan Date: Sat, 4 Apr 2026 12:53:16 +0000 Subject: [PATCH 01/21] feat(voice): pluggable voice backend with Gemini Live API support Add a strategy-based voice backend architecture that allows switching between ElevenLabs ConvAI and Gemini Live API via VOICE_BACKEND env var. - Shared: VoiceBackendType, Gemini Live config builder, tool definitions - Hub: GET /voice/backend discovery, POST /voice/gemini-token with proxy support - Web: GeminiLiveVoiceSession (WebSocket + AudioWorklet audio pipeline), VoiceBackendSession dynamic switcher with React.lazy() code splitting, Gemini tool adapter bridging existing client tools - Tests: hub route tests, pcmUtils round-trip tests, toolAdapter tests - Zero changes to existing ElevenLabs code paths --- hub/src/web/routes/voice.test.ts | 94 +++++ hub/src/web/routes/voice.ts | 33 +- shared/src/voice.ts | 72 ++++ web/src/api/client.ts | 17 + web/src/api/voice.ts | 42 +++ web/src/components/SessionChat.tsx | 6 +- web/src/realtime/GeminiLiveVoiceSession.tsx | 351 ++++++++++++++++++ web/src/realtime/VoiceBackendSession.tsx | 44 +++ web/src/realtime/gemini/audioPlayer.ts | 75 ++++ web/src/realtime/gemini/audioRecorder.ts | 98 +++++ .../realtime/gemini/pcm-recorder.worklet.ts | 35 ++ web/src/realtime/gemini/pcmUtils.test.ts | 60 +++ web/src/realtime/gemini/pcmUtils.ts | 39 ++ web/src/realtime/gemini/toolAdapter.test.ts | 28 ++ web/src/realtime/gemini/toolAdapter.ts | 70 ++++ web/src/realtime/index.ts | 4 +- web/tsconfig.json | 3 +- 17 files changed, 1065 insertions(+), 6 deletions(-) create mode 100644 hub/src/web/routes/voice.test.ts create mode 100644 web/src/realtime/GeminiLiveVoiceSession.tsx create mode 100644 web/src/realtime/VoiceBackendSession.tsx create mode 100644 web/src/realtime/gemini/audioPlayer.ts create mode 100644 web/src/realtime/gemini/audioRecorder.ts create mode 100644 web/src/realtime/gemini/pcm-recorder.worklet.ts create mode 100644 web/src/realtime/gemini/pcmUtils.test.ts create mode 100644 web/src/realtime/gemini/pcmUtils.ts create mode 100644 web/src/realtime/gemini/toolAdapter.test.ts create mode 100644 web/src/realtime/gemini/toolAdapter.ts diff --git a/hub/src/web/routes/voice.test.ts b/hub/src/web/routes/voice.test.ts new file mode 100644 index 0000000000..a7553ff866 --- /dev/null +++ b/hub/src/web/routes/voice.test.ts @@ -0,0 +1,94 @@ +import { describe, test, expect, beforeEach, afterEach } from 'bun:test' +import { Hono } from 'hono' +import type { WebAppEnv } from '../middleware/auth' +import { createVoiceRoutes } from './voice' + +function createApp() { + const app = new Hono() + app.route('/api', createVoiceRoutes()) + return app +} + +describe('GET /api/voice/backend', () => { + const originalEnv = process.env.VOICE_BACKEND + + afterEach(() => { + if (originalEnv === undefined) { + delete process.env.VOICE_BACKEND + } else { + process.env.VOICE_BACKEND = originalEnv + } + }) + + test('returns elevenlabs by default', async () => { + delete process.env.VOICE_BACKEND + const app = createApp() + const res = await app.request('/api/voice/backend') + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('elevenlabs') + }) + + test('returns gemini-live when configured', async () => { + process.env.VOICE_BACKEND = 'gemini-live' + const app = createApp() + const res = await app.request('/api/voice/backend') + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('gemini-live') + }) + + test('falls back to elevenlabs for unknown values', async () => { + process.env.VOICE_BACKEND = 'unknown-backend' + const app = createApp() + const res = await app.request('/api/voice/backend') + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('elevenlabs') + }) +}) + +describe('POST /api/voice/gemini-token', () => { + const origGemini = process.env.GEMINI_API_KEY + const origGoogle = process.env.GOOGLE_API_KEY + + afterEach(() => { + if (origGemini === undefined) delete process.env.GEMINI_API_KEY + else process.env.GEMINI_API_KEY = origGemini + if (origGoogle === undefined) delete process.env.GOOGLE_API_KEY + else process.env.GOOGLE_API_KEY = origGoogle + }) + + test('returns 400 when no API key configured', async () => { + delete process.env.GEMINI_API_KEY + delete process.env.GOOGLE_API_KEY + const app = createApp() + const res = await app.request('/api/voice/gemini-token', { method: 'POST' }) + expect(res.status).toBe(400) + const body = await res.json() as { allowed: boolean; error: string } + expect(body.allowed).toBe(false) + expect(body.error).toContain('not configured') + }) + + test('returns GEMINI_API_KEY when set', async () => { + process.env.GEMINI_API_KEY = 'test-gemini-key' + delete process.env.GOOGLE_API_KEY + const app = createApp() + const res = await app.request('/api/voice/gemini-token', { method: 'POST' }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; apiKey: string } + expect(body.allowed).toBe(true) + expect(body.apiKey).toBe('test-gemini-key') + }) + + test('falls back to GOOGLE_API_KEY', async () => { + delete process.env.GEMINI_API_KEY + process.env.GOOGLE_API_KEY = 'test-google-key' + const app = createApp() + const res = await app.request('/api/voice/gemini-token', { method: 'POST' }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; apiKey: string } + expect(body.allowed).toBe(true) + expect(body.apiKey).toBe('test-google-key') + }) +}) diff --git a/hub/src/web/routes/voice.ts b/hub/src/web/routes/voice.ts index 1a55f83639..8ba2c7b11f 100644 --- a/hub/src/web/routes/voice.ts +++ b/hub/src/web/routes/voice.ts @@ -4,8 +4,10 @@ import type { WebAppEnv } from '../middleware/auth' import { ELEVENLABS_API_BASE, VOICE_AGENT_NAME, - buildVoiceAgentConfig + buildVoiceAgentConfig, + DEFAULT_VOICE_BACKEND } from '@hapi/protocol/voice' +import type { VoiceBackendType } from '@hapi/protocol/voice' const tokenRequestSchema = z.object({ customAgentId: z.string().optional(), @@ -116,6 +118,35 @@ async function getOrCreateAgentId(apiKey: string): Promise { export function createVoiceRoutes(): Hono { const app = new Hono() + // Return the configured voice backend type + app.get('/voice/backend', (c) => { + const raw = process.env.VOICE_BACKEND + const backend: VoiceBackendType = + raw === 'gemini-live' ? 'gemini-live' : DEFAULT_VOICE_BACKEND + return c.json({ backend }) + }) + + // Get Gemini API key for Gemini Live voice sessions + // Gemini Live API does not support ephemeral tokens, so we proxy the key. + // The key is short-lived in the browser session and never persisted client-side. + app.post('/voice/gemini-token', async (c) => { + const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY + if (!apiKey) { + return c.json({ + allowed: false, + error: 'Gemini API key not configured (set GEMINI_API_KEY or GOOGLE_API_KEY)' + }, 400) + } + + return c.json({ + allowed: true, + apiKey, + // Optional overrides for proxy/relay setups + wsUrl: process.env.GEMINI_LIVE_WS_URL || undefined, + baseUrl: process.env.GEMINI_API_BASE || undefined + }) + }) + // Get ElevenLabs ConvAI conversation token app.post('/voice/token', async (c) => { const json = await c.req.json().catch(() => null) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 6751f0eba4..abd3b35381 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -255,3 +255,75 @@ export function buildVoiceAgentConfig(): VoiceAgentConfig { } } } + +export type VoiceBackendType = 'elevenlabs' | 'gemini-live' + +export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'elevenlabs' + +export const GEMINI_LIVE_MODEL = 'gemini-3.1-flash-live-preview' + +export interface VoiceToolDefinition { + name: string + description: string + parameters: { + type: 'object' + required: string[] + properties: Record + } +} + +type VoiceToolSource = Pick<(typeof VOICE_TOOLS)[number], 'name' | 'description' | 'parameters'> + +function cloneVoiceToolDefinition(tool: VoiceToolSource): VoiceToolDefinition { + const properties: VoiceToolDefinition['parameters']['properties'] = {} + + for (const [key, value] of Object.entries(tool.parameters.properties)) { + properties[key] = { + type: value.type, + description: value.description + } + } + + return { + name: tool.name, + description: tool.description, + parameters: { + type: 'object', + required: [...tool.parameters.required], + properties + } + } +} + +export const VOICE_TOOL_DEFINITIONS: VoiceToolDefinition[] = VOICE_TOOLS.map(cloneVoiceToolDefinition) + +export type GeminiLiveFunctionDeclaration = VoiceToolDefinition + +export interface GeminiLiveConfig { + model: string + systemInstruction: string + tools: Array<{ + functionDeclarations: GeminiLiveFunctionDeclaration[] + }> + responseModalities: ['AUDIO'] +} + +export function buildGeminiLiveFunctionDeclarations(): GeminiLiveFunctionDeclaration[] { + return VOICE_TOOLS.map(cloneVoiceToolDefinition) +} + +export function buildGeminiLiveConfig(): GeminiLiveConfig { + return { + model: GEMINI_LIVE_MODEL, + systemInstruction: VOICE_SYSTEM_PROMPT, + tools: [ + { + functionDeclarations: buildGeminiLiveFunctionDeclarations() + } + ], + responseModalities: ['AUDIO'] + } +} diff --git a/web/src/api/client.ts b/web/src/api/client.ts index 7f1083c8af..cd651e3f70 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -443,4 +443,21 @@ export class ApiClient { body: JSON.stringify(options || {}) }) } + + async fetchVoiceBackend(): Promise<{ backend: string }> { + return await this.request('/api/voice/backend') + } + + async fetchGeminiToken(): Promise<{ + allowed: boolean + apiKey?: string + wsUrl?: string + baseUrl?: string + error?: string + }> { + return await this.request('/api/voice/gemini-token', { + method: 'POST', + body: JSON.stringify({}) + }) + } } diff --git a/web/src/api/voice.ts b/web/src/api/voice.ts index 66cee443f1..c79605434c 100644 --- a/web/src/api/voice.ts +++ b/web/src/api/voice.ts @@ -15,6 +15,7 @@ import { VOICE_AGENT_NAME, buildVoiceAgentConfig } from '@hapi/protocol/voice' +import type { VoiceBackendType } from '@hapi/protocol/voice' export interface VoiceTokenResponse { allowed: boolean @@ -160,3 +161,44 @@ export async function createOrUpdateHapiAgent(apiKey: string): Promise { + try { + const result = await api.fetchVoiceBackend() + const backend = result.backend === 'gemini-live' ? 'gemini-live' : 'elevenlabs' + return { backend } as VoiceBackendResponse + } catch { + return { backend: 'elevenlabs' } + } +} + +/** + * Fetch a Gemini API key from the hub for Gemini Live voice sessions. + */ +export async function fetchGeminiToken(api: ApiClient): Promise { + try { + return await api.fetchGeminiToken() + } catch (error) { + return { + allowed: false, + error: error instanceof Error ? error.message : 'Network error' + } + } +} diff --git a/web/src/components/SessionChat.tsx b/web/src/components/SessionChat.tsx index 2a60c62b29..62ea61a14c 100644 --- a/web/src/components/SessionChat.tsx +++ b/web/src/components/SessionChat.tsx @@ -27,7 +27,7 @@ import { TeamPanel } from '@/components/TeamPanel' import { usePlatform } from '@/hooks/usePlatform' import { useSessionActions } from '@/hooks/mutations/useSessionActions' import { useVoiceOptional } from '@/lib/voice-context' -import { RealtimeVoiceSession, registerSessionStore, registerVoiceHooksStore, voiceHooks } from '@/realtime' +import { VoiceBackendSession, registerSessionStore, registerVoiceHooksStore, voiceHooks } from '@/realtime' import { isRemoteTerminalSupported } from '@/utils/terminalSupport' export function SessionChat(props: { @@ -429,9 +429,9 @@ export function SessionChat(props: { - {/* Voice session component - renders nothing but initializes ElevenLabs */} + {/* Voice session component - renders nothing but initializes voice backend */} {voice && ( - { + cleanup() + state.statusCallback?.('connecting') + + // Get API key from hub + const tokenResp = await fetchGeminiToken(this.api) + if (!tokenResp.allowed || !tokenResp.apiKey) { + const msg = tokenResp.error ?? 'Gemini API key not available' + state.statusCallback?.('error', msg) + throw new Error(msg) + } + state.apiKey = tokenResp.apiKey + state.wsBaseUrl = tokenResp.wsUrl || null + + // Request microphone + let permissionStream: MediaStream | null = null + try { + permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true }) + } catch (error) { + state.statusCallback?.('error', 'Microphone permission denied') + throw error + } finally { + permissionStream?.getTracks().forEach((t) => t.stop()) + } + + // Connect WebSocket + const wsBase = state.wsBaseUrl || DEFAULT_GEMINI_LIVE_WS_BASE + const wsUrl = `${wsBase}?key=${encodeURIComponent(state.apiKey)}` + const ws = new WebSocket(wsUrl) + state.ws = ws + + return new Promise((resolve, reject) => { + let setupDone = false + + ws.onopen = () => { + if (DEBUG) console.log('[GeminiLive] WebSocket connected, sending setup') + + const liveConfig = buildGeminiLiveConfig() + const setupMessage = { + setup: { + model: `models/${liveConfig.model}`, + generationConfig: { + responseModalities: ['AUDIO'], + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { voiceName: 'Aoede' } + } + } + }, + systemInstruction: { + parts: [{ text: liveConfig.systemInstruction }] + }, + tools: liveConfig.tools.map((t) => ({ + functionDeclarations: t.functionDeclarations.map((fd) => ({ + name: fd.name, + description: fd.description, + parameters: fd.parameters + })) + })) + } + } + + ws.send(JSON.stringify(setupMessage)) + } + + ws.onmessage = async (event) => { + let data: Record + try { + if (event.data instanceof Blob) { + const text = await event.data.text() + data = JSON.parse(text) as Record + } else { + data = JSON.parse(event.data as string) as Record + } + } catch { + if (DEBUG) console.warn('[GeminiLive] Failed to parse message') + return + } + + // Setup complete + if (data.setupComplete && !setupDone) { + setupDone = true + if (DEBUG) console.log('[GeminiLive] Setup complete') + state.statusCallback?.('connected') + + // Start audio capture + startAudioCapture() + + // Send initial context + first message prompt + sendClientContent(config.initialContext + ? `[Context] ${config.initialContext}\n\nPlease greet the user briefly.` + : 'Please greet the user briefly.') + + resolve() + return + } + + // Server content (audio / text / turn complete) + const serverContent = data.serverContent as { + modelTurn?: { parts?: Array<{ inlineData?: { data: string; mimeType: string }; text?: string }> } + turnComplete?: boolean + } | undefined + + if (serverContent) { + if (serverContent.modelTurn?.parts) { + for (const part of serverContent.modelTurn.parts) { + if (part.inlineData?.data) { + state.player?.enqueue(part.inlineData.data) + } + if (part.text && DEBUG) { + console.log('[GeminiLive] Text:', part.text) + } + } + } + if (serverContent.turnComplete && DEBUG) { + console.log('[GeminiLive] Turn complete') + } + } + + // Tool calls + const toolCall = data.toolCall as { + functionCalls?: Array<{ name: string; args: Record; id: string }> + } | undefined + + if (toolCall?.functionCalls && toolCall.functionCalls.length > 0) { + if (DEBUG) console.log('[GeminiLive] Tool calls:', toolCall.functionCalls.map((c) => c.name)) + + const responses = await handleGeminiFunctionCalls( + toolCall.functionCalls as GeminiFunctionCall[] + ) + + // Send tool responses back + if (state.ws?.readyState === WebSocket.OPEN) { + state.ws.send(JSON.stringify({ + toolResponse: { + functionResponses: responses.map((r) => ({ + id: r.id, + name: r.name, + response: r.response + })) + } + })) + } + } + } + + ws.onerror = (event) => { + console.error('[GeminiLive] WebSocket error:', event) + if (!setupDone) { + state.statusCallback?.('error', 'WebSocket connection failed') + reject(new Error('WebSocket connection failed')) + } + } + + ws.onclose = (event) => { + if (DEBUG) console.log('[GeminiLive] WebSocket closed:', event.code, event.reason) + cleanup() + resetRealtimeSessionState() + state.statusCallback?.('disconnected') + } + }) + } + + async endSession(): Promise { + cleanup() + resetRealtimeSessionState() + state.statusCallback?.('disconnected') + } + + sendTextMessage(message: string): void { + sendClientContent(message) + } + + sendContextualUpdate(update: string): void { + // Send as a system-like context message + sendClientContent(`[System Context Update] ${update}`) + } +} + +function sendClientContent(text: string): void { + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return + state.ws.send(JSON.stringify({ + clientContent: { + turns: [{ role: 'user', parts: [{ text }] }], + turnComplete: true + } + })) +} + +function sendAudioChunk(base64Pcm: string): void { + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return + state.ws.send(JSON.stringify({ + realtimeInput: { + mediaChunks: [{ + mimeType: 'audio/pcm;rate=16000', + data: base64Pcm + }] + } + })) +} + +function startAudioCapture(): void { + state.player = new GeminiAudioPlayer() + state.recorder = new GeminiAudioRecorder() + + state.recorder.start( + (pcm16Chunk) => sendAudioChunk(pcm16Chunk), + (error) => { + console.error('[GeminiLive] Audio capture error:', error) + state.statusCallback?.('error', 'Microphone error') + } + ) +} + +// --- React component --- + +export interface GeminiLiveVoiceSessionProps { + api: ApiClient + micMuted?: boolean + onStatusChange?: StatusCallback + getSession?: (sessionId: string) => Session | null + sendMessage?: (sessionId: string, message: string) => void + approvePermission?: (sessionId: string, requestId: string) => Promise + denyPermission?: (sessionId: string, requestId: string) => Promise +} + +export function GeminiLiveVoiceSession({ + api, + micMuted = false, + onStatusChange, + getSession, + sendMessage, + approvePermission, + denyPermission +}: GeminiLiveVoiceSessionProps) { + const hasRegistered = useRef(false) + + // Store status callback + useEffect(() => { + state.statusCallback = onStatusChange || null + return () => { state.statusCallback = null } + }, [onStatusChange]) + + // Register session store for client tools + useEffect(() => { + if (getSession && sendMessage && approvePermission && denyPermission) { + registerSessionStore({ + getSession: (sessionId: string) => + getSession(sessionId) as { agentState?: { requests?: Record } } | null, + sendMessage, + approvePermission, + denyPermission + }) + } + }, [getSession, sendMessage, approvePermission, denyPermission]) + + // Register voice session once + useEffect(() => { + if (!hasRegistered.current) { + try { + registerVoiceSession(new GeminiLiveVoiceSessionImpl(api)) + hasRegistered.current = true + } catch (error) { + console.error('[GeminiLive] Failed to register voice session:', error) + } + } + }, [api]) + + // Sync mic mute state + useEffect(() => { + if (state.recorder) { + state.recorder.setMuted(micMuted) + } + }, [micMuted]) + + // Handle barge-in: clear audio queue when user starts speaking + const handleBargeIn = useCallback(() => { + if (state.player?.isPlaying()) { + state.player.clearQueue() + } + }, []) + + // Cleanup on unmount + useEffect(() => { + return () => { + cleanup() + } + }, []) + + return null +} diff --git a/web/src/realtime/VoiceBackendSession.tsx b/web/src/realtime/VoiceBackendSession.tsx new file mode 100644 index 0000000000..66e24d6abe --- /dev/null +++ b/web/src/realtime/VoiceBackendSession.tsx @@ -0,0 +1,44 @@ +import { lazy, Suspense, useEffect, useState } from 'react' +import { RealtimeVoiceSession } from './RealtimeVoiceSession' +import type { RealtimeVoiceSessionProps } from './RealtimeVoiceSession' +import type { GeminiLiveVoiceSessionProps } from './GeminiLiveVoiceSession' +import { fetchVoiceBackend } from '@/api/voice' +import type { ApiClient } from '@/api/client' +import type { VoiceBackendType } from '@hapi/protocol/voice' + +// Lazy-load Gemini session to avoid bundling @google/genai when using ElevenLabs +const GeminiLiveVoiceSession = lazy(() => + import('./GeminiLiveVoiceSession').then((m) => ({ default: m.GeminiLiveVoiceSession })) +) + +export type VoiceBackendSessionProps = RealtimeVoiceSessionProps & { + api: ApiClient +} + +/** + * Dynamically selects the voice session component based on the hub's configured backend. + * Queries GET /voice/backend once on mount and renders the appropriate component. + */ +export function VoiceBackendSession(props: VoiceBackendSessionProps) { + const [backend, setBackend] = useState(null) + + useEffect(() => { + let cancelled = false + fetchVoiceBackend(props.api).then((resp) => { + if (!cancelled) setBackend(resp.backend) + }) + return () => { cancelled = true } + }, [props.api]) + + if (!backend) return null + + if (backend === 'gemini-live') { + return ( + + + + ) + } + + return +} diff --git a/web/src/realtime/gemini/audioPlayer.ts b/web/src/realtime/gemini/audioPlayer.ts new file mode 100644 index 0000000000..23d1d341e4 --- /dev/null +++ b/web/src/realtime/gemini/audioPlayer.ts @@ -0,0 +1,75 @@ +import { base64ToArrayBuffer, pcm16ToFloat32 } from './pcmUtils'; + +export class GeminiAudioPlayer { + private audioContext: AudioContext; + private ownsContext: boolean; + private lastEndTime: number = 0; + private activeSources: AudioBufferSourceNode[] = []; + + constructor(audioContext?: AudioContext) { + if (audioContext) { + this.audioContext = audioContext; + this.ownsContext = false; + } else { + this.audioContext = new AudioContext({ sampleRate: 24000 }); + this.ownsContext = true; + } + this.lastEndTime = this.audioContext.currentTime; + } + + enqueue(base64Pcm: string): void { + if (this.audioContext.state === 'suspended') { + this.audioContext.resume(); + } + + const arrayBuffer = base64ToArrayBuffer(base64Pcm); + const float32Data = pcm16ToFloat32(arrayBuffer); + + if (float32Data.length === 0) return; + + const audioBuffer = this.audioContext.createBuffer(1, float32Data.length, 24000); + audioBuffer.copyToChannel(new Float32Array(float32Data), 0); + + const source = this.audioContext.createBufferSource(); + source.buffer = audioBuffer; + source.connect(this.audioContext.destination); + + const startTime = Math.max(this.audioContext.currentTime, this.lastEndTime); + + source.onended = () => { + const index = this.activeSources.indexOf(source); + if (index > -1) { + this.activeSources.splice(index, 1); + } + }; + + source.start(startTime); + this.activeSources.push(source); + + this.lastEndTime = startTime + audioBuffer.duration; + } + + clearQueue(): void { + this.activeSources.forEach(source => { + try { + source.stop(); + } catch (e) { + // Ignore if already stopped + } + source.disconnect(); + }); + this.activeSources = []; + this.lastEndTime = this.audioContext.currentTime; + } + + isPlaying(): boolean { + return this.lastEndTime > this.audioContext.currentTime; + } + + dispose(): void { + this.clearQueue(); + if (this.ownsContext && this.audioContext.state !== 'closed') { + this.audioContext.close(); + } + } +} diff --git a/web/src/realtime/gemini/audioRecorder.ts b/web/src/realtime/gemini/audioRecorder.ts new file mode 100644 index 0000000000..8a66bc3af3 --- /dev/null +++ b/web/src/realtime/gemini/audioRecorder.ts @@ -0,0 +1,98 @@ +import workletUrl from './pcm-recorder.worklet.ts?url'; +import { float32ToPcm16, arrayBufferToBase64 } from './pcmUtils'; + +export class GeminiAudioRecorder { + private audioContext: AudioContext | null = null; + private mediaStream: MediaStream | null = null; + private sourceNode: MediaStreamAudioSourceNode | null = null; + private workletNode: AudioWorkletNode | null = null; + private scriptNode: ScriptProcessorNode | null = null; + + async start(onChunk: (base64Pcm: string) => void, onError?: (error: Error) => void): Promise { + try { + this.mediaStream = await navigator.mediaDevices.getUserMedia({ + audio: { sampleRate: 16000, channelCount: 1 } + }); + + this.mediaStream.getTracks().forEach((track) => { + track.onended = () => { + if (onError) onError(new Error('Microphone disconnected')); + }; + }); + + this.audioContext = new AudioContext({ sampleRate: 16000 }); + if (this.audioContext.state === 'suspended') { + await this.audioContext.resume(); + } + + this.sourceNode = this.audioContext.createMediaStreamSource(this.mediaStream); + + try { + await this.audioContext.audioWorklet.addModule(workletUrl); + this.workletNode = new AudioWorkletNode(this.audioContext, 'pcm-recorder-processor'); + this.workletNode.port.onmessage = (event) => { + const pcm16 = float32ToPcm16(event.data.samples); + const base64 = arrayBufferToBase64(pcm16); + onChunk(base64); + }; + this.sourceNode.connect(this.workletNode); + } catch (e) { + console.warn('AudioWorklet failed, falling back to ScriptProcessorNode', e); + this.scriptNode = this.audioContext.createScriptProcessor(4096, 1, 1); + this.scriptNode.onaudioprocess = (event) => { + const inputData = event.inputBuffer.getChannelData(0); + const pcm16 = float32ToPcm16(new Float32Array(inputData)); + const base64 = arrayBufferToBase64(pcm16); + onChunk(base64); + }; + this.sourceNode.connect(this.scriptNode); + this.scriptNode.connect(this.audioContext.destination); + } + } catch (e) { + if (onError) onError(e instanceof Error ? e : new Error(String(e))); + throw e; + } + } + + stop(): void { + if (this.mediaStream) { + this.mediaStream.getTracks().forEach(track => { + track.onended = null; + track.stop(); + }); + this.mediaStream = null; + } + + if (this.scriptNode) { + this.scriptNode.disconnect(); + this.scriptNode = null; + } + + if (this.workletNode) { + this.workletNode.disconnect(); + this.workletNode = null; + } + + if (this.sourceNode) { + this.sourceNode.disconnect(); + this.sourceNode = null; + } + + if (this.audioContext) { + this.audioContext.close(); + this.audioContext = null; + } + } + + setMuted(muted: boolean): void { + if (this.mediaStream) { + this.mediaStream.getAudioTracks().forEach(track => { + track.enabled = !muted; + }); + } + } + + dispose(): void { + this.stop(); + } +} diff --git a/web/src/realtime/gemini/pcm-recorder.worklet.ts b/web/src/realtime/gemini/pcm-recorder.worklet.ts new file mode 100644 index 0000000000..404f65445b --- /dev/null +++ b/web/src/realtime/gemini/pcm-recorder.worklet.ts @@ -0,0 +1,35 @@ +// AudioWorklet processor runs in a separate scope with its own globals. +// These declarations satisfy TypeScript without pulling in DOM lib types. +declare class AudioWorkletProcessor { + readonly port: MessagePort + constructor() +} +declare function registerProcessor(name: string, ctor: new () => AudioWorkletProcessor): void + +class PcmRecorderProcessor extends AudioWorkletProcessor { + private buffer: Float32Array; + private bufferSize = 4096; + private bufferIndex = 0; + + constructor() { + super(); + this.buffer = new Float32Array(this.bufferSize); + } + + process(inputs: Float32Array[][]): boolean { + const input = inputs[0]; + if (input && input.length > 0) { + const channel = input[0]; + for (let i = 0; i < channel.length; i++) { + this.buffer[this.bufferIndex++] = channel[i]; + if (this.bufferIndex >= this.bufferSize) { + this.port.postMessage({ samples: this.buffer.slice() }); + this.bufferIndex = 0; + } + } + } + return true; + } +} + +registerProcessor('pcm-recorder-processor', PcmRecorderProcessor); diff --git a/web/src/realtime/gemini/pcmUtils.test.ts b/web/src/realtime/gemini/pcmUtils.test.ts new file mode 100644 index 0000000000..2e0be05c3f --- /dev/null +++ b/web/src/realtime/gemini/pcmUtils.test.ts @@ -0,0 +1,60 @@ +import { describe, test, expect } from 'bun:test' +import { + float32ToPcm16, + pcm16ToFloat32, + arrayBufferToBase64, + base64ToArrayBuffer +} from './pcmUtils' + +describe('pcmUtils', () => { + describe('float32ToPcm16 / pcm16ToFloat32 round-trip', () => { + test('preserves signal within quantization error', () => { + const input = new Float32Array([0, 0.5, -0.5, 1.0, -1.0]) + const pcm16 = float32ToPcm16(input) + const output = pcm16ToFloat32(pcm16) + + expect(output.length).toBe(input.length) + for (let i = 0; i < input.length; i++) { + expect(Math.abs(output[i] - input[i])).toBeLessThan(0.001) + } + }) + + test('clamps values outside [-1, 1]', () => { + const input = new Float32Array([2.0, -2.0]) + const pcm16 = float32ToPcm16(input) + const output = pcm16ToFloat32(pcm16) + + expect(Math.abs(output[0] - 1.0)).toBeLessThan(0.001) + expect(Math.abs(output[1] - (-1.0))).toBeLessThan(0.001) + }) + + test('handles empty input', () => { + const input = new Float32Array(0) + const pcm16 = float32ToPcm16(input) + expect(pcm16.byteLength).toBe(0) + const output = pcm16ToFloat32(pcm16) + expect(output.length).toBe(0) + }) + }) + + describe('arrayBufferToBase64 / base64ToArrayBuffer round-trip', () => { + test('preserves binary data', () => { + const original = new Uint8Array([0, 1, 127, 128, 255]) + const base64 = arrayBufferToBase64(original.buffer) + const restored = new Uint8Array(base64ToArrayBuffer(base64)) + + expect(restored.length).toBe(original.length) + for (let i = 0; i < original.length; i++) { + expect(restored[i]).toBe(original[i]) + } + }) + + test('handles empty buffer', () => { + const empty = new ArrayBuffer(0) + const base64 = arrayBufferToBase64(empty) + expect(base64).toBe('') + const restored = base64ToArrayBuffer(base64) + expect(restored.byteLength).toBe(0) + }) + }) +}) diff --git a/web/src/realtime/gemini/pcmUtils.ts b/web/src/realtime/gemini/pcmUtils.ts new file mode 100644 index 0000000000..67e2928fc0 --- /dev/null +++ b/web/src/realtime/gemini/pcmUtils.ts @@ -0,0 +1,39 @@ +export function float32ToPcm16(samples: Float32Array): ArrayBuffer { + const buffer = new ArrayBuffer(samples.length * 2); + const view = new DataView(buffer); + for (let i = 0; i < samples.length; i++) { + let s = Math.max(-1, Math.min(1, samples[i])); + s = s < 0 ? s * 0x8000 : s * 0x7FFF; + view.setInt16(i * 2, s, true); + } + return buffer; +} + +export function pcm16ToFloat32(buffer: ArrayBuffer): Float32Array { + const int16Array = new Int16Array(buffer); + const float32Array = new Float32Array(int16Array.length); + for (let i = 0; i < int16Array.length; i++) { + const s = int16Array[i]; + float32Array[i] = s < 0 ? s / 0x8000 : s / 0x7FFF; + } + return float32Array; +} + +export function arrayBufferToBase64(buffer: ArrayBuffer): string { + let binary = ''; + const bytes = new Uint8Array(buffer); + const len = bytes.byteLength; + for (let i = 0; i < len; i++) { + binary += String.fromCharCode(bytes[i]); + } + return btoa(binary); +} + +export function base64ToArrayBuffer(base64: string): ArrayBuffer { + const binary = atob(base64); + const bytes = new Uint8Array(binary.length); + for (let i = 0; i < binary.length; i++) { + bytes[i] = binary.charCodeAt(i); + } + return bytes.buffer; +} diff --git a/web/src/realtime/gemini/toolAdapter.test.ts b/web/src/realtime/gemini/toolAdapter.test.ts new file mode 100644 index 0000000000..5d98d6d4d0 --- /dev/null +++ b/web/src/realtime/gemini/toolAdapter.test.ts @@ -0,0 +1,28 @@ +import { describe, test, expect } from 'bun:test' +import { handleGeminiFunctionCall, handleGeminiFunctionCalls } from './toolAdapter' +import type { GeminiFunctionCall } from './toolAdapter' + +describe('toolAdapter', () => { + test('returns error for unknown tool', async () => { + const call: GeminiFunctionCall = { + name: 'unknownTool', + args: {}, + id: 'call-1' + } + const resp = await handleGeminiFunctionCall(call) + expect(resp.name).toBe('unknownTool') + expect(resp.id).toBe('call-1') + expect(resp.response.result).toContain('unknown tool') + }) + + test('handles multiple calls in parallel', async () => { + const calls: GeminiFunctionCall[] = [ + { name: 'unknownA', args: {}, id: 'a' }, + { name: 'unknownB', args: {}, id: 'b' } + ] + const responses = await handleGeminiFunctionCalls(calls) + expect(responses.length).toBe(2) + expect(responses[0].id).toBe('a') + expect(responses[1].id).toBe('b') + }) +}) diff --git a/web/src/realtime/gemini/toolAdapter.ts b/web/src/realtime/gemini/toolAdapter.ts new file mode 100644 index 0000000000..dd44e4fb11 --- /dev/null +++ b/web/src/realtime/gemini/toolAdapter.ts @@ -0,0 +1,70 @@ +import { realtimeClientTools } from '../realtimeClientTools' + +/** + * Gemini Live API function call from server. + * Matches the `toolCall` shape in a BidiGenerateContent serverMessage. + */ +export interface GeminiFunctionCall { + name: string + args: Record + id: string +} + +/** + * Response sent back to Gemini Live via `toolResponse`. + */ +export interface GeminiFunctionResponse { + name: string + id: string + response: { result: string } +} + +type ClientToolHandler = (parameters: unknown) => Promise + +const toolHandlers: Record = { + messageCodingAgent: realtimeClientTools.messageCodingAgent, + processPermissionRequest: realtimeClientTools.processPermissionRequest +} + +/** + * Execute a Gemini Live function call using the existing client tool handlers. + * Returns a GeminiFunctionResponse ready to send back over the WebSocket. + */ +export async function handleGeminiFunctionCall( + call: GeminiFunctionCall +): Promise { + const handler = toolHandlers[call.name] + + if (!handler) { + return { + name: call.name, + id: call.id, + response: { result: `error (unknown tool: ${call.name})` } + } + } + + try { + const result = await handler(call.args) + return { + name: call.name, + id: call.id, + response: { result } + } + } catch (error) { + const message = error instanceof Error ? error.message : 'unknown error' + return { + name: call.name, + id: call.id, + response: { result: `error (${message})` } + } + } +} + +/** + * Process multiple function calls in parallel and return all responses. + */ +export async function handleGeminiFunctionCalls( + calls: GeminiFunctionCall[] +): Promise { + return Promise.all(calls.map(handleGeminiFunctionCall)) +} diff --git a/web/src/realtime/index.ts b/web/src/realtime/index.ts index a7fa2fbe99..68e1f7aaa9 100644 --- a/web/src/realtime/index.ts +++ b/web/src/realtime/index.ts @@ -15,8 +15,10 @@ export { // Client tools export { realtimeClientTools, registerSessionStore } from './realtimeClientTools' -// Voice session component +// Voice session components export { RealtimeVoiceSession, type RealtimeVoiceSessionProps } from './RealtimeVoiceSession' +export { GeminiLiveVoiceSession, type GeminiLiveVoiceSessionProps } from './GeminiLiveVoiceSession' +export { VoiceBackendSession, type VoiceBackendSessionProps } from './VoiceBackendSession' // Voice hooks export { voiceHooks, registerVoiceHooksStore } from './hooks/voiceHooks' diff --git a/web/tsconfig.json b/web/tsconfig.json index 8b0682a4bb..de7bcdca50 100644 --- a/web/tsconfig.json +++ b/web/tsconfig.json @@ -11,5 +11,6 @@ "@/*": ["./src/*"] } }, - "include": ["src"] + "include": ["src"], + "exclude": ["src/**/*.test.ts", "src/**/*.test.tsx", "src/**/*.spec.ts", "src/**/*.spec.tsx"] } From 09520c2fe1c897357d17c54ddd744c3e75b3293f Mon Sep 17 00:00:00 2001 From: yuhan Date: Sat, 4 Apr 2026 12:57:06 +0000 Subject: [PATCH 02/21] feat(voice): switch default language to Chinese - System prompt instructs assistant to respond in Mandarin - First message changed to Chinese greeting - ElevenLabs language set to 'zh' --- shared/src/voice.ts | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index abd3b35381..a316750b09 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -136,9 +136,17 @@ For builds, tests, or large file operations: - Treat garbled input as phonetic hints and ask for clarification - Correct yourself immediately if you realize you made an error - Keep conversations forward-moving with fresh insights -- Assume a technical software developer audience` +- Assume a technical software developer audience -export const VOICE_FIRST_MESSAGE = "Hey! Hapi here." +# Language + +IMPORTANT: Always respond in Chinese (Mandarin). Use natural spoken Chinese. +- Greet users in Chinese +- Summarize technical content in Chinese +- Use English only for proper nouns, tool names, and code identifiers +- Keep the same warm, concise conversational style in Chinese` + +export const VOICE_FIRST_MESSAGE = "嗨!我是 Hapi 语音助手,有什么可以帮你的?" export const VOICE_TOOLS = [ { @@ -223,7 +231,7 @@ export function buildVoiceAgentConfig(): VoiceAgentConfig { conversation_config: { agent: { first_message: VOICE_FIRST_MESSAGE, - language: 'en', + language: 'zh', prompt: { prompt: VOICE_SYSTEM_PROMPT, llm: 'gemini-2.5-flash', From 99a60e51eb6a19905dc53d32cbbcb93335132678 Mon Sep 17 00:00:00 2001 From: yuhan Date: Sat, 4 Apr 2026 13:11:23 +0000 Subject: [PATCH 03/21] fix(voice): use inline Blob URL for AudioWorklet instead of ?url import Vite inlined the worklet as a data URI with wrong MIME type (video/mp2t) and uncompiled TypeScript, causing AudioWorklet.addModule() to fail. Use Blob URL with plain JS source instead. --- web/src/realtime/gemini/audioRecorder.ts | 38 ++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/web/src/realtime/gemini/audioRecorder.ts b/web/src/realtime/gemini/audioRecorder.ts index 8a66bc3af3..b1c01a7c47 100644 --- a/web/src/realtime/gemini/audioRecorder.ts +++ b/web/src/realtime/gemini/audioRecorder.ts @@ -1,6 +1,37 @@ -import workletUrl from './pcm-recorder.worklet.ts?url'; import { float32ToPcm16, arrayBufferToBase64 } from './pcmUtils'; +// Inline worklet source to avoid Vite bundling issues with ?url imports. +// AudioWorklet.addModule() requires a URL to valid JS, so we create a Blob URL. +const WORKLET_SOURCE = ` +class PcmRecorderProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this.buffer = new Float32Array(4096); + this.idx = 0; + } + process(inputs) { + const input = inputs[0]; + if (input && input.length > 0) { + const channel = input[0]; + for (let i = 0; i < channel.length; i++) { + this.buffer[this.idx++] = channel[i]; + if (this.idx >= 4096) { + this.port.postMessage({ samples: this.buffer.slice() }); + this.idx = 0; + } + } + } + return true; + } +} +registerProcessor('pcm-recorder-processor', PcmRecorderProcessor); +`; + +function createWorkletUrl(): string { + const blob = new Blob([WORKLET_SOURCE], { type: 'application/javascript' }); + return URL.createObjectURL(blob); +} + export class GeminiAudioRecorder { private audioContext: AudioContext | null = null; private mediaStream: MediaStream | null = null; @@ -28,7 +59,10 @@ export class GeminiAudioRecorder { this.sourceNode = this.audioContext.createMediaStreamSource(this.mediaStream); try { + const workletUrl = createWorkletUrl(); await this.audioContext.audioWorklet.addModule(workletUrl); + URL.revokeObjectURL(workletUrl); + this.workletNode = new AudioWorkletNode(this.audioContext, 'pcm-recorder-processor'); this.workletNode.port.onmessage = (event) => { const pcm16 = float32ToPcm16(event.data.samples); @@ -37,7 +71,7 @@ export class GeminiAudioRecorder { }; this.sourceNode.connect(this.workletNode); } catch (e) { - console.warn('AudioWorklet failed, falling back to ScriptProcessorNode', e); + console.warn('[GeminiLive] AudioWorklet failed, falling back to ScriptProcessorNode', e); this.scriptNode = this.audioContext.createScriptProcessor(4096, 1, 1); this.scriptNode.onaudioprocess = (event) => { const inputData = event.inputBuffer.getChannelData(0); From 47494549a28bd99fcbfcb3a1c382e6847a103b07 Mon Sep 17 00:00:00 2001 From: yuhan Date: Sat, 4 Apr 2026 13:31:02 +0000 Subject: [PATCH 04/21] fix(voice): switch to gemini-2.5-flash-native-audio-latest model gemini-3.1-flash-live-preview does not accept clientContent text input, only audio input. gemini-2.5-flash-native-audio-latest supports both. --- shared/src/voice.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index a316750b09..26c9714a10 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -268,7 +268,7 @@ export type VoiceBackendType = 'elevenlabs' | 'gemini-live' export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'elevenlabs' -export const GEMINI_LIVE_MODEL = 'gemini-3.1-flash-live-preview' +export const GEMINI_LIVE_MODEL = 'gemini-2.5-flash-native-audio-latest' export interface VoiceToolDefinition { name: string From 964ea2e8fe1965130a4f890a7e700792eef02830 Mon Sep 17 00:00:00 2001 From: yuhan Date: Sat, 4 Apr 2026 14:07:59 +0000 Subject: [PATCH 05/21] feat(voice): add Qwen Realtime as third voice backend option - Shared: add 'qwen-realtime' backend type, model/voice constants - Hub: POST /voice/qwen-token route (DASHSCOPE_API_KEY / QWEN_API_KEY) - Web: QwenVoiceSession using DashScope Realtime WebSocket API (OpenAI-compatible protocol: session.update, input_audio_buffer, response.audio.delta, function calling via conversation.item.create) - VoiceBackendSession: lazy-load Qwen component - Tests: qwen-token route tests (3 cases) Switch via VOICE_BACKEND=qwen-realtime + DASHSCOPE_API_KEY=xxx --- hub/src/web/routes/voice.test.ts | 54 ++++ hub/src/web/routes/voice.ts | 21 +- shared/src/voice.ts | 5 +- web/src/api/client.ts | 12 + web/src/api/voice.ts | 25 +- web/src/realtime/QwenVoiceSession.tsx | 377 +++++++++++++++++++++++ web/src/realtime/VoiceBackendSession.tsx | 14 +- web/src/realtime/index.ts | 1 + 8 files changed, 504 insertions(+), 5 deletions(-) create mode 100644 web/src/realtime/QwenVoiceSession.tsx diff --git a/hub/src/web/routes/voice.test.ts b/hub/src/web/routes/voice.test.ts index a7553ff866..da989374c2 100644 --- a/hub/src/web/routes/voice.test.ts +++ b/hub/src/web/routes/voice.test.ts @@ -38,6 +38,15 @@ describe('GET /api/voice/backend', () => { expect(body.backend).toBe('gemini-live') }) + test('returns qwen-realtime when configured', async () => { + process.env.VOICE_BACKEND = 'qwen-realtime' + const app = createApp() + const res = await app.request('/api/voice/backend') + expect(res.status).toBe(200) + const body = await res.json() as { backend: string } + expect(body.backend).toBe('qwen-realtime') + }) + test('falls back to elevenlabs for unknown values', async () => { process.env.VOICE_BACKEND = 'unknown-backend' const app = createApp() @@ -92,3 +101,48 @@ describe('POST /api/voice/gemini-token', () => { expect(body.apiKey).toBe('test-google-key') }) }) + +describe('POST /api/voice/qwen-token', () => { + const origDash = process.env.DASHSCOPE_API_KEY + const origQwen = process.env.QWEN_API_KEY + + afterEach(() => { + if (origDash === undefined) delete process.env.DASHSCOPE_API_KEY + else process.env.DASHSCOPE_API_KEY = origDash + if (origQwen === undefined) delete process.env.QWEN_API_KEY + else process.env.QWEN_API_KEY = origQwen + }) + + test('returns 400 when no API key configured', async () => { + delete process.env.DASHSCOPE_API_KEY + delete process.env.QWEN_API_KEY + const app = createApp() + const res = await app.request('/api/voice/qwen-token', { method: 'POST' }) + expect(res.status).toBe(400) + const body = await res.json() as { allowed: boolean; error: string } + expect(body.allowed).toBe(false) + expect(body.error).toContain('not configured') + }) + + test('returns DASHSCOPE_API_KEY when set', async () => { + process.env.DASHSCOPE_API_KEY = 'test-dash-key' + delete process.env.QWEN_API_KEY + const app = createApp() + const res = await app.request('/api/voice/qwen-token', { method: 'POST' }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; apiKey: string } + expect(body.allowed).toBe(true) + expect(body.apiKey).toBe('test-dash-key') + }) + + test('falls back to QWEN_API_KEY', async () => { + delete process.env.DASHSCOPE_API_KEY + process.env.QWEN_API_KEY = 'test-qwen-key' + const app = createApp() + const res = await app.request('/api/voice/qwen-token', { method: 'POST' }) + expect(res.status).toBe(200) + const body = await res.json() as { allowed: boolean; apiKey: string } + expect(body.allowed).toBe(true) + expect(body.apiKey).toBe('test-qwen-key') + }) +}) diff --git a/hub/src/web/routes/voice.ts b/hub/src/web/routes/voice.ts index 8ba2c7b11f..f71b652116 100644 --- a/hub/src/web/routes/voice.ts +++ b/hub/src/web/routes/voice.ts @@ -122,7 +122,9 @@ export function createVoiceRoutes(): Hono { app.get('/voice/backend', (c) => { const raw = process.env.VOICE_BACKEND const backend: VoiceBackendType = - raw === 'gemini-live' ? 'gemini-live' : DEFAULT_VOICE_BACKEND + raw === 'gemini-live' ? 'gemini-live' + : raw === 'qwen-realtime' ? 'qwen-realtime' + : DEFAULT_VOICE_BACKEND return c.json({ backend }) }) @@ -147,6 +149,23 @@ export function createVoiceRoutes(): Hono { }) }) + // Get Qwen (DashScope) API key for Qwen Realtime voice sessions + app.post('/voice/qwen-token', async (c) => { + const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY + if (!apiKey) { + return c.json({ + allowed: false, + error: 'DashScope API key not configured (set DASHSCOPE_API_KEY or QWEN_API_KEY)' + }, 400) + } + + return c.json({ + allowed: true, + apiKey, + wsUrl: process.env.QWEN_REALTIME_WS_URL || undefined + }) + }) + // Get ElevenLabs ConvAI conversation token app.post('/voice/token', async (c) => { const json = await c.req.json().catch(() => null) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 26c9714a10..6da0669d14 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -264,7 +264,10 @@ export function buildVoiceAgentConfig(): VoiceAgentConfig { } } -export type VoiceBackendType = 'elevenlabs' | 'gemini-live' +export type VoiceBackendType = 'elevenlabs' | 'gemini-live' | 'qwen-realtime' + +export const QWEN_REALTIME_MODEL = 'qwen3.5-omni-plus-realtime' +export const QWEN_REALTIME_VOICE = 'Cherry' export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'elevenlabs' diff --git a/web/src/api/client.ts b/web/src/api/client.ts index cd651e3f70..b199870258 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -448,6 +448,18 @@ export class ApiClient { return await this.request('/api/voice/backend') } + async fetchQwenToken(): Promise<{ + allowed: boolean + apiKey?: string + wsUrl?: string + error?: string + }> { + return await this.request('/api/voice/qwen-token', { + method: 'POST', + body: JSON.stringify({}) + }) + } + async fetchGeminiToken(): Promise<{ allowed: boolean apiKey?: string diff --git a/web/src/api/voice.ts b/web/src/api/voice.ts index c79605434c..5e532eec3f 100644 --- a/web/src/api/voice.ts +++ b/web/src/api/voice.ts @@ -164,6 +164,27 @@ export async function createOrUpdateHapiAgent(apiKey: string): Promise { + try { + return await api.fetchQwenToken() + } catch (error) { + return { + allowed: false, + error: error instanceof Error ? error.message : 'Network error' + } + } +} + export interface VoiceBackendResponse { backend: VoiceBackendType } @@ -182,7 +203,9 @@ export interface GeminiTokenResponse { export async function fetchVoiceBackend(api: ApiClient): Promise { try { const result = await api.fetchVoiceBackend() - const backend = result.backend === 'gemini-live' ? 'gemini-live' : 'elevenlabs' + const backend = result.backend === 'gemini-live' ? 'gemini-live' + : result.backend === 'qwen-realtime' ? 'qwen-realtime' + : 'elevenlabs' return { backend } as VoiceBackendResponse } catch { return { backend: 'elevenlabs' } diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx new file mode 100644 index 0000000000..0debd14bf8 --- /dev/null +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -0,0 +1,377 @@ +import { useEffect, useRef, useCallback } from 'react' +import { registerVoiceSession, resetRealtimeSessionState } from './RealtimeSession' +import { registerSessionStore } from './realtimeClientTools' +import { fetchQwenToken } from '@/api/voice' +import { GeminiAudioRecorder } from './gemini/audioRecorder' +import { GeminiAudioPlayer } from './gemini/audioPlayer' +import { realtimeClientTools } from './realtimeClientTools' +import { + QWEN_REALTIME_MODEL, + QWEN_REALTIME_VOICE, + VOICE_SYSTEM_PROMPT, + VOICE_TOOL_DEFINITIONS +} from '@hapi/protocol/voice' +import type { VoiceSession, VoiceSessionConfig, StatusCallback } from './types' +import type { ApiClient } from '@/api/client' +import type { Session } from '@/types/api' + +const DEBUG = import.meta.env.DEV + +// DashScope Realtime WebSocket endpoint +const DEFAULT_QWEN_WS_BASE = 'wss://dashscope.aliyuncs.com/api-ws/v1/realtime' + +interface QwenState { + ws: WebSocket | null + recorder: GeminiAudioRecorder | null + player: GeminiAudioPlayer | null + statusCallback: StatusCallback | null + apiKey: string | null + wsBaseUrl: string | null +} + +const state: QwenState = { + ws: null, + recorder: null, + player: null, + statusCallback: null, + apiKey: null, + wsBaseUrl: null +} + +let eventCounter = 0 +function nextEventId(): string { + return `evt_${++eventCounter}` +} + +function cleanup() { + if (state.recorder) { + state.recorder.dispose() + state.recorder = null + } + if (state.player) { + state.player.dispose() + state.player = null + } + if (state.ws) { + if (state.ws.readyState === WebSocket.OPEN || state.ws.readyState === WebSocket.CONNECTING) { + state.ws.close() + } + state.ws = null + } +} + +function sendEvent(type: string, payload?: Record): void { + if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return + state.ws.send(JSON.stringify({ + event_id: nextEventId(), + type, + ...payload + })) +} + +class QwenVoiceSessionImpl implements VoiceSession { + private api: ApiClient + + constructor(api: ApiClient) { + this.api = api + } + + async startSession(config: VoiceSessionConfig): Promise { + cleanup() + state.statusCallback?.('connecting') + + // Get API key from hub + const tokenResp = await fetchQwenToken(this.api) + if (!tokenResp.allowed || !tokenResp.apiKey) { + const msg = tokenResp.error ?? 'DashScope API key not available' + state.statusCallback?.('error', msg) + throw new Error(msg) + } + state.apiKey = tokenResp.apiKey + state.wsBaseUrl = tokenResp.wsUrl || null + + // Request microphone + let permissionStream: MediaStream | null = null + try { + permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true }) + } catch (error) { + state.statusCallback?.('error', 'Microphone permission denied') + throw error + } finally { + permissionStream?.getTracks().forEach((t) => t.stop()) + } + + // Connect WebSocket + // DashScope uses Authorization header, but browser WebSocket doesn't support custom headers. + // Use URL query param for API key (DashScope also supports this). + const wsBase = state.wsBaseUrl || DEFAULT_QWEN_WS_BASE + const model = QWEN_REALTIME_MODEL + const wsUrl = `${wsBase}?model=${encodeURIComponent(model)}&api-key=${encodeURIComponent(state.apiKey)}` + const ws = new WebSocket(wsUrl) + state.ws = ws + + return new Promise((resolve, reject) => { + let sessionCreated = false + + ws.onopen = () => { + if (DEBUG) console.log('[Qwen] WebSocket connected') + } + + ws.onmessage = async (event) => { + let data: Record + try { + data = JSON.parse(event.data as string) as Record + } catch { + if (DEBUG) console.warn('[Qwen] Failed to parse message') + return + } + + const eventType = data.type as string + + // Session created - send configuration + if (eventType === 'session.created' && !sessionCreated) { + sessionCreated = true + if (DEBUG) console.log('[Qwen] Session created') + + // Build tools config + const tools = VOICE_TOOL_DEFINITIONS.map((td) => ({ + type: 'function' as const, + name: td.name, + description: td.description, + parameters: td.parameters + })) + + // Send session.update with full configuration + const instructions = config.initialContext + ? `${VOICE_SYSTEM_PROMPT}\n\n[Current Context]\n${config.initialContext}` + : VOICE_SYSTEM_PROMPT + + sendEvent('session.update', { + session: { + modalities: ['text', 'audio'], + voice: QWEN_REALTIME_VOICE, + input_audio_format: 'pcm', + output_audio_format: 'pcm', + instructions, + temperature: 0.7, + turn_detection: { + type: 'server_vad', + threshold: 0.5, + silence_duration_ms: 800, + prefix_padding_ms: 300 + }, + tools, + tool_choice: 'auto' + } + }) + return + } + + // Session updated - ready to go + if (eventType === 'session.updated') { + if (DEBUG) console.log('[Qwen] Session configured') + state.statusCallback?.('connected') + startAudioCapture() + resolve() + return + } + + // Audio output streaming + if (eventType === 'response.audio.delta') { + const delta = data.delta as string + if (delta) { + state.player?.enqueue(delta) + } + return + } + + // Text transcript (for debug) + if (eventType === 'response.audio_transcript.delta' && DEBUG) { + console.log('[Qwen] Transcript:', data.delta) + return + } + + // Function call complete + if (eventType === 'response.function_call_arguments.done') { + const callId = data.call_id as string + const fnName = data.name as string + const argsStr = data.arguments as string + + if (DEBUG) console.log('[Qwen] Tool call:', fnName, argsStr) + + let args: Record = {} + try { args = JSON.parse(argsStr) } catch { /* empty */ } + + // Execute the tool + const handler = fnName === 'messageCodingAgent' + ? realtimeClientTools.messageCodingAgent + : fnName === 'processPermissionRequest' + ? realtimeClientTools.processPermissionRequest + : null + + const result = handler + ? await handler(args) + : `error (unknown tool: ${fnName})` + + // Send function result back + sendEvent('conversation.item.create', { + item: { + type: 'function_call_output', + call_id: callId, + output: typeof result === 'string' ? result : JSON.stringify(result) + } + }) + // Trigger model to continue + sendEvent('response.create') + return + } + + // VAD: user started speaking - barge-in + if (eventType === 'input_audio_buffer.speech_started') { + if (state.player?.isPlaying()) { + state.player.clearQueue() + } + return + } + + // Response done + if (eventType === 'response.done' && DEBUG) { + const resp = data.response as Record | undefined + const usage = resp?.usage as Record | undefined + if (usage) console.log('[Qwen] Usage:', usage) + return + } + + // Error + if (eventType === 'error') { + const err = data.error as { message?: string } | undefined + console.error('[Qwen] Server error:', err?.message || data) + return + } + } + + ws.onerror = (event) => { + console.error('[Qwen] WebSocket error:', event) + if (!sessionCreated) { + state.statusCallback?.('error', 'WebSocket connection failed') + reject(new Error('WebSocket connection failed')) + } + } + + ws.onclose = (event) => { + if (DEBUG) console.log('[Qwen] WebSocket closed:', event.code, event.reason) + cleanup() + resetRealtimeSessionState() + state.statusCallback?.('disconnected') + } + }) + } + + async endSession(): Promise { + cleanup() + resetRealtimeSessionState() + state.statusCallback?.('disconnected') + } + + sendTextMessage(message: string): void { + // Send text as a user message via conversation.item.create + sendEvent('conversation.item.create', { + item: { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text: message }] + } + }) + sendEvent('response.create') + } + + sendContextualUpdate(update: string): void { + // Send context as a system-like user message + sendEvent('conversation.item.create', { + item: { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text: `[System Context Update] ${update}` }] + } + }) + } +} + +function startAudioCapture(): void { + state.player = new GeminiAudioPlayer() + state.recorder = new GeminiAudioRecorder() + + state.recorder.start( + (base64Pcm) => { + sendEvent('input_audio_buffer.append', { audio: base64Pcm }) + }, + (error) => { + console.error('[Qwen] Audio capture error:', error) + state.statusCallback?.('error', 'Microphone error') + } + ) +} + +// --- React component --- + +export interface QwenVoiceSessionProps { + api: ApiClient + micMuted?: boolean + onStatusChange?: StatusCallback + getSession?: (sessionId: string) => Session | null + sendMessage?: (sessionId: string, message: string) => void + approvePermission?: (sessionId: string, requestId: string) => Promise + denyPermission?: (sessionId: string, requestId: string) => Promise +} + +export function QwenVoiceSession({ + api, + micMuted = false, + onStatusChange, + getSession, + sendMessage, + approvePermission, + denyPermission +}: QwenVoiceSessionProps) { + const hasRegistered = useRef(false) + + useEffect(() => { + state.statusCallback = onStatusChange || null + return () => { state.statusCallback = null } + }, [onStatusChange]) + + useEffect(() => { + if (getSession && sendMessage && approvePermission && denyPermission) { + registerSessionStore({ + getSession: (sessionId: string) => + getSession(sessionId) as { agentState?: { requests?: Record } } | null, + sendMessage, + approvePermission, + denyPermission + }) + } + }, [getSession, sendMessage, approvePermission, denyPermission]) + + useEffect(() => { + if (!hasRegistered.current) { + try { + registerVoiceSession(new QwenVoiceSessionImpl(api)) + hasRegistered.current = true + } catch (error) { + console.error('[Qwen] Failed to register voice session:', error) + } + } + }, [api]) + + useEffect(() => { + if (state.recorder) { + state.recorder.setMuted(micMuted) + } + }, [micMuted]) + + useEffect(() => { + return () => { cleanup() } + }, []) + + return null +} diff --git a/web/src/realtime/VoiceBackendSession.tsx b/web/src/realtime/VoiceBackendSession.tsx index 66e24d6abe..c23dfa1509 100644 --- a/web/src/realtime/VoiceBackendSession.tsx +++ b/web/src/realtime/VoiceBackendSession.tsx @@ -1,15 +1,17 @@ import { lazy, Suspense, useEffect, useState } from 'react' import { RealtimeVoiceSession } from './RealtimeVoiceSession' import type { RealtimeVoiceSessionProps } from './RealtimeVoiceSession' -import type { GeminiLiveVoiceSessionProps } from './GeminiLiveVoiceSession' import { fetchVoiceBackend } from '@/api/voice' import type { ApiClient } from '@/api/client' import type { VoiceBackendType } from '@hapi/protocol/voice' -// Lazy-load Gemini session to avoid bundling @google/genai when using ElevenLabs +// Lazy-load alternative backends to avoid bundling when using ElevenLabs const GeminiLiveVoiceSession = lazy(() => import('./GeminiLiveVoiceSession').then((m) => ({ default: m.GeminiLiveVoiceSession })) ) +const QwenVoiceSession = lazy(() => + import('./QwenVoiceSession').then((m) => ({ default: m.QwenVoiceSession })) +) export type VoiceBackendSessionProps = RealtimeVoiceSessionProps & { api: ApiClient @@ -40,5 +42,13 @@ export function VoiceBackendSession(props: VoiceBackendSessionProps) { ) } + if (backend === 'qwen-realtime') { + return ( + + + + ) + } + return } diff --git a/web/src/realtime/index.ts b/web/src/realtime/index.ts index 68e1f7aaa9..1e080123b5 100644 --- a/web/src/realtime/index.ts +++ b/web/src/realtime/index.ts @@ -18,6 +18,7 @@ export { realtimeClientTools, registerSessionStore } from './realtimeClientTools // Voice session components export { RealtimeVoiceSession, type RealtimeVoiceSessionProps } from './RealtimeVoiceSession' export { GeminiLiveVoiceSession, type GeminiLiveVoiceSessionProps } from './GeminiLiveVoiceSession' +export { QwenVoiceSession, type QwenVoiceSessionProps } from './QwenVoiceSession' export { VoiceBackendSession, type VoiceBackendSessionProps } from './VoiceBackendSession' // Voice hooks From a2977485fd0675ee8fcd1c8466a38e1916727820 Mon Sep 17 00:00:00 2001 From: yuhan Date: Sat, 4 Apr 2026 16:04:34 +0000 Subject: [PATCH 06/21] fix(pwa): add skipWaiting + clientsClaim for immediate SW activation Without this, new deployments required users to close all tabs before the updated Service Worker would activate and serve new assets. --- bun.lock | 4 ++++ web/src/sw.ts | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/bun.lock b/bun.lock index f31e993406..c71cac240c 100644 --- a/bun.lock +++ b/bun.lock @@ -1062,6 +1062,10 @@ "@twsxtd/hapi-linux-x64": ["@twsxtd/hapi-linux-x64@0.16.7", "", { "os": "linux", "cpu": "x64", "bin": { "hapi": "bin/hapi" } }, "sha512-JuqgwJev9bHg57EqS+pGWXJ5tBtV3Xm5MFmoMNWXLuRVegNrWTO5WJHRsPH5XIItXtam5/aThKy73WEaTde4IA=="], + "@twsxtd/hapi-linux-x64": ["@twsxtd/hapi-linux-x64@0.16.5", "", { "os": "linux", "cpu": "x64", "bin": { "hapi": "bin/hapi" } }, "sha512-Cdo2B/BCnDJRkkGHxMo8UVVcKXJyuS7bnr+JtJL4f6kp7o8T2T9o/85S05Z/TF8k5dAxz2Np8rpJgmhBd/1boA=="], + + "@twsxtd/hapi-win32-x64": ["@twsxtd/hapi-win32-x64@0.16.5", "", { "os": "win32", "cpu": "x64", "bin": { "hapi": "bin/hapi.exe" } }, "sha512-jnl5zxT2AIslIy2X9jYklTUIob/Om8RlPIx2QbobkYP5+mMpPWUpJsBFx1NdN8FDMApcVQ/tqh6v2d72m16rDA=="], + "@types/aria-query": ["@types/aria-query@5.0.4", "", {}, "sha512-rfT93uj5s0PRL7EzccGMs3brplhcrghnDoV26NqKhCAS1hVo+WdNsPvE/yb6ilfr5hi2MEk6d5EWJTKdxg8jVw=="], "@types/babel__core": ["@types/babel__core@7.20.5", "", { "dependencies": { "@babel/parser": "^7.20.7", "@babel/types": "^7.20.7", "@types/babel__generator": "*", "@types/babel__template": "*", "@types/babel__traverse": "*" } }, "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA=="], diff --git a/web/src/sw.ts b/web/src/sw.ts index ebe55dc0a7..732ebef299 100644 --- a/web/src/sw.ts +++ b/web/src/sw.ts @@ -21,6 +21,10 @@ type PushPayload = { } } +// Activate new SW immediately without waiting for all tabs to close +self.addEventListener('install', () => { self.skipWaiting() }) +self.addEventListener('activate', (event) => { event.waitUntil(self.clients.claim()) }) + precacheAndRoute(self.__WB_MANIFEST) registerRoute( From 7a57ba31e895f6dd1abd5f3ef391eda1c43a5221 Mon Sep 17 00:00:00 2001 From: yuhan Date: Sat, 4 Apr 2026 16:47:16 +0000 Subject: [PATCH 07/21] fix(voice): Qwen WebSocket proxy + switch to qwen3-omni-flash-realtime - Hub: add WebSocket proxy at /api/voice/qwen-ws that injects Authorization header (browser WebSocket can't set custom headers) - Switch from qwen3.5-omni-plus-realtime (access denied / invite-only) to qwen3-omni-flash-realtime (publicly available) - Frontend connects via Hub proxy instead of direct to DashScope --- hub/src/web/server.ts | 108 ++++++++++++++++++++++++-- shared/src/voice.ts | 2 +- web/src/realtime/QwenVoiceSession.tsx | 13 ++-- 3 files changed, 110 insertions(+), 13 deletions(-) diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index b4dbf4eb5e..180e4e2f07 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -21,8 +21,60 @@ import { createPushRoutes } from './routes/push' import { createVoiceRoutes } from './routes/voice' import type { SSEManager } from '../sse/sseManager' import type { VisibilityTracker } from '../visibility/visibilityTracker' -import type { Server as BunServer } from 'bun' +import type { Server as BunServer, ServerWebSocket } from 'bun' import type { Server as SocketEngine } from '@socket.io/bun-engine' + +// Qwen Realtime WebSocket proxy — bridges browser (no custom headers) to DashScope (requires Authorization header) +function createQwenProxyWebSocketHandler() { + const QWEN_WS_BASE = 'wss://dashscope.aliyuncs.com/api-ws/v1/realtime' + // Map browser WS → upstream WS + const upstreamMap = new WeakMap, WebSocket>() + + return { + open(clientWs: ServerWebSocket) { + const data = clientWs.data as { apiKey: string; model: string } + const upstreamUrl = `${process.env.QWEN_REALTIME_WS_URL || QWEN_WS_BASE}?model=${encodeURIComponent(data.model)}` + + const upstream = new WebSocket(upstreamUrl, { + headers: { 'Authorization': `Bearer ${data.apiKey}` } + } as unknown as string[]) + + upstreamMap.set(clientWs, upstream) + + upstream.onopen = () => { + // Connection ready — upstream will send session.created + } + upstream.onmessage = (event) => { + // Forward upstream → client + try { + if (clientWs.readyState === 1) { + clientWs.send(typeof event.data === 'string' ? event.data : new Uint8Array(event.data as ArrayBuffer)) + } + } catch { /* client gone */ } + } + upstream.onerror = () => { + try { clientWs.close(1011, 'Upstream error') } catch { /* */ } + } + upstream.onclose = (event) => { + try { clientWs.close(event.code, event.reason) } catch { /* */ } + upstreamMap.delete(clientWs) + } + }, + message(clientWs: ServerWebSocket, message: string | ArrayBuffer | Uint8Array) { + const upstream = upstreamMap.get(clientWs) + if (upstream?.readyState === WebSocket.OPEN) { + upstream.send(typeof message === 'string' ? message : message) + } + }, + close(clientWs: ServerWebSocket, code: number, reason: string) { + const upstream = upstreamMap.get(clientWs) + if (upstream) { + try { upstream.close(code, reason) } catch { /* */ } + upstreamMap.delete(clientWs) + } + } + } +} import type { WebSocketData } from '@socket.io/bun-engine' import { loadEmbeddedAssetMap, type EmbeddedWebAsset } from './embeddedAssets' import { isBunCompiled } from '../utils/bunCompiled' @@ -230,16 +282,62 @@ export async function startWebServer(options: { const socketHandler = options.socketEngine.handler() - const server = Bun.serve({ + // Wrap socket.io websocket handler to also support Qwen Realtime proxy + const originalWsHandler = socketHandler.websocket + const qwenProxyHandler = createQwenProxyWebSocketHandler() + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const server = (Bun.serve as any)({ hostname: configuration.listenHost, port: configuration.listenPort, idleTimeout: Math.max(30, socketHandler.idleTimeout), maxRequestBodySize: Math.max(socketHandler.maxRequestBodySize, 68 * 1024 * 1024), - websocket: socketHandler.websocket, - fetch: (req, server) => { + websocket: { + ...originalWsHandler, + open(ws: unknown) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean }> + if (wsAny.data?._qwenProxy) { + qwenProxyHandler.open(wsAny) + } else { + originalWsHandler.open?.(ws as never) + } + }, + message(ws: unknown, message: unknown) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean }> + if (wsAny.data?._qwenProxy) { + qwenProxyHandler.message(wsAny, message as string) + } else { + originalWsHandler.message?.(ws as never, message as never) + } + }, + close(ws: unknown, code: number, reason: string) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean }> + if (wsAny.data?._qwenProxy) { + qwenProxyHandler.close(wsAny, code, reason) + } else { + originalWsHandler.close?.(ws as never, code as never, reason as never) + } + } + }, + fetch: (req: Request, server: { upgrade: (req: Request, opts?: unknown) => boolean }) => { const url = new URL(req.url) if (url.pathname.startsWith('/socket.io/')) { - return socketHandler.fetch(req, server) + return socketHandler.fetch(req, server as never) + } + // Qwen Realtime WebSocket proxy + if (url.pathname === '/api/voice/qwen-ws') { + const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY + const model = url.searchParams.get('model') || 'qwen3.5-omni-plus-realtime' + if (!apiKey) { + return new Response('DashScope API key not configured', { status: 400 }) + } + const upgraded = (server as unknown as { upgrade: (req: Request, opts: unknown) => boolean }).upgrade(req, { + data: { _qwenProxy: true, apiKey, model } + }) + if (!upgraded) { + return new Response('WebSocket upgrade failed', { status: 500 }) + } + return undefined as unknown as Response } return app.fetch(req) } diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 6da0669d14..27539d477f 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -266,7 +266,7 @@ export function buildVoiceAgentConfig(): VoiceAgentConfig { export type VoiceBackendType = 'elevenlabs' | 'gemini-live' | 'qwen-realtime' -export const QWEN_REALTIME_MODEL = 'qwen3.5-omni-plus-realtime' +export const QWEN_REALTIME_MODEL = 'qwen3-omni-flash-realtime' export const QWEN_REALTIME_VOICE = 'Cherry' export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'elevenlabs' diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 0debd14bf8..3c9808c6f8 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -17,8 +17,7 @@ import type { Session } from '@/types/api' const DEBUG = import.meta.env.DEV -// DashScope Realtime WebSocket endpoint -const DEFAULT_QWEN_WS_BASE = 'wss://dashscope.aliyuncs.com/api-ws/v1/realtime' +// Qwen WebSocket connects via Hub proxy (browser can't set Authorization header) interface QwenState { ws: WebSocket | null @@ -101,12 +100,12 @@ class QwenVoiceSessionImpl implements VoiceSession { permissionStream?.getTracks().forEach((t) => t.stop()) } - // Connect WebSocket - // DashScope uses Authorization header, but browser WebSocket doesn't support custom headers. - // Use URL query param for API key (DashScope also supports this). - const wsBase = state.wsBaseUrl || DEFAULT_QWEN_WS_BASE + // Connect via Hub WebSocket proxy (DashScope requires Authorization header, + // which browser WebSocket API doesn't support) + const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:' + const proxyBase = state.wsBaseUrl || `${protocol}//${window.location.host}` const model = QWEN_REALTIME_MODEL - const wsUrl = `${wsBase}?model=${encodeURIComponent(model)}&api-key=${encodeURIComponent(state.apiKey)}` + const wsUrl = `${proxyBase}/api/voice/qwen-ws?model=${encodeURIComponent(model)}` const ws = new WebSocket(wsUrl) state.ws = ws From 8f8526871f043de39b2e78ea4b5d2d8cb3e77a65 Mon Sep 17 00:00:00 2001 From: yuhan Date: Sun, 5 Apr 2026 04:42:12 +0000 Subject: [PATCH 08/21] fix(voice): switch default TTS to qwen-realtime + increase socket buffer - Change DEFAULT_VOICE_BACKEND from elevenlabs to qwen-realtime - Change QWEN_REALTIME_VOICE from Cherry to Mia - Increase maxHttpBufferSize to 55MB to match upload limit --- hub/src/socket/server.ts | 1 + shared/src/voice.ts | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/hub/src/socket/server.ts b/hub/src/socket/server.ts index 19086a95b4..ce3639dd95 100644 --- a/hub/src/socket/server.ts +++ b/hub/src/socket/server.ts @@ -63,6 +63,7 @@ export function createSocketServer(deps: SocketServerDeps): { const engine = new Engine({ path: '/socket.io/', cors: corsOptions, + maxHttpBufferSize: 55 * 1024 * 1024, // 55MB to match upload limit allowRequest: async (req) => { const origin = req.headers.get('origin') if (!origin || allowAllOrigins || corsOrigins.includes(origin)) { diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 27539d477f..f844b08d23 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -267,9 +267,9 @@ export function buildVoiceAgentConfig(): VoiceAgentConfig { export type VoiceBackendType = 'elevenlabs' | 'gemini-live' | 'qwen-realtime' export const QWEN_REALTIME_MODEL = 'qwen3-omni-flash-realtime' -export const QWEN_REALTIME_VOICE = 'Cherry' +export const QWEN_REALTIME_VOICE = 'Mia' -export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'elevenlabs' +export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'qwen-realtime' export const GEMINI_LIVE_MODEL = 'gemini-2.5-flash-native-audio-latest' From 0cfe63e8169d47c7052fcf4d884a6be8491945b0 Mon Sep 17 00:00:00 2001 From: yuhan Date: Sun, 5 Apr 2026 13:05:37 +0000 Subject: [PATCH 09/21] fix(voice): fix Gemini Live barge-in and tool call issues - Mute mic while model is speaking to prevent noise-triggered barge-in - Skip audio sending during model turn to avoid false interrupts - Remove greeting prompt that blocked subsequent tool calls - Add critical tool usage instruction at top of system prompt - Always log tool call and messageCodingAgent events (not just DEV mode) --- .claude/plan/hapi-web-loading-fix.md | 95 +++++ .claude/team-plan/pluggable-voice-backend.md | 394 +++++++++++++++++++ shared/src/voice.ts | 8 +- web/src/realtime/GeminiLiveVoiceSession.tsx | 34 +- web/src/realtime/realtimeClientTools.ts | 6 +- 5 files changed, 523 insertions(+), 14 deletions(-) create mode 100644 .claude/plan/hapi-web-loading-fix.md create mode 100644 .claude/team-plan/pluggable-voice-backend.md diff --git a/.claude/plan/hapi-web-loading-fix.md b/.claude/plan/hapi-web-loading-fix.md new file mode 100644 index 0000000000..9cf5da8939 --- /dev/null +++ b/.claude/plan/hapi-web-loading-fix.md @@ -0,0 +1,95 @@ +# 📋 实施计划:Hapi Web 加载失败 + 语音后端修复 + +## 诊断结论 + +### 根因分析 + +| 问题 | 根因 | 严重性 | +|------|------|--------| +| Web 版本加载不了 | Hub 进程未重启,运行的是旧环境变量 + Service Worker 缓存旧资源 | Critical | +| 更改语音选项后出问题 | `~/.hapi/env` 修改后 Hub 不会热加载,需要重启 | Critical | +| 数据库是否分了版本 | **只有一个数据库** `~/.hapi/hapi.db`,无 dev/prod 分离,排除此问题 | ✅ 已排除 | + +### 关键证据 + +1. **Hub 进程**: PID 44317, 启动于 **4/3 16:09** +2. **env 文件**: 最后修改于 **4/5 06:13** (Hub 启动后 2 天) +3. **环境变量不同步**: + - `~/.hapi/env` 中 `VOICE_BACKEND=gemini-live` + - 运行中 Hub 实际返回 `{"backend":"qwen-realtime"}`(因为 Hub 进程的 process.env 中没有 `VOICE_BACKEND`,回退到 `DEFAULT_VOICE_BACKEND = 'qwen-realtime'`) +4. **Web 静态文件**: 所有资源返回 200,HTML/JS/CSS 正常可达 +5. **数据库**: 单一 SQLite `~/.hapi/hapi.db`,schema v6,WAL 模式正常 + +### 用户需求更新 + +用户明确表示 **想用 Gemini TTS**,需要将 `VOICE_BACKEND` 设为 `gemini-live`。 + +--- + +## 任务类型 +- [x] 后端 (→ Hub 重启 + env 修复) +- [x] 前端 (→ Service Worker 清理 + 确认 Gemini Live 组件正常) + +## 技术方案 + +**核心修复**: 重启 Hub 进程使其加载最新的 `~/.hapi/env` 环境变量。 + +**辅助修复**: 清理 `web/dist` 中的旧构建产物,确保 Service Worker 不缓存过期资源。 + +--- + +## 实施步骤 + +### Step 1: 确认并修复 env 配置 +- 文件: `/home/ubuntu/.hapi/env` +- 确保 `VOICE_BACKEND=gemini-live`(用户要用 Gemini TTS) +- 确保 `GEMINI_API_KEY` 已配置 +- 预期产物: env 文件就绪 + +### Step 2: 清理 web 构建产物 +- 删除 `/home/ubuntu/hapi/web/dist/` 并重新构建 +- 命令: `cd /home/ubuntu/hapi/web && rm -rf dist && bun run build` +- 预期产物: 干净的 `web/dist/` 目录 + +### Step 3: 重启 Hub 进程 +- 停止当前 Hub (PID 44317) +- 重新启动 Hub,使其读取最新 env +- 命令: `hapi runner restart` 或手动 kill + 启动 +- 预期产物: Hub 进程以新 env 运行 + +### Step 4: 验证修复 +- 调用 `GET /api/voice/backend` 确认返回 `gemini-live` +- 访问 `https://ccg.aimo3d.org/` 确认页面加载正常 +- 测试 Gemini Live 语音功能 +- 预期产物: Web 正常加载 + 语音后端为 Gemini + +### Step 5: (可选) Service Worker 客户端清理 +- 如果用户浏览器仍显示旧内容,需要: + - 清除浏览器 Service Worker 缓存 + - 或强制刷新 (Ctrl+Shift+R) +- `sw.ts` 已有 `skipWaiting + clientsClaim`,重建后应自动更新 + +--- + +## 关键文件 + +| 文件 | 操作 | 说明 | +|------|------|------| +| `~/.hapi/env` | 确认 | VOICE_BACKEND=gemini-live | +| `web/dist/` | 重建 | 清理旧构建产物 | +| Hub 进程 (PID 44317) | 重启 | 加载最新 env | +| `shared/src/voice.ts:272` | 无需修改 | DEFAULT_VOICE_BACKEND 仅作 fallback | +| `hub/src/web/routes/voice.ts:122-128` | 无需修改 | 逻辑正确,只需 env 生效 | +| `~/.hapi/hapi.db` | 无操作 | 唯一数据库,无需修改 | + +## 风险与缓解 + +| 风险 | 缓解措施 | +|------|----------| +| 重启 Hub 会中断活跃 Claude 会话 | 会话可通过 `--resume` 恢复 | +| Gemini API Key 可能无效/过期 | Step 4 验证 token 端点 | +| 浏览器 SW 缓存未更新 | skipWaiting 机制 + 手动清除指引 | + +## SESSION_ID(供 /ccg:execute 使用) +- CODEX_SESSION: N/A(诊断任务,未调用) +- GEMINI_SESSION: N/A(诊断任务,未调用) diff --git a/.claude/team-plan/pluggable-voice-backend.md b/.claude/team-plan/pluggable-voice-backend.md new file mode 100644 index 0000000000..52e203ab83 --- /dev/null +++ b/.claude/team-plan/pluggable-voice-backend.md @@ -0,0 +1,394 @@ +# Team Plan: Pluggable Voice Backend (ElevenLabs + Gemini Live) + +## Overview + +Refactor Hapi voice assistant into a pluggable architecture (Strategy Pattern) supporting ElevenLabs ConvAI (default) and Gemini Live API backends, switchable via `VOICE_BACKEND` env var. Minimize upstream file changes to reduce git pull conflicts. + +## Codex Analysis Summary + +- Existing `VoiceSession` interface + `registerVoiceSession()` is already a Strategy injection point +- `VOICE_BACKEND` should be resolved at runtime via hub API (not Vite env), since web frontend has no runtime env mechanism +- `sendContextualUpdate` has no Gemini Live equivalent; must approximate via `send_realtime_input` for incremental updates, `send_client_content` for initial context seeding +- Ephemeral tokens use `v1alpha` endpoint; regular API key uses `v1beta` — hub must handle this divergence +- Tool calling in Gemini Live requires synchronous `sendToolResponse`; existing `processPermissionRequest` involves async network calls — keep responses short +- Hidden coupling: `VoiceSessionConfig.language` is typed as `ElevenLabsLanguage` (types.ts:1) +- Settings page language list is ElevenLabs-specific (functions named `getElevenLabsSupportedLanguages`) + +## Gemini Analysis Summary + +- Proposed transparent proxy component pattern: `RealtimeVoiceSession` becomes a switcher +- Audio pipeline: capture via `getUserMedia` + `AudioWorkletNode` for 16kHz downsampling → PCM16 → base64 → WebSocket; playback via `AudioContext(24000)` with scheduled buffer queue +- Tool adapter needed: `getFunctionDeclarations()` maps existing client tools to Gemini format, `handleToolCall()` bridges execution +- Client VAD + server VAD hybrid for barge-in: clear playback queue immediately on interruption +- Settings page needs conditional rendering based on active backend +- No changes needed to `SessionChat.tsx`, `ComposerButtons.tsx`, `HappyThread.tsx` — they only consume abstract `useVoice()` status + +## Functional Review Findings (v2) + +### Critical Gaps Fixed +- C1: AudioWorklet processor file was missing → added to Task 4 +- C2: Token expiry/reconnect not handled → added to Task 2 + Task 6 +- C3: Session switching routes tool calls to wrong session → auto-stop on session switch +- C4: Voice component lifecycle / unmount cleanup → unified dispose() path in Task 6 + +### High Gaps Fixed +- H1: Mobile AudioContext blocked by autoplay → AudioContext created in user gesture handler +- H2: GEMINI_API_KEY missing behavior undefined → explicit error contract +- H3: Tool calling multi-call/timeout → serial execution + per-call timeout +- H4: React Strict Mode double-mount → useEffect cleanup +- H5: Voice button available before backend loads → voiceReady state gating + +### Medium Gaps (addressed in Task 8/9) +- M1: Bundle size → React.lazy() dynamic import +- M2: Settings page not adapted → conditional rendering +- M3: No tests → added Task 8 +- M4: No docs update → added Task 9 + +## Technical Decisions + +| Decision | Choice | Rationale | +|----------|--------|-----------| +| Backend discovery | Hub runtime API (`GET /voice/backend`) | Web has no runtime env; avoids Vite rebuild to switch | +| Wrapper location | New `VoiceBackendSession.tsx` | Original `RealtimeVoiceSession.tsx` untouched = zero upstream conflict | +| Audio processing | Separate `gemini/` subdirectory | Isolate complexity; testable independently | +| Tool bridge | Adapter in `gemini/toolAdapter.ts` | Reuse existing `realtimeClientTools` without modification | +| Language type | Keep `ElevenLabsLanguage` for now | Gemini ignores language pref initially; refactor later to avoid upstream diff | +| Token flow | Hub creates ephemeral token for both backends | Never expose long-lived API keys to browser | +| Session switch | Auto-stop voice on session change | Prevents tool calls routing to wrong session | +| Gemini code loading | React.lazy() dynamic import | Zero bundle impact when using ElevenLabs | +| AudioContext creation | Synchronous in user gesture handler | Required for iOS/Android autoplay policy | + +## Task List + +### Task 1: Shared Voice Config Extension +- **Type**: Backend (shared) +- **File scope**: + - `shared/src/voice.ts` (modify — append new exports) +- **Dependencies**: None +- **Implementation steps**: + 1. Add `VoiceBackendType = 'elevenlabs' | 'gemini-live'` and `DEFAULT_VOICE_BACKEND = 'elevenlabs'` + 2. Add `GEMINI_LIVE_MODEL = 'gemini-3.1-flash-live-preview'` constant + 3. Extract `VOICE_TOOL_DEFINITIONS` from existing `VOICE_TOOLS` — neutral format, single source of truth + 4. Add `buildGeminiLiveFunctionDeclarations()` — converts `VOICE_TOOL_DEFINITIONS` to Gemini `{ name, description, parameters }` schema format + 5. Add `buildGeminiLiveConfig()` — returns `{ model, systemInstruction: VOICE_SYSTEM_PROMPT, tools: [{ functionDeclarations }], responseModalities: ['AUDIO'] }` for `ai.live.connect()` + 6. Keep `buildVoiceAgentConfig()` untouched for ElevenLabs +- **Acceptance**: Both config builders produce valid configs; existing ElevenLabs flow unaffected; `VOICE_TOOL_DEFINITIONS` is the single source for both backends + +### Task 2: Hub Backend Discovery + Token Route +- **Type**: Backend (hub) +- **File scope**: + - `hub/src/web/routes/voice.ts` (modify — add routes, refactor handler) + - `hub/package.json` (modify — add `@google/genai`) +- **Dependencies**: Task 1 +- **Implementation steps**: + 1. Add `resolveVoiceBackend()`: reads `VOICE_BACKEND` env, validates against `VoiceBackendType`, defaults to `elevenlabs` + 2. Add `GET /voice/backend` route: + - Success: `{ allowed: true, backend: VoiceBackendType }` + - Failure (missing key): `{ allowed: false, backend: VoiceBackendType, code: 'missing_elevenlabs_api_key' | 'missing_gemini_api_key', error: string }` + - Validates that the required API key exists for the configured backend + 3. Add `issueGeminiLiveToken()`: + - Read `GEMINI_API_KEY ?? GOOGLE_API_KEY`; if missing, return `{ allowed: false, code: 'missing_gemini_api_key' }` + - Use `@google/genai` SDK to create ephemeral token + - Return `{ allowed: true, backend: 'gemini-live', token, model: GEMINI_LIVE_MODEL, apiVersion: 'v1alpha', expiresAt: number }` + - Never cache Gemini tokens (they expire in ~60s) + 4. Refactor `POST /voice/token` handler: + - Branch on `resolveVoiceBackend()` — `elevenlabs` uses existing logic unchanged, `gemini-live` calls `issueGeminiLiveToken()` + - Discriminated union response type + 5. Error contract: all failure responses use `{ allowed: false, backend, code, error }` shape with appropriate HTTP status codes + 6. Add `@google/genai` to `hub/package.json` +- **Acceptance**: `GET /voice/backend` returns correct backend + allowed status; `POST /voice/token` returns valid token with `expiresAt` for Gemini; missing API key returns structured error; ElevenLabs path unchanged + +### Task 3: Web API Types + Client Functions +- **Type**: Frontend (web) +- **File scope**: + - `web/src/api/voice.ts` (modify — add types and fetch functions) + - `web/src/api/client.ts` (modify — add fetchVoiceBackend method) +- **Dependencies**: Task 2 +- **Implementation steps**: + 1. Add `VoiceBackendResponse` type: + ```ts + | { allowed: true; backend: VoiceBackendType } + | { allowed: false; backend: VoiceBackendType; code: string; error: string } + ``` + 2. Extend `VoiceTokenResponse` as discriminated union: + ```ts + | { allowed: true; backend: 'elevenlabs'; token: string; agentId: string } + | { allowed: true; backend: 'gemini-live'; token: string; model: string; apiVersion: string; expiresAt: number } + | { allowed: false; backend: string; code: string; error: string } + ``` + 3. Add `fetchVoiceBackend(api)` function with module-level cache (cache only successful responses; invalidate on error) + 4. Add `fetchVoiceBackend()` method to `ApiClient` class + 5. Update `fetchVoiceToken()` to handle union response +- **Acceptance**: Type-safe API calls for both backends; cached backend discovery; failed responses not cached + +### Task 4: Gemini Audio Pipeline +- **Type**: Frontend (web) +- **File scope** (all new files): + - `web/src/realtime/gemini/pcmUtils.ts` + - `web/src/realtime/gemini/pcm-recorder.worklet.ts` + - `web/src/realtime/gemini/audioRecorder.ts` + - `web/src/realtime/gemini/audioPlayer.ts` +- **Dependencies**: None (can parallel with Task 1-3) +- **Implementation steps**: + 1. `pcmUtils.ts`: Pure utility functions: + - `float32ToPcm16(samples: Float32Array): ArrayBuffer` + - `pcm16ToFloat32(buffer: ArrayBuffer): Float32Array` + - `arrayBufferToBase64(buffer: ArrayBuffer): string` + - `base64ToArrayBuffer(base64: string): ArrayBuffer` + 2. `pcm-recorder.worklet.ts`: AudioWorklet processor: + - Extends `AudioWorkletProcessor` + - `process()` method: accumulate Float32 samples into chunks (e.g., 4096 samples), post to main thread via `port.postMessage()` + - Register as `'pcm-recorder-processor'` + - Must be loadable via Vite: `import workletUrl from './pcm-recorder.worklet.ts?url'` + 3. `audioRecorder.ts`: class `GeminiAudioRecorder`: + - `start(onChunk: (base64Pcm: string) => void)`: + - `getUserMedia({ audio: { sampleRate: 16000, channelCount: 1 } })` + - Create `AudioContext({ sampleRate: 16000 })` + - `audioContext.audioWorklet.addModule(workletUrl)` + - Connect MediaStreamSource → AudioWorkletNode + - Worklet messages → `float32ToPcm16()` → `arrayBufferToBase64()` → `onChunk()` + - `stop()`: stop all tracks, disconnect nodes, close AudioContext + - `setMuted(muted: boolean)`: toggle `MediaStreamTrack.enabled` + - `dispose()`: idempotent full cleanup, safe to call multiple times + - Listen for `MediaStreamTrack.onended` (device unplugged) → invoke error callback + - **Fallback**: if `audioWorklet.addModule()` fails, fall back to `ScriptProcessorNode` (deprecated but wider support) + 4. `audioPlayer.ts`: class `GeminiAudioPlayer`: + - `constructor(audioContext?: AudioContext)`: use provided AudioContext or create new at 24kHz; maintain playback queue with scheduled end times + - `enqueue(base64Pcm: string)`: decode → create `AudioBufferSourceNode` → schedule at `max(audioContext.currentTime, lastEndTime)` → update `lastEndTime` + - `clearQueue()`: stop all scheduled sources immediately (for barge-in); reset `lastEndTime` + - `isPlaying(): boolean`: check if audio is currently being output + - `dispose()`: stop all, close AudioContext if we own it + - Handle Chrome tab backgrounding: detect `audioContext.state === 'suspended'` → attempt `resume()` → if blocked, notify via callback +- **Acceptance**: Recorder produces 16kHz PCM16 base64 chunks; Player plays 24kHz PCM16 smoothly without clicks; clearQueue stops immediately; device unplug detected; fallback for no-AudioWorklet browsers + +### Task 5: Gemini Tool Adapter +- **Type**: Frontend (web) +- **File scope** (new file): + - `web/src/realtime/gemini/toolAdapter.ts` +- **Dependencies**: Task 1 (for VOICE_TOOL_DEFINITIONS) +- **Implementation steps**: + 1. `getGeminiFunctionDeclarations()`: import `VOICE_TOOL_DEFINITIONS` from shared (single source of truth), map to Gemini schema format — no separate declaration, no schema drift risk + 2. `handleGeminiToolCalls(functionCalls, clientTools)`: + - Process calls **serially** (one at a time, in order) + - For each call: lookup function name in `realtimeClientTools`, execute with args, collect result + - **Preserve call IDs**: each `FunctionResponse` must include the matching `id` from the `FunctionCall` + - **Per-call timeout**: wrap each execution in a 30s timeout; return `'error (timeout)'` on expiry + - **Error isolation**: tool failure returns error string as response, never throws, never crashes session + - Return `FunctionResponse[]` array + 3. `validateToolArgs(name: string, args: unknown): boolean`: basic validation that required params exist +- **Acceptance**: Function declarations derived from single source; tool calls route correctly; call IDs preserved in responses; timeout works; errors don't crash session + +### Task 6: GeminiLiveVoiceSession Implementation +- **Type**: Frontend (web) +- **File scope** (new file): + - `web/src/realtime/GeminiLiveVoiceSession.tsx` + - `web/package.json` (modify — add `@google/genai`) +- **Dependencies**: Task 3, Task 4, Task 5 +- **Implementation steps**: + 1. Create `GeminiLiveVoiceSessionImpl` class implementing `VoiceSession` interface: + - **`startSession(config)`**: + - Fetch token from hub via `fetchVoiceToken(api)` + - Build config via `buildGeminiLiveConfig()` from shared + - Call `ai.live.connect({ model, config, callbacks })` with ephemeral token + - Start audio recorder → pipe chunks to live session via `sendRealtimeInput()` + - Seed initial context via `session.sendClientContent()` (one-time) + - Set status 'connected' + - **`endSession()`**: call `dispose()` (see below) + - **`sendTextMessage(message)`**: send as realtime text input to live session + - **`sendContextualUpdate(update)`**: send as realtime text input with `[CONTEXT UPDATE] ` prefix + - **`dispose(reason?: string)`**: single idempotent teardown path: + - Stop recorder (releases mic) + - Clear + dispose player + - Close live session WebSocket + - Reset all internal state + - Safe to call from any failure branch, unmount, session switch, or error + - **Reconnect logic**: + - On WebSocket close/error: if `reason !== 'user-initiated'`, attempt reconnect + - Fetch fresh token from hub (old one expired) + - Recreate live session with new token + - Reseed context via `sendClientContent()` + - Max 3 reconnect attempts with exponential backoff (1s, 3s, 9s) + - After 3 failures: set status 'error', show error in VoiceErrorBanner + 2. Create `GeminiLiveVoiceSession` React component: + - Props: same as `RealtimeVoiceSessionProps` + - **On mount**: instantiate impl, register via `registerVoiceSession()`, register session store + - **useEffect cleanup**: call `dispose('unmount')` — handles React Strict Mode double-mount correctly + - Handle `micMuted` prop: delegate to `recorder.setMuted()` — if recorder not yet started, store as pending state applied on recorder start + - Wire live session callbacks: + - `onopen` → status 'connected' + - `onclose` → attempt reconnect or status 'disconnected' + - `onerror` → status 'error' with message + - `onmessage`: dispatch by type: + - Audio data → `player.enqueue(base64)` + - Tool call → `toolAdapter.handleGeminiToolCalls()` → `session.sendToolResponse()` + - Text → log/ignore (voice session doesn't render text) + - **Barge-in**: when server signals user is speaking (or audio input detected while player active) → `player.clearQueue()` + - **AudioContext creation**: create AudioContext **synchronously in startSession**, which is called from user click handler → satisfies mobile autoplay policy + - Share AudioContext between recorder and player where sample rates allow (otherwise separate contexts) + - Render nothing (same as ElevenLabs version) + 3. Add `@google/genai` to `web/package.json` +- **Acceptance**: Full voice conversation works; tool calls execute correctly with preserved IDs; mic mute works (including pending state); barge-in clears playback; reconnect works on token expiry/WebSocket drop; dispose is idempotent; no resource leaks on unmount; works on mobile (AudioContext in gesture) + +### Task 7: Voice Backend Switcher + Integration +- **Type**: Frontend (web) +- **File scope**: + - `web/src/realtime/VoiceBackendSession.tsx` (new) + - `web/src/realtime/index.ts` (modify — add export) + - `web/src/components/SessionChat.tsx` (modify — change import + JSX, add auto-stop) + - `web/src/lib/voice-context.tsx` (modify — add voiceReady state) +- **Dependencies**: Task 6 +- **Implementation steps**: + 1. Create `VoiceBackendSession.tsx`: + - Props: same as `RealtimeVoiceSessionProps` + `api: ApiClient` + - On mount: call `fetchVoiceBackend(api)` (cached), store result in state + - Render: + - Loading (no backend yet): return null + - `backend === 'gemini-live'` → `React.lazy(() => import('./GeminiLiveVoiceSession'))` wrapped in `` + - Default → `` + - `allowed === false` → return null (voice not available) + 2. Update `web/src/lib/voice-context.tsx`: + - Add `voiceReady: boolean` to context (default false) + - Set `voiceReady = true` after backend discovery completes with `allowed: true` + - Expose `voiceReady` in `useVoice()` return + - Voice button disabled until `voiceReady === true` + 3. Update `web/src/components/SessionChat.tsx`: + - Change import: `RealtimeVoiceSession` → `VoiceBackendSession` + - Change JSX: ` k !== 'serverContent' || !('modelTurn' in (data.serverContent as Record || {}))) + if (!data.serverContent) { + console.log('[GeminiLive] Message:', msgKeys.join(', '), JSON.stringify(data).slice(0, 200)) + } + // Setup complete if (data.setupComplete && !setupDone) { setupDone = true @@ -146,10 +154,10 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { // Start audio capture startAudioCapture() - // Send initial context + first message prompt - sendClientContent(config.initialContext - ? `[Context] ${config.initialContext}\n\nPlease greet the user briefly.` - : 'Please greet the user briefly.') + // Send initial context if available (no greeting to preserve tool call ability) + if (config.initialContext) { + sendClientContent(`[Context] ${config.initialContext}`) + } resolve() return @@ -163,17 +171,25 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { if (serverContent) { if (serverContent.modelTurn?.parts) { + // Model is generating — mute mic to prevent barge-in from noise + if (!state.modelSpeaking) { + state.modelSpeaking = true + state.recorder?.setMuted(true) + } for (const part of serverContent.modelTurn.parts) { if (part.inlineData?.data) { state.player?.enqueue(part.inlineData.data) } - if (part.text && DEBUG) { + if (part.text) { console.log('[GeminiLive] Text:', part.text) } } } - if (serverContent.turnComplete && DEBUG) { + if (serverContent.turnComplete) { console.log('[GeminiLive] Turn complete') + // Model done — unmute mic for next user turn + state.modelSpeaking = false + state.recorder?.setMuted(false) } } @@ -183,7 +199,7 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { } | undefined if (toolCall?.functionCalls && toolCall.functionCalls.length > 0) { - if (DEBUG) console.log('[GeminiLive] Tool calls:', toolCall.functionCalls.map((c) => c.name)) + console.log('[GeminiLive] Tool calls:', toolCall.functionCalls.map((c) => c.name)) const responses = await handleGeminiFunctionCalls( toolCall.functionCalls as GeminiFunctionCall[] @@ -249,6 +265,8 @@ function sendClientContent(text: string): void { function sendAudioChunk(base64Pcm: string): void { if (!state.ws || state.ws.readyState !== WebSocket.OPEN) return + // Don't send audio while model is speaking + if (state.modelSpeaking) return state.ws.send(JSON.stringify({ realtimeInput: { mediaChunks: [{ diff --git a/web/src/realtime/realtimeClientTools.ts b/web/src/realtime/realtimeClientTools.ts index a2490ac816..962898c569 100644 --- a/web/src/realtime/realtimeClientTools.ts +++ b/web/src/realtime/realtimeClientTools.ts @@ -45,10 +45,8 @@ export const realtimeClientTools = { return 'error (session store not available)' } - if (VOICE_CONFIG.ENABLE_DEBUG_LOGGING) { - console.log('[Voice] messageCodingAgent called with:', message) - console.log('[Voice] Sending message to session:', sessionId) - } + console.log('[Voice] messageCodingAgent called with:', message) + console.log('[Voice] Sending message to session:', sessionId) sessionStore.sendMessage(sessionId, message) return "sent [DO NOT say anything else, simply say 'sent']" From bde20fad269c66de00c21ce34c4a93fa6f00aaa6 Mon Sep 17 00:00:00 2001 From: yuhan Date: Sun, 5 Apr 2026 13:18:25 +0000 Subject: [PATCH 10/21] fix(voice): greeting via system prompt to preserve tool call ability clientContent greeting creates a conversation turn that pushes the model into "chat mode", breaking subsequent tool calls. Instead, instruct the model to greet naturally when the user speaks first. --- shared/src/voice.ts | 6 +++++- web/src/realtime/GeminiLiveVoiceSession.tsx | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 7b092e001f..b2b6f74e72 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -148,7 +148,11 @@ IMPORTANT: Always respond in Chinese (Mandarin). Use natural spoken Chinese. - Greet users in Chinese - Summarize technical content in Chinese - Use English only for proper nouns, tool names, and code identifiers -- Keep the same warm, concise conversational style in Chinese` +- Keep the same warm, concise conversational style in Chinese + +# First Interaction + +When the user speaks to you for the first time, begin your response with a brief greeting (e.g. "你好!") before addressing their request. If their first message is a coding request, greet briefly AND call the tool — do both.` export const VOICE_FIRST_MESSAGE = "嗨!我是 Hapi 语音助手,有什么可以帮你的?" diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index eff1d0aebf..16974c7bd3 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -154,7 +154,7 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { // Start audio capture startAudioCapture() - // Send initial context if available (no greeting to preserve tool call ability) + // Send initial context if available (no clientContent greeting — it breaks tool calls) if (config.initialContext) { sendClientContent(`[Context] ${config.initialContext}`) } From f5cbd0ef2dc0b7c1a22402c6c8a9d376893083ab Mon Sep 17 00:00:00 2001 From: yuhan Date: Tue, 21 Apr 2026 08:55:35 +0000 Subject: [PATCH 11/21] =?UTF-8?q?fix(voice):=20address=20PR=20review=20?= =?UTF-8?q?=E2=80=94=20add=20JWT=20auth=20to=20WS=20proxy,=20stop=20leakin?= =?UTF-8?q?g=20API=20keys?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add JWT token verification to /api/voice/gemini-ws and /api/voice/qwen-ws WebSocket endpoints before upgrading, preventing unauthenticated access to server-side API credentials (BLOCKER) - Revert DEFAULT_VOICE_BACKEND to 'elevenlabs' so existing installs that only configured ElevenLabs are not broken (MAJOR) - Remove raw DashScope API key from /voice/qwen-token response; the hub proxy handles the key server-side, so the browser never needs it (MAJOR) - Update frontend to pass JWT via ?token= query param on WS connections - Change composer send shortcut from Enter to Ctrl/Cmd+Enter --- hub/src/web/routes/voice.ts | 19 ++-- hub/src/web/server.ts | 98 +++++++++++++++++-- shared/src/voice.ts | 2 +- web/src/api/client.ts | 6 +- web/src/api/voice.ts | 1 - .../AssistantChat/HappyComposer.tsx | 18 ++-- web/src/realtime/GeminiLiveVoiceSession.tsx | 17 +++- web/src/realtime/QwenVoiceSession.tsx | 9 +- 8 files changed, 138 insertions(+), 32 deletions(-) diff --git a/hub/src/web/routes/voice.ts b/hub/src/web/routes/voice.ts index f71b652116..a3f1ac8fcb 100644 --- a/hub/src/web/routes/voice.ts +++ b/hub/src/web/routes/voice.ts @@ -140,16 +140,21 @@ export function createVoiceRoutes(): Hono { }, 400) } + // Use server-side WS proxy to avoid region restrictions. + // The proxy at /api/voice/gemini-ws handles the API key server-side. + const publicUrl = process.env.HAPI_PUBLIC_URL || `http://localhost:${process.env.HAPI_LISTEN_PORT || '24888'}` + const wsProxyUrl = publicUrl.replace(/^http/, 'ws') + '/api/voice/gemini-ws' + return c.json({ allowed: true, - apiKey, - // Optional overrides for proxy/relay setups - wsUrl: process.env.GEMINI_LIVE_WS_URL || undefined, + apiKey: 'proxied', // Dummy — key is handled server-side + wsUrl: process.env.GEMINI_LIVE_WS_URL || wsProxyUrl, baseUrl: process.env.GEMINI_API_BASE || undefined }) }) - // Get Qwen (DashScope) API key for Qwen Realtime voice sessions + // Check Qwen (DashScope) availability for Qwen Realtime voice sessions + // The actual API key is never sent to the browser — it stays server-side in the WS proxy. app.post('/voice/qwen-token', async (c) => { const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY if (!apiKey) { @@ -159,10 +164,12 @@ export function createVoiceRoutes(): Hono { }, 400) } + const publicUrl = process.env.HAPI_PUBLIC_URL || `http://localhost:${process.env.HAPI_LISTEN_PORT || '24888'}` + const wsProxyUrl = publicUrl.replace(/^http/, 'ws') + '/api/voice/qwen-ws' + return c.json({ allowed: true, - apiKey, - wsUrl: process.env.QWEN_REALTIME_WS_URL || undefined + wsUrl: process.env.QWEN_REALTIME_WS_URL || wsProxyUrl }) }) diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index 180e4e2f07..b4efa1b49b 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -23,6 +23,54 @@ import type { SSEManager } from '../sse/sseManager' import type { VisibilityTracker } from '../visibility/visibilityTracker' import type { Server as BunServer, ServerWebSocket } from 'bun' import type { Server as SocketEngine } from '@socket.io/bun-engine' +import { jwtVerify } from 'jose' + +// Gemini Live WebSocket proxy — relays browser WS to Google, bypassing region restrictions +function createGeminiProxyWebSocketHandler() { + const GEMINI_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent' + const upstreamMap = new WeakMap, WebSocket>() + + return { + open(clientWs: ServerWebSocket) { + const data = clientWs.data as { _geminiProxy: boolean; apiKey: string } + const upstreamUrl = `${process.env.GEMINI_LIVE_WS_URL || GEMINI_WS_BASE}?key=${encodeURIComponent(data.apiKey)}` + + const upstream = new WebSocket(upstreamUrl) + upstreamMap.set(clientWs, upstream) + + upstream.onopen = () => { + // Ready — client will send setup message + } + upstream.onmessage = (event) => { + try { + if (clientWs.readyState === 1) { + clientWs.send(typeof event.data === 'string' ? event.data : new Uint8Array(event.data as ArrayBuffer)) + } + } catch { /* client gone */ } + } + upstream.onerror = () => { + try { clientWs.close(1011, 'Upstream error') } catch { /* */ } + } + upstream.onclose = (event) => { + try { clientWs.close(event.code, event.reason) } catch { /* */ } + upstreamMap.delete(clientWs) + } + }, + message(clientWs: ServerWebSocket, message: string | ArrayBuffer | Uint8Array) { + const upstream = upstreamMap.get(clientWs) + if (upstream?.readyState === WebSocket.OPEN) { + upstream.send(typeof message === 'string' ? message : message) + } + }, + close(clientWs: ServerWebSocket, code: number, reason: string) { + const upstream = upstreamMap.get(clientWs) + if (upstream) { + try { upstream.close(code, reason) } catch { /* */ } + upstreamMap.delete(clientWs) + } + } + } +} // Qwen Realtime WebSocket proxy — bridges browser (no custom headers) to DashScope (requires Authorization header) function createQwenProxyWebSocketHandler() { @@ -284,6 +332,7 @@ export async function startWebServer(options: { // Wrap socket.io websocket handler to also support Qwen Realtime proxy const originalWsHandler = socketHandler.websocket + const geminiProxyHandler = createGeminiProxyWebSocketHandler() const qwenProxyHandler = createQwenProxyWebSocketHandler() // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -295,35 +344,70 @@ export async function startWebServer(options: { websocket: { ...originalWsHandler, open(ws: unknown) { - const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean }> - if (wsAny.data?._qwenProxy) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean; _geminiProxy?: boolean }> + if (wsAny.data?._geminiProxy) { + geminiProxyHandler.open(wsAny) + } else if (wsAny.data?._qwenProxy) { qwenProxyHandler.open(wsAny) } else { originalWsHandler.open?.(ws as never) } }, message(ws: unknown, message: unknown) { - const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean }> - if (wsAny.data?._qwenProxy) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean; _geminiProxy?: boolean }> + if (wsAny.data?._geminiProxy) { + geminiProxyHandler.message(wsAny, message as string) + } else if (wsAny.data?._qwenProxy) { qwenProxyHandler.message(wsAny, message as string) } else { originalWsHandler.message?.(ws as never, message as never) } }, close(ws: unknown, code: number, reason: string) { - const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean }> - if (wsAny.data?._qwenProxy) { + const wsAny = ws as ServerWebSocket<{ _qwenProxy?: boolean; _geminiProxy?: boolean }> + if (wsAny.data?._geminiProxy) { + geminiProxyHandler.close(wsAny, code, reason) + } else if (wsAny.data?._qwenProxy) { qwenProxyHandler.close(wsAny, code, reason) } else { originalWsHandler.close?.(ws as never, code as never, reason as never) } } }, - fetch: (req: Request, server: { upgrade: (req: Request, opts?: unknown) => boolean }) => { + fetch: async (req: Request, server: { upgrade: (req: Request, opts?: unknown) => boolean }) => { const url = new URL(req.url) if (url.pathname.startsWith('/socket.io/')) { return socketHandler.fetch(req, server as never) } + + // Voice WebSocket proxies — require JWT auth via query param + // (browser WebSocket API cannot set custom headers) + if (url.pathname === '/api/voice/gemini-ws' || url.pathname === '/api/voice/qwen-ws') { + const token = url.searchParams.get('token') + if (!token) { + return new Response('Missing authorization token', { status: 401 }) + } + try { + await jwtVerify(token, options.jwtSecret, { algorithms: ['HS256'] }) + } catch { + return new Response('Invalid token', { status: 401 }) + } + } + + // Gemini Live WebSocket proxy + if (url.pathname === '/api/voice/gemini-ws') { + const apiKey = process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY + if (!apiKey) { + return new Response('Gemini API key not configured', { status: 400 }) + } + const upgraded = (server as unknown as { upgrade: (req: Request, opts: unknown) => boolean }).upgrade(req, { + data: { _geminiProxy: true, apiKey } + }) + if (!upgraded) { + return new Response('WebSocket upgrade failed', { status: 500 }) + } + return undefined as unknown as Response + } // Qwen Realtime WebSocket proxy if (url.pathname === '/api/voice/qwen-ws') { const apiKey = process.env.DASHSCOPE_API_KEY || process.env.QWEN_API_KEY diff --git a/shared/src/voice.ts b/shared/src/voice.ts index b2b6f74e72..ad99ce2d7d 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -277,7 +277,7 @@ export type VoiceBackendType = 'elevenlabs' | 'gemini-live' | 'qwen-realtime' export const QWEN_REALTIME_MODEL = 'qwen3-omni-flash-realtime' export const QWEN_REALTIME_VOICE = 'Mia' -export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'gemini-live' +export const DEFAULT_VOICE_BACKEND: VoiceBackendType = 'elevenlabs' export const GEMINI_LIVE_MODEL = 'gemini-2.5-flash-native-audio-latest' diff --git a/web/src/api/client.ts b/web/src/api/client.ts index b199870258..ff76fe89a9 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -444,13 +444,17 @@ export class ApiClient { }) } + /** Return the current auth token (for WebSocket query-param auth). */ + getAuthToken(): string | null { + return this.getToken ? this.getToken() : this.token + } + async fetchVoiceBackend(): Promise<{ backend: string }> { return await this.request('/api/voice/backend') } async fetchQwenToken(): Promise<{ allowed: boolean - apiKey?: string wsUrl?: string error?: string }> { diff --git a/web/src/api/voice.ts b/web/src/api/voice.ts index 5e532eec3f..3105c4ab95 100644 --- a/web/src/api/voice.ts +++ b/web/src/api/voice.ts @@ -166,7 +166,6 @@ export async function createOrUpdateHapiAgent(apiKey: string): Promise 0) { + if (key === 'Enter' && suggestions.length > 0 && !e.ctrlKey && !e.metaKey) { e.preventDefault() const indexToSelect = selectedIndex >= 0 ? selectedIndex : 0 handleSuggestionSelect(indexToSelect) return } - // Only plain Enter (no modifiers) sends; other modifier combos are ignored - if (key === 'Enter') { + // Ctrl+Enter (Windows/Linux) or Cmd+Enter (Mac) sends the message + if (key === 'Enter' && (e.ctrlKey || e.metaKey)) { e.preventDefault() - if (!e.ctrlKey && !e.altKey && !e.metaKey && canSend) { + if (canSend) { api.composer().send() setShowContinueHint(false) } return } + // Plain Enter inserts a newline (default textarea behavior) + if (key === 'Enter') { + return + } + if (suggestions.length > 0) { if (key === 'ArrowUp') { e.preventDefault() diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index 16974c7bd3..556fc76492 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -11,7 +11,7 @@ import type { ApiClient } from '@/api/client' import type { Session } from '@/types/api' import type { GeminiFunctionCall } from './gemini/toolAdapter' -const DEBUG = import.meta.env.DEV +const DEBUG = true // Default Gemini Live WebSocket API endpoint (Google direct) const DEFAULT_GEMINI_LIVE_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent' @@ -65,9 +65,12 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { state.statusCallback?.('connecting') // Get API key from hub + console.log('[GeminiLive] Fetching token...') const tokenResp = await fetchGeminiToken(this.api) + console.log('[GeminiLive] Token response:', { allowed: tokenResp.allowed, hasKey: !!tokenResp.apiKey, error: tokenResp.error }) if (!tokenResp.allowed || !tokenResp.apiKey) { const msg = tokenResp.error ?? 'Gemini API key not available' + console.error('[GeminiLive] Token failed:', msg) state.statusCallback?.('error', msg) throw new Error(msg) } @@ -75,19 +78,27 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { state.wsBaseUrl = tokenResp.wsUrl || null // Request microphone + console.log('[GeminiLive] Requesting microphone...') let permissionStream: MediaStream | null = null try { permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true }) + console.log('[GeminiLive] Microphone granted') } catch (error) { + console.error('[GeminiLive] Microphone denied:', error) state.statusCallback?.('error', 'Microphone permission denied') throw error } finally { permissionStream?.getTracks().forEach((t) => t.stop()) } - // Connect WebSocket + // Connect WebSocket — use proxy URL if provided (avoids region restrictions) const wsBase = state.wsBaseUrl || DEFAULT_GEMINI_LIVE_WS_BASE - const wsUrl = `${wsBase}?key=${encodeURIComponent(state.apiKey)}` + const isProxy = !!state.wsBaseUrl + const authToken = this.api.getAuthToken() || '' + const wsUrl = isProxy + ? `${wsBase}${wsBase.includes('?') ? '&' : '?'}token=${encodeURIComponent(authToken)}` + : `${wsBase}?key=${encodeURIComponent(state.apiKey)}` + console.log('[GeminiLive] Connecting WebSocket to:', wsBase, isProxy ? '(proxied)' : '(direct)') const ws = new WebSocket(wsUrl) state.ws = ws diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 3c9808c6f8..d6958bd930 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -79,14 +79,14 @@ class QwenVoiceSessionImpl implements VoiceSession { cleanup() state.statusCallback?.('connecting') - // Get API key from hub + // Check Qwen availability (hub no longer sends the raw API key) const tokenResp = await fetchQwenToken(this.api) - if (!tokenResp.allowed || !tokenResp.apiKey) { + if (!tokenResp.allowed) { const msg = tokenResp.error ?? 'DashScope API key not available' state.statusCallback?.('error', msg) throw new Error(msg) } - state.apiKey = tokenResp.apiKey + state.apiKey = null // key stays server-side state.wsBaseUrl = tokenResp.wsUrl || null // Request microphone @@ -105,7 +105,8 @@ class QwenVoiceSessionImpl implements VoiceSession { const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:' const proxyBase = state.wsBaseUrl || `${protocol}//${window.location.host}` const model = QWEN_REALTIME_MODEL - const wsUrl = `${proxyBase}/api/voice/qwen-ws?model=${encodeURIComponent(model)}` + const authToken = this.api.getAuthToken() || '' + const wsUrl = `${proxyBase}/api/voice/qwen-ws?model=${encodeURIComponent(model)}&token=${encodeURIComponent(authToken)}` const ws = new WebSocket(wsUrl) state.ws = ws From e32c1f651abcd691638a0e3f93942903848ea02b Mon Sep 17 00:00:00 2001 From: yuhan Date: Tue, 21 Apr 2026 09:29:51 +0000 Subject: [PATCH 12/21] fix(voice): gate voice button on backend discovery readiness VoiceBackendSession now fires onReadyChange(true) after backend discovery completes. SessionChat disables the voice toggle until ready, preventing silent drops when the user taps before registerVoiceSession() has run. --- web/src/components/SessionChat.tsx | 6 ++++-- web/src/realtime/VoiceBackendSession.tsx | 13 ++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/web/src/components/SessionChat.tsx b/web/src/components/SessionChat.tsx index 62ea61a14c..40489d809a 100644 --- a/web/src/components/SessionChat.tsx +++ b/web/src/components/SessionChat.tsx @@ -80,6 +80,7 @@ export function SessionChat(props: { // Voice assistant integration const voice = useVoiceOptional() + const [voiceBackendReady, setVoiceBackendReady] = useState(false) // Register session store for voice client tools useEffect(() => { @@ -423,8 +424,8 @@ export function SessionChat(props: { autocompleteSuggestions={props.autocompleteSuggestions} voiceStatus={voice?.status} voiceMicMuted={voice?.micMuted} - onVoiceToggle={voice ? handleVoiceToggle : undefined} - onVoiceMicToggle={voice ? handleVoiceMicToggle : undefined} + onVoiceToggle={voice && voiceBackendReady ? handleVoiceToggle : undefined} + onVoiceMicToggle={voice && voiceBackendReady ? handleVoiceMicToggle : undefined} /> @@ -435,6 +436,7 @@ export function SessionChat(props: { api={props.api} micMuted={voice.micMuted} onStatusChange={voice.setStatus} + onReadyChange={setVoiceBackendReady} /> )} diff --git a/web/src/realtime/VoiceBackendSession.tsx b/web/src/realtime/VoiceBackendSession.tsx index c23dfa1509..00c67a5557 100644 --- a/web/src/realtime/VoiceBackendSession.tsx +++ b/web/src/realtime/VoiceBackendSession.tsx @@ -15,6 +15,7 @@ const QwenVoiceSession = lazy(() => export type VoiceBackendSessionProps = RealtimeVoiceSessionProps & { api: ApiClient + onReadyChange?: (ready: boolean) => void } /** @@ -27,10 +28,16 @@ export function VoiceBackendSession(props: VoiceBackendSessionProps) { useEffect(() => { let cancelled = false fetchVoiceBackend(props.api).then((resp) => { - if (!cancelled) setBackend(resp.backend) + if (!cancelled) { + setBackend(resp.backend) + props.onReadyChange?.(true) + } }) - return () => { cancelled = true } - }, [props.api]) + return () => { + cancelled = true + props.onReadyChange?.(false) + } + }, [props.api]) // eslint-disable-line react-hooks/exhaustive-deps if (!backend) return null From fbf315bf8a078906968b9f8cf8e4691d3a57db5e Mon Sep 17 00:00:00 2001 From: yuhan Date: Tue, 21 Apr 2026 09:48:40 +0000 Subject: [PATCH 13/21] fix(voice): fix Qwen URL duplication, defer ready until session registered, update tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix BLOCKER: Qwen proxy URL was concatenated twice — wsBaseUrl from hub already contains /api/voice/qwen-ws, so the client no longer appends it again - Fix MAJOR: onReadyChange now fires from onRegistered callback inside each backend component (after registerVoiceSession completes), not from VoiceBackendSession after discovery. This ensures lazy-loaded Gemini/Qwen chunks have fully mounted before the voice button enables - Fix MAJOR: update voice.test.ts to assert new proxy contract — gemini-token returns { apiKey: 'proxied', wsUrl }, qwen-token returns { allowed, wsUrl } with no apiKey field --- hub/src/web/routes/voice.test.ts | 26 ++++++++++++--------- web/src/realtime/GeminiLiveVoiceSession.tsx | 5 +++- web/src/realtime/QwenVoiceSession.tsx | 11 ++++++--- web/src/realtime/RealtimeVoiceSession.tsx | 3 +++ web/src/realtime/VoiceBackendSession.tsx | 18 +++++++------- 5 files changed, 40 insertions(+), 23 deletions(-) diff --git a/hub/src/web/routes/voice.test.ts b/hub/src/web/routes/voice.test.ts index da989374c2..f2eb2444ba 100644 --- a/hub/src/web/routes/voice.test.ts +++ b/hub/src/web/routes/voice.test.ts @@ -1,4 +1,4 @@ -import { describe, test, expect, beforeEach, afterEach } from 'bun:test' +import { describe, test, expect, afterEach } from 'bun:test' import { Hono } from 'hono' import type { WebAppEnv } from '../middleware/auth' import { createVoiceRoutes } from './voice' @@ -79,15 +79,16 @@ describe('POST /api/voice/gemini-token', () => { expect(body.error).toContain('not configured') }) - test('returns GEMINI_API_KEY when set', async () => { + test('returns proxied wsUrl when GEMINI_API_KEY is set', async () => { process.env.GEMINI_API_KEY = 'test-gemini-key' delete process.env.GOOGLE_API_KEY const app = createApp() const res = await app.request('/api/voice/gemini-token', { method: 'POST' }) expect(res.status).toBe(200) - const body = await res.json() as { allowed: boolean; apiKey: string } + const body = await res.json() as { allowed: boolean; apiKey: string; wsUrl: string } expect(body.allowed).toBe(true) - expect(body.apiKey).toBe('test-gemini-key') + expect(body.apiKey).toBe('proxied') + expect(body.wsUrl).toContain('/api/voice/gemini-ws') }) test('falls back to GOOGLE_API_KEY', async () => { @@ -96,9 +97,10 @@ describe('POST /api/voice/gemini-token', () => { const app = createApp() const res = await app.request('/api/voice/gemini-token', { method: 'POST' }) expect(res.status).toBe(200) - const body = await res.json() as { allowed: boolean; apiKey: string } + const body = await res.json() as { allowed: boolean; apiKey: string; wsUrl: string } expect(body.allowed).toBe(true) - expect(body.apiKey).toBe('test-google-key') + expect(body.apiKey).toBe('proxied') + expect(body.wsUrl).toContain('/api/voice/gemini-ws') }) }) @@ -124,15 +126,16 @@ describe('POST /api/voice/qwen-token', () => { expect(body.error).toContain('not configured') }) - test('returns DASHSCOPE_API_KEY when set', async () => { + test('returns wsUrl when DASHSCOPE_API_KEY is set (no raw key exposed)', async () => { process.env.DASHSCOPE_API_KEY = 'test-dash-key' delete process.env.QWEN_API_KEY const app = createApp() const res = await app.request('/api/voice/qwen-token', { method: 'POST' }) expect(res.status).toBe(200) - const body = await res.json() as { allowed: boolean; apiKey: string } + const body = await res.json() as { allowed: boolean; wsUrl: string } expect(body.allowed).toBe(true) - expect(body.apiKey).toBe('test-dash-key') + expect(body.wsUrl).toContain('/api/voice/qwen-ws') + expect(body).not.toHaveProperty('apiKey') }) test('falls back to QWEN_API_KEY', async () => { @@ -141,8 +144,9 @@ describe('POST /api/voice/qwen-token', () => { const app = createApp() const res = await app.request('/api/voice/qwen-token', { method: 'POST' }) expect(res.status).toBe(200) - const body = await res.json() as { allowed: boolean; apiKey: string } + const body = await res.json() as { allowed: boolean; wsUrl: string } expect(body.allowed).toBe(true) - expect(body.apiKey).toBe('test-qwen-key') + expect(body.wsUrl).toContain('/api/voice/qwen-ws') + expect(body).not.toHaveProperty('apiKey') }) }) diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index 556fc76492..f99c8ba561 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -307,6 +307,7 @@ export interface GeminiLiveVoiceSessionProps { api: ApiClient micMuted?: boolean onStatusChange?: StatusCallback + onRegistered?: () => void getSession?: (sessionId: string) => Session | null sendMessage?: (sessionId: string, message: string) => void approvePermission?: (sessionId: string, requestId: string) => Promise @@ -317,6 +318,7 @@ export function GeminiLiveVoiceSession({ api, micMuted = false, onStatusChange, + onRegistered, getSession, sendMessage, approvePermission, @@ -349,11 +351,12 @@ export function GeminiLiveVoiceSession({ try { registerVoiceSession(new GeminiLiveVoiceSessionImpl(api)) hasRegistered.current = true + onRegistered?.() } catch (error) { console.error('[GeminiLive] Failed to register voice session:', error) } } - }, [api]) + }, [api]) // eslint-disable-line react-hooks/exhaustive-deps // Sync mic mute state useEffect(() => { diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index d6958bd930..6cd993d875 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -103,10 +103,12 @@ class QwenVoiceSessionImpl implements VoiceSession { // Connect via Hub WebSocket proxy (DashScope requires Authorization header, // which browser WebSocket API doesn't support) const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:' - const proxyBase = state.wsBaseUrl || `${protocol}//${window.location.host}` + const defaultProxyUrl = `${protocol}//${window.location.host}/api/voice/qwen-ws` + const proxyUrl = state.wsBaseUrl || defaultProxyUrl const model = QWEN_REALTIME_MODEL const authToken = this.api.getAuthToken() || '' - const wsUrl = `${proxyBase}/api/voice/qwen-ws?model=${encodeURIComponent(model)}&token=${encodeURIComponent(authToken)}` + const separator = proxyUrl.includes('?') ? '&' : '?' + const wsUrl = `${proxyUrl}${separator}model=${encodeURIComponent(model)}&token=${encodeURIComponent(authToken)}` const ws = new WebSocket(wsUrl) state.ws = ws @@ -318,6 +320,7 @@ export interface QwenVoiceSessionProps { api: ApiClient micMuted?: boolean onStatusChange?: StatusCallback + onRegistered?: () => void getSession?: (sessionId: string) => Session | null sendMessage?: (sessionId: string, message: string) => void approvePermission?: (sessionId: string, requestId: string) => Promise @@ -328,6 +331,7 @@ export function QwenVoiceSession({ api, micMuted = false, onStatusChange, + onRegistered, getSession, sendMessage, approvePermission, @@ -357,11 +361,12 @@ export function QwenVoiceSession({ try { registerVoiceSession(new QwenVoiceSessionImpl(api)) hasRegistered.current = true + onRegistered?.() } catch (error) { console.error('[Qwen] Failed to register voice session:', error) } } - }, [api]) + }, [api]) // eslint-disable-line react-hooks/exhaustive-deps useEffect(() => { if (state.recorder) { diff --git a/web/src/realtime/RealtimeVoiceSession.tsx b/web/src/realtime/RealtimeVoiceSession.tsx index fff9b7b44b..7bfac5953a 100644 --- a/web/src/realtime/RealtimeVoiceSession.tsx +++ b/web/src/realtime/RealtimeVoiceSession.tsx @@ -126,6 +126,7 @@ export interface RealtimeVoiceSessionProps { api: ApiClient micMuted?: boolean onStatusChange?: StatusCallback + onRegistered?: () => void getSession?: (sessionId: string) => Session | null sendMessage?: (sessionId: string, message: string) => void approvePermission?: (sessionId: string, requestId: string) => Promise @@ -136,6 +137,7 @@ export function RealtimeVoiceSession({ api, micMuted: micMutedProp = false, onStatusChange, + onRegistered, getSession, sendMessage, approvePermission, @@ -231,6 +233,7 @@ export function RealtimeVoiceSession({ try { registerVoiceSession(new RealtimeVoiceSessionImpl(api)) hasRegistered.current = true + onRegistered?.() } catch (error) { console.error('[Voice] Failed to register voice session:', error) } diff --git a/web/src/realtime/VoiceBackendSession.tsx b/web/src/realtime/VoiceBackendSession.tsx index 00c67a5557..b990d07bd7 100644 --- a/web/src/realtime/VoiceBackendSession.tsx +++ b/web/src/realtime/VoiceBackendSession.tsx @@ -1,4 +1,4 @@ -import { lazy, Suspense, useEffect, useState } from 'react' +import { lazy, Suspense, useCallback, useEffect, useState } from 'react' import { RealtimeVoiceSession } from './RealtimeVoiceSession' import type { RealtimeVoiceSessionProps } from './RealtimeVoiceSession' import { fetchVoiceBackend } from '@/api/voice' @@ -21,6 +21,7 @@ export type VoiceBackendSessionProps = RealtimeVoiceSessionProps & { /** * Dynamically selects the voice session component based on the hub's configured backend. * Queries GET /voice/backend once on mount and renders the appropriate component. + * Only signals readiness after the selected backend has mounted and registered its session. */ export function VoiceBackendSession(props: VoiceBackendSessionProps) { const [backend, setBackend] = useState(null) @@ -28,10 +29,7 @@ export function VoiceBackendSession(props: VoiceBackendSessionProps) { useEffect(() => { let cancelled = false fetchVoiceBackend(props.api).then((resp) => { - if (!cancelled) { - setBackend(resp.backend) - props.onReadyChange?.(true) - } + if (!cancelled) setBackend(resp.backend) }) return () => { cancelled = true @@ -39,12 +37,16 @@ export function VoiceBackendSession(props: VoiceBackendSessionProps) { } }, [props.api]) // eslint-disable-line react-hooks/exhaustive-deps + const handleRegistered = useCallback(() => { + props.onReadyChange?.(true) + }, [props.onReadyChange]) + if (!backend) return null if (backend === 'gemini-live') { return ( - + ) } @@ -52,10 +54,10 @@ export function VoiceBackendSession(props: VoiceBackendSessionProps) { if (backend === 'qwen-realtime') { return ( - + ) } - return + return } From c68366a5c0278a12a325b69cdda7fbbdf16c920f Mon Sep 17 00:00:00 2001 From: yuhan Date: Tue, 21 Apr 2026 10:02:13 +0000 Subject: [PATCH 14/21] fix(voice): use request origin for proxy URL, connect worklet to sink, serialize tool calls - Fix BLOCKER: derive wsUrl from request origin instead of hard-coded localhost:24888 fallback, so remote browsers connect back to the hub. HAPI_PUBLIC_URL still overrides when set. - Fix MAJOR: connect AudioWorklet node to a silent GainNode sink so the audio graph pulls frames and port.onmessage fires correctly. - Fix MAJOR: replace Promise.all with sequential for-loop in handleGeminiFunctionCalls to prevent racing on shared session state (e.g. duplicate processPermissionRequest resolutions). --- hub/src/web/routes/voice.ts | 8 ++++++-- web/src/realtime/gemini/audioRecorder.ts | 7 +++++++ web/src/realtime/gemini/toolAdapter.ts | 10 ++++++++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/hub/src/web/routes/voice.ts b/hub/src/web/routes/voice.ts index a3f1ac8fcb..875b8034c2 100644 --- a/hub/src/web/routes/voice.ts +++ b/hub/src/web/routes/voice.ts @@ -142,7 +142,10 @@ export function createVoiceRoutes(): Hono { // Use server-side WS proxy to avoid region restrictions. // The proxy at /api/voice/gemini-ws handles the API key server-side. - const publicUrl = process.env.HAPI_PUBLIC_URL || `http://localhost:${process.env.HAPI_LISTEN_PORT || '24888'}` + // Derive wsUrl from the request origin so remote browsers connect back to the hub, + // not to localhost. HAPI_PUBLIC_URL overrides when set (e.g. behind a reverse proxy). + const requestOrigin = new URL(c.req.url).origin + const publicUrl = process.env.HAPI_PUBLIC_URL || requestOrigin const wsProxyUrl = publicUrl.replace(/^http/, 'ws') + '/api/voice/gemini-ws' return c.json({ @@ -164,7 +167,8 @@ export function createVoiceRoutes(): Hono { }, 400) } - const publicUrl = process.env.HAPI_PUBLIC_URL || `http://localhost:${process.env.HAPI_LISTEN_PORT || '24888'}` + const requestOrigin = new URL(c.req.url).origin + const publicUrl = process.env.HAPI_PUBLIC_URL || requestOrigin const wsProxyUrl = publicUrl.replace(/^http/, 'ws') + '/api/voice/qwen-ws' return c.json({ diff --git a/web/src/realtime/gemini/audioRecorder.ts b/web/src/realtime/gemini/audioRecorder.ts index b1c01a7c47..98813212a0 100644 --- a/web/src/realtime/gemini/audioRecorder.ts +++ b/web/src/realtime/gemini/audioRecorder.ts @@ -69,7 +69,14 @@ export class GeminiAudioRecorder { const base64 = arrayBufferToBase64(pcm16); onChunk(base64); }; + // Connect source → worklet → silent sink → destination. + // The downstream connection is required so the audio graph pulls + // frames through the worklet node and port.onmessage fires. + const sink = this.audioContext.createGain(); + sink.gain.value = 0; this.sourceNode.connect(this.workletNode); + this.workletNode.connect(sink); + sink.connect(this.audioContext.destination); } catch (e) { console.warn('[GeminiLive] AudioWorklet failed, falling back to ScriptProcessorNode', e); this.scriptNode = this.audioContext.createScriptProcessor(4096, 1, 1); diff --git a/web/src/realtime/gemini/toolAdapter.ts b/web/src/realtime/gemini/toolAdapter.ts index dd44e4fb11..dbf4dee9c9 100644 --- a/web/src/realtime/gemini/toolAdapter.ts +++ b/web/src/realtime/gemini/toolAdapter.ts @@ -61,10 +61,16 @@ export async function handleGeminiFunctionCall( } /** - * Process multiple function calls in parallel and return all responses. + * Process multiple function calls sequentially to avoid racing on shared + * session state (e.g. processPermissionRequest resolving the same pending + * request twice when calls run in parallel). */ export async function handleGeminiFunctionCalls( calls: GeminiFunctionCall[] ): Promise { - return Promise.all(calls.map(handleGeminiFunctionCall)) + const responses: GeminiFunctionResponse[] = [] + for (const call of calls) { + responses.push(await handleGeminiFunctionCall(call)) + } + return responses } From 759bf35961d361808eb3b3a8085275e26f5f8ce8 Mon Sep 17 00:00:00 2001 From: yuhan Date: Tue, 21 Apr 2026 11:00:40 +0000 Subject: [PATCH 15/21] fix(voice): create playback AudioContext in user gesture for mobile autoplay Create and resume the playback AudioContext at the start of startSession(), while still inside the user's click/tap gesture. Pass the pre-created context to GeminiAudioPlayer so mobile browsers (iOS Safari) don't block audio output due to autoplay policy. Applies to both Gemini Live and Qwen Realtime backends. --- web/src/realtime/GeminiLiveVoiceSession.tsx | 11 ++++++++--- web/src/realtime/QwenVoiceSession.tsx | 11 ++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index f99c8ba561..a67f130408 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -64,6 +64,11 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { cleanup() state.statusCallback?.('connecting') + // Create playback AudioContext immediately while still inside the user + // gesture (click/tap). Mobile browsers require this for autoplay policy. + const playbackContext = new AudioContext({ sampleRate: 24000 }) + await playbackContext.resume() + // Get API key from hub console.log('[GeminiLive] Fetching token...') const tokenResp = await fetchGeminiToken(this.api) @@ -163,7 +168,7 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { state.statusCallback?.('connected') // Start audio capture - startAudioCapture() + startAudioCapture(playbackContext) // Send initial context if available (no clientContent greeting — it breaks tool calls) if (config.initialContext) { @@ -288,8 +293,8 @@ function sendAudioChunk(base64Pcm: string): void { })) } -function startAudioCapture(): void { - state.player = new GeminiAudioPlayer() +function startAudioCapture(playbackContext: AudioContext): void { + state.player = new GeminiAudioPlayer(playbackContext) state.recorder = new GeminiAudioRecorder() state.recorder.start( diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 6cd993d875..3bdb813f0f 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -79,6 +79,11 @@ class QwenVoiceSessionImpl implements VoiceSession { cleanup() state.statusCallback?.('connecting') + // Create playback AudioContext immediately while still inside the user + // gesture (click/tap). Mobile browsers require this for autoplay policy. + const playbackContext = new AudioContext({ sampleRate: 24000 }) + await playbackContext.resume() + // Check Qwen availability (hub no longer sends the raw API key) const tokenResp = await fetchQwenToken(this.api) if (!tokenResp.allowed) { @@ -173,7 +178,7 @@ class QwenVoiceSessionImpl implements VoiceSession { if (eventType === 'session.updated') { if (DEBUG) console.log('[Qwen] Session configured') state.statusCallback?.('connected') - startAudioCapture() + startAudioCapture(playbackContext) resolve() return } @@ -299,8 +304,8 @@ class QwenVoiceSessionImpl implements VoiceSession { } } -function startAudioCapture(): void { - state.player = new GeminiAudioPlayer() +function startAudioCapture(playbackContext: AudioContext): void { + state.player = new GeminiAudioPlayer(playbackContext) state.recorder = new GeminiAudioRecorder() state.recorder.start( From 74aa4c2fc960ce30719610ece83ae4e7f2857851 Mon Sep 17 00:00:00 2001 From: yuhan Date: Tue, 21 Apr 2026 11:13:55 +0000 Subject: [PATCH 16/21] fix(voice): store playback AudioContext in state and close on cleanup Move playback AudioContext into module state so cleanup() can close it on failed starts and normal stop/start cycles. Prevents orphaned AudioContext leaks that would exhaust mobile browser limits after repeated retries. --- web/src/realtime/GeminiLiveVoiceSession.tsx | 13 ++++++++++--- web/src/realtime/QwenVoiceSession.tsx | 13 ++++++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index a67f130408..3a81e97300 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -20,6 +20,7 @@ interface GeminiLiveState { ws: WebSocket | null recorder: GeminiAudioRecorder | null player: GeminiAudioPlayer | null + playbackContext: AudioContext | null statusCallback: StatusCallback | null apiKey: string | null wsBaseUrl: string | null @@ -30,6 +31,7 @@ const state: GeminiLiveState = { ws: null, recorder: null, player: null, + playbackContext: null, statusCallback: null, apiKey: null, wsBaseUrl: null, @@ -45,6 +47,10 @@ function cleanup() { state.player.dispose() state.player = null } + if (state.playbackContext && state.playbackContext.state !== 'closed') { + void state.playbackContext.close() + } + state.playbackContext = null if (state.ws) { if (state.ws.readyState === WebSocket.OPEN || state.ws.readyState === WebSocket.CONNECTING) { state.ws.close() @@ -66,8 +72,9 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { // Create playback AudioContext immediately while still inside the user // gesture (click/tap). Mobile browsers require this for autoplay policy. - const playbackContext = new AudioContext({ sampleRate: 24000 }) - await playbackContext.resume() + // Store in state so cleanup() can close it on failure or stop. + state.playbackContext = new AudioContext({ sampleRate: 24000 }) + await state.playbackContext.resume() // Get API key from hub console.log('[GeminiLive] Fetching token...') @@ -168,7 +175,7 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { state.statusCallback?.('connected') // Start audio capture - startAudioCapture(playbackContext) + startAudioCapture(state.playbackContext!) // Send initial context if available (no clientContent greeting — it breaks tool calls) if (config.initialContext) { diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 3bdb813f0f..4c9ab1d48c 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -23,6 +23,7 @@ interface QwenState { ws: WebSocket | null recorder: GeminiAudioRecorder | null player: GeminiAudioPlayer | null + playbackContext: AudioContext | null statusCallback: StatusCallback | null apiKey: string | null wsBaseUrl: string | null @@ -32,6 +33,7 @@ const state: QwenState = { ws: null, recorder: null, player: null, + playbackContext: null, statusCallback: null, apiKey: null, wsBaseUrl: null @@ -51,6 +53,10 @@ function cleanup() { state.player.dispose() state.player = null } + if (state.playbackContext && state.playbackContext.state !== 'closed') { + void state.playbackContext.close() + } + state.playbackContext = null if (state.ws) { if (state.ws.readyState === WebSocket.OPEN || state.ws.readyState === WebSocket.CONNECTING) { state.ws.close() @@ -81,8 +87,9 @@ class QwenVoiceSessionImpl implements VoiceSession { // Create playback AudioContext immediately while still inside the user // gesture (click/tap). Mobile browsers require this for autoplay policy. - const playbackContext = new AudioContext({ sampleRate: 24000 }) - await playbackContext.resume() + // Store in state so cleanup() can close it on failure or stop. + state.playbackContext = new AudioContext({ sampleRate: 24000 }) + await state.playbackContext.resume() // Check Qwen availability (hub no longer sends the raw API key) const tokenResp = await fetchQwenToken(this.api) @@ -178,7 +185,7 @@ class QwenVoiceSessionImpl implements VoiceSession { if (eventType === 'session.updated') { if (DEBUG) console.log('[Qwen] Session configured') state.statusCallback?.('connected') - startAudioCapture(playbackContext) + startAudioCapture(state.playbackContext!) resolve() return } From 296dc85a328ff0339286bbb238bfe843d6754bf8 Mon Sep 17 00:00:00 2001 From: yuhan Date: Tue, 21 Apr 2026 11:26:38 +0000 Subject: [PATCH 17/21] fix(pwa,voice): remove forced SW activation, guard voice debug logs - Remove skipWaiting + clientsClaim from service worker to prevent breaking lazy-loaded voice chunks in already-open tabs after deploy. New SW now waits for all tabs to close before activating. - Wrap messageCodingAgent console.log calls with VOICE_CONFIG debug guard to stop logging user prompts and session IDs in production. --- web/src/realtime/realtimeClientTools.ts | 6 ++++-- web/src/sw.ts | 7 +++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/web/src/realtime/realtimeClientTools.ts b/web/src/realtime/realtimeClientTools.ts index 962898c569..a2490ac816 100644 --- a/web/src/realtime/realtimeClientTools.ts +++ b/web/src/realtime/realtimeClientTools.ts @@ -45,8 +45,10 @@ export const realtimeClientTools = { return 'error (session store not available)' } - console.log('[Voice] messageCodingAgent called with:', message) - console.log('[Voice] Sending message to session:', sessionId) + if (VOICE_CONFIG.ENABLE_DEBUG_LOGGING) { + console.log('[Voice] messageCodingAgent called with:', message) + console.log('[Voice] Sending message to session:', sessionId) + } sessionStore.sendMessage(sessionId, message) return "sent [DO NOT say anything else, simply say 'sent']" diff --git a/web/src/sw.ts b/web/src/sw.ts index 732ebef299..62be9dd29d 100644 --- a/web/src/sw.ts +++ b/web/src/sw.ts @@ -21,10 +21,9 @@ type PushPayload = { } } -// Activate new SW immediately without waiting for all tabs to close -self.addEventListener('install', () => { self.skipWaiting() }) -self.addEventListener('activate', (event) => { event.waitUntil(self.clients.claim()) }) - +// Let the new service worker wait until all tabs close before activating. +// Immediate skipWaiting + clientsClaim can break lazy-loaded chunks (e.g. voice) +// when the old app shell requests hashes that the new precache no longer serves. precacheAndRoute(self.__WB_MANIFEST) registerRoute( From f108c0f2d7d4af4f49e32cd11e72d79d92b515ea Mon Sep 17 00:00:00 2001 From: yuhan Date: Tue, 21 Apr 2026 13:28:40 +0000 Subject: [PATCH 18/21] fix(voice): reject startup promise on early WS close, fix debug flag - Gemini: reject the startup promise in onclose when setup hasn't completed, preventing the UI from hanging in 'connecting' state - Qwen: reject on both server 'error' events and early onclose during handshake, with proper cleanup and status callback - Change Gemini DEBUG from hardcoded true to import.meta.env.DEV to stop logging connection state and tool calls in production --- web/src/realtime/GeminiLiveVoiceSession.tsx | 8 +++++++- web/src/realtime/QwenVoiceSession.tsx | 14 +++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index 3a81e97300..0d663695a7 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -11,7 +11,7 @@ import type { ApiClient } from '@/api/client' import type { Session } from '@/types/api' import type { GeminiFunctionCall } from './gemini/toolAdapter' -const DEBUG = true +const DEBUG = import.meta.env.DEV // Default Gemini Live WebSocket API endpoint (Google direct) const DEFAULT_GEMINI_LIVE_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent' @@ -255,6 +255,12 @@ class GeminiLiveVoiceSessionImpl implements VoiceSession { if (DEBUG) console.log('[GeminiLive] WebSocket closed:', event.code, event.reason) cleanup() resetRealtimeSessionState() + if (!setupDone) { + const message = event.reason || 'WebSocket closed before setup completed' + state.statusCallback?.('error', message) + reject(new Error(message)) + return + } state.statusCallback?.('disconnected') } }) diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 4c9ab1d48c..167e4dd8b4 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -259,7 +259,13 @@ class QwenVoiceSessionImpl implements VoiceSession { // Error if (eventType === 'error') { const err = data.error as { message?: string } | undefined - console.error('[Qwen] Server error:', err?.message || data) + const message = err?.message || 'Realtime session setup failed' + console.error('[Qwen] Server error:', message) + state.statusCallback?.('error', message) + if (!sessionCreated) { + reject(new Error(message)) + ws.close() + } return } } @@ -276,6 +282,12 @@ class QwenVoiceSessionImpl implements VoiceSession { if (DEBUG) console.log('[Qwen] WebSocket closed:', event.code, event.reason) cleanup() resetRealtimeSessionState() + if (!sessionCreated) { + const message = event.reason || 'WebSocket closed before setup completed' + state.statusCallback?.('error', message) + reject(new Error(message)) + return + } state.statusCallback?.('disconnected') } }) From 7111b67f34a3e31d3af2aad2ce7b7bf016575469 Mon Sep 17 00:00:00 2001 From: yuhan Date: Wed, 22 Apr 2026 02:11:59 +0000 Subject: [PATCH 19/21] fix(voice): queue Gemini proxy messages during connect, fix Qwen startup hang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gemini proxy: buffer client messages (especially the setup frame) in a pending queue while the upstream WebSocket is still CONNECTING, then flush on open. Previously the setup frame was silently dropped. Qwen: rename sessionCreated → sessionReady and only set it after session.updated arrives. If the server fails between session.created and session.updated the promise now correctly rejects instead of hanging forever. --- hub/src/web/server.ts | 16 +++++++++++++++- web/src/realtime/QwenVoiceSession.tsx | 12 ++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/hub/src/web/server.ts b/hub/src/web/server.ts index b4efa1b49b..3272902be6 100644 --- a/hub/src/web/server.ts +++ b/hub/src/web/server.ts @@ -29,17 +29,24 @@ import { jwtVerify } from 'jose' function createGeminiProxyWebSocketHandler() { const GEMINI_WS_BASE = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent' const upstreamMap = new WeakMap, WebSocket>() + const pendingMap = new WeakMap, Array>() return { open(clientWs: ServerWebSocket) { const data = clientWs.data as { _geminiProxy: boolean; apiKey: string } const upstreamUrl = `${process.env.GEMINI_LIVE_WS_URL || GEMINI_WS_BASE}?key=${encodeURIComponent(data.apiKey)}` + const pending: Array = [] + pendingMap.set(clientWs, pending) const upstream = new WebSocket(upstreamUrl) upstreamMap.set(clientWs, upstream) upstream.onopen = () => { - // Ready — client will send setup message + // Flush any messages queued while upstream was connecting (e.g. setup frame) + for (const queued of pending.splice(0)) { + upstream.send(typeof queued === 'string' ? queued : queued) + } + pendingMap.delete(clientWs) } upstream.onmessage = (event) => { try { @@ -49,9 +56,11 @@ function createGeminiProxyWebSocketHandler() { } catch { /* client gone */ } } upstream.onerror = () => { + pendingMap.delete(clientWs) try { clientWs.close(1011, 'Upstream error') } catch { /* */ } } upstream.onclose = (event) => { + pendingMap.delete(clientWs) try { clientWs.close(event.code, event.reason) } catch { /* */ } upstreamMap.delete(clientWs) } @@ -60,10 +69,15 @@ function createGeminiProxyWebSocketHandler() { const upstream = upstreamMap.get(clientWs) if (upstream?.readyState === WebSocket.OPEN) { upstream.send(typeof message === 'string' ? message : message) + } else if (upstream?.readyState === WebSocket.CONNECTING) { + // Queue messages until upstream opens (critical for the setup frame) + const pending = pendingMap.get(clientWs) + if (pending) pending.push(message) } }, close(clientWs: ServerWebSocket, code: number, reason: string) { const upstream = upstreamMap.get(clientWs) + pendingMap.delete(clientWs) if (upstream) { try { upstream.close(code, reason) } catch { /* */ } upstreamMap.delete(clientWs) diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 167e4dd8b4..4c168a61ef 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -125,7 +125,7 @@ class QwenVoiceSessionImpl implements VoiceSession { state.ws = ws return new Promise((resolve, reject) => { - let sessionCreated = false + let sessionReady = false ws.onopen = () => { if (DEBUG) console.log('[Qwen] WebSocket connected') @@ -143,8 +143,7 @@ class QwenVoiceSessionImpl implements VoiceSession { const eventType = data.type as string // Session created - send configuration - if (eventType === 'session.created' && !sessionCreated) { - sessionCreated = true + if (eventType === 'session.created' && !sessionReady) { if (DEBUG) console.log('[Qwen] Session created') // Build tools config @@ -183,6 +182,7 @@ class QwenVoiceSessionImpl implements VoiceSession { // Session updated - ready to go if (eventType === 'session.updated') { + sessionReady = true if (DEBUG) console.log('[Qwen] Session configured') state.statusCallback?.('connected') startAudioCapture(state.playbackContext!) @@ -262,7 +262,7 @@ class QwenVoiceSessionImpl implements VoiceSession { const message = err?.message || 'Realtime session setup failed' console.error('[Qwen] Server error:', message) state.statusCallback?.('error', message) - if (!sessionCreated) { + if (!sessionReady) { reject(new Error(message)) ws.close() } @@ -272,7 +272,7 @@ class QwenVoiceSessionImpl implements VoiceSession { ws.onerror = (event) => { console.error('[Qwen] WebSocket error:', event) - if (!sessionCreated) { + if (!sessionReady) { state.statusCallback?.('error', 'WebSocket connection failed') reject(new Error('WebSocket connection failed')) } @@ -282,7 +282,7 @@ class QwenVoiceSessionImpl implements VoiceSession { if (DEBUG) console.log('[Qwen] WebSocket closed:', event.code, event.reason) cleanup() resetRealtimeSessionState() - if (!sessionCreated) { + if (!sessionReady) { const message = event.reason || 'WebSocket closed before setup completed' state.statusCallback?.('error', message) reject(new Error(message)) From 5c6048854fde0ffd700b98303b0a050e8e44aeab Mon Sep 17 00:00:00 2001 From: yuhan Date: Wed, 22 Apr 2026 03:28:45 +0000 Subject: [PATCH 20/21] fix(voice): pin browser wsUrl to proxy, separate ElevenLabs language from Gemini/Qwen Security: hub token endpoints now always return the proxy URL (/api/voice/*-ws) regardless of GEMINI_LIVE_WS_URL or QWEN_REALTIME_WS_URL env vars. Those env vars are upstream-only and never exposed to the browser, preventing JWT leakage to external endpoints. Language: extracted the Chinese language block from VOICE_SYSTEM_PROMPT into VOICE_CHINESE_LANGUAGE_BLOCK, appended only by Gemini and Qwen backends. ElevenLabs keeps a neutral English base prompt and controls language via its own language field, preserving user language preference. --- hub/src/web/routes/voice.ts | 4 ++-- shared/src/voice.ts | 23 +++++++++++++++-------- web/src/realtime/QwenVoiceSession.tsx | 6 ++++-- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/hub/src/web/routes/voice.ts b/hub/src/web/routes/voice.ts index 875b8034c2..a0863fa45e 100644 --- a/hub/src/web/routes/voice.ts +++ b/hub/src/web/routes/voice.ts @@ -151,7 +151,7 @@ export function createVoiceRoutes(): Hono { return c.json({ allowed: true, apiKey: 'proxied', // Dummy — key is handled server-side - wsUrl: process.env.GEMINI_LIVE_WS_URL || wsProxyUrl, + wsUrl: wsProxyUrl, // Always proxy — env WS URLs are upstream-only (server-side) baseUrl: process.env.GEMINI_API_BASE || undefined }) }) @@ -173,7 +173,7 @@ export function createVoiceRoutes(): Hono { return c.json({ allowed: true, - wsUrl: process.env.QWEN_REALTIME_WS_URL || wsProxyUrl + wsUrl: wsProxyUrl // Always proxy — env WS URLs are upstream-only (server-side) }) }) diff --git a/shared/src/voice.ts b/shared/src/voice.ts index ad99ce2d7d..2c2124f876 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -142,19 +142,26 @@ For builds, tests, or large file operations: - Keep conversations forward-moving with fresh insights - Assume a technical software developer audience +# First Interaction + +When the user speaks to you for the first time, begin your response with a brief greeting before addressing their request. If their first message is a coding request, greet briefly AND call the tool — do both.` + +/** + * Additional language block appended to VOICE_SYSTEM_PROMPT for Gemini/Qwen + * backends (which don't have a separate language field like ElevenLabs). + */ +export const VOICE_CHINESE_LANGUAGE_BLOCK = ` + # Language IMPORTANT: Always respond in Chinese (Mandarin). Use natural spoken Chinese. - Greet users in Chinese - Summarize technical content in Chinese - Use English only for proper nouns, tool names, and code identifiers -- Keep the same warm, concise conversational style in Chinese - -# First Interaction - -When the user speaks to you for the first time, begin your response with a brief greeting (e.g. "你好!") before addressing their request. If their first message is a coding request, greet briefly AND call the tool — do both.` +- Keep the same warm, concise conversational style in Chinese` -export const VOICE_FIRST_MESSAGE = "嗨!我是 Hapi 语音助手,有什么可以帮你的?" +/** ElevenLabs first message — language controlled by ElevenLabs language field */ +export const VOICE_FIRST_MESSAGE = "Hey! Hapi here — what can I help you with?" export const VOICE_TOOLS = [ { @@ -239,7 +246,7 @@ export function buildVoiceAgentConfig(): VoiceAgentConfig { conversation_config: { agent: { first_message: VOICE_FIRST_MESSAGE, - language: 'zh', + language: 'en', prompt: { prompt: VOICE_SYSTEM_PROMPT, llm: 'gemini-2.5-flash', @@ -337,7 +344,7 @@ export function buildGeminiLiveFunctionDeclarations(): GeminiLiveFunctionDeclara export function buildGeminiLiveConfig(): GeminiLiveConfig { return { model: GEMINI_LIVE_MODEL, - systemInstruction: VOICE_SYSTEM_PROMPT, + systemInstruction: VOICE_SYSTEM_PROMPT + VOICE_CHINESE_LANGUAGE_BLOCK, tools: [ { functionDeclarations: buildGeminiLiveFunctionDeclarations() diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 4c168a61ef..727f272c94 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -9,6 +9,7 @@ import { QWEN_REALTIME_MODEL, QWEN_REALTIME_VOICE, VOICE_SYSTEM_PROMPT, + VOICE_CHINESE_LANGUAGE_BLOCK, VOICE_TOOL_DEFINITIONS } from '@hapi/protocol/voice' import type { VoiceSession, VoiceSessionConfig, StatusCallback } from './types' @@ -155,9 +156,10 @@ class QwenVoiceSessionImpl implements VoiceSession { })) // Send session.update with full configuration + const basePrompt = VOICE_SYSTEM_PROMPT + VOICE_CHINESE_LANGUAGE_BLOCK const instructions = config.initialContext - ? `${VOICE_SYSTEM_PROMPT}\n\n[Current Context]\n${config.initialContext}` - : VOICE_SYSTEM_PROMPT + ? `${basePrompt}\n\n[Current Context]\n${config.initialContext}` + : basePrompt sendEvent('session.update', { session: { From aa9802da917a47c8e54045de4e9709b4646b8aed Mon Sep 17 00:00:00 2001 From: yuhan Date: Wed, 22 Apr 2026 03:46:04 +0000 Subject: [PATCH 21/21] fix(voice): apply initial mic mute state after recorder starts Both Gemini and Qwen backends now persist micMuted to module-level state and reapply it immediately after startAudioCapture() creates the recorder. Previously the React mute effect could run while the recorder was still null, causing audio to be captured even when the UI showed the mic as muted. --- web/src/realtime/GeminiLiveVoiceSession.tsx | 12 ++++++++++-- web/src/realtime/QwenVoiceSession.tsx | 11 ++++++++++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/web/src/realtime/GeminiLiveVoiceSession.tsx b/web/src/realtime/GeminiLiveVoiceSession.tsx index 0d663695a7..a0461e092f 100644 --- a/web/src/realtime/GeminiLiveVoiceSession.tsx +++ b/web/src/realtime/GeminiLiveVoiceSession.tsx @@ -25,6 +25,7 @@ interface GeminiLiveState { apiKey: string | null wsBaseUrl: string | null modelSpeaking: boolean + micMuted: boolean } const state: GeminiLiveState = { @@ -35,7 +36,8 @@ const state: GeminiLiveState = { statusCallback: null, apiKey: null, wsBaseUrl: null, - modelSpeaking: false + modelSpeaking: false, + micMuted: false } function cleanup() { @@ -317,6 +319,11 @@ function startAudioCapture(playbackContext: AudioContext): void { state.statusCallback?.('error', 'Microphone error') } ) + + // Apply initial mute state — the React effect may have run before the recorder existed + if (state.micMuted) { + state.recorder.setMuted(true) + } } // --- React component --- @@ -376,8 +383,9 @@ export function GeminiLiveVoiceSession({ } }, [api]) // eslint-disable-line react-hooks/exhaustive-deps - // Sync mic mute state + // Sync mic mute state — also persist to module state so startAudioCapture can apply it useEffect(() => { + state.micMuted = micMuted if (state.recorder) { state.recorder.setMuted(micMuted) } diff --git a/web/src/realtime/QwenVoiceSession.tsx b/web/src/realtime/QwenVoiceSession.tsx index 727f272c94..f0624cc2da 100644 --- a/web/src/realtime/QwenVoiceSession.tsx +++ b/web/src/realtime/QwenVoiceSession.tsx @@ -28,6 +28,7 @@ interface QwenState { statusCallback: StatusCallback | null apiKey: string | null wsBaseUrl: string | null + micMuted: boolean } const state: QwenState = { @@ -37,7 +38,8 @@ const state: QwenState = { playbackContext: null, statusCallback: null, apiKey: null, - wsBaseUrl: null + wsBaseUrl: null, + micMuted: false } let eventCounter = 0 @@ -338,6 +340,11 @@ function startAudioCapture(playbackContext: AudioContext): void { state.statusCallback?.('error', 'Microphone error') } ) + + // Apply initial mute state — the React effect may have run before the recorder existed + if (state.micMuted) { + state.recorder.setMuted(true) + } } // --- React component --- @@ -394,7 +401,9 @@ export function QwenVoiceSession({ } }, [api]) // eslint-disable-line react-hooks/exhaustive-deps + // Sync mic mute state — also persist to module state so startAudioCapture can apply it useEffect(() => { + state.micMuted = micMuted if (state.recorder) { state.recorder.setMuted(micMuted) }