diff --git a/frontend-nextjs/app/components/CreateCharacter/BuildDashboard.tsx b/frontend-nextjs/app/components/CreateCharacter/BuildDashboard.tsx index 9494dea5..618a6966 100644 --- a/frontend-nextjs/app/components/CreateCharacter/BuildDashboard.tsx +++ b/frontend-nextjs/app/components/CreateCharacter/BuildDashboard.tsx @@ -205,7 +205,7 @@ const SettingsDashboard: React.FC = ({ const [audioElement, setAudioElement] = useState(null); const [showVoiceCloneModal, setShowVoiceCloneModal] = useState<{ - provider: "elevenlabs" | "hume"; + provider: "elevenlabs" | "hume" | "60db"; title: string; voiceInputLabel: string; voiceInputPlaceholder: string; @@ -412,6 +412,16 @@ const SettingsDashboard: React.FC = ({ Eleven Labs Agent + diff --git a/frontend-nextjs/app/components/CreateCharacter/VoiceCloneModal.tsx b/frontend-nextjs/app/components/CreateCharacter/VoiceCloneModal.tsx index bd45f2e8..d5283251 100644 --- a/frontend-nextjs/app/components/CreateCharacter/VoiceCloneModal.tsx +++ b/frontend-nextjs/app/components/CreateCharacter/VoiceCloneModal.tsx @@ -18,7 +18,7 @@ interface VoiceCloneModalProps { onSuccess?: () => void; selectedUser: IUser; voiceCloneModalProps: { - provider: "elevenlabs" | "hume"; + provider: "elevenlabs" | "hume" | "60db"; title: string; voiceInputLabel: string; voiceInputPlaceholder: string; diff --git a/server/cloudflare/models/sixtydb.ts b/server/cloudflare/models/sixtydb.ts new file mode 100644 index 00000000..eb417e18 --- /dev/null +++ b/server/cloudflare/models/sixtydb.ts @@ -0,0 +1,100 @@ +/** + * 60db.ai backend for the Cloudflare worker / Durable Object. + * + * Cloudflare deployments use Workers AI bindings for STT/LLM/TTS by default. + * The cleanest way to slot 60db in without rebuilding the whole DO pipeline + * is to expose a one-shot REST synthesizer that the existing TTS module can + * delegate to when TTS_BACKEND=60db. + * + * If you also want 60db STT/LLM in Cloudflare, the full bidirectional + * implementation lives in server/deno/models/sixtydb.ts and is straight- + * forward to port (Cloudflare Workers do support outbound WebSockets via + * the WebSocket API in Durable Objects). + * + * Docs: https://docs.60db.ai/api-reference/tts/text-to-speech + */ + +import type { Env } from "../src/types"; + +const SYNTHESIZE_URL = "https://api.60db.ai/tts-synthesize"; +const DEFAULT_VOICE_ID = "fbb75ed2-975a-40c7-9e06-38e30524a9a1"; // Zara + +const AUDIO_SAMPLE_RATE = 24_000; + +interface SynthesizeResponse { + success?: boolean; + message?: string; + audio_base64?: string; + sample_rate?: number; + duration_seconds?: number; + encoding?: string; + output_format?: string; +} + +function decodeBase64(b64: string): Uint8Array { + // Cloudflare Workers expose atob, so we can decode via that path. For + // small audio payloads this is plenty fast; for larger ones consider + // streaming through Response(body) directly. + const binary = atob(b64); + const bytes = new Uint8Array(binary.length); + for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i); + return bytes; +} + +export async function synthesizeSpeechWith60db( + env: Env, + text: string, +): Promise { + if (!env.SIXTYDB_API_KEY?.trim()) { + throw new Error("SIXTYDB_API_KEY is missing"); + } + + const voiceId = env.SIXTYDB_VOICE_ID?.trim() || DEFAULT_VOICE_ID; + + const upstream = await fetch(SYNTHESIZE_URL, { + method: "POST", + headers: { + "Authorization": `Bearer ${env.SIXTYDB_API_KEY}`, + "Content-Type": "application/json", + "Accept": "application/json", + }, + body: JSON.stringify({ + text, + voice_id: voiceId, + enhance: true, + speed: 1.0, + stability: 50, + similarity: 75, + // PCM16 mono — matches the format the DO's Opus packetizer expects. + // Falls back to mp3 if 60db rejects (some accounts only ship mp3/wav). + output_format: "wav", + }), + }); + + if (!upstream.ok) { + const text = await upstream.text().catch(() => ""); + throw new Error(`60db /tts-synthesize HTTP ${upstream.status}: ${text.slice(0, 200)}`); + } + + const body = (await upstream.json()) as SynthesizeResponse; + if (!body.audio_base64) { + throw new Error(`60db returned no audio: ${body.message ?? "unknown"}`); + } + + const audio = decodeBase64(body.audio_base64); + + // Hand BodyInit an ArrayBuffer (not a typed-array view) to dodge the + // Uint8Array generic incompatibility in TS 5.7+ libs. + const arrayBuffer = audio.buffer.slice( + audio.byteOffset, + audio.byteOffset + audio.byteLength, + ) as ArrayBuffer; + + return new Response(arrayBuffer, { + headers: { + "Content-Type": "audio/wav", + "X-Sample-Rate": String(body.sample_rate ?? AUDIO_SAMPLE_RATE), + "X-Encoding": body.encoding ?? "linear16", + }, + }); +} diff --git a/server/cloudflare/src/types.ts b/server/cloudflare/src/types.ts index 16776572..2748e71b 100644 --- a/server/cloudflare/src/types.ts +++ b/server/cloudflare/src/types.ts @@ -6,5 +6,10 @@ export interface Env { ELATO_OPENAI_MODEL?: string; ELATO_OPENAI_SYSTEM_PROMPT?: string; ELATO_OPENAI_FIRST_MESSAGE?: string; + // 60db.ai integration — set TTS_BACKEND="60db" to swap synthesizeSpeech() + // from Workers AI / Deepgram Aura to 60db's REST /tts-synthesize. + TTS_BACKEND?: "workers-ai" | "60db"; + SIXTYDB_API_KEY?: string; + SIXTYDB_VOICE_ID?: string; ElatoVoiceSession: DurableObjectNamespace; } diff --git a/server/deno/main.ts b/server/deno/main.ts index 170bd157..0ed61a92 100644 --- a/server/deno/main.ts +++ b/server/deno/main.ts @@ -18,6 +18,7 @@ import { connectToGemini } from "./models/gemini.ts"; import { connectToElevenLabs } from "./models/elevenlabs.ts"; import { connectToHume } from "./models/hume.ts"; import { connectToGrok } from "./models/grok.ts"; +import { connectTo60db } from "./models/sixtydb.ts"; const server = createServer(); @@ -97,6 +98,9 @@ wss.on("connection", async (ws: WSWebSocket, payload: IPayload) => { case "hume": await connectToHume(providerArgs); break; + case "60db": + await connectTo60db(providerArgs); + break; default: throw new Error(`Unknown provider: ${provider}`); } diff --git a/server/deno/models/sixtydb.ts b/server/deno/models/sixtydb.ts new file mode 100644 index 00000000..72b4fc93 --- /dev/null +++ b/server/deno/models/sixtydb.ts @@ -0,0 +1,531 @@ +/** + * 60db.ai voice pipeline provider. + * + * Unlike the other providers in this folder (openai/gemini/grok/elevenlabs/hume), + * 60db ships its STT, LLM, and TTS as three separate services. This file + * orchestrates them into the same realtime conversational surface the other + * providers expose: + * + * ESP32 PCM16 @ 24kHz ──┐ + * ├─▶ wss://api.60db.ai/ws/stt (linear PCM frames) + * │ ↳ transcription (interim + final) + * │ ↳ speech_started → BARGE-IN: cancel TTS, reset Opus + * │ + * final transcript ───┼─▶ POST /v1/chat/completions (SSE stream) + * │ ↳ delta tokens + * │ + * assistant text ─┴─▶ wss://api.60db.ai/ws/tts (create→send→flush) + * ↳ audio_chunk (base64 PCM) + * ↳ resample → Opus → ESP32 + * + * Docs: + * https://docs.60db.ai/api-reference/websocket/stt + * https://docs.60db.ai/api-reference/llm/chat-completion + * https://docs.60db.ai/api-reference/websocket/tts + */ + +import { Buffer } from "node:buffer"; +import type { RawData } from "npm:@types/ws"; +import { addConversation, getDeviceInfo } from "../supabase.ts"; +import { + createOpusPacketizer, + isDev, + SAMPLE_RATE, + sixtyDbApiKey, +} from "../utils.ts"; + +// ---------- 60db endpoints ---------------------------------------------------- + +const BASE_HTTP = "https://api.60db.ai"; +const BASE_WS = "wss://api.60db.ai"; +const STT_WS_PATH = "/ws/stt"; +const TTS_WS_PATH = "/ws/tts"; +const CHAT_URL = `${BASE_HTTP}/v1/chat/completions`; + +// 60db STT supports linear PCM at 16/24/44.1/48 kHz. We feed at 16k (smaller +// frames, faster transcription, plenty for speech). ESP32 PCM @ 24k → 16k. +const STT_SAMPLE_RATE = 16_000; + +// 60db TTS speaks PCM via the WebSocket. We ask for 24 kHz so it matches the +// firmware rate and skips resampling on the way out (only base64 decode). +const TTS_SAMPLE_RATE = 24_000; +const TTS_ENCODING = "LINEAR16" as const; + +// Default voice when the personality doesn't carry a 60db UUID. +const DEFAULT_60DB_VOICE_ID = "fbb75ed2-975a-40c7-9e06-38e30524a9a1"; // "Zara" + +// ---------- helpers ----------------------------------------------------------- + +/** + * Linear-interpolation PCM16 mono resampler. + * Cheapest acceptable for speech in [8k..48k]; identical strategy to + * elevenlabs.ts so the audio quality bar matches across providers. + */ +function resamplePcm16Mono( + inputBytes: Buffer, + fromRate: number, + toRate: number, +): Buffer { + if (fromRate === toRate || inputBytes.length === 0) return inputBytes; + const inputSamples = inputBytes.length / 2; + const outputSamples = Math.max(1, Math.floor((inputSamples * toRate) / fromRate)); + const output = Buffer.alloc(outputSamples * 2); + for (let i = 0; i < outputSamples; i++) { + const srcPos = (i * fromRate) / toRate; + const l = Math.floor(srcPos); + const r = Math.min(l + 1, inputSamples - 1); + const t = srcPos - l; + const a = inputBytes.readInt16LE(l * 2); + const b = inputBytes.readInt16LE(r * 2); + output.writeInt16LE(Math.round(a + (b - a) * t), i * 2); + } + return output; +} + +function buildWsUrl(path: string, apiKey: string): string { + const u = new URL(BASE_WS + path); + u.searchParams.set("apiKey", apiKey); + return u.toString(); +} + +function isUuid(value: string | undefined | null): boolean { + return !!value && /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i.test(value); +} + +function resolveVoiceId(personality: any): string { + const cfg = personality?.voice?.config?.config_id ?? personality?.oai_voice; + return isUuid(cfg) ? cfg : DEFAULT_60DB_VOICE_ID; +} + +function safeSend(socket: WebSocket | null, payload: unknown) { + if (!socket || socket.readyState !== WebSocket.OPEN) return; + try { + socket.send(typeof payload === "string" ? payload : JSON.stringify(payload)); + } catch (err) { + console.error("60db: send failed", err); + } +} + +// ---------- main entrypoint --------------------------------------------------- + +export const connectTo60db = async ({ + ws, + payload, + connectionPcmFile, + firstMessage, + systemPrompt, + closeHandler, +}: ProviderArgs) => { + const apiKey = sixtyDbApiKey; + if (!apiKey) { + ws.send(JSON.stringify({ type: "server", msg: "AUTH.ERROR" })); + throw new Error("SIXTYDB_API_KEY is missing"); + } + + const { user, supabase } = payload; + const voiceId = resolveVoiceId(user.personality); + + // Outbound Opus packetizer to the ESP32. + const opus = createOpusPacketizer((packet) => ws.send(packet)); + + // Conversation memory we maintain locally — 60db's chat endpoint supports + // save_chat + chat_id, but keeping it local keeps the provider behaviour + // identical to the others (they all reload history from Supabase). + const messages: Array<{ role: "system" | "user" | "assistant"; content: string }> = [ + { role: "system", content: systemPrompt }, + ]; + + let sttSocket: WebSocket | null = null; + let ttsSocket: WebSocket | null = null; + let isSttReady = false; + let isTtsReady = false; + let activeTtsContextId: string | null = null; + let assistantTurnInProgress = false; + let pendingTtsText = ""; // buffer for tokens that arrive before TTS ctx is ready + + // Queue early ESP32 audio while the upstream STT is still connecting. + const audioQueue: Buffer[] = []; + + // ── ESP32 → server ───────────────────────────────────────────────────── + const handleClientMessage = async (data: any, isBinary: boolean) => { + try { + if (isBinary) { + if (isDev && connectionPcmFile) await connectionPcmFile.write(data); + const pcmAt24k = Buffer.from(data); + const pcmForStt = resamplePcm16Mono(pcmAt24k, SAMPLE_RATE, STT_SAMPLE_RATE); + + if (isSttReady && sttSocket) { + // Send as a JSON `audio` frame; 60db also accepts raw binary + // (telephony μ-law) but we're on the browser/linear PCM path. + safeSend(sttSocket, { + type: "audio", + audio: pcmForStt.toString("base64"), + encoding: "linear", + sample_rate: STT_SAMPLE_RATE, + timestamp: Date.now(), + }); + } else { + audioQueue.push(pcmForStt); + } + } else { + const message = JSON.parse(data.toString("utf-8")); + if (message.type === "instruction") { + if (message.msg === "INTERRUPT") { + bargeIn("client INTERRUPT"); + } else if (message.msg === "END_SESSION") { + console.log("60db: end session requested by ESP32"); + sttSocket?.close(); + ttsSocket?.close(); + } + } + } + } catch (err) { + console.error("60db: error handling client message", err); + } + }; + + ws.on("message", (data: any, isBinary: boolean) => { + handleClientMessage(data, isBinary); + }); + + ws.on("error", (error: any) => { + console.error("60db: ESP32 WebSocket error", error); + cleanup(); + }); + + ws.on("close", async (code: number, reason: string) => { + console.log(`60db: ESP32 ws closed code=${code} reason=${reason}`); + await closeHandler(); + cleanup(); + if (isDev && connectionPcmFile) { + connectionPcmFile.close(); + } + }); + + // ── shared cleanup + barge-in ────────────────────────────────────────── + const cleanup = () => { + opus.close(); + try { sttSocket?.close(); } catch { /* noop */ } + try { ttsSocket?.close(); } catch { /* noop */ } + sttSocket = null; + ttsSocket = null; + isSttReady = false; + isTtsReady = false; + }; + + /** + * Barge-in: user spoke while the assistant was talking. Cancel the + * in-flight TTS context, reset the Opus encoder, drop any partial + * assistant text. The next final transcript will start a fresh turn. + */ + const bargeIn = (reason: string) => { + if (!assistantTurnInProgress && !activeTtsContextId) return; + console.log(`60db: barge-in (${reason})`); + if (ttsSocket && activeTtsContextId) { + safeSend(ttsSocket, { close_context: { context_id: activeTtsContextId } }); + } + activeTtsContextId = null; + assistantTurnInProgress = false; + pendingTtsText = ""; + opus.reset(); + }; + + // ── 60db TTS: open WS, manage contexts ───────────────────────────────── + const openTtsSocket = () => { + const url = buildWsUrl(TTS_WS_PATH, apiKey); + ttsSocket = new WebSocket(url); + ttsSocket.binaryType = "arraybuffer"; + + ttsSocket.onopen = () => console.log("60db: TTS ws opened"); + ttsSocket.onerror = (e) => console.error("60db: TTS ws error", e); + ttsSocket.onclose = () => { + console.log("60db: TTS ws closed"); + isTtsReady = false; + }; + + ttsSocket.onmessage = async (event) => { + let frame: any; + try { + frame = JSON.parse(typeof event.data === "string" ? event.data : Buffer.from(event.data as ArrayBuffer).toString("utf-8")); + } catch (err) { + console.error("60db: TTS bad frame", err); + return; + } + + if (frame.connection_established) { + isTtsReady = true; + console.log("60db: TTS connection_established", frame.connection_established); + return; + } + if (frame.context_created) { + // Drain any tokens that arrived before context was ready. + if (pendingTtsText && activeTtsContextId) { + safeSend(ttsSocket, { + send_text: { context_id: activeTtsContextId, text: pendingTtsText }, + }); + pendingTtsText = ""; + } + return; + } + if (frame.audio_chunk?.audioContent) { + const audio = Buffer.from(frame.audio_chunk.audioContent, "base64"); + // We asked for 24kHz LINEAR16 — no resample needed → straight to Opus. + if (!assistantTurnInProgress) { + assistantTurnInProgress = true; + opus.reset(); + ws.send(JSON.stringify({ type: "server", msg: "RESPONSE.CREATED" })); + } + opus.push(audio); + return; + } + if (frame.flush_completed) { + opus.flush(true); + if (assistantTurnInProgress) { + try { + const device = await getDeviceInfo(supabase, user.user_id); + ws.send(JSON.stringify({ + type: "server", + msg: "RESPONSE.COMPLETE", + volume_control: device?.volume ?? 100, + })); + } catch (err) { + console.error("60db: getDeviceInfo failed", err); + ws.send(JSON.stringify({ type: "server", msg: "RESPONSE.COMPLETE" })); + } + } + assistantTurnInProgress = false; + return; + } + if (frame.context_closed) { + if (frame.context_closed.context_id === activeTtsContextId) { + activeTtsContextId = null; + } + return; + } + if (frame.error) { + console.error("60db: TTS error frame", frame.error); + return; + } + }; + }; + + const startTtsContext = (): string => { + const ctxId = crypto.randomUUID(); + activeTtsContextId = ctxId; + safeSend(ttsSocket, { + create_context: { + context_id: ctxId, + voice_id: voiceId, + audio_config: { + audio_encoding: TTS_ENCODING, + sample_rate_hertz: TTS_SAMPLE_RATE, + }, + speed: 1.0, + stability: 50, + similarity: 75, + }, + }); + return ctxId; + }; + + const pushTtsTokens = (text: string) => { + if (!text) return; + if (!activeTtsContextId || !isTtsReady) { + pendingTtsText += text; + return; + } + safeSend(ttsSocket, { + send_text: { context_id: activeTtsContextId, text }, + }); + }; + + const flushTtsContext = () => { + if (!activeTtsContextId) return; + const ctxId = activeTtsContextId; + safeSend(ttsSocket, { flush_context: { context_id: ctxId } }); + // We deliberately don't close_context here so the same WS can be + // reused for the next assistant turn (cheaper than reconnecting). + }; + + // ── 60db LLM: SSE chat-completions stream ────────────────────────────── + const runAssistantTurn = async (userText: string) => { + if (assistantTurnInProgress) { + // Already responding — let barge-in handle this case via the STT path. + return; + } + messages.push({ role: "user", content: userText }); + addConversation(supabase, "user", userText, user); + + startTtsContext(); + + let assistantText = ""; + try { + const res = await fetch(CHAT_URL, { + method: "POST", + headers: { + "Authorization": `Bearer ${apiKey}`, + "Content-Type": "application/json", + "Accept": "text/event-stream", + }, + body: JSON.stringify({ + model: "60db-tiny", + messages, + stream: true, + save_chat: false, + }), + }); + + if (!res.ok || !res.body) { + throw new Error(`60db chat HTTP ${res.status}`); + } + + const reader = res.body.getReader(); + const decoder = new TextDecoder(); + let buffer = ""; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + buffer += decoder.decode(value, { stream: true }); + + // SSE frames are separated by \n\n; each frame has one or more + // `data: …` lines. + const frames = buffer.split("\n\n"); + buffer = frames.pop() ?? ""; + for (const frame of frames) { + for (const line of frame.split("\n")) { + if (!line.startsWith("data:")) continue; + const payloadStr = line.slice(5).trim(); + if (!payloadStr || payloadStr === "[DONE]") continue; + try { + const evt = JSON.parse(payloadStr); + const delta = evt?.choices?.[0]?.delta?.content + ?? evt?.choices?.[0]?.message?.content + ?? ""; + if (delta) { + assistantText += delta; + pushTtsTokens(delta); + } + } catch (err) { + console.warn("60db: SSE parse error", err, payloadStr); + } + } + } + } + } catch (err) { + console.error("60db: LLM stream failed", err); + // Fallback: speak a generic apology so the toy isn't silent. + const fallback = "Sorry — I had a connection hiccup."; + assistantText = fallback; + pushTtsTokens(fallback); + } + + // End of assistant text → trigger synthesis flush. + if (assistantText) { + messages.push({ role: "assistant", content: assistantText }); + addConversation(supabase, "assistant", assistantText, user); + } + flushTtsContext(); + }; + + // ── 60db STT: open WS, drive the conversation ────────────────────────── + const openSttSocket = () => { + const url = buildWsUrl(STT_WS_PATH, apiKey); + sttSocket = new WebSocket(url); + + sttSocket.onopen = () => { + console.log("60db: STT ws opened"); + // Configure the session. continuous_mode keeps it alive between + // utterances (required for voicebot UX). + safeSend(sttSocket, { + type: "start", + languages: null, // auto-detect across 39 langs + config: { + encoding: "linear", + sample_rate: STT_SAMPLE_RATE, + utterance_end_ms: 500, + continuous_mode: true, + interim_results_frequency: 300, + audio_enhancement: "adaptive", + no_speech_threshold: 0.60, + }, + }); + }; + + sttSocket.onerror = (e) => console.error("60db: STT ws error", e); + + sttSocket.onclose = (e) => { + console.log(`60db: STT ws closed code=${e.code} reason=${e.reason}`); + isSttReady = false; + // Cascade: closing STT ends the session for this device. + try { ws.close(); } catch { /* noop */ } + }; + + sttSocket.onmessage = (event) => { + let frame: any; + try { + frame = JSON.parse(typeof event.data === "string" ? event.data : Buffer.from(event.data as ArrayBuffer).toString("utf-8")); + } catch (err) { + console.error("60db: STT bad frame", err); + return; + } + + if (frame.type === "connected") { + console.log("60db: STT connected", frame.server_info?.model); + isSttReady = true; + // Drain queued audio captured while STT was opening. + while (audioQueue.length) { + const buf = audioQueue.shift()!; + safeSend(sttSocket, { + type: "audio", + audio: buf.toString("base64"), + encoding: "linear", + sample_rate: STT_SAMPLE_RATE, + timestamp: Date.now(), + }); + } + // Kick off the first assistant turn so the toy greets the user + // (matches the OpenAI/Gemini/ElevenLabs flow). + if (firstMessage?.trim()) { + runAssistantTurn(firstMessage); + } + return; + } + + if (frame.type === "speech_started") { + // User began speaking → barge-in if we're mid-response. + if (assistantTurnInProgress) bargeIn("speech_started"); + return; + } + + if (frame.type === "transcription") { + // Only act on canonical (speech_final) results with real text. + if (frame.speech_final && frame.text?.trim()) { + console.log("60db: user said", frame.text); + runAssistantTurn(frame.text.trim()); + } + return; + } + + if (frame.type === "session_stopped") { + console.log("60db: STT session_stopped", frame.billing_summary); + return; + } + + if (frame.type === "error") { + console.error("60db: STT error", frame.error); + return; + } + }; + }; + + // ── kick everything off ──────────────────────────────────────────────── + try { + openTtsSocket(); + openSttSocket(); + } catch (err) { + console.error("60db: failed to open upstream sockets", err); + ws.send(JSON.stringify({ type: "server", msg: "RESPONSE.ERROR" })); + cleanup(); + } +}; diff --git a/server/deno/types.d.ts b/server/deno/types.d.ts index 39383705..21f716c9 100644 --- a/server/deno/types.d.ts +++ b/server/deno/types.d.ts @@ -26,7 +26,7 @@ declare global { user_code: string; } - type ModelProvider = "openai" | "gemini" | "elevenlabs" | "hume" | "grok"; + type ModelProvider = "openai" | "gemini" | "elevenlabs" | "hume" | "grok" | "60db"; type GrokVoice = | "Ara" diff --git a/server/deno/utils.ts b/server/deno/utils.ts index ac2a230b..205c89fd 100644 --- a/server/deno/utils.ts +++ b/server/deno/utils.ts @@ -106,6 +106,7 @@ export const geminiApiKey = Deno.env.get("GEMINI_API_KEY"); export const elevenLabsApiKey = Deno.env.get("ELEVENLABS_API_KEY"); export const humeApiKey = Deno.env.get('HUME_API_KEY'); export const xaiApiKey = Deno.env.get('XAI_API_KEY'); +export const sixtyDbApiKey = Deno.env.get("SIXTYDB_API_KEY"); export { encoder, FRAME_SIZE }; diff --git a/server/fastapi/models/llm/sixtydb.py b/server/fastapi/models/llm/sixtydb.py new file mode 100644 index 00000000..3c791f73 --- /dev/null +++ b/server/fastapi/models/llm/sixtydb.py @@ -0,0 +1,14 @@ +"""60db LLM provider. + +60db exposes an OpenAI-compatible /v1/chat/completions endpoint, so when +mounted as the LLM, the cleanest Pipecat integration is OpenAILLMService +pointed at base_url=https://api.60db.ai/v1 with model="60db-tiny". + +Until that wiring lands directly, the autodiscovered factory surfaces a +clear NotImplementedError. The full 60db pipeline (STT+LLM+TTS) is wired +end-to-end in server/deno/models/sixtydb.ts. +""" + +from models._autodiscover import autodiscovered_provider_factory + +create_service = autodiscovered_provider_factory("Sixtydb", "LLM") diff --git a/server/fastapi/models/providers.py b/server/fastapi/models/providers.py index 6eae8316..ad96b960 100644 --- a/server/fastapi/models/providers.py +++ b/server/fastapi/models/providers.py @@ -52,6 +52,14 @@ class ProviderSpec: env=("XAI_API_KEY",), description="xAI Grok via Pipecat.", ), + "60db": ProviderSpec( + name="60db", + category="llm", + module="models.llm.sixtydb", + env=("SIXTYDB_API_KEY",), + aliases=("sixtydb",), + description="60db.ai chat completions (OpenAI-compatible, model 60db-tiny).", + ), }, "stt": { "deepgram": ProviderSpec( @@ -67,6 +75,14 @@ class ProviderSpec: module="models.stt.whisper", description="Local Whisper transcription service with no external API key.", ), + "60db": ProviderSpec( + name="60db", + category="stt", + module="models.stt.sixtydb", + env=("SIXTYDB_API_KEY",), + aliases=("sixtydb",), + description="60db.ai WebSocket STT — 39 languages, native Indic support.", + ), }, "tts": { "elevenlabs": ProviderSpec( @@ -97,6 +113,14 @@ class ProviderSpec: env=("OPENAI_API_KEY",), description="OpenAI text-to-speech service.", ), + "60db": ProviderSpec( + name="60db", + category="tts", + module="models.tts.sixtydb", + env=("SIXTYDB_API_KEY",), + aliases=("sixtydb",), + description="60db.ai WebSocket TTS — low-cost, native Indic voices.", + ), }, } diff --git a/server/fastapi/models/stt/sixtydb.py b/server/fastapi/models/stt/sixtydb.py new file mode 100644 index 00000000..c8007690 --- /dev/null +++ b/server/fastapi/models/stt/sixtydb.py @@ -0,0 +1,14 @@ +"""60db STT provider. + +Pipecat doesn't ship a stock 60db service yet, so this stub surfaces a +clear NotImplementedError via the autodiscovery loader. When a Pipecat +service named SixtydbSTTService lands (or a custom class is added under +pipecat.services.sixtydb), this file picks it up automatically. + +For the Deno server, a fully-orchestrated 60db pipeline already exists at +server/deno/models/sixtydb.ts. +""" + +from models._autodiscover import autodiscovered_provider_factory + +create_service = autodiscovered_provider_factory("Sixtydb", "STT") diff --git a/server/fastapi/models/tts/sixtydb.py b/server/fastapi/models/tts/sixtydb.py new file mode 100644 index 00000000..6eef8275 --- /dev/null +++ b/server/fastapi/models/tts/sixtydb.py @@ -0,0 +1,9 @@ +"""60db TTS provider. + +See models/stt/sixtydb.py for the stub-vs-Pipecat note. The Deno server +holds the full 60db WS implementation in server/deno/models/sixtydb.ts. +""" + +from models._autodiscover import autodiscovered_provider_factory + +create_service = autodiscovered_provider_factory("Sixtydb", "TTS") diff --git a/supabase/migrations/20260608184500_add_sixtydb_provider.sql b/supabase/migrations/20260608184500_add_sixtydb_provider.sql new file mode 100644 index 00000000..d2d4c09a --- /dev/null +++ b/supabase/migrations/20260608184500_add_sixtydb_provider.sql @@ -0,0 +1,9 @@ +-- Extend the personalities.provider CHECK constraint to accept '60db'. +-- 60db.ai is wired as a full STT + LLM + TTS pipeline (server/deno/models/sixtydb.ts, +-- server/cloudflare/models/sixtydb.ts, server/fastapi/models/{stt,tts,llm}/sixtydb.py). + +ALTER TABLE personalities DROP CONSTRAINT IF EXISTS personalities_provider_check; + +ALTER TABLE personalities + ADD CONSTRAINT personalities_provider_check + CHECK (provider IN ('openai', 'gemini', 'grok', 'elevenlabs', 'hume', '60db'));