Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
node_modules/
npm-debug.log*
*.log
esp-web-tools-logs.txt

# Python caches / virtualenv
__pycache__/
.pytest_cache/
.venv/
.venv-platformio/
.venv-vllm/
.venv-qwen-tts/
tts-worker/.venv/
Expand Down
91 changes: 91 additions & 0 deletions face-app/dist/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { startFaceWebSocketServer } from './ws_server.js';
import { createTtsController } from './tts_controller.js';
import { createTtsAudioStore } from './tts_audio_store.js';
import { loadFaceAppConfig } from './config_loader.js';
import { resolveBrowserAudioMaxChannels } from './browser_audio_config.js';
import { createOperatorAsrProxy } from './operator_asr_proxy.js';
Expand Down Expand Up @@ -66,6 +67,32 @@ function writeJson(response, statusCode, payload) {
response.end(JSON.stringify(payload));
}

async function readJsonRequestBody(request, { maxBytes = 32_768 } = {}) {
const chunks = [];
let byteLength = 0;
for await (const chunk of request) {
byteLength += chunk.length;
if (byteLength > maxBytes) {
const error = new Error('request_body_too_large');
error.code = 'request_body_too_large';
throw error;
}
chunks.push(chunk);
}
if (byteLength === 0) {
const error = new Error('empty_body');
error.code = 'empty_body';
throw error;
}
try {
return JSON.parse(Buffer.concat(chunks, byteLength).toString('utf8'));
} catch {
const error = new Error('invalid_json');
error.code = 'invalid_json';
throw error;
}
}

function normalizeOptionalString(value) {
if (typeof value !== 'string') {
return null;
Expand Down Expand Up @@ -237,11 +264,22 @@ const operatorAsrProxy = createOperatorAsrProxy({
modelJa: process.env.MH_OPERATOR_ASR_MODEL_JA ?? '',
modelEn: process.env.MH_OPERATOR_ASR_MODEL_EN ?? '',
requestTimeoutMs: Number.isNaN(operatorAsrTimeoutMs) ? 20_000 : operatorAsrTimeoutMs,
onBargeIn: (reason) => {
// Lazily resolved: ttsController is created after this proxy.
if (ttsController && typeof ttsController.flushForBargeIn === 'function') {
Promise.resolve(ttsController.flushForBargeIn(reason)).catch((error) => {
console.error(`[face-app] tts barge-in flush failed: ${error.message}`);
});
}
},
log: console
});
let operatorRealtimeAsrProxy = null;

let ttsController = null;
const ttsAudioStore = createTtsAudioStore({
ttlMs: Number.parseInt(process.env.MH_TTS_AUDIO_REF_TTL_MS ?? '60000', 10)
});

function normalizeSayPayload(payload) {
const normalized = { ...payload };
Expand Down Expand Up @@ -394,6 +432,54 @@ const server = await startFaceWebSocketServer({
});
return true;
}
if (parsedUrl.pathname === '/api/operator/response') {
if (request.method !== 'POST') {
writeJson(response, 405, {
ok: false,
error: 'method_not_allowed'
});
return true;
}
let payload = null;
try {
payload = await readJsonRequestBody(request);
} catch (error) {
writeJson(response, error.code === 'request_body_too_large' ? 413 : 400, {
ok: false,
error: error.code ?? 'invalid_request_body'
});
return true;
}
if (!payload || payload.type !== 'operator_response') {
writeJson(response, 400, {
ok: false,
error: 'invalid_operator_response'
});
return true;
}
if (typeof payload.value !== 'string' || payload.value.trim() === '') {
writeJson(response, 400, {
ok: false,
error: 'empty_value'
});
return true;
}
const normalized = {
...payload,
v: payload.v ?? 1,
type: 'operator_response',
session_id: normalizeSessionId(payload),
response_kind: typeof payload.response_kind === 'string' ? payload.response_kind : 'text',
value: payload.value.trim(),
source: typeof payload.source === 'string' && payload.source.trim() !== '' ? payload.source.trim() : 'http',
ts: Date.now()
};
server.broadcast(normalized);
writeJson(response, 202, {
ok: true
});
return true;
}
if (parsedUrl.pathname === '/api/operator/ui-config') {
writeJson(response, 200, {
ok: true,
Expand All @@ -417,6 +503,9 @@ const server = await startFaceWebSocketServer({
});
return true;
}
if (ttsAudioStore.handleHttpRequest(request, response)) {
return true;
}
return operatorAsrProxy.handleHttpRequest(request, response);
},
log: console
Expand Down Expand Up @@ -461,9 +550,11 @@ if (ttsEnabled) {
broadcast(payload) {
return server.broadcast(payload);
},
audioStore: ttsAudioStore,
defaultTtlMs: faceConfig.tts.defaultTtlMs,
autoInterruptAfterMs: faceConfig.tts.autoInterruptAfterMs,
qwenBoundarySpeaker: process.env.MH_QWEN_TTS_BOUNDARY_SPEAKER ?? 'Ono_Anna',
maxChunkChars: Number.parseInt(process.env.MH_TTS_CHUNK_MAX_CHARS ?? '120', 10),
gateConfig: faceConfig.speechGate,
workerCwd: repoRoot,
workerEnv: {
Expand Down
13 changes: 13 additions & 0 deletions face-app/dist/operator_asr_proxy.js
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ export function createOperatorAsrProxy(options = {}) {
const modelEn = asNonEmptyString(options.modelEn);
const modelJa = asNonEmptyString(options.modelJa);
const fetchImpl = typeof options.fetchImpl === 'function' ? options.fetchImpl : globalThis.fetch;
const onBargeIn = typeof options.onBargeIn === 'function' ? options.onBargeIn : null;

if (typeof fetchImpl !== 'function') {
throw new Error('fetch API is unavailable for operator ASR proxy');
Expand All @@ -150,6 +151,18 @@ export function createOperatorAsrProxy(options = {}) {
return true;
}

// The user has taken the turn (PTT released, audio incoming):
// stop any in-flight/queued agent speech so it does not talk over
// the transcript that is about to be delivered. Cross-transport
// (Atom posts here too; it has no usable WebSocket client).
if (onBargeIn) {
try {
onBargeIn('operator_ptt');
} catch (error) {
log.error(`[face-app] operator ASR barge-in handler failed: ${error.message}`);
}
}

const requestedLanguage = normalizeLanguage(
parsedUrl.searchParams.get('lang') ?? parsedUrl.searchParams.get('languageHint') ?? 'en',
'en'
Expand Down
212 changes: 212 additions & 0 deletions face-app/dist/tts_audio_store.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import { randomUUID } from 'node:crypto';

function toNow(options) {
return typeof options.now === 'function' ? options.now : () => Date.now();
}

function normalizeTtlMs(value, fallbackMs = 60_000) {
if (!Number.isInteger(value)) {
return fallbackMs;
}
return Math.max(1, value);
}

function normalizeMimeType(value) {
return typeof value === 'string' && value.trim() !== '' ? value.trim() : 'audio/wav';
}

function parseWavDurationMs(buffer, sampleRateFallback = null) {
if (!Buffer.isBuffer(buffer) || buffer.length < 44) {
return null;
}
if (buffer.toString('ascii', 0, 4) !== 'RIFF' || buffer.toString('ascii', 8, 12) !== 'WAVE') {
return null;
}

let sampleRate = Number.isInteger(sampleRateFallback) && sampleRateFallback > 0 ? sampleRateFallback : null;
let byteRate = null;
let dataBytes = null;
let offset = 12;

while (offset + 8 <= buffer.length) {
const chunkId = buffer.toString('ascii', offset, offset + 4);
const chunkSize = buffer.readUInt32LE(offset + 4);
const chunkStart = offset + 8;
if (chunkStart + chunkSize > buffer.length) {
break;
}

if (chunkId === 'fmt ' && chunkSize >= 16) {
sampleRate = buffer.readUInt32LE(chunkStart + 4);
byteRate = buffer.readUInt32LE(chunkStart + 8);
} else if (chunkId === 'data') {
dataBytes = chunkSize;
break;
}

offset = chunkStart + chunkSize + (chunkSize % 2);
}

if (Number.isInteger(byteRate) && byteRate > 0 && Number.isInteger(dataBytes)) {
return Math.round((dataBytes / byteRate) * 1000);
}
if (Number.isInteger(sampleRate) && sampleRate > 0 && Number.isInteger(dataBytes)) {
return Math.round((dataBytes / (sampleRate * 2)) * 1000);
}
return null;
}

function writeJson(response, statusCode, payload) {
response.writeHead(statusCode, {
'content-type': 'application/json; charset=utf-8',
'cache-control': 'no-store'
});
response.end(JSON.stringify(payload));
}

export function createTtsAudioStore(options = {}) {
const now = toNow(options);
const ttlMs = normalizeTtlMs(options.ttlMs, 60_000);
const entries = new Map();

function prune(atMs = now()) {
for (const [id, entry] of entries) {
if (atMs > entry.expiresAt) {
entries.delete(id);
}
}
}

function putAudio({
audioBase64,
mimeType = 'audio/wav',
sampleRate = null,
sessionId = '-',
utteranceId = null,
generation = null,
messageId = null,
revision = null,
agentId = null,
agentLabel = null
}) {
if (typeof audioBase64 !== 'string' || audioBase64.trim() === '') {
return null;
}
prune();

const audio = Buffer.from(audioBase64, 'base64');
const id = randomUUID();
const createdAt = now();
const entry = {
id,
audio,
mimeType: normalizeMimeType(mimeType),
sampleRate: Number.isInteger(sampleRate) ? sampleRate : null,
byteLength: audio.length,
durationMs: parseWavDurationMs(audio, sampleRate),
sessionId,
utteranceId,
generation,
messageId,
revision,
agentId,
agentLabel,
createdAt,
expiresAt: createdAt + ttlMs
};
entries.set(id, entry);
return entry;
}

function get(id) {
prune();
const entry = entries.get(id);
if (!entry) {
return null;
}
if (now() > entry.expiresAt) {
entries.delete(id);
return null;
}
return entry;
}

function deleteAudio(id) {
return entries.delete(id);
}

function clear() {
entries.clear();
}

function toReferencePayload(entry, { basePath = '/api/tts/audio' } = {}) {
if (!entry) {
return null;
}
const url = `${basePath}/${entry.id}.wav`;
return {
v: 1,
type: 'tts_audio_ref',
session_id: entry.sessionId,
...(entry.agentId ? { agent_id: entry.agentId } : {}),
...(entry.agentLabel ? { agent_label: entry.agentLabel } : {}),
utterance_id: entry.utteranceId,
generation: entry.generation,
message_id: entry.messageId,
revision: entry.revision,
mime_type: entry.mimeType,
sample_rate: entry.sampleRate,
byte_length: entry.byteLength,
duration_ms: entry.durationMs,
expires_at: entry.expiresAt,
url,
ts: now()
};
}

function handleHttpRequest(request, response) {
const parsedUrl = new URL(request.url ?? '/', 'http://127.0.0.1');
const match = parsedUrl.pathname.match(/^\/api\/tts\/audio\/([0-9a-fA-F-]+)\.wav$/);
if (!match) {
return false;
}
if (request.method !== 'GET' && request.method !== 'HEAD') {
writeJson(response, 405, { ok: false, error: 'method_not_allowed' });
return true;
}

const entry = get(match[1]);
if (!entry) {
writeJson(response, 410, { ok: false, error: 'audio_expired' });
return true;
}

response.writeHead(200, {
'content-type': entry.mimeType,
'content-length': String(entry.byteLength),
'cache-control': 'no-store',
'x-utterance-id': entry.utteranceId ?? '',
'x-generation': Number.isInteger(entry.generation) ? String(entry.generation) : ''
});
if (request.method === 'HEAD') {
response.end();
return true;
}
response.end(entry.audio);
return true;
}

return {
putAudio,
get,
deleteAudio,
prune,
clear,
toReferencePayload,
handleHttpRequest,
size() {
prune();
return entries.size;
}
};
}
Loading
Loading