diff --git a/CHANGELOG.md b/CHANGELOG.md index a1a9088..7c9acfd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ ## 0.6.1 (2026-05-12) +### Fixed — ElevenLabs WS TTS firstMessage audio garbled on Twilio (output_format auto-flip to ulaw_8000 was bypassed on the firstMessage path) + +`ElevenLabsWebSocketTTS()` constructed without an explicit `outputFormat` defaults to `pcm_16000`. `StreamHandler.initPipeline` (TS) / `_init_pipeline` (Py) then calls `setTelephonyCarrier('twilio')` which auto-flips the adapter's wire format to `ulaw_8000` (a perf win: ElevenLabs encodes μ-law server-side, saving an SDK-side resample + mulaw transcode). The flip was wired correctly in the adapter, but the outbound audio path was never told. `encodePipelineAudio` (TS) and `TwilioAudioSender.send_audio` (Py) both unconditionally treated TTS chunks as PCM16 16 kHz — feeding the μ-law bytes through a 16 kHz → 8 kHz resampler and then `pcm16_to_mulaw` produced loud garbled hiss on the wire. Audible as a "gracchia molto" first-message on the user's earpiece in live Twilio + pipeline calls; subsequent turns went through the same broken path so the artefact persisted whenever the auto-flip was active. + +Fix: after `setTelephonyCarrier(carrier)` returns, the stream handler queries the TTS adapter's effective `outputFormat` / `output_format`. When it reads `'ulaw_8000'` AND the carrier is Twilio, TS sets a new `ttsIsMulaw8k` flag so `encodePipelineAudio` base64-encodes the chunk bytes directly; Python flips `audio_sender._input_is_mulaw_8k = True` so `TwilioAudioSender.send_audio` skips its PCM16 path. Mirrors the existing ElevenLabs ConvAI native μ-law fast-path. The user's explicit `outputFormat` always wins (the adapter is a no-op when the caller passed an explicit value), and Telnyx (which auto-flips to `pcm_16000`) is unaffected — the default path was already correct for it. Files: `libraries/typescript/src/stream-handler.ts`, `libraries/python/getpatter/stream_handler.py`. Coverage: `libraries/typescript/tests/unit/stream-handler.test.ts` (3 new cases — pass-through, default path, construction default) + `libraries/python/tests/unit/test_twilio_handler_unit.py` (2 new cases covering construction-time and runtime flip of the `_input_is_mulaw_8k` flag). + ### Changed — `StreamHandler` adopt-capability check now uses duck typing The TS realtime adopt branch in `stream-handler.ts` previously relied on `this.adapter instanceof OpenAIRealtimeAdapter` to gate the prewarm-handoff path. Switched to a duck-type check (`typeof adapter.adoptWebSocket === 'function'`) so the generic stream-handler module stays provider-agnostic on this hot path and matches the Python handler's `getattr(self._adapter, "adopt_websocket", None)` shape. Files: `libraries/typescript/src/stream-handler.ts`. diff --git a/libraries/python/getpatter/stream_handler.py b/libraries/python/getpatter/stream_handler.py index 520fc44..08ecd2a 100644 --- a/libraries/python/getpatter/stream_handler.py +++ b/libraries/python/getpatter/stream_handler.py @@ -1951,6 +1951,29 @@ async def start(self) -> None: exc_info=True, ) + # After the carrier hint, the adapter may have flipped its wire + # format to the carrier-native codec. When that codec is μ-law @ + # 8 kHz AND the carrier is Twilio (mulaw on the wire), flip the + # audio_sender into pass-through mode so it forwards the bytes + # as-is instead of resampling them as PCM16 16 kHz. Without this, + # ``TwilioAudioSender.send_audio`` would interpret the μ-law + # bytes as PCM16 samples, resample, and re-encode to μ-law — + # producing the loud, garbled hiss reported on the firstMessage + # live path with ``ElevenLabsWebSocketTTS()`` (defaults) + Twilio. + # Mirrors the ConvAI ``_native_mulaw_8k`` pattern earlier in the + # file and the TS ``ttsIsMulaw8k`` flag in ``stream-handler.ts``. + if self._tts is not None and self._for_twilio: + effective_format = getattr(self._tts, "output_format", None) + if effective_format == "ulaw_8000" and hasattr( + self.audio_sender, "_input_is_mulaw_8k" + ): + self.audio_sender._input_is_mulaw_8k = True # type: ignore[attr-defined] + logger.debug( + "pipeline mode: TTS native μ-law 8 kHz fast-path enabled " + "(adapter output_format=ulaw_8000, carrier=twilio); " + "skipping outbound PCM16→8k resample + mulaw transcode" + ) + if self._stt is None: logger.warning("Pipeline mode: no STT configured") if self._tts is None: diff --git a/libraries/python/tests/unit/test_twilio_audio_sender_mulaw_passthrough.py b/libraries/python/tests/unit/test_twilio_audio_sender_mulaw_passthrough.py new file mode 100644 index 0000000..40c90a8 --- /dev/null +++ b/libraries/python/tests/unit/test_twilio_audio_sender_mulaw_passthrough.py @@ -0,0 +1,98 @@ +"""Unit tests for the TwilioAudioSender μ-law pass-through fast-path. + +Guards the 0.6.1 firstMessage audio glitch fix: + +When ``ElevenLabsWebSocketTTS()`` is constructed without an explicit +``output_format`` and paired with a Twilio carrier, +``set_telephony_carrier('twilio')`` auto-flips the wire format to +``ulaw_8000``. The TTS adapter then streams μ-law bytes, but +``TwilioAudioSender.send_audio`` always assumed PCM16 16 kHz — resampling ++ re-encoding the μ-law bytes produced loud garbled hiss on the wire +(the "gracchia molto" firstMessage report). + +The fix in ``StreamHandler._init_pipeline`` flips +``audio_sender._input_is_mulaw_8k = True`` after carrier negotiation when +the adapter reports ``output_format == 'ulaw_8000'``. These tests cover +the mutation contract that path relies on: construction-time pass-through +and runtime flip both correctly bypass the PCM16 transcoding pipeline. +""" + +from __future__ import annotations + +import base64 +import json +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from getpatter.telephony.twilio import TwilioAudioSender + + +@pytest.mark.unit +class TestTwilioAudioSenderMulawPassthrough: + """``TwilioAudioSender._input_is_mulaw_8k`` controls the transcode path.""" + + async def test_input_is_mulaw_flag_pass_through(self) -> None: + """When constructed with ``input_is_mulaw_8k=True`` the sender + forwards bytes as-is — no resample, no transcode. Used by the + ElevenLabs ConvAI path and by the 0.6.1 firstMessage fix when + the ElevenLabs WS TTS adapter auto-flips to ``ulaw_8000`` for + Twilio. + """ + ws = AsyncMock() + ws.send_text = AsyncMock() + sender = TwilioAudioSender(ws, stream_sid="MZ_test", input_is_mulaw_8k=True) + + mulaw_bytes = b"\x7f\x80\xff\x00\x7e\x81" + await sender.send_audio(mulaw_bytes) + + ws.send_text.assert_awaited_once() + payload = json.loads(ws.send_text.call_args[0][0]) + assert payload["event"] == "media" + decoded = base64.b64decode(payload["media"]["payload"]) + assert decoded == mulaw_bytes # byte-for-byte forwarded + + async def test_input_is_mulaw_flip_at_runtime(self) -> None: + """``StreamHandler._init_pipeline`` flips the flag in place after + the TTS adapter's ``set_telephony_carrier`` returns. This test + guards that mutation contract: a sender constructed in default + (PCM16) mode keeps producing transcoded output until the flag + flips, then becomes a pure pass-through. Mirrors the in-place + mutation already used for ElevenLabs ConvAI native μ-law. + """ + ws = AsyncMock() + ws.send_text = AsyncMock() + with ( + patch( + "getpatter.audio.transcoding.pcm16_to_mulaw", + side_effect=lambda x: b"X" * len(x), + create=True, + ), + patch( + "getpatter.audio.transcoding.create_resampler_16k_to_8k", + return_value=MagicMock( + process=MagicMock(side_effect=lambda x: x), + flush=MagicMock(return_value=b""), + ), + create=True, + ), + ): + sender = TwilioAudioSender(ws, stream_sid="MZ_test") + + # Pre-flip: PCM16 bytes get transcoded through the mock pipeline. + # 2-byte aligned chunk → resampler returns it unchanged → mulaw + # mock returns "XX". The decoded payload must be the mock output. + await sender.send_audio(b"\x00\x00") + payload_before = json.loads(ws.send_text.call_args[0][0]) + assert base64.b64decode(payload_before["media"]["payload"]) == b"XX" + + # Flip the flag in place — same shape as the production fix in + # ``StreamHandler._init_pipeline`` after ``set_telephony_carrier``. + sender._input_is_mulaw_8k = True + + # Post-flip: bytes flow through untouched. The mock transcode + # helpers must not be invoked. We assert this via the payload + # bytes matching the input exactly. + await sender.send_audio(b"\xff\x7f\x80") + payload_after = json.loads(ws.send_text.call_args[0][0]) + assert base64.b64decode(payload_after["media"]["payload"]) == b"\xff\x7f\x80" diff --git a/libraries/typescript/src/stream-handler.ts b/libraries/typescript/src/stream-handler.ts index 0030038..df02e92 100644 --- a/libraries/typescript/src/stream-handler.ts +++ b/libraries/typescript/src/stream-handler.ts @@ -719,6 +719,17 @@ export class StreamHandler { // Created lazily on first use; reset() on call end. private readonly inboundResampler: StatefulResampler = createResampler8kTo16k(); private readonly outboundResampler: StatefulResampler = createResampler16kTo8k(); + // Pipeline-mode TTS native-mulaw fast-path. Set to ``true`` in + // ``initPipeline`` when the TTS adapter's effective ``outputFormat`` is + // ``ulaw_8000`` AND the telephony carrier is Twilio (mulaw 8 kHz on the + // wire). In that case ``encodePipelineAudio`` skips the PCM16 → 8 kHz + // resample + mulaw transcode and just base64-encodes the bytes as-is — + // the adapter already speaks the carrier's native codec server-side. + // + // Without this flag the PCM16-resample path would misinterpret μ-law + // bytes as int16 samples (a perceptible, loud, garbled hiss — the bug + // this fix targets on the ElevenLabs WS + Twilio firstMessage path). + private ttsIsMulaw8k: boolean = false; private readonly history: ReturnType; private readonly metricsAcc: CallMetricsAccumulator; @@ -1372,20 +1383,34 @@ export class StreamHandler { // --------------------------------------------------------------------------- /** - * Encode a PCM 16kHz audio chunk for the telephony provider. + * Encode a TTS audio chunk for the telephony provider. * * Both Twilio and Telnyx negotiate PCMU (mulaw) 8 kHz on the bidirectional * media stream — Twilio always, and Telnyx because ``streaming_start`` - * (server.ts) requests ``stream_bidirectional_codec=PCMU`` at 8 kHz. So - * the wire format for both providers is mulaw 8 kHz; we resample 16 kHz - * PCM16 → 8 kHz then encode to mulaw. Mirrors the Python pipeline path - * (libraries/python/getpatter/handlers/telnyx_handler.py::TelnyxAudioSender). + * (server.ts) requests ``stream_bidirectional_codec=PCMU`` at 8 kHz. * - * Maintains a 1-byte carry across calls so unaligned HTTP chunks from - * streaming TTS providers never byte-swap the PCM16 samples downstream. + * Two paths depending on what the TTS adapter is emitting: + * + * - ``ttsIsMulaw8k=true`` (native fast-path): the adapter already streams + * μ-law @ 8 kHz (e.g. ElevenLabs WS with ``output_format=ulaw_8000`` + * auto-flipped by ``setTelephonyCarrier('twilio')``). We base64-encode + * the bytes as-is — no resample, no transcode. Treating these bytes as + * PCM16 would cause the resampler + ``pcm16ToMulaw`` to misinterpret + * them as int16 samples and emit loud garbled hiss on the wire. + * + * - default: chunks are PCM16 @ 16 kHz; resample 16 kHz → 8 kHz and + * encode to μ-law. A 1-byte alignment carry guards against unaligned + * HTTP chunks from streaming TTS providers. + * + * Mirrors the Python pipeline path ( + * ``libraries/python/getpatter/telephony/twilio.py::TwilioAudioSender``). */ - private encodePipelineAudio(pcm16k: Buffer): string { - const aligned = this.alignPcm16(pcm16k); + private encodePipelineAudio(chunk: Buffer): string { + if (this.ttsIsMulaw8k) { + // Adapter speaks the carrier's native codec — pass bytes through. + return chunk.toString('base64'); + } + const aligned = this.alignPcm16(chunk); if (aligned.length === 0) return ''; const pcm8k = this.outboundResampler.process(aligned); const mulaw = pcm16ToMulaw(pcm8k); @@ -1517,6 +1542,29 @@ export class StreamHandler { getLogger().debug(`TTS setTelephonyCarrier failed (${label}): ${String(e)}`); } } + + // After the carrier hint, the adapter may have flipped its wire + // format to the carrier-native codec. When that codec is μ-law @ + // 8 kHz AND the carrier is Twilio (mulaw on the wire), enable the + // pass-through fast-path in ``encodePipelineAudio``. Without this, + // ``encodePipelineAudio`` would interpret the μ-law bytes as PCM16 + // samples, resample, and re-encode to μ-law — producing the loud, + // garbled hiss reported on the firstMessage live path with + // ``ElevenLabsWebSocketTTS()`` (defaults) + Twilio. + const formatAware = this.tts as unknown as { outputFormat?: string }; + const effectiveFormat = + typeof formatAware.outputFormat === 'string' ? formatAware.outputFormat : null; + const carrier = this.deps.bridge.telephonyProvider; + if (effectiveFormat === 'ulaw_8000' && carrier === 'twilio') { + this.ttsIsMulaw8k = true; + getLogger().debug( + `pipeline mode (${label}): TTS native μ-law 8 kHz fast-path enabled ` + + `(adapter outputFormat=ulaw_8000, carrier=twilio); skipping ` + + `outbound PCM16→8k resample + mulaw transcode`, + ); + } else { + this.ttsIsMulaw8k = false; + } } if (!this.stt) { diff --git a/libraries/typescript/tests/unit/stream-handler.test.ts b/libraries/typescript/tests/unit/stream-handler.test.ts index d685dc2..fabf2fb 100644 --- a/libraries/typescript/tests/unit/stream-handler.test.ts +++ b/libraries/typescript/tests/unit/stream-handler.test.ts @@ -915,4 +915,77 @@ describe('StreamHandler', () => { expect(p.pendingMarks.length).toBe(0); }); }); + + // --------------------------------------------------------------------------- + // Native μ-law 8 kHz pass-through (TTS adapter auto-flipped to ulaw_8000) + // --------------------------------------------------------------------------- + // + // Guards the 0.6.1 fix: when ``ElevenLabsWebSocketTTS()`` is constructed + // without an explicit ``outputFormat`` and paired with a Twilio carrier, + // ``setTelephonyCarrier('twilio')`` auto-flips ``outputFormat`` to + // ``ulaw_8000``. The TTS adapter therefore streams μ-law bytes, but + // ``encodePipelineAudio`` always interpreted bytes as PCM16 16 kHz — + // resampling + re-encoding to μ-law produced loud garbled hiss on the + // wire (the firstMessage audio glitch report). The fix wires a + // ``ttsIsMulaw8k`` pass-through flag so the encoder forwards bytes + // as-is when the adapter speaks the carrier's native codec. + describe('encodePipelineAudio — native mulaw 8 kHz pass-through', () => { + interface EncodePriv { + ttsIsMulaw8k: boolean; + encodePipelineAudio(chunk: Buffer): string; + } + + it('forwards mulaw bytes as base64 when ttsIsMulaw8k=true', () => { + const h = new StreamHandler( + makeDeps(), + makeMockWs(), + '+15551111111', + '+15552222222', + ); + const p = h as unknown as EncodePriv; + p.ttsIsMulaw8k = true; + + // Well-defined mulaw samples; pass-through must echo them byte-for-byte. + const mulaw = Buffer.from([0x7f, 0x80, 0xff, 0x00, 0x7e, 0x81]); + const encoded = p.encodePipelineAudio(mulaw); + + expect(encoded).toBe(mulaw.toString('base64')); + // No PCM16 carry should accumulate on the pass-through path. + const carryView = h as unknown as { ttsByteCarry: Buffer | null }; + expect(carryView.ttsByteCarry).toBeNull(); + }); + + it('resamples + transcodes when ttsIsMulaw8k=false (default path)', () => { + const h = new StreamHandler( + makeDeps(), + makeMockWs(), + '+15551111111', + '+15552222222', + ); + const p = h as unknown as EncodePriv; + expect(p.ttsIsMulaw8k).toBe(false); + + // Aligned PCM16 16 kHz chunk → non-trivial mulaw output. The exact + // bytes depend on the resampler; we only assert that the output is + // non-empty AND differs from the pass-through interpretation, which + // is enough to catch a regression in the pass-through wiring. + const pcm16 = Buffer.alloc(64); + for (let i = 0; i < 32; i++) pcm16.writeInt16LE(i * 256, i * 2); + const encoded = p.encodePipelineAudio(pcm16); + + expect(encoded.length).toBeGreaterThan(0); + expect(encoded).not.toBe(pcm16.toString('base64')); + }); + + it('initialises ttsIsMulaw8k=false at construction time', () => { + const h = new StreamHandler( + makeDeps(), + makeMockWs(), + '+15551111111', + '+15552222222', + ); + const p = h as unknown as EncodePriv; + expect(p.ttsIsMulaw8k).toBe(false); + }); + }); });