anam-org · sebvanleuven · Feb 3, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/README.md b/README.md
@@ -69,12 +69,13 @@ asyncio.run(main())
 - 💬 **Two-way communication** - Send text messages (like transcribed user speech) and receive generated responses
 - 📝 **Real-time transcriptions** - Receive incremental message stream events for user and persona text as it's generated
 - 📚 **Message history tracking** - Automatic conversation history with incremental updates
-- 🎤 **Audio-passthrough** - Send TTS generated audio input and receive rendered synchronized audio/video avatar
+- 🤖 **Audio-passthrough** - Send TTS generated audio input and receive rendered synchronized audio/video avatar
 - 🗣️ **Direct text-to-speech** - Send text directly to TTS for immediate speech output (bypasses LLM processing)
-- 🎯 **Async iterator API** - Clean, Pythonic async/await patterns for continuous stream of audio/video frames
+- 🎤 **Real-time user audio input** - Send raw audio samples (e.g. from microphone) to Anam for processing (turnkey solution: STT → LLM → TTS → Face)
+- 📡 **Async iterator API** - Clean, Pythonic async/await patterns for continuous stream of audio/video frames
 - 🎯 **Event-driven API** - Simple decorator-based event handlers for discrete events
 - 📝 **Fully typed** - Complete type hints for IDE support
-- 🔒 **Server-side ready** - Designed for server-side Python applications (e.g. for use in a web application)
+- 🔒 **Server-side ready** - Designed for server-side Python applications (e.g. for backend pipelines)
 
 ## API Reference
 
@@ -101,9 +102,6 @@ client = AnamClient(
         voice_id="emma",
         language_code="en",
     ),
-    options=ClientOptions(
-        disable_input_audio=True,  # Don't capture microphone
-    ),
 )
 ```
 
@@ -126,6 +124,17 @@ async with client.connect() as session:
     # Both streams run concurrently
     await asyncio.gather(process_video(), process_audio())
 ```
+### User Audio Input
+
+User audio input is real time audio such as microphone audio. 
+User audio is 16 bit PCM samples, mono or stereo, with any sample rate. In order to process the audio correctly, the sample rate needs to be provided.
+The audio is forwarded in real-time as a webRTC audio track. In order to reduce latency, any audio provided before the webRTC audio track is created will be dropped.
+
+### TTS audio (Audio Passthrough)
+
+TTS audio is generated by a TTS engine, and should be provided in chunks through the `send_audio_chunk` method. The audio can be a byte array or base64 encoded strings (the SDK will convert to base64). The audio is ingested to the backend at max bandwidth. Sample_rate and channels need to be provided through the `AgentAudioInputConfig` object. 
+
+For best performance, we suggest using 24kHz mono audio. The provided audio is returned in-sync with the avatar without any resampling. Sample rates lower than 24kHz will result in poor Avatar performance. Sample rates higher than 24kHz might impact latency without any noticeable improvement in audio quality.
 
 ### Events
 
@@ -313,7 +322,6 @@ from anam import ClientOptions
 options = ClientOptions(
     api_base_url="https://api.anam.ai",  # API base URL
     api_version="v1",                     # API version
-    disable_input_audio=False,            # Disable microphone input
     ice_servers=None,                     # Custom ICE servers
 )
 ```
@@ -359,6 +367,7 @@ except AnamError as e:
   - `aiohttp` - HTTP client
   - `websockets` - WebSocket client
   - `numpy` - Array handling
+  - `pyav` - Video and audio handling
 
 Optional for display utilities:
 - `opencv-python` - Video display

diff --git a/examples/avatar_audio_passthrough.py b/examples/avatar_audio_passthrough.py
@@ -198,7 +198,7 @@ def main() -> None:
     client = AnamClient(
         api_key=api_key,
         persona_config=persona_config,
-        options=ClientOptions(disable_input_audio=True, api_base_url=api_base_url),
+        options=ClientOptions(api_base_url=api_base_url),
     )
 
     # Create display and audio player

diff --git a/examples/persona_interactive_video.py b/examples/persona_interactive_video.py
@@ -294,7 +294,7 @@ def main() -> None:
     client = AnamClient(
         api_key=api_key,
         persona_config=persona_config,
-        options=ClientOptions(disable_input_audio=False, api_base_url=api_base_url),
+        options=ClientOptions(api_base_url=api_base_url),
     )
 
     # Create display and audio player

diff --git a/examples/save_recording.py b/examples/save_recording.py
@@ -131,7 +131,7 @@ async def main() -> None:
     client = AnamClient(
         api_key=api_key,
         persona_id=persona_id,
-        options=ClientOptions(disable_input_audio=True, api_base_url=api_base_url),
+        options=ClientOptions(api_base_url=api_base_url),
     )
 
     # Register connection event handler

diff --git a/examples/text_to_video.py b/examples/text_to_video.py
@@ -268,7 +268,7 @@ async def text_to_video(
     client = AnamClient(
         api_key=api_key,
         persona_config=persona_config,
-        options=ClientOptions(disable_input_audio=True, api_base_url=api_base_url),
+        options=ClientOptions(api_base_url=api_base_url),
     )
 
     # Temp files for video and audio (keep extensions for format detection)

diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,7 @@ dependencies = [
     "websockets>=12.0",
     "numpy>=1.26.0",
     "python-dotenv>=1.2.1",
+    "av>=16.0.1",
 ]
 
 [project.optional-dependencies]
@@ -84,6 +85,7 @@ disallow_incomplete_defs = true
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
+pythonpath = ["src"]
 
 [tool.semantic_release]
 version_toml = ["pyproject.toml:project.version"]

diff --git a/src/anam/__init__.py b/src/anam/__init__.py
@@ -39,6 +39,7 @@ async def consume_audio():
 For more information, see https://docs.anam.ai
 """
 
+from av.audio.frame import AudioFrame
 from av.video.frame import VideoFrame
 
 from ._agent_audio_input_stream import AgentAudioInputStream
@@ -71,6 +72,7 @@ async def consume_audio():
     "AgentAudioInputConfig",
     "AgentAudioInputStream",
     "AnamEvent",
+    "AudioFrame",
     "ClientOptions",
     "ConnectionClosedCode",
     "Message",

diff --git a/src/anam/_streaming.py b/src/anam/_streaming.py
@@ -21,6 +21,7 @@
 
 from ._agent_audio_input_stream import AgentAudioInputStream
 from ._signalling import SignalAction, SignallingClient
+from ._user_audio_input_track import UserAudioInputTrack
 from .types import AgentAudioInputConfig, SessionInfo
 
 logger = logging.getLogger(__name__)
@@ -40,7 +41,6 @@ def __init__(
         on_message: Callable[[dict[str, Any]], Awaitable[None]] | None = None,
         on_connection_established: Callable[[], Awaitable[None]] | None = None,
         on_connection_closed: Callable[[str, str | None], Awaitable[None]] | None = None,
-        disable_input_audio: bool = False,
         custom_ice_servers: list[dict[str, Any]] | None = None,
     ):
         """Initialize the streaming client.
@@ -50,7 +50,6 @@ def __init__(
             on_message: Callback for data channel messages.
             on_connection_established: Callback when connected.
             on_connection_closed: Callback when disconnected.
-            disable_input_audio: If True, don't send microphone audio.
             custom_ice_servers: Custom ICE servers (optional).
         """
         self._session_info = session_info
@@ -62,7 +61,6 @@ def __init__(
         self._on_connection_closed = on_connection_closed
 
         # Configuration
-        self._disable_input_audio = disable_input_audio
         self._ice_servers = custom_ice_servers or session_info.ice_servers
 
         # State
@@ -75,6 +73,8 @@ def __init__(
         self._audio_track: MediaStreamTrack | None = None
         self._is_connected = False
         self._agent_audio_input_stream: AgentAudioInputStream | None = None
+        self._user_audio_input_track: UserAudioInputTrack | None = None
+        self._audio_transceiver = None  # Store transceiver for lazy track creation
 
     async def connect(self, timeout: float = 30.0) -> None:
         """Start the streaming connection.
@@ -322,12 +322,10 @@ def on_track(track: MediaStreamTrack) -> None:
         # Video: receive only
         self._peer_connection.addTransceiver("video", direction="recvonly")
 
-        # Audio: send/receive or receive only
-        if self._disable_input_audio:
-            self._peer_connection.addTransceiver("audio", direction="recvonly")
-        else:
-            self._peer_connection.addTransceiver("audio", direction="sendrecv")
-            # Note: Audio input track would be added here if needed
+        # Audio: send/receive (track created lazily when first audio arrives via send_user_audio())
+        self._audio_transceiver = self._peer_connection.addTransceiver(
+            "audio", direction="sendrecv"
+        )
 
         logger.debug("Peer connection initialized")
 
@@ -642,6 +640,17 @@ async def close(self) -> None:
             finally:
                 self._signalling_client = None
 
+        # Close user audio input track before closing peer connection
+        # This clears the audio queue and prevents recv() from generating more frames
+        if self._user_audio_input_track:
+            try:
+                self._user_audio_input_track.close()
+                logger.debug("Closed user audio input track")
+            except Exception as e:
+                logger.warning("Error closing user audio input track: %s", e)
+            finally:
+                self._user_audio_input_track = None
+
         # Close peer connection
         if self._peer_connection:
             try:
@@ -654,6 +663,53 @@ async def close(self) -> None:
         self._is_connected = False
         logger.info("Streaming client closed")
 
+    def send_user_audio(
+        self,
+        audio_bytes: bytes,
+        sample_rate: int,
+        num_channels: int,
+    ) -> None:
+        """Send raw user audio samples to Anam for processing.
+
+        This method accepts 16-bit PCM samples and adds them to the audio buffer for transmission via WebRTC.
+        The audio track is created lazily when first audio arrives.
+        Audio is only added to the buffer after the connection is established, to avoid accumulating stale audio.
+
+        Args:
+            audio_bytes: Raw audio data (16-bit PCM).
+            sample_rate: Sample rate of the input audio (Hz).
+            num_channels: Number of channels in the input audio (1=mono, 2=stereo).
+
+        Raises:
+            RuntimeError: If peer connection is not initialized.
+        """
+        if not self._peer_connection:
+            raise RuntimeError("Peer connection not initialized. Call connect() first.")
+        if num_channels != 1 and num_channels != 2:
+            raise RuntimeError("Invalid number of channels. Must be 1 or 2.")
+
+        # Create track lazily when first audio arrives
+        if self._user_audio_input_track is None:
+            logger.info(
+                f"Creating user audio input track: sample_rate={sample_rate}Hz, "
+                f"channels={num_channels}"
+            )
+
+            self._user_audio_input_track = UserAudioInputTrack(sample_rate, num_channels)
+
+            # Add track to transceiver (lazy track creation)
+            if self._audio_transceiver and self._audio_transceiver.sender:
+                try:
+                    self._audio_transceiver.sender.replaceTrack(self._user_audio_input_track)
+                    logger.info("Added user audio track to transceiver")
+                except Exception as e:
+                    raise RuntimeError(f"Failed to add user audio track: {e}") from e
+            else:
+                raise RuntimeError("Audio transceiver not properly initialized")
+        if self._peer_connection.connectionState == "connected":
+            # Avoid accumulating stale audio, only queue audio when connection is established.
+            self._user_audio_input_track.add_audio_samples(audio_bytes, sample_rate, num_channels)
+
     def __del__(self) -> None:
         """Cleanup on destruction to prevent warnings."""
         # Clear peer connection reference if close() wasn't called explicitly.