speechmatics · J-Jaywalker · Jun 3, 2025 · May 2, 2025 · May 2, 2025 · May 2, 2025
diff --git a/.gitignore b/.gitignore
@@ -87,6 +87,7 @@ target/
 # pyenv
 .python-version
 venv
+.venv
 
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,11 +4,17 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [4.0.1] - 2025-04-30
+
+### Added 
+
+- Support RT Multichannel and channel DZ
 
 ## [3.0.4] - 2025-04-16
 
 - Support for new parameters `prefer_current_speaker` and `speaker_sensitivity` in Speaker Diarization
 
+
 ## [3.0.3] - 2025-03-03
 
 ### Added

diff --git a/Makefile b/Makefile
@@ -9,6 +9,11 @@ lint:
 	black --check --diff $(SOURCES)
 	ruff $(SOURCES)
 
+.PHONY: lint-fix
+lint-fix:
+	black $(SOURCES)
+	ruff --fix $(SOURCES)
+
 .PHONY: format
 format:
 	black $(SOURCES)

diff --git a/speechmatics/adapters.py b/speechmatics/adapters.py
@@ -2,7 +2,7 @@
 """
 Functions for converting our JSON transcription results to other formats.
 """
-from typing import Any, List
+from typing import Any, List, Optional
 
 
 def get_txt_translation(translations: List[dict]):
@@ -32,15 +32,17 @@ def get_txt_translation(translations: List[dict]):
 def convert_to_txt(
     tokens: List[dict],
     language: str,
-    language_pack_info: dict = None,
+    language_pack_info: Optional[dict] = None,
     speaker_labels: bool = True,
+    channel: Optional[str] = None,
 ) -> str:
     """
     Convert a set of transcription result tokens to a plain text format.
 
     :param tokens: the transcription results.
     :param language_pack_info: information about the language pack.
     :param speaker_labels: whether or not to output speaker labels in the text.
+    :param channel: the channel name (if multichannel).
     :return: the plain text as a string.
     """
     # Although we should get word_delimiter from language_pack_info, we still want sensible
@@ -64,6 +66,8 @@ def convert_to_txt(
             texts.append(f"SPEAKER: {current_speaker}\n")
         texts.append(join_tokens(group, word_delimiter=word_delimiter))
         texts.append("\n")
+    if texts and channel:
+        texts.insert(0, f"{channel}: ")
 
     return "".join(texts).rstrip()
 

diff --git a/speechmatics/cli.py b/speechmatics/cli.py
@@ -111,6 +111,25 @@ def parse_word_replacements(replacement_words_filepath) -> List[Dict]:
     return replacement_words
 
 
+def parse_multichannel_args(multichannel_args: str) -> list[str]:
+    """
+    Parses multichannel arguments from the command line
+    :param multichannel_args: Multichannel arguments
+    :type multichannel_args: str
+    :return: A list of channels to be used.
+    :rtype: List[str]
+    :raises SystemExit: If the arguments are not formatted properly.
+    """
+    channels = []
+    try:
+        channels = multichannel_args.split(",")
+    except ValueError:
+        raise SystemExit(
+            f"Invalid format for multichannel arguments: '{multichannel_args}'. Expected format <channel_1>,<channel_2>"
+        )
+    return channels
+
+
 def parse_additional_vocab(additional_vocab_filepath):
     """
     Parses an additional vocab list from a file.
@@ -329,6 +348,11 @@ def get_transcription_config(
             "Using additional vocab from file %s", args["additional_vocab_file"]
         )
 
+    if args.get("multichannel"):
+        multichannel_args = parse_multichannel_args(args["multichannel"])
+        config["channel_diarization_labels"] = multichannel_args
+        LOGGER.info(f"Using multchannel mode with channels {multichannel_args}")
+
     if args.get("additional_vocab"):
         if not config.get("additional_vocab"):
             config["additional_vocab"] = args["additional_vocab"]
@@ -518,24 +542,32 @@ def add_printing_handlers(
         api.add_event_handler(
             ServerMessageType.AudioAdded, lambda *args: print_symbol("-")
         )
+        api.add_event_handler(
+            ServerMessageType.ChannelAudioAdded, lambda *args: print_symbol("=")
+        )
         api.add_event_handler(
             ServerMessageType.AddPartialTranscript, lambda *args: print_symbol(".")
         )
         api.add_event_handler(
             ServerMessageType.AddTranscript, lambda *args: print_symbol("|")
         )
         api.add_middleware(ClientMessageType.AddAudio, lambda *args: print_symbol("+"))
+        api.add_middleware(
+            ClientMessageType.AddChannelAudio, lambda *args: print_symbol("x")
+        )
 
     def partial_transcript_handler(message):
         # "\n" does not appear in partial transcripts
         if print_json:
             print(json.dumps(message))
             return
+
         plaintext = speechmatics.adapters.convert_to_txt(
             message["results"],
             api.transcription_config.language,
             language_pack_info=api.get_language_pack_info(),
             speaker_labels=True,
+            channel=get_channel(message),
         )
         if plaintext:
             sys.stderr.write(f"{escape_seq}{plaintext}\r")
@@ -545,16 +577,24 @@ def transcript_handler(message):
         if print_json:
             print(json.dumps(message))
             return
+
         plaintext = speechmatics.adapters.convert_to_txt(
             message["results"],
             api.transcription_config.language,
             language_pack_info=api.get_language_pack_info(),
             speaker_labels=True,
+            channel=get_channel(message),
         )
         if plaintext:
             sys.stdout.write(f"{escape_seq}{plaintext}\n")
         transcripts.text += plaintext
 
+    def get_channel(message):
+        return next(
+            (result["channel"] for result in message["results"] if "channel" in result),
+            None,
+        )
+
     def audio_event_handler(message):
         if print_json:
             print(json.dumps(message))
@@ -759,12 +799,23 @@ def rt_main(args):
         translation_config=transcription_config.translation_config,
     )
 
-    def run(stream):
+    def run(stream=None, channel_stream_pairs=None):
         try:
+            # Pass in either stream or channel_stream_pairs depending on what != None
+            # Dynamically construct the args based on the input
+            args_list = [transcription_config]
+            if stream is not None:
+                args_list.append(stream)
+            elif channel_stream_pairs is not None:
+                args_list.append(None)  # This skips the stream argument
+                args_list.append(channel_stream_pairs)
+            else:
+                raise SystemExit(
+                    "Neither stream nor channel_stream_pairs were provided."
+                )
             api.run_synchronously(
-                stream,
-                transcription_config,
-                get_audio_settings(args),
+                *args_list,
+                audio_settings=get_audio_settings(args),
                 from_cli=True,
                 extra_headers=extra_headers,
             )
@@ -773,11 +824,39 @@ def run(stream):
             LOGGER.warning("Keyboard interrupt received.")
 
     if args["files"][0] == "-":
-        run(sys.stdin.buffer)
+        if transcription_config.channel_diarization_labels:
+            raise SystemExit(
+                "Channel diarization is not yet supported when reading from stdin."
+            )
+        run(stream=sys.stdin.buffer)
     else:
-        for filename in args["files"]:
-            with open(filename, "rb") as audio_file:
-                run(audio_file)
+        # Check we have the right diarization type
+        if transcription_config.channel_diarization_labels:
+            if (
+                transcription_config.diarization != "channel"
+                and transcription_config.diarization != "channel_and_speaker"
+            ):
+                raise SystemExit(
+                    "Multichannel DZ type must be 'channel' or 'channel_and_speaker'."
+                )
+
+            num_channels = len(transcription_config.channel_diarization_labels)
+            if len(args["files"]) != num_channels:
+                raise SystemExit(
+                    f"Number of files: ({len(args['files'])}) must match number of channels: ({num_channels})."
+                )
+
+            channel_stream_pairs = {}
+            for i in range(num_channels):
+                # Here the order matters, as stream positions and diarization labels correspond to one another.
+                channel_name = transcription_config.channel_diarization_labels[i]
+                channel_stream_pairs[channel_name] = args["files"][i]
+            run(channel_stream_pairs=channel_stream_pairs)
+
+        else:
+            for filename in args["files"]:
+                with open(filename, "rb") as audio_file:
+                    run(stream=audio_file)
 
 
 def batch_main(args):

diff --git a/speechmatics/cli_parser.py b/speechmatics/cli_parser.py
@@ -315,6 +315,15 @@ def get_arg_parser():
         default=None,
         help=("Comma-separated list of expected languages for language identification"),
     )
+    config_parser.add_argument(
+        "--multichannel",
+        metavar="CHANNELS",
+        help=(
+            "Enables multichannel mode and specifies channels. "
+            "Pass channels as a comma-separated string, e.g.: <CHANNEL_1>,<CHANNEL_2>. "
+            "The number of channels specified must match the number of input files."
+        ),
+    )
 
     # Parent parser for batch summarize argument
     batch_summarization_parser = argparse.ArgumentParser(add_help=False)
@@ -547,7 +556,11 @@ def get_arg_parser():
 
     rt_transcribe_command_parser.add_argument(
         "--diarization",
-        choices=["none", "speaker"],
+        choices=[
+            "none",
+            "speaker",
+            "channel",
+        ],
         help="Which type of diarization to use.",
     )
-Original file line number
+Diff line change
@@ Expand Up / @@ -87,6 +87,7 @@ target/ @@
     # pyenv
     .python-version
     venv
+    .venv
     # pipenv
     #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
@@ Expand Down @@