Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ target/
# pyenv
.python-version
venv
.venv

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [4.0.1] - 2025-04-30

### Added

- Support RT Multichannel and channel DZ

## [3.0.4] - 2025-04-16

- Support for new parameters `prefer_current_speaker` and `speaker_sensitivity` in Speaker Diarization


## [3.0.3] - 2025-03-03

### Added
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ lint:
black --check --diff $(SOURCES)
ruff $(SOURCES)

.PHONY: lint-fix
lint-fix:
black $(SOURCES)
Comment thread
giorgosHadji marked this conversation as resolved.
ruff --fix $(SOURCES)
Comment thread
J-Jaywalker marked this conversation as resolved.

.PHONY: format
format:
black $(SOURCES)
Expand Down
8 changes: 6 additions & 2 deletions speechmatics/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
Functions for converting our JSON transcription results to other formats.
"""
from typing import Any, List
from typing import Any, List, Optional


def get_txt_translation(translations: List[dict]):
Expand Down Expand Up @@ -32,15 +32,17 @@ def get_txt_translation(translations: List[dict]):
def convert_to_txt(
tokens: List[dict],
language: str,
language_pack_info: dict = None,
language_pack_info: Optional[dict] = None,
speaker_labels: bool = True,
channel: Optional[str] = None,
) -> str:
"""
Convert a set of transcription result tokens to a plain text format.

:param tokens: the transcription results.
:param language_pack_info: information about the language pack.
:param speaker_labels: whether or not to output speaker labels in the text.
:param channel: the channel name (if multichannel).
:return: the plain text as a string.
"""
# Although we should get word_delimiter from language_pack_info, we still want sensible
Expand All @@ -64,6 +66,8 @@ def convert_to_txt(
texts.append(f"SPEAKER: {current_speaker}\n")
texts.append(join_tokens(group, word_delimiter=word_delimiter))
texts.append("\n")
if texts and channel:
texts.insert(0, f"{channel}: ")
Comment thread
J-Jaywalker marked this conversation as resolved.

return "".join(texts).rstrip()

Expand Down
95 changes: 87 additions & 8 deletions speechmatics/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,25 @@ def parse_word_replacements(replacement_words_filepath) -> List[Dict]:
return replacement_words


def parse_multichannel_args(multichannel_args: str) -> list[str]:
"""
Parses multichannel arguments from the command line
:param multichannel_args: Multichannel arguments
:type multichannel_args: str
Comment thread
J-Jaywalker marked this conversation as resolved.
:return: A list of channels to be used.
:rtype: List[str]
:raises SystemExit: If the arguments are not formatted properly.
"""
channels = []
try:
channels = multichannel_args.split(",")
except ValueError:
raise SystemExit(
f"Invalid format for multichannel arguments: '{multichannel_args}'. Expected format <channel_1>,<channel_2>"
)
return channels


def parse_additional_vocab(additional_vocab_filepath):
"""
Parses an additional vocab list from a file.
Expand Down Expand Up @@ -329,6 +348,11 @@ def get_transcription_config(
"Using additional vocab from file %s", args["additional_vocab_file"]
)

if args.get("multichannel"):
multichannel_args = parse_multichannel_args(args["multichannel"])
config["channel_diarization_labels"] = multichannel_args
LOGGER.info(f"Using multchannel mode with channels {multichannel_args}")

if args.get("additional_vocab"):
if not config.get("additional_vocab"):
config["additional_vocab"] = args["additional_vocab"]
Expand Down Expand Up @@ -518,24 +542,32 @@ def add_printing_handlers(
api.add_event_handler(
ServerMessageType.AudioAdded, lambda *args: print_symbol("-")
)
api.add_event_handler(
ServerMessageType.ChannelAudioAdded, lambda *args: print_symbol("=")
)
api.add_event_handler(
ServerMessageType.AddPartialTranscript, lambda *args: print_symbol(".")
)
api.add_event_handler(
ServerMessageType.AddTranscript, lambda *args: print_symbol("|")
)
api.add_middleware(ClientMessageType.AddAudio, lambda *args: print_symbol("+"))
api.add_middleware(
ClientMessageType.AddChannelAudio, lambda *args: print_symbol("x")
)

def partial_transcript_handler(message):
# "\n" does not appear in partial transcripts
if print_json:
print(json.dumps(message))
return

plaintext = speechmatics.adapters.convert_to_txt(
message["results"],
api.transcription_config.language,
language_pack_info=api.get_language_pack_info(),
speaker_labels=True,
channel=get_channel(message),
)
if plaintext:
sys.stderr.write(f"{escape_seq}{plaintext}\r")
Expand All @@ -545,16 +577,24 @@ def transcript_handler(message):
if print_json:
print(json.dumps(message))
return

plaintext = speechmatics.adapters.convert_to_txt(
message["results"],
api.transcription_config.language,
language_pack_info=api.get_language_pack_info(),
speaker_labels=True,
channel=get_channel(message),
)
if plaintext:
sys.stdout.write(f"{escape_seq}{plaintext}\n")
transcripts.text += plaintext

def get_channel(message):
return next(
(result["channel"] for result in message["results"] if "channel" in result),
None,
)

def audio_event_handler(message):
if print_json:
print(json.dumps(message))
Expand Down Expand Up @@ -759,12 +799,23 @@ def rt_main(args):
translation_config=transcription_config.translation_config,
)

def run(stream):
def run(stream=None, channel_stream_pairs=None):
try:
# Pass in either stream or channel_stream_pairs depending on what != None
# Dynamically construct the args based on the input
args_list = [transcription_config]
if stream is not None:
args_list.append(stream)
elif channel_stream_pairs is not None:
args_list.append(None) # This skips the stream argument
Comment thread
giorgosHadji marked this conversation as resolved.
args_list.append(channel_stream_pairs)
else:
raise SystemExit(
"Neither stream nor channel_stream_pairs were provided."
Comment thread
J-Jaywalker marked this conversation as resolved.
)
api.run_synchronously(
stream,
transcription_config,
get_audio_settings(args),
*args_list,
audio_settings=get_audio_settings(args),
from_cli=True,
extra_headers=extra_headers,
)
Expand All @@ -773,11 +824,39 @@ def run(stream):
LOGGER.warning("Keyboard interrupt received.")

if args["files"][0] == "-":
run(sys.stdin.buffer)
if transcription_config.channel_diarization_labels:
raise SystemExit(
"Channel diarization is not yet supported when reading from stdin."
)
run(stream=sys.stdin.buffer)
else:
for filename in args["files"]:
with open(filename, "rb") as audio_file:
run(audio_file)
# Check we have the right diarization type
if transcription_config.channel_diarization_labels:
if (
transcription_config.diarization != "channel"
and transcription_config.diarization != "channel_and_speaker"
):
raise SystemExit(
"Multichannel DZ type must be 'channel' or 'channel_and_speaker'."
)

num_channels = len(transcription_config.channel_diarization_labels)
if len(args["files"]) != num_channels:
raise SystemExit(
f"Number of files: ({len(args['files'])}) must match number of channels: ({num_channels})."
Comment thread
J-Jaywalker marked this conversation as resolved.
)

channel_stream_pairs = {}
for i in range(num_channels):
# Here the order matters, as stream positions and diarization labels correspond to one another.
channel_name = transcription_config.channel_diarization_labels[i]
channel_stream_pairs[channel_name] = args["files"][i]
Comment thread
J-Jaywalker marked this conversation as resolved.
run(channel_stream_pairs=channel_stream_pairs)

else:
for filename in args["files"]:
with open(filename, "rb") as audio_file:
run(stream=audio_file)


def batch_main(args):
Expand Down
15 changes: 14 additions & 1 deletion speechmatics/cli_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,15 @@ def get_arg_parser():
default=None,
help=("Comma-separated list of expected languages for language identification"),
)
config_parser.add_argument(
"--multichannel",
metavar="CHANNELS",
help=(
"Enables multichannel mode and specifies channels. "
Comment thread
J-Jaywalker marked this conversation as resolved.
"Pass channels as a comma-separated string, e.g.: <CHANNEL_1>,<CHANNEL_2>. "
"The number of channels specified must match the number of input files."
),
)

# Parent parser for batch summarize argument
batch_summarization_parser = argparse.ArgumentParser(add_help=False)
Expand Down Expand Up @@ -547,7 +556,11 @@ def get_arg_parser():

rt_transcribe_command_parser.add_argument(
"--diarization",
choices=["none", "speaker"],
choices=[
"none",
"speaker",
"channel",
],
help="Which type of diarization to use.",
)

Expand Down
Loading