From 032aa87bbc1627673575b4d35f6472b1016f3c74 Mon Sep 17 00:00:00 2001 From: abhi12-ayalur Date: Tue, 28 Nov 2023 02:03:36 +0000 Subject: [PATCH] v1 dubbing --- dubbing/README.md | 68 ++++++ dubbing/main.py | 345 ++++++++++++++++++++++++++++++ dubbing/utils.py | 343 +++++++++++++++++++++++++++++ text_to_speech/elevenlabs/main.py | 10 +- 4 files changed, 764 insertions(+), 2 deletions(-) create mode 100644 dubbing/README.md create mode 100644 dubbing/main.py create mode 100644 dubbing/utils.py diff --git a/dubbing/README.md b/dubbing/README.md new file mode 100644 index 0000000..4352df4 --- /dev/null +++ b/dubbing/README.md @@ -0,0 +1,68 @@ +# Dubbing + +Take a video or audio sample of a single person speaking and dub the audio in any language of your choosing. You can also lipsync the dub on the source material if it is a video. + +**Note:** The processing time depends on length, and if a video on the resolution and length of the video but a general rule of thumb is that it takes 1 second per second of audio, and 12 seconds to generate a video with lipsyncing with default settings. + +Some options to toggle include whether or not to refine the input and output audio, hyperparameters for speech generation, whether to downsample the video, and whether to lipsync the output. + +For generating the text, we give the user an option to pick between the open source [xtts](https://www.sievedata.com/functions/sieve/xtts-v1), [ElevenLabs](https://www.sievedata.com/functions/sieve/elevenlabs_speech_synthesis), or [Play.ht](https://www.sievedata.com/functions/sieve/playht_speech_synthesis) Text-to-Speech models. The ElevenLabs and Play.ht models are recommended for better quality but requires an API key. + +If you are using the ElevenLabs or Play.ht models, you can also choose to either clone the voice from the audio within the video itself or use a `voice_id` that you've already either created or is available in one of the platforms. If you choose to clone the voice, you can also choose to delete the voice after use. + +You must enter the API keys and user IDs mentioned in the speech_synthesis functions under the ["Secrets" tab](https://www.sievedata.com/dashboard/settings/secrets) in your account settings if you want to use the API variants. + +Tips for the input audio: +- Ensure there is only 1 speaker. If there is some noise it is ok, you can toggle the refinement options to denoise the audio if you would like. + +## Options + +- `source_file`: An audio or video input file to dub. +- `target_language`: The language to which the audio will be translated. Default is "spanish". +- `tts_model`: The Text-to-Speech model to use. Supported models are "xtts", "elevenlabs", and "playht". "elevenlabs" or "playht" are recommended for better quality but requires an ElevenLabs or PlayHT API key. +- `speech_stability`: A value between 0 and 1. Increasing this value can make the speech more expressive with output varying between re-generations. However, it can also lead to instabilities. +- `speech_similarity_boost`: A value between 0 and 1. Lower values are recommended if there are background artifacts present in the generated speech. +- `voice_id`: The ID of the voice to use. If none are set, the voice will be cloned from the source audio and used. This is only applicable if the `tts_model` is set to "elevenlabs" or "playht". +- `cleanup_voice_id`: Whether to delete the voice after use. This is only applicable if the `tts_model` is set to "elevenlabs" or "playht". +- `refine_source_audio`: Whether to refine the source audio using sieve/audio_enhancement. +- `low_resolution`: Whether to reduce the resolution of an input video to half of the original on each axis. Significantly speeds up inference. Defaults to False. Only applicable for video inputs. +- `low_fps`: Whether to reduce the fps of an input video to half of the original. Significantly speeds up inference. Defaults to False. Only applicable for video inputs. +- `enable_lipsyncing`: Whether to enable lip-syncing on the original video to the dubbed audio. Defaults to True. Only applicable for video inputs. Otherwise, audio is returned. + + +Currently, the languages supported end to end are: +- English +- Spanish +- Chinese +- French +- Italian +- Portuguese +- Polish +- Turkish +- Russian +- Dutch +- Czech +- German +- Arabic + +If using the Eleven Labs model for voice cloning, it additionally supports these languages: + +- Korean +- Swedish +- Indonesian +- Vietnamese +- Filipino +- Ukrainian +- Greek +- Finnish +- Romanian +- Danish +- Bulgarian +- Malay +- Hungarian +- Norwegian +- Slovak +- Croatian +- Classic Arabic +- Tamil +- Hindi diff --git a/dubbing/main.py b/dubbing/main.py new file mode 100644 index 0000000..fac6d36 --- /dev/null +++ b/dubbing/main.py @@ -0,0 +1,345 @@ +import sieve +from utils import ( + trim_silence_from_audio_loaded, + convert_to_format, + extract_audio_from_video +) + +audio_metadata = sieve.Metadata( + title="Dubbing", + description="Translate any video or audio to several languages", + tags=["Audio", "Video", "Lip-Syncing", "Translation", "Speech", "TTS", "Voice Cloning"], + readme=open("README.md", "r").read(), +) + + +@sieve.function( + name="dubbing", + system_packages=[ + "ffmpeg", + "rubberband-cli" + ], + python_packages=[ + "librosa", + "soundfile", + "moviepy", + "pydub", + "pyrubberband", + ], + metadata=audio_metadata, + environment_variables=[ + sieve.Env(name="ELEVEN_LABS_API_KEY", description="API key for ElevenLabs", default=""), + sieve.Env(name="PLAYHT_API_KEY", description="API key for ElevenLabs", default=""), + sieve.Env(name="PLAYHT_API_USER_ID", description="API user ID for ElevenLabs", default="") + ], +) +def dubbing( + source_file: sieve.File, + target_language: str = "spanish", + tts_model: str = "xtts", + voice_id: str = "", + cleanup_voice_id: bool = False, + low_resolution: bool = False, + low_fps: bool = False, + enable_lipsyncing: bool = True, +): + """ + :param source_file: An audio or video input file to dub. + :param target_language: The language to which the audio will be translated. Default is "spanish". + :param tts_model: The Text-to-Speech model to use. Supported models are "xtts", "elevenlabs", and "playht". "elevenlabs" or "playht" are recommended for better quality but requires an ElevenLabs API key. + :param voice_id: The ID of the voice to use. If none are set, the voice will be cloned from the source audio and used. This is only applicable if the `tts_model` is set to "elevenlabs" or "playht". + :param cleanup_voice_id: Whether to delete the voice after use. This is only applicable if the `tts_model` is set to "elevenlabs" or "playht". + :param low_resolution: Whether to reduce the resolution of an input video to half of the original on each axis. Significantly speeds up inference. Defaults to False. Only applicable for video inputs. + :param low_fps: Whether to reduce the fps of an input video to half of the original. Significantly speeds up inference. Defaults to False. Only applicable for video inputs. + :param enable_lipsyncing: Whether to enable lip-syncing on the original video to the dubbed audio. Defaults to True. Only applicable for video inputs. Otherwise, audio is returned. + :return: An audio file dubbed in the target language. + """ + import os + import time + + source_file_path = source_file.path + + video_extensions = set(["mp4", "avi", "mkv", "flv", "mov", "wmv", "webm"]) + + audio_extensions = set(["wav", "mp3", "ogg", "flac", "m4a"]) + + source_path_extension = os.path.splitext(source_file_path)[1][1:].lower() + + is_video = False + + # Validate source file + if source_path_extension in video_extensions: + print("Source file detected as video") + is_video = True + source_video = sieve.Video(path=source_file_path) + input_audio_path = "/tmp/input.wav" + extract_audio_from_video(source_video.path, input_audio_path) + source_audio = sieve.Audio(path=input_audio_path) + elif source_path_extension in audio_extensions: + print("Source file detected as audio") + is_video = False + source_audio = sieve.Audio(path=source_file_path) + else: + raise ValueError( + f"Unsupported file extension: {source_path_extension}. Please use one of the following: {video_extensions.union(audio_extensions)}. This function supports video and audio files." + ) + + target_language = target_language.lower() + + source_audio = convert_to_format(source_audio, "source_audio.wav", "wav") + + # Refine source_audio + start_time = time.time() + source_audio = sieve.function.get("sieve/audio_enhancement").run(source_audio, filter_type="noise", enhance_speed_boost=True) + print(f"Time taken to refine source audio: {time.time() - start_time} seconds") + + start_time = time.time() + text_info = sieve.function.get("sieve/speech_transcriber").run(source_audio) + out = list(text_info) + segments = [] + for i in range(len(out)): + segments.extend(out[i]["segments"]) + + print(f"Time taken to get text info: {time.time() - start_time} seconds") + language_code = out[0]["language_code"] + + try: + source_language = LANGUAGE_CODE_MAP[language_code] + except KeyError: + raise ValueError( + f"Unsupported language code: {language_code}. Please use one of the following: {list(LANGUAGE_CODE_MAP.keys())}" + ) + + # Translate text from english to another + start_time = time.time() + language_translator = sieve.function.get("sieve/seamless_text2text") + translations = [] + translation_coroutines = [] + for segment in segments: + translation = language_translator.push( + segment["text"], source_language, target_language + ) + translation_coroutines.append(translation) + for translation in translation_coroutines: + translations.append(translation.result()) + + print(f"Time taken to translate text: {time.time() - start_time} seconds") + + concat_translations = " ".join(translations) + + # TTS using audio as source + tts_model_str = tts_model + start_time = time.time() + target_audios = [] + tts_coroutines = [] + + # TTS Hyperparameters, to autoconfigure later. + speech_stability: float = 0.5 + speech_similarity_boost: float = 0.63 + + if tts_model_str == "xtts": + tts_model = sieve.function.get(f"sieve/xtts") + for i, segment in enumerate(segments): + if target_language in INVERSE_LANGUAGE_CODE_MAP: + language_code = INVERSE_LANGUAGE_CODE_MAP[target_language] + else: + language_code = target_language + print("Language code not found in map for language, using language code as is") + tts = tts_model.push( + translations[i], + source_audio, + stability=speech_stability, + similarity_boost=speech_similarity_boost, + language_code=language_code, + ) + tts_coroutines.append(tts) + elif tts_model_str == "elevenlabs": + tts_model = sieve.function.get(f"sieve/elevenlabs_speech_synthesis") + if len(voice_id) == 0: + # clone voice + cloning_model = sieve.function.get("sieve/elevenlabs_voice_cloning") + voice_cloning = cloning_model.run(source_audio) + print(voice_cloning) + voice_id = voice_cloning["voice_id"] + if voice_id and len(voice_id) > 0: + tts = tts_model.push( + concat_translations, + voice_id=voice_id, + stability=speech_stability, + similarity_boost=speech_similarity_boost + ) + else: + tts = tts_model.push( + concat_translations, + stability=speech_stability, + similarity_boost=speech_similarity_boost + ) + + tts_coroutines.append(tts) + + if cleanup_voice_id: + # delete voice + cloning_model = sieve.function.get("sieve/elevenlabs_voice_cloning") + cloning_model.run(source_audio, delete_voice_id=voice_id) + elif tts_model_str == "playht": + tts_model = sieve.function.get(f"sieve/playht_speech_synthesis") + if len(voice_id) == 0: + # clone voice + cloning_model = sieve.function.get("sieve/playht_voice_cloning") + voice_cloning = cloning_model.run(source_audio) + print(voice_cloning) + voice_id = voice_cloning["id"] + for i, segment in enumerate(segments): + if voice_id and len(voice_id) > 0: + tts = tts_model.push( + translations[i], + voice=voice_id, + ) + else: + tts = tts_model.push( + translations[i], + ) + + tts_coroutines.append(tts) + + if cleanup_voice_id: + # delete voice + cloning_model = sieve.function.get("sieve/playht_voice_cloning") + cloning_model.run(source_audio, delete_voice_id=voice_id) + else: + raise ValueError(f"Unsupported TTS model: {tts_model_str}. Please use one of the following: xtts, elevenlabs, playht") + for tts in tts_coroutines: + target_audios.append(tts.result()) + print(f"Time taken for TTS: {time.time() - start_time} seconds") + + # Combine target audios with gaps + from pydub import AudioSegment + + combined_audio = AudioSegment.empty() + for i, target_audio in enumerate(target_audios): + # Trim silence from target_audio + start_time = time.time() + target_audio_path = target_audio.path + # Convert the audio to wav if it is not a wav + if not target_audio_path.endswith('.wav'): + new_path = os.path.splitext(target_audio_path)[0] + '.wav' + convert_to_format(target_audio, new_path, 'wav') + target_audio_path = new_path + segment_audio = AudioSegment.from_wav(target_audio_path) + trimmed_audio = trim_silence_from_audio_loaded(segment_audio) + if i < len(segments) - 1: + try: + gap_duration = ( + segments[i + 1]["start"] - segments[i]["words"][-1]["end"] + ) * 1000 # Convert to milliseconds + except KeyError: + print( + f"KeyError at index {i} of segments. Using default gap duration of 0.05 seconds." + ) + gap_duration = 0 + gap = AudioSegment.silent(duration=gap_duration) + combined_audio += trimmed_audio + gap + else: + combined_audio += trimmed_audio + combined_audio.export("combined_audio.wav", format="wav") + target_audio = sieve.Audio(path="combined_audio.wav") + + # Lip-syncing if enable_lipsyncing is True and the input is a video + if is_video and enable_lipsyncing: + print("Running Lip-syncing...") + out_video = sieve.function.get("sieve/video_retalking").run( + source_video, + target_audio, + low_resolution, + low_fps, + ) + + return out_video + + return target_audio + + +LANGUAGE_CODE_MAP = { + "en": "english", + "es": "spanish", + "fr": "french", + "de": "german", + "it": "italian", + "pt": "portuguese", + "ru": "russian", + "ja": "japanese", + "ko": "korean", + "zh": "chinese", + "ar": "arabic", + "hi": "hindi", + "nl": "dutch", + "sv": "swedish", + "fi": "finnish", + "da": "danish", + "pl": "polish", + "hu": "hungarian", + "el": "greek", + "tr": "turkish", + "he": "hebrew", + "id": "indonesian", + "ms": "malay", + "th": "thai", + "vi": "vietnamese", + "cs": "czech", + "ro": "romanian", + "uk": "ukrainian", + "fa": "persian", + "af": "afrikaans", + "sw": "swahili", + "no": "norwegian", + "et": "estonian", + "lt": "lithuanian", + "lv": "latvian", + "sl": "slovenian", + "sk": "slovak", + "hr": "croatian", + "sr": "serbian", + "mk": "macedonian", + "bs": "bosnian", + "sq": "albanian", + "cy": "welsh", + "ga": "irish", + "mt": "maltese", + "is": "icelandic", + "tl": "filipino", + "yo": "yoruba", + "ig": "igbo", + "ha": "hausa", + "zu": "zulu", + "xh": "xhosa", + "st": "sesotho", + "so": "somali", + "am": "amharic", + "ne": "nepali", + "bn": "bengali", + "pa": "punjabi", + "gu": "gujarati", + "or": "odia", + "ta": "tamil", + "te": "telugu", + "kn": "kannada", + "ml": "malayalam", + "si": "sinhala", + "my": "burmese", + "ka": "georgian", + "hy": "armenian", + "kk": "kazakh", + "uz": "uzbek", + "mn": "mongolian", + "ky": "kyrgyz", + "tg": "tajik", + "tk": "turkmen", + "ps": "pashto", + "sd": "sindhi", + "ur": "urdu", + "yi": "yiddish", + "la": "latin", +} + +INVERSE_LANGUAGE_CODE_MAP = {v: k for k, v in LANGUAGE_CODE_MAP.items()} + diff --git a/dubbing/utils.py b/dubbing/utils.py new file mode 100644 index 0000000..6ac9253 --- /dev/null +++ b/dubbing/utils.py @@ -0,0 +1,343 @@ +import sieve + + +def trim_audio_into_snippets(audio_path: str, timestamps: list) -> list: + """ + Trims an audio file into different snippets based on given timestamps. + + Parameters: + - audio_path: path to the input audio file. + - timestamps: list of tuples, each containing start and end timestamps in seconds. + + Returns: + - List of paths to each of the audio snippets. + """ + from pydub import AudioSegment + from pydub.utils import make_chunks + + # Load the audio file + audio = AudioSegment.from_file(audio_path) + + # Initialize list to store paths to audio snippets + snippet_paths = [] + + # Loop over each timestamp pair and trim the audio + for i, (start, end) in enumerate(timestamps): + # Convert timestamps from seconds to milliseconds + start_ms = start * 1000 + end_ms = end * 1000 + + # Trim the audio + snippet = audio[start_ms:end_ms] + + # Save the snippet to a new file + snippet_path = f"snippet_{i}.wav" + snippet.export(snippet_path, format="wav") + + # Add the snippet path to the list + snippet_paths.append(snippet_path) + + return snippet_paths + + +def convert_to_format( + audio: sieve.Audio, output_path: str, output_format: str = "mp3" +) -> sieve.Audio: + """ + Converts a video or audio file to an mp3 or wav file. + + Parameters: + - input_path: path to the input video or audio file. + - output_path: path to save the output audio file. + - output_format: format of the output audio file. Default is "mp3". + + Returns: + - new sieve.Audio + """ + from pydub import AudioSegment + import os + + input_path = audio.path + + if output_format not in ["mp3", "wav"]: + raise ValueError( + f"Unsupported output format: {output_format}. Please use one of the following: mp3, wav" + ) + + if os.path.exists(output_path): + os.remove(output_path) + + if os.path.splitext(input_path)[1] == ".mp3" and output_format == "mp3": + return audio + elif os.path.splitext(input_path)[1] == ".wav" and output_format == "wav": + return audio + + # Load the input file + audio_input = AudioSegment.from_file(input_path) + + # Export the audio in the desired format + audio_input.export(output_path, format=output_format) + + return sieve.Audio(path=output_path) + + +def match_audio_length( + source_audio_path: str, + target_audio_path: str, + output_path: str = "matched_audio.wav", +) -> str: + """ + Stretches or shrinks the source audio to match the length of the target audio. + + Parameters: + - source_audio_path: path to the source audio file. + - target_audio_path: path to the target audio file. + - output_path: path to save the matched audio. Default is "matched_audio.wav". + + Returns: + - Path to the matched audio file. + """ + import librosa + import pyrubberband as pyrb + import soundfile as sf + + # Load the audio files + source_audio, sr_source = librosa.load(source_audio_path, sr=None) + target_audio, sr_target = librosa.load(target_audio_path, sr=None) + + print(f"Source audio length: {len(source_audio) / sr_source} seconds") + print(f"Target audio length: {len(target_audio) / sr_target} seconds") + + # Calculate the speed factor + speed_factor = len(target_audio) / len(source_audio) + + print(f"Speed factor: {speed_factor}") + + # Stretch or shrink the source audio + matched_audio = pyrb.time_stretch(source_audio, sr_source, 1 / speed_factor) + + # Export the matched audio + sf.write(output_path, matched_audio, sr_source, format="wav") + + return output_path + + +def match_audio_length_from_segment(source_audio, target_length_ms: int): + """ + Stretches or shrinks the source audio to match the target length. + + Parameters: + - source_audio: AudioSegment of the source audio file. + - target_length_ms: target length in milliseconds. + + Returns: + - AudioSegment of the matched audio. + """ + import librosa + import pyrubberband as pyrb + import numpy as np + from pydub import AudioSegment + + # Convert source_audio to numpy array + source_audio_array = np.array(source_audio.get_array_of_samples()) + + source_audio_length_s = len(source_audio_array) / source_audio.frame_rate + target_audio_length_s = target_length_ms / 1000 + + print(f"Source audio length: {source_audio_length_s} seconds") + print(f"Target audio length: {target_audio_length_s} seconds") + + # Calculate the speed factor + speed_factor = target_audio_length_s / source_audio_length_s + + print(f"Speed factor: {speed_factor}") + + # Stretch or shrink the source audio + matched_audio_array = pyrb.time_stretch( + source_audio_array, source_audio.frame_rate, 1 / speed_factor + ) + + # Convert matched_audio_array back to AudioSegment + matched_audio = AudioSegment( + matched_audio_array.tobytes(), + frame_rate=source_audio.frame_rate, + sample_width=source_audio.sample_width, + channels=source_audio.channels, + ) + + return matched_audio + + +def speed_up_audio( + audio_path: str, speed: float = 1.25, output_path: str = "sped_up_audio.wav" +) -> str: + """ + Speeds up an audio file by a given factor without changing the pitch. + + Parameters: + - audio_path: path to the input audio file. + - speed: factor to speed up the audio by. Default is 1.25. + - output_path: path to save the sped-up audio. Default is "sped_up_audio.wav". + + Returns: + - Path to the sped-up audio file. + """ + from pydub import AudioSegment + from pydub.playback import play + + # Load the audio file + audio = AudioSegment.from_file(audio_path) + + # Speed up the audio + sped_up_audio = audio.speedup(playback_speed=speed) + + # Export the sped-up audio + sped_up_audio.export(output_path, format="wav") + + return output_path + + +def trim_audio_into_snippets(audio_path: str, timestamps: list) -> list: + """ + Trims an audio file into different snippets based on given timestamps. + + Parameters: + - audio_path: path to the input audio file. + - timestamps: list of tuples, each containing start and end timestamps in seconds. + + Returns: + - List of paths to each of the audio snippets. + """ + from pydub import AudioSegment + from pydub.utils import make_chunks + + # Load the audio file + audio = AudioSegment.from_file(audio_path) + + # Initialize list to store paths to audio snippets + snippet_paths = [] + + # Loop over each timestamp pair and trim the audio + for i, (start, end) in enumerate(timestamps): + # Convert timestamps from seconds to milliseconds + start_ms = start * 1000 + end_ms = end * 1000 + + # Trim the audio + snippet = audio[start_ms:end_ms] + + # Save the snippet to a new file + snippet_path = f"snippet_{i}.wav" + snippet.export(snippet_path, format="wav") + + # Add the snippet path to the list + snippet_paths.append(snippet_path) + + return snippet_paths + + +def extract_audio_from_video(video_path: str, output_path: str): + """ + Extracts audio from a video and saves it to an output path. + + Parameters: + - video_path: path to the input video file. + - output_path: path to save the extracted audio. + """ + from moviepy.editor import VideoFileClip + + # Load the video file + clip = VideoFileClip(video_path) + + # Extract audio + audio = clip.audio + + # Save audio to output path + audio.write_audiofile(output_path) + + +def trim_silence_from_video(video_path, output_path, silence_thresh=-50.0): + """ + Trims silence from the beginning and end of a video. + + Parameters: + - video_path: path to the input video file. + - output_path: path to save the trimmed video. + - silence_thresh: threshold in dB. Anything quieter than this will be considered silence. + - chunk_size: how long to analyze sound for (in ms). + """ + + from moviepy.editor import VideoFileClip + from pydub import AudioSegment + from pydub.silence import detect_nonsilent + + # Load the video file + clip = VideoFileClip(video_path) + + # Convert video audio to pydub's AudioSegment format + audio = AudioSegment.from_file(video_path, codec="aac") + + # Detect non-silent chunks + non_silence_ranges = detect_nonsilent( + audio, min_silence_len=1, silence_thresh=silence_thresh + ) + + # If there are non-silent chunks, trim the video file based on the first and last non-silent chunk + if non_silence_ranges: + start_trim = max( + 0, non_silence_ranges[0][0] - 250 + ) # Add a quarter second buffer to the start, if possible + end_trim = min( + len(audio), non_silence_ranges[-1][1] + 250 + ) # Add a quarter second buffer to the end, if possible + trimmed_clip = clip.subclip( + start_trim / 1000.0, end_trim / 1000.0 + ) # Convert ms to seconds + trimmed_clip.write_videofile(output_path, codec="libx264", audio_codec="aac") + else: + print("All audio is silent. Nothing to trim!") + + +def trim_silence_from_audio(audio_path, output_path, silence_thresh=-50.0): + """ + Trims silence from the beginning and end of an audio. + + Parameters: + - audio_path: path to the input audio file. + - output_path: path to save the trimmed audio. + - silence_thresh: threshold in dB. Anything quieter than this will be considered silence. + """ + + from pydub import AudioSegment + + # Load the audio file + audio = AudioSegment.from_file(audio_path, format="wav") + + # Detect non-silent chunks + out = trim_silence_from_audio_loaded(audio, silence_thresh=silence_thresh) + + # Export the trimmed audio + out.export(output_path, format="wav") + + +def trim_silence_from_audio_loaded(audio, silence_thresh=-50.0, buffer_ms=250): + from pydub.silence import detect_nonsilent + + # Detect non-silent chunks + non_silence_ranges = detect_nonsilent( + audio, min_silence_len=1, silence_thresh=silence_thresh + ) + + # If there are non-silent chunks, trim the audio file based on the first and last non-silent chunk + if non_silence_ranges: + start_trim = max( + 0, non_silence_ranges[0][0] - buffer_ms + ) # Add a quarter second buffer to the start, if possible + end_trim = min( + len(audio), non_silence_ranges[-1][1] + buffer_ms + ) # Add a quarter second buffer to the end, if possible + trimmed_audio = audio[start_trim:end_trim] + return trimmed_audio + else: + print("All audio is silent. Nothing to trim!") + return audio diff --git a/text_to_speech/elevenlabs/main.py b/text_to_speech/elevenlabs/main.py index 2220ad5..d8e58e2 100644 --- a/text_to_speech/elevenlabs/main.py +++ b/text_to_speech/elevenlabs/main.py @@ -124,12 +124,18 @@ def clone_audio( shutil.rmtree(temp_dir) + if response.status_code != 200: + raise ValueError(f"Could not clone voice. API Response:\n {response.text}") + + if "voice_id" not in response.json(): + raise ValueError(f"Could not clone voice. Please verify that you have not hit a limit on the number of voices you can create. API Response:\n {response.text}") + try: return response.json() except Exception as e: print(response.text) - raise ValueError("Could not generate voice", response.text) + raise ValueError(f"Could not generate voice. API Response: {response.text}") synthesis_metadata = sieve.Metadata( description="Text to speech using ElevenLabs", @@ -144,7 +150,7 @@ def clone_audio( @sieve.function( name="elevenlabs_speech_synthesis", system_packages=["ffmpeg"], - environment_variables=[ + environment_variables=[ sieve.Env(name="ELEVEN_LABS_API_KEY", description="API key for ElevenLabs") ], metadata=synthesis_metadata