From 032aa87bbc1627673575b4d35f6472b1016f3c74 Mon Sep 17 00:00:00 2001
From: abhi12-ayalur <abhinav.ayalur@gmail.com>
Date: Tue, 28 Nov 2023 02:03:36 +0000
Subject: [PATCH] v1 dubbing

---
 dubbing/README.md                 |  68 ++++++
 dubbing/main.py                   | 345 ++++++++++++++++++++++++++++++
 dubbing/utils.py                  | 343 +++++++++++++++++++++++++++++
 text_to_speech/elevenlabs/main.py |  10 +-
 4 files changed, 764 insertions(+), 2 deletions(-)
 create mode 100644 dubbing/README.md
 create mode 100644 dubbing/main.py
 create mode 100644 dubbing/utils.py

diff --git a/dubbing/README.md b/dubbing/README.md
new file mode 100644
index 0000000..4352df4
--- /dev/null
+++ b/dubbing/README.md
@@ -0,0 +1,68 @@
+# Dubbing
+
+Take a video or audio sample of a single person speaking and dub the audio in any language of your choosing. You can also lipsync the dub on the source material if it is a video.
+
+**Note:** The processing time depends on length, and if a video on the resolution and length of the video but a general rule of thumb is that it takes 1 second per second of audio, and 12 seconds to generate a video with lipsyncing with default settings.
+
+Some options to toggle include whether or not to refine the input and output audio, hyperparameters for speech generation, whether to downsample the video, and whether to lipsync the output.
+
+For generating the text, we give the user an option to pick between the open source [xtts](https://www.sievedata.com/functions/sieve/xtts-v1), [ElevenLabs](https://www.sievedata.com/functions/sieve/elevenlabs_speech_synthesis), or [Play.ht](https://www.sievedata.com/functions/sieve/playht_speech_synthesis) Text-to-Speech models. The ElevenLabs and Play.ht models are recommended for better quality but requires an API key.
+
+If you are using the ElevenLabs or Play.ht models, you can also choose to either clone the voice from the audio within the video itself or use a `voice_id` that you've already either created or is available in one of the platforms. If you choose to clone the voice, you can also choose to delete the voice after use.
+
+You must enter the API keys and user IDs mentioned in the speech_synthesis functions under the ["Secrets" tab](https://www.sievedata.com/dashboard/settings/secrets) in your account settings if you want to use the API variants.
+
+Tips for the input audio:
+- Ensure there is only 1 speaker. If there is some noise it is ok, you can toggle the refinement options to denoise the audio if you would like.
+
+## Options
+
+- `source_file`: An audio or video input file to dub.
+- `target_language`: The language to which the audio will be translated. Default is "spanish".
+- `tts_model`: The Text-to-Speech model to use. Supported models are "xtts", "elevenlabs", and "playht". "elevenlabs" or "playht" are recommended for better quality but requires an ElevenLabs or PlayHT API key.
+- `speech_stability`: A value between 0 and 1. Increasing this value can make the speech more expressive with output varying between re-generations. However, it can also lead to instabilities.
+- `speech_similarity_boost`: A value between 0 and 1. Lower values are recommended if there are background artifacts present in the generated speech.
+- `voice_id`: The ID of the voice to use. If none are set, the voice will be cloned from the source audio and used. This is only applicable if the `tts_model` is set to "elevenlabs" or "playht".
+- `cleanup_voice_id`: Whether to delete the voice after use. This is only applicable if the `tts_model` is set to "elevenlabs" or "playht".
+- `refine_source_audio`: Whether to refine the source audio using sieve/audio_enhancement.
+- `low_resolution`: Whether to reduce the resolution of an input video to half of the original on each axis. Significantly speeds up inference. Defaults to False. Only applicable for video inputs.
+- `low_fps`: Whether to reduce the fps of an input video to half of the original. Significantly speeds up inference. Defaults to False. Only applicable for video inputs.
+- `enable_lipsyncing`: Whether to enable lip-syncing on the original video to the dubbed audio. Defaults to True. Only applicable for video inputs. Otherwise, audio is returned.
+
+
+Currently, the languages supported end to end are: 
+- English
+- Spanish
+- Chinese
+- French
+- Italian
+- Portuguese
+- Polish
+- Turkish
+- Russian
+- Dutch
+- Czech
+- German
+- Arabic
+
+If using the Eleven Labs model for voice cloning, it additionally supports these languages:
+
+- Korean
+- Swedish
+- Indonesian
+- Vietnamese
+- Filipino
+- Ukrainian
+- Greek
+- Finnish
+- Romanian
+- Danish
+- Bulgarian
+- Malay
+- Hungarian
+- Norwegian
+- Slovak
+- Croatian
+- Classic Arabic
+- Tamil
+- Hindi
diff --git a/dubbing/main.py b/dubbing/main.py
new file mode 100644
index 0000000..fac6d36
--- /dev/null
+++ b/dubbing/main.py
@@ -0,0 +1,345 @@
+import sieve
+from utils import (
+    trim_silence_from_audio_loaded,
+    convert_to_format,
+    extract_audio_from_video
+)
+
+audio_metadata = sieve.Metadata(
+    title="Dubbing",
+    description="Translate any video or audio to several languages",
+    tags=["Audio", "Video", "Lip-Syncing", "Translation", "Speech", "TTS", "Voice Cloning"],
+    readme=open("README.md", "r").read(),
+)
+
+
+@sieve.function(
+    name="dubbing",
+    system_packages=[
+        "ffmpeg", 
+        "rubberband-cli"
+    ],
+    python_packages=[
+        "librosa",
+        "soundfile",
+        "moviepy",
+        "pydub",
+        "pyrubberband",
+    ],
+    metadata=audio_metadata,
+    environment_variables=[
+        sieve.Env(name="ELEVEN_LABS_API_KEY", description="API key for ElevenLabs", default=""),
+        sieve.Env(name="PLAYHT_API_KEY", description="API key for ElevenLabs", default=""),
+        sieve.Env(name="PLAYHT_API_USER_ID", description="API user ID for ElevenLabs", default="")
+    ],
+)
+def dubbing(
+    source_file: sieve.File,
+    target_language: str = "spanish",
+    tts_model: str = "xtts",
+    voice_id: str = "",
+    cleanup_voice_id: bool = False,
+    low_resolution: bool = False,
+    low_fps: bool = False,
+    enable_lipsyncing: bool = True,
+):
+    """
+    :param source_file: An audio or video input file to dub.
+    :param target_language: The language to which the audio will be translated. Default is "spanish".
+    :param tts_model: The Text-to-Speech model to use. Supported models are "xtts", "elevenlabs", and "playht". "elevenlabs" or "playht" are recommended for better quality but requires an ElevenLabs API key.
+    :param voice_id: The ID of the voice to use. If none are set, the voice will be cloned from the source audio and used. This is only applicable if the `tts_model` is set to "elevenlabs" or "playht".
+    :param cleanup_voice_id: Whether to delete the voice after use. This is only applicable if the `tts_model` is set to "elevenlabs" or "playht".
+    :param low_resolution: Whether to reduce the resolution of an input video to half of the original on each axis. Significantly speeds up inference. Defaults to False. Only applicable for video inputs.
+    :param low_fps: Whether to reduce the fps of an input video to half of the original. Significantly speeds up inference. Defaults to False. Only applicable for video inputs.
+    :param enable_lipsyncing: Whether to enable lip-syncing on the original video to the dubbed audio. Defaults to True. Only applicable for video inputs. Otherwise, audio is returned.
+    :return: An audio file dubbed in the target language.
+    """
+    import os
+    import time
+
+    source_file_path = source_file.path
+
+    video_extensions = set(["mp4", "avi", "mkv", "flv", "mov", "wmv", "webm"])
+
+    audio_extensions = set(["wav", "mp3", "ogg", "flac", "m4a"])
+
+    source_path_extension = os.path.splitext(source_file_path)[1][1:].lower()
+
+    is_video = False
+
+    # Validate source file
+    if source_path_extension in video_extensions:
+        print("Source file detected as video")
+        is_video = True
+        source_video = sieve.Video(path=source_file_path)
+        input_audio_path = "/tmp/input.wav"
+        extract_audio_from_video(source_video.path, input_audio_path)
+        source_audio = sieve.Audio(path=input_audio_path)
+    elif source_path_extension in audio_extensions:
+        print("Source file detected as audio")
+        is_video = False
+        source_audio = sieve.Audio(path=source_file_path)
+    else:
+        raise ValueError(
+            f"Unsupported file extension: {source_path_extension}. Please use one of the following: {video_extensions.union(audio_extensions)}. This function supports video and audio files."
+        )
+
+    target_language = target_language.lower()
+
+    source_audio = convert_to_format(source_audio, "source_audio.wav", "wav")
+
+    # Refine source_audio
+    start_time = time.time()
+    source_audio = sieve.function.get("sieve/audio_enhancement").run(source_audio, filter_type="noise", enhance_speed_boost=True)
+    print(f"Time taken to refine source audio: {time.time() - start_time} seconds")
+
+    start_time = time.time()
+    text_info = sieve.function.get("sieve/speech_transcriber").run(source_audio)
+    out = list(text_info)
+    segments = []
+    for i in range(len(out)):
+        segments.extend(out[i]["segments"])
+
+    print(f"Time taken to get text info: {time.time() - start_time} seconds")
+    language_code = out[0]["language_code"]
+
+    try:
+        source_language = LANGUAGE_CODE_MAP[language_code]
+    except KeyError:
+        raise ValueError(
+            f"Unsupported language code: {language_code}. Please use one of the following: {list(LANGUAGE_CODE_MAP.keys())}"
+        )
+
+    # Translate text from english to another
+    start_time = time.time()
+    language_translator = sieve.function.get("sieve/seamless_text2text")
+    translations = []
+    translation_coroutines = []
+    for segment in segments:
+        translation = language_translator.push(
+            segment["text"], source_language, target_language
+        )
+        translation_coroutines.append(translation)
+    for translation in translation_coroutines:
+        translations.append(translation.result())
+
+    print(f"Time taken to translate text: {time.time() - start_time} seconds")
+
+    concat_translations = " ".join(translations)
+
+    # TTS using audio as source
+    tts_model_str = tts_model
+    start_time = time.time()
+    target_audios = []
+    tts_coroutines = []
+
+    # TTS Hyperparameters, to autoconfigure later.
+    speech_stability: float = 0.5
+    speech_similarity_boost: float = 0.63
+
+    if tts_model_str  == "xtts":
+        tts_model = sieve.function.get(f"sieve/xtts")
+        for i, segment in enumerate(segments):
+            if target_language in INVERSE_LANGUAGE_CODE_MAP:
+                language_code = INVERSE_LANGUAGE_CODE_MAP[target_language]
+            else:   
+                language_code = target_language
+                print("Language code not found in map for language, using language code as is")
+            tts = tts_model.push(
+                translations[i],
+                source_audio,
+                stability=speech_stability,
+                similarity_boost=speech_similarity_boost,
+                language_code=language_code,
+            )
+            tts_coroutines.append(tts)
+    elif tts_model_str == "elevenlabs":
+        tts_model = sieve.function.get(f"sieve/elevenlabs_speech_synthesis")
+        if len(voice_id) == 0:
+            # clone voice
+            cloning_model = sieve.function.get("sieve/elevenlabs_voice_cloning")
+            voice_cloning = cloning_model.run(source_audio)
+            print(voice_cloning)
+            voice_id = voice_cloning["voice_id"]
+        if voice_id and len(voice_id) > 0:
+            tts = tts_model.push(
+                concat_translations,
+                voice_id=voice_id,
+                stability=speech_stability,
+                similarity_boost=speech_similarity_boost
+            )
+        else:
+            tts = tts_model.push(
+                concat_translations,
+                stability=speech_stability,
+                similarity_boost=speech_similarity_boost
+            )
+            
+        tts_coroutines.append(tts)
+        
+        if cleanup_voice_id:
+            # delete voice
+            cloning_model = sieve.function.get("sieve/elevenlabs_voice_cloning")
+            cloning_model.run(source_audio, delete_voice_id=voice_id)
+    elif tts_model_str == "playht":
+        tts_model = sieve.function.get(f"sieve/playht_speech_synthesis")
+        if len(voice_id) == 0:
+            # clone voice
+            cloning_model = sieve.function.get("sieve/playht_voice_cloning")
+            voice_cloning = cloning_model.run(source_audio)
+            print(voice_cloning)
+            voice_id = voice_cloning["id"]
+        for i, segment in enumerate(segments):
+            if voice_id and len(voice_id) > 0:
+                tts = tts_model.push(
+                    translations[i],
+                    voice=voice_id,
+                )
+            else:
+                tts = tts_model.push(
+                    translations[i],
+                )
+                
+            tts_coroutines.append(tts)
+
+        if cleanup_voice_id:
+            # delete voice
+            cloning_model = sieve.function.get("sieve/playht_voice_cloning")
+            cloning_model.run(source_audio, delete_voice_id=voice_id)
+    else:
+        raise ValueError(f"Unsupported TTS model: {tts_model_str}. Please use one of the following: xtts, elevenlabs, playht")
+    for tts in tts_coroutines:
+        target_audios.append(tts.result())
+    print(f"Time taken for TTS: {time.time() - start_time} seconds")
+
+    # Combine target audios with gaps
+    from pydub import AudioSegment
+
+    combined_audio = AudioSegment.empty()
+    for i, target_audio in enumerate(target_audios):
+        # Trim silence from target_audio
+        start_time = time.time()
+        target_audio_path = target_audio.path
+        # Convert the audio to wav if it is not a wav
+        if not target_audio_path.endswith('.wav'):
+            new_path = os.path.splitext(target_audio_path)[0] + '.wav'
+            convert_to_format(target_audio, new_path, 'wav')
+            target_audio_path = new_path
+        segment_audio = AudioSegment.from_wav(target_audio_path)
+        trimmed_audio = trim_silence_from_audio_loaded(segment_audio)
+        if i < len(segments) - 1:
+            try:
+                gap_duration = (
+                    segments[i + 1]["start"] - segments[i]["words"][-1]["end"]
+                ) * 1000  # Convert to milliseconds
+            except KeyError:
+                print(
+                    f"KeyError at index {i} of segments. Using default gap duration of 0.05 seconds."
+                )
+                gap_duration = 0
+            gap = AudioSegment.silent(duration=gap_duration)
+            combined_audio += trimmed_audio + gap
+        else:
+            combined_audio += trimmed_audio
+    combined_audio.export("combined_audio.wav", format="wav")
+    target_audio = sieve.Audio(path="combined_audio.wav")
+
+    # Lip-syncing if enable_lipsyncing is True and the input is a video
+    if is_video and enable_lipsyncing:
+        print("Running Lip-syncing...")
+        out_video = sieve.function.get("sieve/video_retalking").run(
+            source_video,
+            target_audio,
+            low_resolution,
+            low_fps,
+        )
+        
+        return out_video
+
+    return target_audio
+
+
+LANGUAGE_CODE_MAP = {
+    "en": "english",
+    "es": "spanish",
+    "fr": "french",
+    "de": "german",
+    "it": "italian",
+    "pt": "portuguese",
+    "ru": "russian",
+    "ja": "japanese",
+    "ko": "korean",
+    "zh": "chinese",
+    "ar": "arabic",
+    "hi": "hindi",
+    "nl": "dutch",
+    "sv": "swedish",
+    "fi": "finnish",
+    "da": "danish",
+    "pl": "polish",
+    "hu": "hungarian",
+    "el": "greek",
+    "tr": "turkish",
+    "he": "hebrew",
+    "id": "indonesian",
+    "ms": "malay",
+    "th": "thai",
+    "vi": "vietnamese",
+    "cs": "czech",
+    "ro": "romanian",
+    "uk": "ukrainian",
+    "fa": "persian",
+    "af": "afrikaans",
+    "sw": "swahili",
+    "no": "norwegian",
+    "et": "estonian",
+    "lt": "lithuanian",
+    "lv": "latvian",
+    "sl": "slovenian",
+    "sk": "slovak",
+    "hr": "croatian",
+    "sr": "serbian",
+    "mk": "macedonian",
+    "bs": "bosnian",
+    "sq": "albanian",
+    "cy": "welsh",
+    "ga": "irish",
+    "mt": "maltese",
+    "is": "icelandic",
+    "tl": "filipino",
+    "yo": "yoruba",
+    "ig": "igbo",
+    "ha": "hausa",
+    "zu": "zulu",
+    "xh": "xhosa",
+    "st": "sesotho",
+    "so": "somali",
+    "am": "amharic",
+    "ne": "nepali",
+    "bn": "bengali",
+    "pa": "punjabi",
+    "gu": "gujarati",
+    "or": "odia",
+    "ta": "tamil",
+    "te": "telugu",
+    "kn": "kannada",
+    "ml": "malayalam",
+    "si": "sinhala",
+    "my": "burmese",
+    "ka": "georgian",
+    "hy": "armenian",
+    "kk": "kazakh",
+    "uz": "uzbek",
+    "mn": "mongolian",
+    "ky": "kyrgyz",
+    "tg": "tajik",
+    "tk": "turkmen",
+    "ps": "pashto",
+    "sd": "sindhi",
+    "ur": "urdu",
+    "yi": "yiddish",
+    "la": "latin",
+}
+
+INVERSE_LANGUAGE_CODE_MAP = {v: k for k, v in LANGUAGE_CODE_MAP.items()}
+
diff --git a/dubbing/utils.py b/dubbing/utils.py
new file mode 100644
index 0000000..6ac9253
--- /dev/null
+++ b/dubbing/utils.py
@@ -0,0 +1,343 @@
+import sieve
+
+
+def trim_audio_into_snippets(audio_path: str, timestamps: list) -> list:
+    """
+    Trims an audio file into different snippets based on given timestamps.
+
+    Parameters:
+    - audio_path: path to the input audio file.
+    - timestamps: list of tuples, each containing start and end timestamps in seconds.
+
+    Returns:
+    - List of paths to each of the audio snippets.
+    """
+    from pydub import AudioSegment
+    from pydub.utils import make_chunks
+
+    # Load the audio file
+    audio = AudioSegment.from_file(audio_path)
+
+    # Initialize list to store paths to audio snippets
+    snippet_paths = []
+
+    # Loop over each timestamp pair and trim the audio
+    for i, (start, end) in enumerate(timestamps):
+        # Convert timestamps from seconds to milliseconds
+        start_ms = start * 1000
+        end_ms = end * 1000
+
+        # Trim the audio
+        snippet = audio[start_ms:end_ms]
+
+        # Save the snippet to a new file
+        snippet_path = f"snippet_{i}.wav"
+        snippet.export(snippet_path, format="wav")
+
+        # Add the snippet path to the list
+        snippet_paths.append(snippet_path)
+
+    return snippet_paths
+
+
+def convert_to_format(
+    audio: sieve.Audio, output_path: str, output_format: str = "mp3"
+) -> sieve.Audio:
+    """
+    Converts a video or audio file to an mp3 or wav file.
+
+    Parameters:
+    - input_path: path to the input video or audio file.
+    - output_path: path to save the output audio file.
+    - output_format: format of the output audio file. Default is "mp3".
+
+    Returns:
+    - new sieve.Audio
+    """
+    from pydub import AudioSegment
+    import os
+
+    input_path = audio.path
+
+    if output_format not in ["mp3", "wav"]:
+        raise ValueError(
+            f"Unsupported output format: {output_format}. Please use one of the following: mp3, wav"
+        )
+
+    if os.path.exists(output_path):
+        os.remove(output_path)
+
+    if os.path.splitext(input_path)[1] == ".mp3" and output_format == "mp3":
+        return audio
+    elif os.path.splitext(input_path)[1] == ".wav" and output_format == "wav":
+        return audio
+
+    # Load the input file
+    audio_input = AudioSegment.from_file(input_path)
+
+    # Export the audio in the desired format
+    audio_input.export(output_path, format=output_format)
+
+    return sieve.Audio(path=output_path)
+
+
+def match_audio_length(
+    source_audio_path: str,
+    target_audio_path: str,
+    output_path: str = "matched_audio.wav",
+) -> str:
+    """
+    Stretches or shrinks the source audio to match the length of the target audio.
+
+    Parameters:
+    - source_audio_path: path to the source audio file.
+    - target_audio_path: path to the target audio file.
+    - output_path: path to save the matched audio. Default is "matched_audio.wav".
+
+    Returns:
+    - Path to the matched audio file.
+    """
+    import librosa
+    import pyrubberband as pyrb
+    import soundfile as sf
+
+    # Load the audio files
+    source_audio, sr_source = librosa.load(source_audio_path, sr=None)
+    target_audio, sr_target = librosa.load(target_audio_path, sr=None)
+
+    print(f"Source audio length: {len(source_audio) / sr_source} seconds")
+    print(f"Target audio length: {len(target_audio) / sr_target} seconds")
+
+    # Calculate the speed factor
+    speed_factor = len(target_audio) / len(source_audio)
+
+    print(f"Speed factor: {speed_factor}")
+
+    # Stretch or shrink the source audio
+    matched_audio = pyrb.time_stretch(source_audio, sr_source, 1 / speed_factor)
+
+    # Export the matched audio
+    sf.write(output_path, matched_audio, sr_source, format="wav")
+
+    return output_path
+
+
+def match_audio_length_from_segment(source_audio, target_length_ms: int):
+    """
+    Stretches or shrinks the source audio to match the target length.
+
+    Parameters:
+    - source_audio: AudioSegment of the source audio file.
+    - target_length_ms: target length in milliseconds.
+
+    Returns:
+    - AudioSegment of the matched audio.
+    """
+    import librosa
+    import pyrubberband as pyrb
+    import numpy as np
+    from pydub import AudioSegment
+
+    # Convert source_audio to numpy array
+    source_audio_array = np.array(source_audio.get_array_of_samples())
+
+    source_audio_length_s = len(source_audio_array) / source_audio.frame_rate
+    target_audio_length_s = target_length_ms / 1000
+
+    print(f"Source audio length: {source_audio_length_s} seconds")
+    print(f"Target audio length: {target_audio_length_s} seconds")
+
+    # Calculate the speed factor
+    speed_factor = target_audio_length_s / source_audio_length_s
+
+    print(f"Speed factor: {speed_factor}")
+
+    # Stretch or shrink the source audio
+    matched_audio_array = pyrb.time_stretch(
+        source_audio_array, source_audio.frame_rate, 1 / speed_factor
+    )
+
+    # Convert matched_audio_array back to AudioSegment
+    matched_audio = AudioSegment(
+        matched_audio_array.tobytes(),
+        frame_rate=source_audio.frame_rate,
+        sample_width=source_audio.sample_width,
+        channels=source_audio.channels,
+    )
+
+    return matched_audio
+
+
+def speed_up_audio(
+    audio_path: str, speed: float = 1.25, output_path: str = "sped_up_audio.wav"
+) -> str:
+    """
+    Speeds up an audio file by a given factor without changing the pitch.
+
+    Parameters:
+    - audio_path: path to the input audio file.
+    - speed: factor to speed up the audio by. Default is 1.25.
+    - output_path: path to save the sped-up audio. Default is "sped_up_audio.wav".
+
+    Returns:
+    - Path to the sped-up audio file.
+    """
+    from pydub import AudioSegment
+    from pydub.playback import play
+
+    # Load the audio file
+    audio = AudioSegment.from_file(audio_path)
+
+    # Speed up the audio
+    sped_up_audio = audio.speedup(playback_speed=speed)
+
+    # Export the sped-up audio
+    sped_up_audio.export(output_path, format="wav")
+
+    return output_path
+
+
+def trim_audio_into_snippets(audio_path: str, timestamps: list) -> list:
+    """
+    Trims an audio file into different snippets based on given timestamps.
+
+    Parameters:
+    - audio_path: path to the input audio file.
+    - timestamps: list of tuples, each containing start and end timestamps in seconds.
+
+    Returns:
+    - List of paths to each of the audio snippets.
+    """
+    from pydub import AudioSegment
+    from pydub.utils import make_chunks
+
+    # Load the audio file
+    audio = AudioSegment.from_file(audio_path)
+
+    # Initialize list to store paths to audio snippets
+    snippet_paths = []
+
+    # Loop over each timestamp pair and trim the audio
+    for i, (start, end) in enumerate(timestamps):
+        # Convert timestamps from seconds to milliseconds
+        start_ms = start * 1000
+        end_ms = end * 1000
+
+        # Trim the audio
+        snippet = audio[start_ms:end_ms]
+
+        # Save the snippet to a new file
+        snippet_path = f"snippet_{i}.wav"
+        snippet.export(snippet_path, format="wav")
+
+        # Add the snippet path to the list
+        snippet_paths.append(snippet_path)
+
+    return snippet_paths
+
+
+def extract_audio_from_video(video_path: str, output_path: str):
+    """
+    Extracts audio from a video and saves it to an output path.
+
+    Parameters:
+    - video_path: path to the input video file.
+    - output_path: path to save the extracted audio.
+    """
+    from moviepy.editor import VideoFileClip
+
+    # Load the video file
+    clip = VideoFileClip(video_path)
+
+    # Extract audio
+    audio = clip.audio
+
+    # Save audio to output path
+    audio.write_audiofile(output_path)
+
+
+def trim_silence_from_video(video_path, output_path, silence_thresh=-50.0):
+    """
+    Trims silence from the beginning and end of a video.
+
+    Parameters:
+    - video_path: path to the input video file.
+    - output_path: path to save the trimmed video.
+    - silence_thresh: threshold in dB. Anything quieter than this will be considered silence.
+    - chunk_size: how long to analyze sound for (in ms).
+    """
+
+    from moviepy.editor import VideoFileClip
+    from pydub import AudioSegment
+    from pydub.silence import detect_nonsilent
+
+    # Load the video file
+    clip = VideoFileClip(video_path)
+
+    # Convert video audio to pydub's AudioSegment format
+    audio = AudioSegment.from_file(video_path, codec="aac")
+
+    # Detect non-silent chunks
+    non_silence_ranges = detect_nonsilent(
+        audio, min_silence_len=1, silence_thresh=silence_thresh
+    )
+
+    # If there are non-silent chunks, trim the video file based on the first and last non-silent chunk
+    if non_silence_ranges:
+        start_trim = max(
+            0, non_silence_ranges[0][0] - 250
+        )  # Add a quarter second buffer to the start, if possible
+        end_trim = min(
+            len(audio), non_silence_ranges[-1][1] + 250
+        )  # Add a quarter second buffer to the end, if possible
+        trimmed_clip = clip.subclip(
+            start_trim / 1000.0, end_trim / 1000.0
+        )  # Convert ms to seconds
+        trimmed_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")
+    else:
+        print("All audio is silent. Nothing to trim!")
+
+
+def trim_silence_from_audio(audio_path, output_path, silence_thresh=-50.0):
+    """
+    Trims silence from the beginning and end of an audio.
+
+    Parameters:
+    - audio_path: path to the input audio file.
+    - output_path: path to save the trimmed audio.
+    - silence_thresh: threshold in dB. Anything quieter than this will be considered silence.
+    """
+
+    from pydub import AudioSegment
+
+    # Load the audio file
+    audio = AudioSegment.from_file(audio_path, format="wav")
+
+    # Detect non-silent chunks
+    out = trim_silence_from_audio_loaded(audio, silence_thresh=silence_thresh)
+
+    # Export the trimmed audio
+    out.export(output_path, format="wav")
+
+
+def trim_silence_from_audio_loaded(audio, silence_thresh=-50.0, buffer_ms=250):
+    from pydub.silence import detect_nonsilent
+
+    # Detect non-silent chunks
+    non_silence_ranges = detect_nonsilent(
+        audio, min_silence_len=1, silence_thresh=silence_thresh
+    )
+
+    # If there are non-silent chunks, trim the audio file based on the first and last non-silent chunk
+    if non_silence_ranges:
+        start_trim = max(
+            0, non_silence_ranges[0][0] - buffer_ms
+        )  # Add a quarter second buffer to the start, if possible
+        end_trim = min(
+            len(audio), non_silence_ranges[-1][1] + buffer_ms
+        )  # Add a quarter second buffer to the end, if possible
+        trimmed_audio = audio[start_trim:end_trim]
+        return trimmed_audio
+    else:
+        print("All audio is silent. Nothing to trim!")
+        return audio
diff --git a/text_to_speech/elevenlabs/main.py b/text_to_speech/elevenlabs/main.py
index 2220ad5..d8e58e2 100644
--- a/text_to_speech/elevenlabs/main.py
+++ b/text_to_speech/elevenlabs/main.py
@@ -124,12 +124,18 @@ def clone_audio(
 
     shutil.rmtree(temp_dir)
 
+    if response.status_code != 200:
+        raise ValueError(f"Could not clone voice. API Response:\n {response.text}")
+    
+    if "voice_id" not in response.json():
+        raise ValueError(f"Could not clone voice. Please verify that you have not hit a limit on the number of voices you can create. API Response:\n {response.text}")
+
     try:
         return response.json()
     
     except Exception as e:
         print(response.text)
-        raise ValueError("Could not generate voice", response.text)
+        raise ValueError(f"Could not generate voice. API Response: {response.text}")
 
 synthesis_metadata = sieve.Metadata(
     description="Text to speech using ElevenLabs",
@@ -144,7 +150,7 @@ def clone_audio(
 @sieve.function(
     name="elevenlabs_speech_synthesis",
     system_packages=["ffmpeg"],
-    environment_variables=[
+    environment_variables=[ 
         sieve.Env(name="ELEVEN_LABS_API_KEY", description="API key for ElevenLabs")
     ],
     metadata=synthesis_metadata