diff --git a/illuminate/README.md b/illuminate/README.md new file mode 100644 index 0000000..d302c97 --- /dev/null +++ b/illuminate/README.md @@ -0,0 +1,15 @@ +I have not made this into a sieve function yet. It dictates an openai chat in two given voices with contextual emotion and pacing. + + +Setup: +``` +python -m venv venv +source venv/bin/activate +pip install -r requirements.txt +python scripts/get_voices.py +``` + +Run local example: +``` +python dictate.py +``` diff --git a/illuminate/dictate.py b/illuminate/dictate.py new file mode 100644 index 0000000..e2b5a53 --- /dev/null +++ b/illuminate/dictate.py @@ -0,0 +1,147 @@ +import os +import shutil +import random + +import sieve +from langchain.chat_models import ChatOpenAI +from pydub import AudioSegment + +VOICE_DIR = "voices" + +EMOTIONS = [ + "normal", + "anger", + "curiosity", + "positivity", + "surprise", + "sadness", +] + +PACES = [ + "normal", + "fast", + "slow" +] + + +llm = ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo") + + +def parse_emotion_pace(text): + prompt = lambda message: f""" +Which of the emotions: +- {"\n- ".join(EMOTIONS)} + +And paces: +- {"\n- ".join(PACES)} + +Is most apprporiate to read this: +{message} +""" + response = llm.invoke(prompt(text)).content + words = response.lower().split(" ") + + emotion = "normal" + pace = "normal" + + for word in words: + if word in EMOTIONS: + emotion = word + if word in PACES: + pace = word + + return {"emotion": emotion, "pace": pace} + + +def intonate(messages): + + for message in messages: + text = message['message'] + message = {**message, **parse_emotion_pace(text)} + + yield message + + +def dictate_messages( + messages: dict, + alice: str="obama", + bob: str="neil_degrasse" +): + + tts = sieve.function.get("sieve/tts") + backend = "cartesia-voice-cloning" + + ref_voices = { + "user": sieve.File(path=os.path.join(VOICE_DIR, alice + ".wav")), + "assistant": sieve.File(path=os.path.join(VOICE_DIR, bob + ".wav")), + } + + voice_jobs = [] + for message in intonate(messages): + if message["role"] == "system": + continue + + args = { + "voice": backend, + "text": message["message"], + "reference_audio": ref_voices[message["role"]], + "emotion": message["emotion"], + "pace": message["pace"], + } + + voice_jobs.append( + tts.push(**args) + ) + + for job in voice_jobs: + yield job.result() + + +def concatenate_audio_files(directory): + print(f"Concatenating audio files in {directory}") + output_file = "concatenated.wav" + audio_files = [f for f in os.listdir(directory) if f.endswith('.wav')] + audio_files.sort() + + combined = AudioSegment.empty() + + for file in audio_files: + audio = AudioSegment.from_wav(os.path.join(directory, file)) + + # add pause of around half a second; noise of 50ms makes it sound more natural + noise = random.gauss(0,50) + pause = AudioSegment.silent(duration=500 + noise) + + combined += audio + pause + + combined.export(output_file, format="wav") + + return sieve.File(path=output_file) + + +def main(messages): + convo_dir = "convo" + + shutil.rmtree(convo_dir, ignore_errors=True) + os.makedirs(convo_dir, exist_ok=True) + for i, audio in enumerate(dictate_messages(messages)): + shutil.move(audio.path, os.path.join("convo", f"audio_{i}.wav")) + print(f"Audio {i} saved to convo/audio_{i}.wav") + + return concatenate_audio_files(convo_dir) + + +if __name__ == "__main__": + + messages = [ + {"role": "user", "message": "Hello, how are you?"}, + {"role": "assistant", "message": "I'm doing well, thank you for asking."}, + {"role": "user", "message": "What is the capital of France?"}, + {"role": "assistant", "message": "The capital of France is Paris."}, + ] + + + main(messages) + + + diff --git a/illuminate/post.md b/illuminate/post.md new file mode 100644 index 0000000..9f9b768 --- /dev/null +++ b/illuminate/post.md @@ -0,0 +1,52 @@ + + +Recently, Google released [Illuminate](https://illuminate.google.com/home), an experimental tool for generating engaging audio content (podcast-esque) from research papers and books. + +From the web page: +> Illuminate is an experimental technology that uses AI to adapt content to your learning preferences. Illuminate generates audio with two AI-generated voices in conversation, discussing the key points of select papers. Illuminate is currently optimized for published computer science academic papers. +> As an experimental product, the generated audio with two AI-generated voices in conversation may not always perfectly capture the nuances of the original research papers. Please be aware that there may be occasional errors or inconsistencies and that we are continually iterating to improve the user experience. + +While they don't go into detail on its inner workings, it's reasonable to believe that it's a pipeline or some sort with two parts: +- A text-based language model pipeline to extract information and write a transcript +- An audio pipeline to read the transcript and make a dialogue + +The text-based pipeline would be responsible for taking in a PDF or ebook, and extracting key information using a technique like RAG, then generating talking points and dialogue from that information. + +The audio examples on the Illuminate web page tend to follow an interview-style format along the lines of +- introduce the content +- question +- in depth response +- reaction or comment, followup question + +The examples are pleasant, factual, and use analogies to explain the content in a pleasing way. Designing a language model program to turn raw information into a human-like dialogue is a tricky task, and outside of the scope of this post! + +Here we're going to attack the second part of the pipeline: generating dialogue from the conversation. At the end of the day, you'll probably end up with an openai chat schema with a conversation in it. Something that looks like this: +```python + messages = [ + {"role": "user", "message": "Hello, how are you?"}, + {"role": "assistant", "message": "I'm doing well, thank you for asking."}, + {"role": "user", "message": "What is the capital of France?"}, + {"role": "assistant", "message": "The capital of France is Paris."}, + ] +``` + +The task is to generate a dialogue with two voices from the dialogue. The easy part of the task is to +- clone two voices +- read "user" messages in one voice, read "assistant" messages in the other +- add pauses in between dictations + +The harder part of the task is to _contextually_ adjust +- intonation +- speed +- pauses + +We'll accomplish this super easily with `sieve/tts`, which narrates text given a reference voice, and gives you control over emotion, pacing, and other granular parameters. + +For each message, we'll +- predict a reasonable emotion and pace with an llm + + + + + + diff --git a/illuminate/requirements.txt b/illuminate/requirements.txt new file mode 100644 index 0000000..602dec6 --- /dev/null +++ b/illuminate/requirements.txt @@ -0,0 +1,7 @@ +langchain +langchain-community +pypdf +openai +sievedata +tiktoken +pydub diff --git a/illuminate/scripts/get_voices.py b/illuminate/scripts/get_voices.py new file mode 100644 index 0000000..d2f53be --- /dev/null +++ b/illuminate/scripts/get_voices.py @@ -0,0 +1,93 @@ +import sieve +import os +import shutil +import subprocess +import tempfile +import json + + +VOICE_DIR = "voices" + +ALL_VOICES = [ + { + "name": "neil_degrasse", + "link": "https://www.youtube.com/watch?v=JtahB1-MNvk", + "time": 2*60 + 14 + }, + { + "name": "obama", + "link": "https://www.youtube.com/watch?v=X15o2sG8HF4", + "time": 1*60 + 5 + }, + { + "name": "oprah", + "link": "https://www.youtube.com/watch?v=Nk3s5SvIQ7o", + "time": 1*60 + 40 + }, +] + + +def save_voice(yt_link: str, name: str): + os.makedirs(VOICE_DIR, exist_ok=True) + + youtube_dl = sieve.function.get("sieve/youtube_to_mp4") + video = youtube_dl.run(yt_link, resolution="lowest-available") + + shutil.move(video.path, os.path.join(VOICE_DIR, name + ".mp4")) + + +def trim_video_to_wav(video: sieve.File, start_time: int): + # tmp wav file w random name + output_path = os.path.join(tempfile.gettempdir(), next(tempfile._get_candidate_names()) + ".wav") + + command = [ + 'ffmpeg', + '-ss', str(start_time), + '-t', '60', # 60 seconds + '-i', video.path, + '-nostdin', + '-loglevel','error', + '-vn', # Disable video output + '-acodec', 'pcm_s16le', # WAV format + '-ar', '44100', # Audio sampling rate + '-ac', '2', # Stereo + output_path + ] + + subprocess.run(command, check=True) + + return sieve.File(path=output_path) + + +def get_voices(): + + os.makedirs(VOICE_DIR, exist_ok=True) + + youtube_dl = sieve.function.get("sieve/youtube_to_mp4") + + jobs = [] + for voice in ALL_VOICES: + if os.path.exists(os.path.join(VOICE_DIR, voice["name"] + ".wav")): + print(f"Voice {voice['name']} already exists") + continue + + jobs.append(youtube_dl.push( + voice["link"], + resolution="lowest-available" + )) + print(f"Downloading {voice['name']} voice") + + + for job, voice in zip(jobs, ALL_VOICES): + + video = job.result() + + audio = trim_video_to_wav(video, voice["time"]) + + shutil.move(audio.path, os.path.join(VOICE_DIR, voice["name"] + ".wav")) + print(f"Saved {voice['name']} voice to {VOICE_DIR}") + + + +if __name__ == "__main__": + get_voices() diff --git a/text_to_segment/main.py b/text_to_segment/main.py index f771ffc..f5a8c21 100644 --- a/text_to_segment/main.py +++ b/text_to_segment/main.py @@ -65,10 +65,11 @@ def get_object_bbox(image: sieve.File, object_name: str): ], metadata=metadata ) -def segment(file: sieve.File, object_name: str): +def segment(file: sieve.File, object_name: str, return_mp4: bool = False): """ :param file: photo or video to segment :param object_name: the object you wish to segment + :param return_mp4: if True, return only an MP4 video of the segmentation masks """ sam = sieve.function.get("sieve/sam2") @@ -90,7 +91,7 @@ def segment(file: sieve.File, object_name: str): file=file, prompts=[sam_prompt], model_type="tiny", - debug_masks=False + debug_masks=return_mp4 ) return sam_out