Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions illuminate/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
I have not made this into a sieve function yet. It dictates an openai chat in two given voices with contextual emotion and pacing.


Setup:
```
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt
python scripts/get_voices.py
```

Run local example:
```
python dictate.py
```
147 changes: 147 additions & 0 deletions illuminate/dictate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import os
import shutil
import random

import sieve
from langchain.chat_models import ChatOpenAI
from pydub import AudioSegment

VOICE_DIR = "voices"

EMOTIONS = [
"normal",
"anger",
"curiosity",
"positivity",
"surprise",
"sadness",
]

PACES = [
"normal",
"fast",
"slow"
]


llm = ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo")


def parse_emotion_pace(text):
prompt = lambda message: f"""
Which of the emotions:
- {"\n- ".join(EMOTIONS)}

And paces:
- {"\n- ".join(PACES)}

Is most apprporiate to read this:
{message}
"""
response = llm.invoke(prompt(text)).content
words = response.lower().split(" ")

emotion = "normal"
pace = "normal"

for word in words:
if word in EMOTIONS:
emotion = word
if word in PACES:
pace = word

return {"emotion": emotion, "pace": pace}


def intonate(messages):

for message in messages:
text = message['message']
message = {**message, **parse_emotion_pace(text)}

yield message


def dictate_messages(
messages: dict,
alice: str="obama",
bob: str="neil_degrasse"
):

tts = sieve.function.get("sieve/tts")
backend = "cartesia-voice-cloning"

ref_voices = {
"user": sieve.File(path=os.path.join(VOICE_DIR, alice + ".wav")),
"assistant": sieve.File(path=os.path.join(VOICE_DIR, bob + ".wav")),
}

voice_jobs = []
for message in intonate(messages):
if message["role"] == "system":
continue

args = {
"voice": backend,
"text": message["message"],
"reference_audio": ref_voices[message["role"]],
"emotion": message["emotion"],
"pace": message["pace"],
}

voice_jobs.append(
tts.push(**args)
)

for job in voice_jobs:
yield job.result()


def concatenate_audio_files(directory):
print(f"Concatenating audio files in {directory}")
output_file = "concatenated.wav"
audio_files = [f for f in os.listdir(directory) if f.endswith('.wav')]
audio_files.sort()

combined = AudioSegment.empty()

for file in audio_files:
audio = AudioSegment.from_wav(os.path.join(directory, file))

# add pause of around half a second; noise of 50ms makes it sound more natural
noise = random.gauss(0,50)
pause = AudioSegment.silent(duration=500 + noise)

combined += audio + pause

combined.export(output_file, format="wav")

return sieve.File(path=output_file)


def main(messages):
convo_dir = "convo"

shutil.rmtree(convo_dir, ignore_errors=True)
os.makedirs(convo_dir, exist_ok=True)
for i, audio in enumerate(dictate_messages(messages)):
shutil.move(audio.path, os.path.join("convo", f"audio_{i}.wav"))
print(f"Audio {i} saved to convo/audio_{i}.wav")

return concatenate_audio_files(convo_dir)


if __name__ == "__main__":

messages = [
{"role": "user", "message": "Hello, how are you?"},
{"role": "assistant", "message": "I'm doing well, thank you for asking."},
{"role": "user", "message": "What is the capital of France?"},
{"role": "assistant", "message": "The capital of France is Paris."},
]


main(messages)



52 changes: 52 additions & 0 deletions illuminate/post.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@


Recently, Google released [Illuminate](https://illuminate.google.com/home), an experimental tool for generating engaging audio content (podcast-esque) from research papers and books.

From the web page:
> Illuminate is an experimental technology that uses AI to adapt content to your learning preferences. Illuminate generates audio with two AI-generated voices in conversation, discussing the key points of select papers. Illuminate is currently optimized for published computer science academic papers.
> As an experimental product, the generated audio with two AI-generated voices in conversation may not always perfectly capture the nuances of the original research papers. Please be aware that there may be occasional errors or inconsistencies and that we are continually iterating to improve the user experience.

While they don't go into detail on its inner workings, it's reasonable to believe that it's a pipeline or some sort with two parts:
- A text-based language model pipeline to extract information and write a transcript
- An audio pipeline to read the transcript and make a dialogue

The text-based pipeline would be responsible for taking in a PDF or ebook, and extracting key information using a technique like RAG, then generating talking points and dialogue from that information.

The audio examples on the Illuminate web page tend to follow an interview-style format along the lines of
- introduce the content
- question
- in depth response
- reaction or comment, followup question

The examples are pleasant, factual, and use analogies to explain the content in a pleasing way. Designing a language model program to turn raw information into a human-like dialogue is a tricky task, and outside of the scope of this post!

Here we're going to attack the second part of the pipeline: generating dialogue from the conversation. At the end of the day, you'll probably end up with an openai chat schema with a conversation in it. Something that looks like this:
```python
messages = [
{"role": "user", "message": "Hello, how are you?"},
{"role": "assistant", "message": "I'm doing well, thank you for asking."},
{"role": "user", "message": "What is the capital of France?"},
{"role": "assistant", "message": "The capital of France is Paris."},
]
```

The task is to generate a dialogue with two voices from the dialogue. The easy part of the task is to
- clone two voices
- read "user" messages in one voice, read "assistant" messages in the other
- add pauses in between dictations

The harder part of the task is to _contextually_ adjust
- intonation
- speed
- pauses

We'll accomplish this super easily with `sieve/tts`, which narrates text given a reference voice, and gives you control over emotion, pacing, and other granular parameters.

For each message, we'll
- predict a reasonable emotion and pace with an llm






7 changes: 7 additions & 0 deletions illuminate/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
langchain
langchain-community
pypdf
openai
sievedata
tiktoken
pydub
93 changes: 93 additions & 0 deletions illuminate/scripts/get_voices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import sieve
import os
import shutil
import subprocess
import tempfile
import json


VOICE_DIR = "voices"

ALL_VOICES = [
{
"name": "neil_degrasse",
"link": "https://www.youtube.com/watch?v=JtahB1-MNvk",
"time": 2*60 + 14
},
{
"name": "obama",
"link": "https://www.youtube.com/watch?v=X15o2sG8HF4",
"time": 1*60 + 5
},
{
"name": "oprah",
"link": "https://www.youtube.com/watch?v=Nk3s5SvIQ7o",
"time": 1*60 + 40
},
]


def save_voice(yt_link: str, name: str):
os.makedirs(VOICE_DIR, exist_ok=True)

youtube_dl = sieve.function.get("sieve/youtube_to_mp4")
video = youtube_dl.run(yt_link, resolution="lowest-available")

shutil.move(video.path, os.path.join(VOICE_DIR, name + ".mp4"))


def trim_video_to_wav(video: sieve.File, start_time: int):
# tmp wav file w random name
output_path = os.path.join(tempfile.gettempdir(), next(tempfile._get_candidate_names()) + ".wav")

command = [
'ffmpeg',
'-ss', str(start_time),
'-t', '60', # 60 seconds
'-i', video.path,
'-nostdin',
'-loglevel','error',
'-vn', # Disable video output
'-acodec', 'pcm_s16le', # WAV format
'-ar', '44100', # Audio sampling rate
'-ac', '2', # Stereo
output_path
]

subprocess.run(command, check=True)

return sieve.File(path=output_path)


def get_voices():

os.makedirs(VOICE_DIR, exist_ok=True)

youtube_dl = sieve.function.get("sieve/youtube_to_mp4")

jobs = []
for voice in ALL_VOICES:
if os.path.exists(os.path.join(VOICE_DIR, voice["name"] + ".wav")):
print(f"Voice {voice['name']} already exists")
continue

jobs.append(youtube_dl.push(
voice["link"],
resolution="lowest-available"
))
print(f"Downloading {voice['name']} voice")


for job, voice in zip(jobs, ALL_VOICES):

video = job.result()

audio = trim_video_to_wav(video, voice["time"])

shutil.move(audio.path, os.path.join(VOICE_DIR, voice["name"] + ".wav"))
print(f"Saved {voice['name']} voice to {VOICE_DIR}")



if __name__ == "__main__":
get_voices()
5 changes: 3 additions & 2 deletions text_to_segment/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,11 @@ def get_object_bbox(image: sieve.File, object_name: str):
],
metadata=metadata
)
def segment(file: sieve.File, object_name: str):
def segment(file: sieve.File, object_name: str, return_mp4: bool = False):
"""
:param file: photo or video to segment
:param object_name: the object you wish to segment
:param return_mp4: if True, return only an MP4 video of the segmentation masks
"""
sam = sieve.function.get("sieve/sam2")

Expand All @@ -90,7 +91,7 @@ def segment(file: sieve.File, object_name: str):
file=file,
prompts=[sam_prompt],
model_type="tiny",
debug_masks=False
debug_masks=return_mp4
)

return sam_out
Expand Down