-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexample.py
More file actions
63 lines (52 loc) · 2.41 KB
/
example.py
File metadata and controls
63 lines (52 loc) · 2.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from ebook_reader_supertonic import SupertonicTTS, VOICE_STYLES, MIN_SPEED, MAX_SPEED, MIN_STEPS, MAX_STEPS
import numpy as np
def main():
"""
This example demonstrates how to use the ebook_reader_supertonic package with various settings.
"""
# 1. Initialize the engine
# This will automatically download models to ~/.cache/ebook_reader_supertonic if not present.
engine = SupertonicTTS()
# 2. Basic Synthesis parameters
text = "Supertonic TTS is a high-quality flow-matching based speech synthesis system."
# Selecting a voice style (you can use VOICE_STYLES[i].id or just strings like 'F1', 'M3')
voice_choice = VOICE_STYLES[4] # F5: Professional and attractive
print(f"Synthesizing using: {voice_choice.name} ({voice_choice.description})")
# 3. Call synthesize
# Parameters:
# - text: The string to speak
# - voice: Voice ID (F1-F5, M1-M5)
# - steps: Number of diffusion steps (Higher = better quality, Lower = faster)
# - speed: Speech rate (lower is slower, higher is faster)
audio, sample_rate, word_timestamps = engine.synthesize(
text=text,
voice=voice_choice.id,
steps=MAX_STEPS, # Using 14 steps for best quality
speed=1.0 # Normal speed
)
# 4. Understanding the Output
# - audio: A NumPy array (float32) of normalized audio samples (-1 to 1).
# - sample_rate: The sampling rate (always 44100 Hz for Supertonic).
# - word_timestamps: A list of dicts: [{'word': str, 'start': float, 'end': float}]
duration = len(audio) / sample_rate
print(f"\n--- Output Info ---")
print(f"Audio Buffer Type: {type(audio)}")
print(f"Sample Rate: {sample_rate} Hz")
print(f"Total Duration: {duration:.2f} seconds")
# 5. Extracting Word Metadata
print("\n--- Word-by-Word Timestamps ---")
for ts in word_timestamps:
word = ts['word']
start = ts['start']
end = ts['end']
word_duration = end - start
print(f"[{start:5.2f}s -> {end:5.2f}s] {word:12} (Length: {word_duration:.2f}s)")
# 6. Saving the output
output_file = "comprehensive_example.wav"
engine.save_wav(audio, output_file)
print(f"\nSaved audio to: {output_file}")
# 7. Speed limits example
print(f"\nRecommended speed range: {MIN_SPEED} to {MAX_SPEED}")
print(f"Recommended steps range: {MIN_STEPS} to {MAX_STEPS}")
if __name__ == "__main__":
main()