-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
160 lines (141 loc) · 5.81 KB
/
main.py
File metadata and controls
160 lines (141 loc) · 5.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import json
from gtts import gTTS
from io import BytesIO
from pydub import AudioSegment
from pydub.playback import play
import requests
import argparse
import queue
import sys
import sounddevice as sd
import google.generativeai as genai
import io
import threading
from vosk import Model, KaldiRecognizer
# Global queue and event
q = queue.Queue()
tts_playing_event = threading.Event()
def int_or_str(text):
"""Helper function for argument parsing."""
try:
return int(text)
except ValueError:
return text
def callback(indata, frames, time, status):
"""This is called (from a separate thread) for each audio block."""
if status:
print(status, file=sys.stderr)
if not tts_playing_event.is_set():
q.put(bytes(indata))
def load_config(config_file):
"""Load configuration settings from a JSON file."""
with open(config_file, 'r') as file:
return json.load(file)
def initialize_ai_model(api_key):
"""Initializes the AI model with the provided API key."""
genai.configure(api_key=api_key)
model = genai.GenerativeModel(
model_name='gemini-1.5-flash-latest',
system_instruction=(
"You are Luna, an assistant to take phone calls from customers for <REDACTED> "
"Your job is to greet customers, understand their needs, provide troubleshooting help if possible, and offer information about additional services, "
"especially mobile plans with <REDACTED>. When encountering situations where immediate assistance isn’t possible, you will pass the call to the senior team. "
"Keep your responses casual and human-like, including natural speech patterns and pauses. Now you are ready to take calls. Keep responses short and sweet."
)
)
return model
def process_text(chat, text):
"""Processes the text using the AI model and returns the response."""
response = chat.send_message(text)
return response.text
def text_to_speech(text, api_key, voice_id):
"""Converts text to speech using the ElevenLabs API."""
tts_playing_event.set()
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
payload = {
"text": text,
"model_id": "eleven_turbo_v2",
"voice_settings": {
"stability": 0.30,
"similarity_boost": 0.75,
"style": 1.0,
"use_speaker_boost": True
}
}
headers = {
"Content-Type": "application/json",
"xi-api-key": api_key
}
response = requests.post(url, json=payload, headers=headers)
if response.status_code == 200:
audio_content = response.content
audio = AudioSegment.from_file(io.BytesIO(audio_content))
play(audio)
else:
print(f"Failed to generate speech: {response.status_code} - {response.text}")
tts_playing_event.clear()
def main():
parser = argparse.ArgumentParser(description="Voice Interaction AI")
parser.add_argument("-c", "--config", type=str, default="config.json", help="Path to the config file")
parser.add_argument("-f", "--filename", type=str, metavar="FILENAME", help="Audio file to store recording to")
parser.add_argument("-d", "--device", type=int_or_str, help="Input device (numeric ID or substring)")
parser.add_argument("-r", "--samplerate", type=int, help="Sampling rate")
parser.add_argument("-m", "--model", type=str, help="Language model; e.g. en-us, fr, nl; default is en-us")
parser.add_argument("-l", "--list-devices", action="store_true", help="Show list of audio devices and exit")
args = parser.parse_args()
#Gradio - python to UI
if args.list_devices:
print(sd.query_devices())
sys.exit(0)
# Load configuration
config = load_config(args.config)
google_api_key = config["google_api_key"]
elevenlabs_api_key = config["elevenlabs_api_key"]
elevenlabs_voice_id = config.get("elevenlabs_voice_id", "Iu3tg76F3g64V36OrFVV")
# Initialize the AI model
model = initialize_ai_model(google_api_key)
chat = model.start_chat(history=[])
try:
if args.samplerate is None:
device_info = sd.query_devices(args.device, "input")
args.samplerate = int(device_info["default_samplerate"])
if args.model is None:
vosk_model = Model(lang="en-us")
else:
vosk_model = Model(lang=args.model)
if args.filename:
dump_fn = open(args.filename, "wb")
else:
dump_fn = None
with sd.RawInputStream(samplerate=args.samplerate, blocksize=8000, device=args.device, dtype="int16", channels=1, callback=callback):
print("#" * 80)
print("Press Ctrl+C to stop the recording")
print("#" * 80)
rec = KaldiRecognizer(vosk_model, args.samplerate)
while True:
data = q.get()
if rec.AcceptWaveform(data):
result_string = rec.Result()
print(result_string)
try:
result_dict = json.loads(result_string)
except json.JSONDecodeError:
result_dict = {}
text_data = result_dict.get("text", "")
if text_data!="":
print("Speech Text:", text_data)
ai_response = process_text(chat, text_data)
print("AI response:", ai_response)
text_to_speech(ai_response, elevenlabs_api_key, elevenlabs_voice_id)
if dump_fn is not None:
dump_fn.write(data)
except KeyboardInterrupt:
print("\nDone")
sys.exit(0)
except Exception as e:
sys.exit(f"{type(e).__name__}: {e}")
finally:
if dump_fn:
dump_fn.close()
if __name__ == "__main__":
main()