STARBORN's picture
Upload app.py
2087bf7 verified
import gradio as gr
import os
import numpy as np
import librosa
import asyncio
import edge_tts
import soundfile as sf
from groq import Groq
try:
from fastrtc import WebRTC, ReplyOnPause, get_hf_turn_credentials
FASTRTC_AVAILABLE = True
except ImportError:
FASTRTC_AVAILABLE = False
print("FastRTC not available, using fallback UI")
# Initialize Groq
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
async def text_to_speech_logic(text):
communicate = edge_tts.Communicate(text, "en-US-AndrewNeural")
await communicate.save("temp_op.mp3")
audio, sr = librosa.load("temp_op.mp3", sr=16000)
# Ensure audio is in the correct shape (1, samples) for FastRTC
if len(audio.shape) == 1:
audio = audio.reshape(1, -1)
return sr, audio
def process_audio(audio: tuple[int, np.ndarray]):
sr, y = audio
# FastRTC audio can be (samples, channels), we need (samples,)
if len(y.shape) > 1:
y = y.mean(axis=1)
sf.write("input.wav", y, sr)
with open("input.wav", "rb") as file:
transcription = client.audio.transcriptions.create(
file=("input.wav", file.read()),
model="whisper-large-v3-turbo",
)
response = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{"role": "system", "content": "You are a concise voice assistant. Give 1-sentence answers."},
{"role": "user", "content": transcription.text}
]
)
reply_text = response.choices[0].message.content
return asyncio.run(text_to_speech_logic(reply_text))
# Fallback function for regular audio interface
def process_audio_file(audio_file):
if audio_file is None:
return None, "Please record or upload audio"
# Load audio
y, sr = librosa.load(audio_file, sr=16000)
sf.write("input.wav", y, sr)
# Transcribe
with open("input.wav", "rb") as file:
transcription = client.audio.transcriptions.create(
file=("input.wav", file.read()),
model="whisper-large-v3-turbo",
)
# Get response
response = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[
{"role": "system", "content": "You are a concise voice assistant. Give 1-sentence answers."},
{"role": "user", "content": transcription.text}
]
)
reply_text = response.choices[0].message.content
# Generate speech
sr_out, audio_out = asyncio.run(text_to_speech_logic(reply_text))
return "temp_op.mp3", f"**You said:** {transcription.text}\n\n**Assistant:** {reply_text}"
# Create the interface
with gr.Blocks(title="Voice Agent Live") as demo:
gr.Markdown("# 🎙️ Voice Agent Live")
gr.Markdown("Speak to the AI assistant and get voice responses!")
if FASTRTC_AVAILABLE:
gr.Markdown("### Real-time Voice Chat (WebRTC)")
try:
webrtc_comp = WebRTC(
label="Voice Chat",
mode="send-receive",
modality="audio",
rtc_configuration=get_hf_turn_credentials()
)
webrtc_comp.stream(
fn=ReplyOnPause(process_audio),
inputs=[webrtc_comp],
outputs=[webrtc_comp]
)
except Exception as e:
gr.Markdown(f"⚠️ WebRTC Error: {str(e)}")
gr.Markdown("### Using fallback mode below")
FASTRTC_AVAILABLE = False
if not FASTRTC_AVAILABLE:
gr.Markdown("### Voice Chat (Record/Upload)")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Record or Upload Audio"
)
submit_btn = gr.Button("🎤 Process Audio", variant="primary", size="lg")
with gr.Column():
audio_output = gr.Audio(label="Assistant Response", type="filepath")
text_output = gr.Markdown()
submit_btn.click(
fn=process_audio_file,
inputs=[audio_input],
outputs=[audio_output, text_output]
)
gr.Examples(
examples=[],
inputs=audio_input,
label="Try recording your voice!"
)
if __name__ == "__main__":
demo.launch(share=False)