Spaces:

STARBORN
/

voice_agent_tutorial

Sleeping

App Files Files Community

voice_agent_tutorial / app.py

STARBORN

Upload app.py

2087bf7 verified 8 days ago

raw

history blame contribute delete

4.51 kB

	import gradio as gr
	import os
	import numpy as np
	import librosa
	import asyncio
	import edge_tts
	import soundfile as sf
	from groq import Groq

	try:
	from fastrtc import WebRTC, ReplyOnPause, get_hf_turn_credentials
	FASTRTC_AVAILABLE = True
	except ImportError:
	FASTRTC_AVAILABLE = False
	print("FastRTC not available, using fallback UI")

	# Initialize Groq
	client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

	async def text_to_speech_logic(text):
	communicate = edge_tts.Communicate(text, "en-US-AndrewNeural")
	await communicate.save("temp_op.mp3")
	audio, sr = librosa.load("temp_op.mp3", sr=16000)
	# Ensure audio is in the correct shape (1, samples) for FastRTC
	if len(audio.shape) == 1:
	audio = audio.reshape(1, -1)
	return sr, audio

	def process_audio(audio: tuple[int, np.ndarray]):
	sr, y = audio
	# FastRTC audio can be (samples, channels), we need (samples,)
	if len(y.shape) > 1:
	y = y.mean(axis=1)

	sf.write("input.wav", y, sr)

	with open("input.wav", "rb") as file:
	transcription = client.audio.transcriptions.create(
	file=("input.wav", file.read()),
	model="whisper-large-v3-turbo",
	)

	response = client.chat.completions.create(
	model="llama-3.3-70b-versatile",
	messages=[
	{"role": "system", "content": "You are a concise voice assistant. Give 1-sentence answers."},
	{"role": "user", "content": transcription.text}
	]
	)

	reply_text = response.choices[0].message.content
	return asyncio.run(text_to_speech_logic(reply_text))

	# Fallback function for regular audio interface
	def process_audio_file(audio_file):
	if audio_file is None:
	return None, "Please record or upload audio"

	# Load audio
	y, sr = librosa.load(audio_file, sr=16000)
	sf.write("input.wav", y, sr)

	# Transcribe
	with open("input.wav", "rb") as file:
	transcription = client.audio.transcriptions.create(
	file=("input.wav", file.read()),
	model="whisper-large-v3-turbo",
	)

	# Get response
	response = client.chat.completions.create(
	model="llama-3.3-70b-versatile",
	messages=[
	{"role": "system", "content": "You are a concise voice assistant. Give 1-sentence answers."},
	{"role": "user", "content": transcription.text}
	]
	)

	reply_text = response.choices[0].message.content

	# Generate speech
	sr_out, audio_out = asyncio.run(text_to_speech_logic(reply_text))

	return "temp_op.mp3", f"You said: {transcription.text}\n\nAssistant: {reply_text}"

	# Create the interface
	with gr.Blocks(title="Voice Agent Live") as demo:
	gr.Markdown("# 🎙️ Voice Agent Live")
	gr.Markdown("Speak to the AI assistant and get voice responses!")

	if FASTRTC_AVAILABLE:
	gr.Markdown("### Real-time Voice Chat (WebRTC)")
	try:
	webrtc_comp = WebRTC(
	label="Voice Chat",
	mode="send-receive",
	modality="audio",
	rtc_configuration=get_hf_turn_credentials()
	)
	webrtc_comp.stream(
	fn=ReplyOnPause(process_audio),
	inputs=[webrtc_comp],
	outputs=[webrtc_comp]
	)
	except Exception as e:
	gr.Markdown(f"⚠️ WebRTC Error: {str(e)}")
	gr.Markdown("### Using fallback mode below")
	FASTRTC_AVAILABLE = False

	if not FASTRTC_AVAILABLE:
	gr.Markdown("### Voice Chat (Record/Upload)")
	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="Record or Upload Audio"
	)
	submit_btn = gr.Button("🎤 Process Audio", variant="primary", size="lg")

	with gr.Column():
	audio_output = gr.Audio(label="Assistant Response", type="filepath")
	text_output = gr.Markdown()

	submit_btn.click(
	fn=process_audio_file,
	inputs=[audio_input],
	outputs=[audio_output, text_output]
	)

	gr.Examples(
	examples=[],
	inputs=audio_input,
	label="Try recording your voice!"
	)

	if __name__ == "__main__":
	demo.launch(share=False)