Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import numpy as np | |
| import librosa | |
| import asyncio | |
| import edge_tts | |
| import soundfile as sf | |
| from groq import Groq | |
| try: | |
| from fastrtc import WebRTC, ReplyOnPause, get_hf_turn_credentials | |
| FASTRTC_AVAILABLE = True | |
| except ImportError: | |
| FASTRTC_AVAILABLE = False | |
| print("FastRTC not available, using fallback UI") | |
| # Initialize Groq | |
| client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
| async def text_to_speech_logic(text): | |
| communicate = edge_tts.Communicate(text, "en-US-AndrewNeural") | |
| await communicate.save("temp_op.mp3") | |
| audio, sr = librosa.load("temp_op.mp3", sr=16000) | |
| # Ensure audio is in the correct shape (1, samples) for FastRTC | |
| if len(audio.shape) == 1: | |
| audio = audio.reshape(1, -1) | |
| return sr, audio | |
| def process_audio(audio: tuple[int, np.ndarray]): | |
| sr, y = audio | |
| # FastRTC audio can be (samples, channels), we need (samples,) | |
| if len(y.shape) > 1: | |
| y = y.mean(axis=1) | |
| sf.write("input.wav", y, sr) | |
| with open("input.wav", "rb") as file: | |
| transcription = client.audio.transcriptions.create( | |
| file=("input.wav", file.read()), | |
| model="whisper-large-v3-turbo", | |
| ) | |
| response = client.chat.completions.create( | |
| model="llama-3.3-70b-versatile", | |
| messages=[ | |
| {"role": "system", "content": "You are a concise voice assistant. Give 1-sentence answers."}, | |
| {"role": "user", "content": transcription.text} | |
| ] | |
| ) | |
| reply_text = response.choices[0].message.content | |
| return asyncio.run(text_to_speech_logic(reply_text)) | |
| # Fallback function for regular audio interface | |
| def process_audio_file(audio_file): | |
| if audio_file is None: | |
| return None, "Please record or upload audio" | |
| # Load audio | |
| y, sr = librosa.load(audio_file, sr=16000) | |
| sf.write("input.wav", y, sr) | |
| # Transcribe | |
| with open("input.wav", "rb") as file: | |
| transcription = client.audio.transcriptions.create( | |
| file=("input.wav", file.read()), | |
| model="whisper-large-v3-turbo", | |
| ) | |
| # Get response | |
| response = client.chat.completions.create( | |
| model="llama-3.3-70b-versatile", | |
| messages=[ | |
| {"role": "system", "content": "You are a concise voice assistant. Give 1-sentence answers."}, | |
| {"role": "user", "content": transcription.text} | |
| ] | |
| ) | |
| reply_text = response.choices[0].message.content | |
| # Generate speech | |
| sr_out, audio_out = asyncio.run(text_to_speech_logic(reply_text)) | |
| return "temp_op.mp3", f"**You said:** {transcription.text}\n\n**Assistant:** {reply_text}" | |
| # Create the interface | |
| with gr.Blocks(title="Voice Agent Live") as demo: | |
| gr.Markdown("# 🎙️ Voice Agent Live") | |
| gr.Markdown("Speak to the AI assistant and get voice responses!") | |
| if FASTRTC_AVAILABLE: | |
| gr.Markdown("### Real-time Voice Chat (WebRTC)") | |
| try: | |
| webrtc_comp = WebRTC( | |
| label="Voice Chat", | |
| mode="send-receive", | |
| modality="audio", | |
| rtc_configuration=get_hf_turn_credentials() | |
| ) | |
| webrtc_comp.stream( | |
| fn=ReplyOnPause(process_audio), | |
| inputs=[webrtc_comp], | |
| outputs=[webrtc_comp] | |
| ) | |
| except Exception as e: | |
| gr.Markdown(f"⚠️ WebRTC Error: {str(e)}") | |
| gr.Markdown("### Using fallback mode below") | |
| FASTRTC_AVAILABLE = False | |
| if not FASTRTC_AVAILABLE: | |
| gr.Markdown("### Voice Chat (Record/Upload)") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="Record or Upload Audio" | |
| ) | |
| submit_btn = gr.Button("🎤 Process Audio", variant="primary", size="lg") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Assistant Response", type="filepath") | |
| text_output = gr.Markdown() | |
| submit_btn.click( | |
| fn=process_audio_file, | |
| inputs=[audio_input], | |
| outputs=[audio_output, text_output] | |
| ) | |
| gr.Examples( | |
| examples=[], | |
| inputs=audio_input, | |
| label="Try recording your voice!" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=False) | |