import os import gradio as gr from gradio_client import Client from test_overrides import _get_param_examples, _override_params _DESCRIPTION = """ Proxy to [xVASynth Gradio Space](https://huggingface.co/spaces/Pendrokar/xVASynth-TTS), as that space uses a Python version that MCP does not support. You can add this TTS as a tool using the "Use via API or MCP" instructions within the footer of this page. """ voice_models = [ ("👩 #ex04", "x_ex04"), ("🧑 #ex01", "x_ex01"), ("👱‍♀️ 🇬🇧 #92", "ccby_nvidia_hifi_92_F"), ("👨‍🦳 #6671", "ccby_nvidia_hifi_6671_M"), ] voice_models_more = [ ("👸 #ex02", "x_ex02"), ("👨‍🦱 #ex03", "x_ex03"), ("🧔 #6670", "ccby_nvidia_hifi_6670_M"), ("👨‍🦲 #9017", "ccby_nvidia_hifi_9017_M"), ("🧑 #6097", "ccby_nvidia_hifi_6097_M"), ("👩‍🦱 #12787", "ccby_nvidia_hifi_12787_F"), ("👵 #11614", "ccby_nv_hifi_11614_F"), ("👩‍🦰 #8051", "ccby_nvidia_hifi_8051_F"), ("👩‍🦳 #11697", "ccby_nvidia_hifi_11697_F"), ("👩‍🦲 #9136", "ccby_nvidia_hifi_9136_F"), ("♟ Lojban", "x_selpahi"), # v2 model for Lojban, pre- the multilingual capabilities of xVASynth v3 ] # order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA languages = [ ("🇺🇸 EN", "en"), ("🇩🇪 DE", "de"), ("🇪🇸 ES", "es"), ("🇮🇳 HI", "hi"), ("🇨🇳 ZH", "zh"), ] languages_more = [ ("🇳🇱 NL", "nl"), ("🇧🇷 PT", "pt"), ("🇮🇹 IT", "it"), ("🇵🇱 PL", "pl"), ("🇷🇴 RO", "ro"), ("🇸🇪 SV", "sv"), ("🇩🇰 DA", "da"), ("🇫🇮 FI", "fi"), ("🇭🇺 HU", "hu"), ("🇬🇷 EL", "el"), ("🇫🇷 FR", "fr"), ("🇷🇺 RU", "ru"), ("🇺🇦 UA", "uk"), ("🇹🇷 TR", "tr"), ("🇸🇦 AR", "ar"), ("🇯🇵 JP", "jp"), ("🇰🇷 KO", "ko"), ("🇻🇳 VI", "vi"), ("🇻🇦 LA", "la"), ("🇳🇬 YO", "yo"), ("Swahili", "sw"), ("Hausa", "ha"), ("Wolof", "wo"), ] lojban_lang = [ # There is no ISO 639-1 for Lojban, but jb is valid ('♟ Lojban', 'jb') ] # Translated from English by DeepMind's Gemini Pro default_text = { "ar": "هذا هو صوتي.", "da": "Sådan lyder min stemme.", "de": "So klingt meine Stimme.", "el": "Έτσι ακούγεται η φωνή μου.", "en": "This is what my voice sounds like.", "es": "Así suena mi voz.", "fi": "Näin ääneni kuulostaa.", "fr": "Voici à quoi ressemble ma voix.", "ha": "Wannan ne muryata ke.", "hi": "यह मेरी आवाज़ कैसी लगती है।", "hu": "Így hangzik a hangom.", "it": "Così suona la mia voce.", "jb": ".i ca'e gusni", "jp": "これが私の声です。", "ko": "여기 제 목소리가 어떤지 들어보세요.", "la": "Haec est vox mea sonans.", "nl": "Dit is hoe mijn stem klinkt.", "pl": "Tak brzmi mój głos.", "pt": "É assim que minha voz soa.", "ro": "Așa sună vocea mea.", "ru": "Вот как звучит мой голос.", "sv": "Såhär låter min röst.", "sw": "Baba, yetu, yetu, uliye. Mbinguni, yetu, yetu. Amiiinaa!!", #civ4 "tr": "Benim sesimin sesi böyle.", "uk": "Ось як звучить мій голос.", "vi": "Đây là giọng nói của tôi.", "wo": "Ndox li neen xewnaal ma.", "yo": "Ìyí ni ohùn mi ńlá.", "zh": "这是我的声音。", } # Component defaults input_textbox_init = { 'label': "Input Text", 'value': "This is what my voice sounds like.", 'info': "Also accepts ARPAbet symbols placed within {} brackets.", 'lines': 1, 'max_lines': 5, 'autofocus': True, 'interactive': False } pacing_slider_init = { 'value': 1.0, 'minimum': 0.5, 'maximum': 2.0, 'step': 0.1, 'label': "Duration", 'interactive': False } pitch_slider_init = { 'minimum': 0, 'maximum': 1.0, 'value': 0.5, 'step': 0.05, 'label': "Pitch", 'visible': False, 'interactive': False } energy_slider_init = { 'minimum': 0.1, 'maximum': 1.0, 'value': 1.0, 'step': 0.05, 'label': "Energy", 'visible': False, 'interactive': False } anger_slider_init = { 'minimum': 0, 'maximum': 1.0, 'value': 0, 'step': 0.05, 'label': "😠 Anger", 'info': "Tread lightly beyond 0.9", 'interactive': False } happy_slider_init = { 'minimum': 0, 'maximum': 1.0, 'value': 0, 'step': 0.05, 'label': "😃 Happiness", 'info': "Tread lightly beyond 0.7", 'interactive': False } sad_slider_init = { 'minimum': 0, 'maximum': 1.0, 'value': 0, 'step': 0.05, 'label': "😭 Sadness", 'info': "Duration increased when beyond 0.2", 'interactive': False } surprise_slider_init = { 'minimum': 0, 'maximum': 1.0, 'value': 0, 'step': 0.05, 'label': "😮 Surprise", 'info': "Oversaturates Happiness when beyond 0.3", 'interactive': False } voice_radio_init = { 'choices': [*voice_models, (f'+{len(voice_models_more)}', 'more')], 'value': "ccby_nvidia_hifi_6671_M", 'label': "Voice", 'info': "Fine-tuned voice model", 'interactive': False } deepmoji_checkbox_init = { 'label': "Use DeepMoji", 'info': "Auto adjust emotional values for English", 'value': True, 'interactive': False } class BlocksDemo: def __init__(self): self.block = self.create_interface() def create_interface(self): with gr.Blocks(css=".arpabet {background-color: gray; border-radius: 5px; font-size: 120%; padding: 0 0.1em; margin: 0 0.1em; text-align: center}") as demo: gr.Markdown("# xVASynth TTS - MCP Proxy") gr.Markdown(value=_DESCRIPTION) with gr.Row(): # Main row for inputs and language selection with gr.Column(): # Input column input_textbox = gr.Textbox(**input_textbox_init) language_radio = gr.Radio([*languages, *languages_more, *lojban_lang], interactive=False) with gr.Row(): with gr.Column(): en_examples_dropdown = gr.Dropdown(interactive=False) with gr.Column(): pacing_slider = gr.Slider(**pacing_slider_init) with gr.Column(): # Control column voice_radio = gr.Radio([*voice_models, *voice_models_more], interactive=False) pitch_slider = gr.Slider(**pitch_slider_init) energy_slider = gr.Slider(**energy_slider_init) with gr.Row(): # Main row for inputs and language selection with gr.Column(): # Input column anger_slider = gr.Slider(**anger_slider_init) sad_slider = gr.Slider(**sad_slider_init) with gr.Column(): # Input column happy_slider = gr.Slider(**happy_slider_init) surprise_slider = gr.Slider(**surprise_slider_init) deepmoji_checkbox = gr.Checkbox(**deepmoji_checkbox_init) gen_audio = gr.Button("generate", "primary", visible=False) gen_lojban = gr.Button("generate_lojban", "primary", visible=False) output_wav = gr.Audio( label="22kHz audio output", type="filepath", editable=False, autoplay=True, visible=False ) # with gr.Column(): # Input column output_arpabet = gr.HTML(label="ARPAbet") gen_audio.click( fn=self.generate, inputs=[ input_textbox, voice_radio, language_radio, pacing_slider, anger_slider, happy_slider, sad_slider, surprise_slider, deepmoji_checkbox ], outputs=[ output_wav, output_arpabet, anger_slider, happy_slider, sad_slider, surprise_slider, # xVAServer response gr.Textbox(visible=False) ] ) gen_lojban.click( fn=self.lojban, inputs=[ input_textbox, voice_radio, language_radio, pacing_slider, anger_slider, happy_slider, sad_slider, surprise_slider, deepmoji_checkbox ], outputs=[ output_wav, output_arpabet, anger_slider, happy_slider, sad_slider, surprise_slider, # xVAServer response gr.Textbox(visible=False) ] ) return demo def generate( self, input_text: str = "This is what my voice sounds like.", voice: str = "ccby_nvidia_hifi_6670_M", lang: str = "en", pacing: float = 1.0, anger: float = 0.0, happy: float = 0.0, sad: float = 0.0, surprise: float = 0.0, deepmoji_checked = 1 ): """ Convert the text to speech using xVASynth (v3) xVAPitch models. Sensitive to maxed out emotional values Args: input_text: string; from which to create the audio voice: Literal['x_ex04', 'x_ex01', 'ccby_nvidia_hifi_92_F', 'ccby_nvidia_hifi_6671_M', 'x_ex02', 'x_ex03', 'ccby_nvidia_hifi_6670_M', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_12787_F', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_9136_F']; _M/_F means a male/female voice; x_ex04/x_ex02 are American female voices; x_ex03/x_ex01 are American male voices lang: Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp']; the language of input_text pacing: float (numeric value between 0.5 and 2.0); Duration; 1.0 is default anger: float (numeric value between 0 and 1.0); 😠 Anger happy: float (numeric value between 0 and 1.0); 😃 Happiness sad: float (numeric value between 0 and 1.0); 😭 Sadness surprise: float (numeric value between 0 and 1.0); 😮 Surprise deepmoji_checked: bool; use the DeepMoji model to parse English text and futher amplify the emotional values Returns: Tuple of (output_audio_path, arpabet_html, final_anger_ratio, final_happiness_ratio, final_sadness_ratio, final_surprise_ratio, response) where output_audio_path is the filepath of output audio """ model = "Pendrokar/xVASynth-TTS" client = Client("Pendrokar/xVASynth-TTS", hf_token=os.getenv('HF_TOKEN')) endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict') api_name = '/predict' fn_index = None end_parameters = None text = input_text end_parameters = _get_param_examples( endpoints['named_endpoints'][api_name]['parameters'] ) print(end_parameters) # override some or all default parameters space_inputs = _override_params(end_parameters, model) space_inputs[0] = input_text space_inputs[1] = voice space_inputs[2] = lang space_inputs[3] = pacing space_inputs[6] = anger space_inputs[7] = happy space_inputs[8] = sad space_inputs[9] = surprise space_inputs[10] = deepmoji_checked print(space_inputs) result = client.predict( *space_inputs, api_name=api_name ) return result def lojban( self, input_text: str = "coi rodo", voice: str = "x_selpahi", lang: str = "jb", pacing: float = 1.0, anger: float = 0.0, happy: float = 0.0, sad: float = 0.0, surprise: float = 0.0, deepmoji_checked = 1 ): """ Convert the Lojban text to speech using xVASynth (v2) FastPitch 1.1 models. Args: input_text: string; from which to create the audio voice: Literal['x_selpahi']; the only viable Voice model filenames lang: Literal['jb']; the language of input_text pacing: float (numeric value between 0.5 and 2.0); Duration; 1.0 is default Returns: Tuple of (output_audio_path, arpabet_html, response) where output_audio_path is the filepath of output audio """ model = "Pendrokar/xVASynth-TTS" client = Client("Pendrokar/xVASynth-TTS", hf_token=os.getenv('HF_TOKEN')) endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict') api_name = '/predict' fn_index = None end_parameters = None text = input_text end_parameters = _get_param_examples( endpoints['named_endpoints'][api_name]['parameters'] ) print(end_parameters) # override some or all default parameters space_inputs = _override_params(end_parameters, model) space_inputs[0] = input_text space_inputs[1] = voice space_inputs[2] = 'jbo' space_inputs[3] = pacing print(space_inputs) result = client.predict( *space_inputs, api_name=api_name ) return result demo = BlocksDemo() demo.block.launch(show_api=True, show_error=True, mcp_server=True)