ASureevaA
commited on
Commit
·
fb68e9f
1
Parent(s):
0ef80ce
fix image q
Browse files
app.py
CHANGED
|
@@ -296,7 +296,7 @@ def synthesize_speech(text_value: str, model_key: str):
|
|
| 296 |
)
|
| 297 |
return file_object.name
|
| 298 |
|
| 299 |
-
raise ValueError(f"Неизвестная
|
| 300 |
|
| 301 |
|
| 302 |
|
|
@@ -599,35 +599,30 @@ def parse_point_coordinates_text(coordinates_text: str) -> List[List[int]]:
|
|
| 599 |
|
| 600 |
def build_interface():
|
| 601 |
with gr.Blocks(title="Multimodal AI Demo", theme=gr.themes.Soft()) as demo_block:
|
| 602 |
-
gr.Markdown("
|
| 603 |
-
gr.Markdown(
|
| 604 |
-
"Демонстрация различных задач компьютерного зрения "
|
| 605 |
-
"и обработки звука с использованием Hugging Face Transformers",
|
| 606 |
-
)
|
| 607 |
|
| 608 |
with gr.Tab("Классификация аудио"):
|
| 609 |
-
gr.Markdown("##
|
| 610 |
with gr.Row():
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
)
|
| 631 |
|
| 632 |
audio_classify_button.click(
|
| 633 |
fn=classify_audio_file,
|
|
@@ -635,25 +630,24 @@ def build_interface():
|
|
| 635 |
outputs=audio_output_component,
|
| 636 |
)
|
| 637 |
|
| 638 |
-
with gr.Tab("Zero-Shot аудио
|
| 639 |
-
gr.Markdown("## Zero-Shot
|
| 640 |
with gr.Row():
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
)
|
| 657 |
|
| 658 |
clap_button.click(
|
| 659 |
fn=classify_audio_zero_shot_clap,
|
|
@@ -662,28 +656,27 @@ def build_interface():
|
|
| 662 |
)
|
| 663 |
|
| 664 |
with gr.Tab("Распознавание речи"):
|
| 665 |
-
gr.Markdown("##
|
| 666 |
with gr.Row():
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
)
|
| 687 |
|
| 688 |
asr_button.click(
|
| 689 |
fn=recognize_speech,
|
|
@@ -693,27 +686,26 @@ def build_interface():
|
|
| 693 |
with gr.Tab("Синтез речи"):
|
| 694 |
gr.Markdown("## Text-to-Speech")
|
| 695 |
with gr.Row():
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
)
|
| 717 |
|
| 718 |
tts_button.click(
|
| 719 |
fn=synthesize_speech,
|
|
@@ -722,30 +714,29 @@ def build_interface():
|
|
| 722 |
)
|
| 723 |
|
| 724 |
with gr.Tab("Детекция объектов"):
|
| 725 |
-
gr.Markdown("##
|
| 726 |
with gr.Row():
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
)
|
| 749 |
|
| 750 |
object_detect_button.click(
|
| 751 |
fn=detect_objects_on_image,
|
|
@@ -754,18 +745,17 @@ def build_interface():
|
|
| 754 |
)
|
| 755 |
|
| 756 |
with gr.Tab("Сегментация"):
|
| 757 |
-
gr.Markdown("##
|
| 758 |
with gr.Row():
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
)
|
| 769 |
|
| 770 |
segmentation_button.click(
|
| 771 |
fn=segment_image,
|
|
@@ -773,19 +763,19 @@ def build_interface():
|
|
| 773 |
outputs=segmentation_output_image,
|
| 774 |
)
|
| 775 |
|
| 776 |
-
with gr.Tab("Глубина
|
| 777 |
-
gr.Markdown("## Depth Estimation
|
| 778 |
with gr.Row():
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
|
| 790 |
depth_button.click(
|
| 791 |
fn=estimate_image_depth,
|
|
@@ -794,31 +784,30 @@ def build_interface():
|
|
| 794 |
)
|
| 795 |
|
| 796 |
with gr.Tab("Описание изображений"):
|
| 797 |
-
gr.Markdown("##
|
| 798 |
with gr.Row():
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
)
|
| 822 |
|
| 823 |
caption_button.click(
|
| 824 |
fn=generate_image_caption,
|
|
@@ -829,34 +818,33 @@ def build_interface():
|
|
| 829 |
with gr.Tab("Визуальные вопросы"):
|
| 830 |
gr.Markdown("## Visual Question Answering")
|
| 831 |
with gr.Row():
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
)
|
| 860 |
|
| 861 |
vqa_button.click(
|
| 862 |
fn=answer_visual_question,
|
|
@@ -865,36 +853,35 @@ def build_interface():
|
|
| 865 |
)
|
| 866 |
|
| 867 |
with gr.Tab("Zero-Shot классификация"):
|
| 868 |
-
gr.Markdown("## Zero-Shot
|
| 869 |
with gr.Row():
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
-
|
| 895 |
-
|
| 896 |
-
|
| 897 |
-
)
|
| 898 |
|
| 899 |
zero_shot_button.click(
|
| 900 |
fn=perform_zero_shot_classification,
|
|
@@ -903,40 +890,40 @@ def build_interface():
|
|
| 903 |
)
|
| 904 |
|
| 905 |
with gr.Tab("Поиск изображений"):
|
| 906 |
-
gr.Markdown("##
|
| 907 |
with gr.Row():
|
| 908 |
-
|
| 909 |
-
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
-
|
| 915 |
-
|
| 916 |
-
|
| 917 |
-
|
| 918 |
-
|
| 919 |
-
|
| 920 |
-
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
|
| 931 |
-
|
| 932 |
-
|
| 933 |
-
|
| 934 |
-
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
-
|
| 940 |
|
| 941 |
retrieval_button.click(
|
| 942 |
fn=retrieve_best_image,
|
|
@@ -948,9 +935,9 @@ def build_interface():
|
|
| 948 |
gr.Markdown("### Задачи:")
|
| 949 |
gr.Markdown(
|
| 950 |
"""
|
| 951 |
-
- Аудио:
|
| 952 |
-
- Компьютерное зрение: детекция объектов,
|
| 953 |
-
- Мультимодальные задачи:
|
| 954 |
"""
|
| 955 |
)
|
| 956 |
return demo_block
|
|
|
|
| 296 |
)
|
| 297 |
return file_object.name
|
| 298 |
|
| 299 |
+
raise ValueError(f"Неизвестная модель: {model_key}")
|
| 300 |
|
| 301 |
|
| 302 |
|
|
|
|
| 599 |
|
| 600 |
def build_interface():
|
| 601 |
with gr.Blocks(title="Multimodal AI Demo", theme=gr.themes.Soft()) as demo_block:
|
| 602 |
+
gr.Markdown("# AI модели")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 603 |
|
| 604 |
with gr.Tab("Классификация аудио"):
|
| 605 |
+
gr.Markdown("## Классификация аудио")
|
| 606 |
with gr.Row():
|
| 607 |
+
audio_input_component = gr.Audio(
|
| 608 |
+
label="Загрузите аудиофайл",
|
| 609 |
+
type="filepath",
|
| 610 |
+
)
|
| 611 |
+
audio_model_selector = gr.Dropdown(
|
| 612 |
+
choices=["audio_classifier", "emotion_classifier"],
|
| 613 |
+
label="Выберите модель",
|
| 614 |
+
value="audio_classifier",
|
| 615 |
+
info=(
|
| 616 |
+
"audio_classifier - общая классификация (курс)"
|
| 617 |
+
"emotion_classifier - эмоции в речи "
|
| 618 |
+
),
|
| 619 |
+
)
|
| 620 |
+
audio_classify_button = gr.Button("Применить")
|
| 621 |
+
|
| 622 |
+
audio_output_component = gr.Textbox(
|
| 623 |
+
label="Результаты классификации",
|
| 624 |
+
lines=10,
|
| 625 |
+
)
|
|
|
|
| 626 |
|
| 627 |
audio_classify_button.click(
|
| 628 |
fn=classify_audio_file,
|
|
|
|
| 630 |
outputs=audio_output_component,
|
| 631 |
)
|
| 632 |
|
| 633 |
+
with gr.Tab("Zero-Shot аудио"):
|
| 634 |
+
gr.Markdown("## Zero-Shot аудио классификатор")
|
| 635 |
with gr.Row():
|
| 636 |
+
clap_audio_input_component = gr.Audio(
|
| 637 |
+
label="Загрузите аудиофайл",
|
| 638 |
+
type="filepath",
|
| 639 |
+
)
|
| 640 |
+
clap_label_texts_component = gr.Textbox(
|
| 641 |
+
label="Кандидатные метки (через запятую)",
|
| 642 |
+
placeholder="лай собаки, шум дождя, музыка, разговор",
|
| 643 |
+
lines=2,
|
| 644 |
+
)
|
| 645 |
+
clap_button = gr.Button("Применить")
|
| 646 |
+
|
| 647 |
+
clap_output_component = gr.Textbox(
|
| 648 |
+
label="Результаты zero-shot классификации",
|
| 649 |
+
lines=10,
|
| 650 |
+
)
|
|
|
|
| 651 |
|
| 652 |
clap_button.click(
|
| 653 |
fn=classify_audio_zero_shot_clap,
|
|
|
|
| 656 |
)
|
| 657 |
|
| 658 |
with gr.Tab("Распознавание речи"):
|
| 659 |
+
gr.Markdown("## Распознавание реч")
|
| 660 |
with gr.Row():
|
| 661 |
+
asr_audio_input_component = gr.Audio(
|
| 662 |
+
label="Загрузите аудио с речью",
|
| 663 |
+
type="filepath",
|
| 664 |
+
)
|
| 665 |
+
asr_model_selector = gr.Dropdown(
|
| 666 |
+
choices=["whisper", "wav2vec2"],
|
| 667 |
+
label="Выберите модель",
|
| 668 |
+
value="whisper",
|
| 669 |
+
info=(
|
| 670 |
+
"whisper - distil-whisper/distil-small.en (курс),\n"
|
| 671 |
+
"wav2vec2 - openai/whisper-small"
|
| 672 |
+
),
|
| 673 |
+
)
|
| 674 |
+
asr_button = gr.Button("Применить")
|
| 675 |
+
|
| 676 |
+
asr_output_component = gr.Textbox(
|
| 677 |
+
label="Транскрипция",
|
| 678 |
+
lines=5,
|
| 679 |
+
)
|
|
|
|
| 680 |
|
| 681 |
asr_button.click(
|
| 682 |
fn=recognize_speech,
|
|
|
|
| 686 |
with gr.Tab("Синтез речи"):
|
| 687 |
gr.Markdown("## Text-to-Speech")
|
| 688 |
with gr.Row():
|
| 689 |
+
tts_text_component = gr.Textbox(
|
| 690 |
+
label="Введите текст для синтеза",
|
| 691 |
+
placeholder="Введите текст на русском или английском языке...",
|
| 692 |
+
lines=3,
|
| 693 |
+
)
|
| 694 |
+
tts_model_selector = gr.Dropdown(
|
| 695 |
+
choices=["vits-ljs", "Google TTS"],
|
| 696 |
+
label="Выберите модель",
|
| 697 |
+
value="vits-ljs",
|
| 698 |
+
info=(
|
| 699 |
+
"kakao-enterprise/vits-ljs\n"
|
| 700 |
+
"Google TTS"
|
| 701 |
+
),
|
| 702 |
+
)
|
| 703 |
+
tts_button = gr.Button("Применить")
|
| 704 |
+
|
| 705 |
+
tts_audio_output_component = gr.Audio(
|
| 706 |
+
label="Синтезированная речь",
|
| 707 |
+
type="filepath",
|
| 708 |
+
)
|
|
|
|
| 709 |
|
| 710 |
tts_button.click(
|
| 711 |
fn=synthesize_speech,
|
|
|
|
| 714 |
)
|
| 715 |
|
| 716 |
with gr.Tab("Детекция объектов"):
|
| 717 |
+
gr.Markdown("## Детекция объектов")
|
| 718 |
with gr.Row():
|
| 719 |
+
object_input_image = gr.Image(
|
| 720 |
+
label="Загрузите изображение",
|
| 721 |
+
type="pil",
|
| 722 |
+
)
|
| 723 |
+
object_model_selector = gr.Dropdown(
|
| 724 |
+
choices=[
|
| 725 |
+
"object_detection_conditional_detr",
|
| 726 |
+
"object_detection_yolos_small",
|
| 727 |
+
],
|
| 728 |
+
label="Модель",
|
| 729 |
+
value="object_detection_conditional_detr",
|
| 730 |
+
info=(
|
| 731 |
+
"object_detection_conditional_detr - microsoft/conditional-detr-resnet-50\n"
|
| 732 |
+
"object_detection_yolos_small - hustvl/yolos-small"
|
| 733 |
+
),
|
| 734 |
+
)
|
| 735 |
+
object_detect_button = gr.Button("Применить")
|
| 736 |
+
|
| 737 |
+
object_output_image = gr.Image(
|
| 738 |
+
label="Результат",
|
| 739 |
+
)
|
|
|
|
| 740 |
|
| 741 |
object_detect_button.click(
|
| 742 |
fn=detect_objects_on_image,
|
|
|
|
| 745 |
)
|
| 746 |
|
| 747 |
with gr.Tab("Сегментация"):
|
| 748 |
+
gr.Markdown("## Сегментация")
|
| 749 |
with gr.Row():
|
| 750 |
+
segmentation_input_image = gr.Image(
|
| 751 |
+
label="Загрузите изображение",
|
| 752 |
+
type="pil",
|
| 753 |
+
)
|
| 754 |
+
segmentation_button = gr.Button("Применить")
|
| 755 |
+
|
| 756 |
+
segmentation_output_image = gr.Image(
|
| 757 |
+
label="Маска",
|
| 758 |
+
)
|
|
|
|
| 759 |
|
| 760 |
segmentation_button.click(
|
| 761 |
fn=segment_image,
|
|
|
|
| 763 |
outputs=segmentation_output_image,
|
| 764 |
)
|
| 765 |
|
| 766 |
+
with gr.Tab("Глубина"):
|
| 767 |
+
gr.Markdown("## Глубина (Depth Estimation)")
|
| 768 |
with gr.Row():
|
| 769 |
+
|
| 770 |
+
depth_input_image = gr.Image(
|
| 771 |
+
label="Загрузите изображение",
|
| 772 |
+
type="pil",
|
| 773 |
+
)
|
| 774 |
+
depth_button = gr.Button("Применить")
|
| 775 |
+
|
| 776 |
+
depth_output_image = gr.Image(
|
| 777 |
+
label="Глубины",
|
| 778 |
+
)
|
| 779 |
|
| 780 |
depth_button.click(
|
| 781 |
fn=estimate_image_depth,
|
|
|
|
| 784 |
)
|
| 785 |
|
| 786 |
with gr.Tab("Описание изображений"):
|
| 787 |
+
gr.Markdown("## Описание изображений")
|
| 788 |
with gr.Row():
|
| 789 |
+
caption_input_image = gr.Image(
|
| 790 |
+
label="Загрузите изображение",
|
| 791 |
+
type="pil",
|
| 792 |
+
)
|
| 793 |
+
caption_model_selector = gr.Dropdown(
|
| 794 |
+
choices=[
|
| 795 |
+
"captioning_blip_base",
|
| 796 |
+
"captioning_blip_large",
|
| 797 |
+
],
|
| 798 |
+
label="Модель",
|
| 799 |
+
value="captioning_blip_base",
|
| 800 |
+
info=(
|
| 801 |
+
"captioning_blip_base - Salesforce/blip-image-captioning-base (курс)\n"
|
| 802 |
+
"captioning_blip_large - Salesforce/blip-image-captioning-large"
|
| 803 |
+
),
|
| 804 |
+
)
|
| 805 |
+
caption_button = gr.Button("Применить")
|
| 806 |
+
|
| 807 |
+
caption_output_text = gr.Textbox(
|
| 808 |
+
label="Описание изображения",
|
| 809 |
+
lines=3,
|
| 810 |
+
)
|
|
|
|
| 811 |
|
| 812 |
caption_button.click(
|
| 813 |
fn=generate_image_caption,
|
|
|
|
| 818 |
with gr.Tab("Визуальные вопросы"):
|
| 819 |
gr.Markdown("## Visual Question Answering")
|
| 820 |
with gr.Row():
|
| 821 |
+
vqa_input_image = gr.Image(
|
| 822 |
+
label="Загрузите изображение",
|
| 823 |
+
type="pil",
|
| 824 |
+
)
|
| 825 |
+
vqa_question_text = gr.Textbox(
|
| 826 |
+
label="Вопрос",
|
| 827 |
+
placeholder="Вопрос",
|
| 828 |
+
lines=2,
|
| 829 |
+
)
|
| 830 |
+
vqa_model_selector = gr.Dropdown(
|
| 831 |
+
choices=[
|
| 832 |
+
"vqa_blip_base",
|
| 833 |
+
"vqa_vilt_b32",
|
| 834 |
+
],
|
| 835 |
+
label="Модель",
|
| 836 |
+
value="vqa_blip_base",
|
| 837 |
+
info=(
|
| 838 |
+
"vqa_blip_base - Salesforce/blip-vqa-base (курс)\n"
|
| 839 |
+
"vqa_vilt_b32 - dandelin/vilt-b32-finetuned-vqa"
|
| 840 |
+
),
|
| 841 |
+
)
|
| 842 |
+
vqa_button = gr.Button("Ответить на вопрос")
|
| 843 |
+
|
| 844 |
+
vqa_output_text = gr.Textbox(
|
| 845 |
+
label="Ответ",
|
| 846 |
+
lines=3,
|
| 847 |
+
)
|
|
|
|
| 848 |
|
| 849 |
vqa_button.click(
|
| 850 |
fn=answer_visual_question,
|
|
|
|
| 853 |
)
|
| 854 |
|
| 855 |
with gr.Tab("Zero-Shot классификация"):
|
| 856 |
+
gr.Markdown("## Zero-Shot классификация")
|
| 857 |
with gr.Row():
|
| 858 |
+
zero_shot_input_image = gr.Image(
|
| 859 |
+
label="Загрузите изображение",
|
| 860 |
+
type="pil",
|
| 861 |
+
)
|
| 862 |
+
zero_shot_classes_text = gr.Textbox(
|
| 863 |
+
label="Классы для классификации (через запятую)",
|
| 864 |
+
placeholder="человек, машина, дерево, здание, животное",
|
| 865 |
+
lines=2,
|
| 866 |
+
)
|
| 867 |
+
clip_model_selector = gr.Dropdown(
|
| 868 |
+
choices=[
|
| 869 |
+
"clip_large_patch14",
|
| 870 |
+
"clip_base_patch32",
|
| 871 |
+
],
|
| 872 |
+
label="модель",
|
| 873 |
+
value="clip_large_patch14",
|
| 874 |
+
info=(
|
| 875 |
+
"clip_large_patch14 - openai/clip-vit-large-patch14 (курс)\n"
|
| 876 |
+
"clip_base_patch32 - openai/clip-vit-base-patch32"
|
| 877 |
+
),
|
| 878 |
+
)
|
| 879 |
+
zero_shot_button = gr.Button("Применить")
|
| 880 |
+
|
| 881 |
+
zero_shot_output_text = gr.Textbox(
|
| 882 |
+
label="Результаты",
|
| 883 |
+
lines=10,
|
| 884 |
+
)
|
|
|
|
| 885 |
|
| 886 |
zero_shot_button.click(
|
| 887 |
fn=perform_zero_shot_classification,
|
|
|
|
| 890 |
)
|
| 891 |
|
| 892 |
with gr.Tab("Поиск изображений"):
|
| 893 |
+
gr.Markdown("## Поиск изображений")
|
| 894 |
with gr.Row():
|
| 895 |
+
|
| 896 |
+
retrieval_dir = gr.File(
|
| 897 |
+
label="Загрузите папку с изображениями",
|
| 898 |
+
file_count="directory",
|
| 899 |
+
file_types=["image"],
|
| 900 |
+
type="filepath",
|
| 901 |
+
)
|
| 902 |
+
retrieval_query_text = gr.Textbox(
|
| 903 |
+
label="Текстовый запрос",
|
| 904 |
+
placeholder="описание того, что вы ищете...",
|
| 905 |
+
lines=2,
|
| 906 |
+
)
|
| 907 |
+
retrieval_clip_selector = gr.Dropdown(
|
| 908 |
+
choices=[
|
| 909 |
+
"clip_large_patch14",
|
| 910 |
+
"clip_base_patch32",
|
| 911 |
+
],
|
| 912 |
+
label="модель",
|
| 913 |
+
value="clip_large_patch14",
|
| 914 |
+
info=(
|
| 915 |
+
"clip_large_patch14 - openai/clip-vit-large-patch14 (курс)\n"
|
| 916 |
+
"clip_base_patch32 - openai/clip-vit-base-patch32 (альтернатива)"
|
| 917 |
+
),
|
| 918 |
+
)
|
| 919 |
+
retrieval_button = gr.Button("Поиск")
|
| 920 |
+
|
| 921 |
+
retrieval_output_text = gr.Textbox(
|
| 922 |
+
label="Результат",
|
| 923 |
+
)
|
| 924 |
+
retrieval_output_image = gr.Image(
|
| 925 |
+
label="Наиболее подходящее изображение",
|
| 926 |
+
)
|
| 927 |
|
| 928 |
retrieval_button.click(
|
| 929 |
fn=retrieve_best_image,
|
|
|
|
| 935 |
gr.Markdown("### Задачи:")
|
| 936 |
gr.Markdown(
|
| 937 |
"""
|
| 938 |
+
- Аудио: классификация, распознавание речи, синтез речи
|
| 939 |
+
- Компьютерное зрение: детекция объектов, сегментация, оценка глубины, генерация описаний изображений
|
| 940 |
+
- Мультимодальные задачи: вопросы к изображению, zero-shot классификация изображений, поиск по изображениям по текстовому запросу
|
| 941 |
"""
|
| 942 |
)
|
| 943 |
return demo_block
|