Spaces:
Running
Running
command : always-prompt mode (#383)
Browse files- examples/command/command.cpp +117 -1
examples/command/command.cpp
CHANGED
|
@@ -11,6 +11,8 @@
|
|
| 11 |
#include <SDL.h>
|
| 12 |
#include <SDL_audio.h>
|
| 13 |
|
|
|
|
|
|
|
| 14 |
#include <cassert>
|
| 15 |
#include <cstdio>
|
| 16 |
#include <fstream>
|
|
@@ -25,7 +27,7 @@
|
|
| 25 |
struct whisper_params {
|
| 26 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
| 27 |
int32_t prompt_ms = 5000;
|
| 28 |
-
int32_t command_ms =
|
| 29 |
int32_t capture_id = -1;
|
| 30 |
int32_t max_tokens = 32;
|
| 31 |
int32_t audio_ctx = 0;
|
|
@@ -43,6 +45,7 @@ struct whisper_params {
|
|
| 43 |
std::string model = "models/ggml-base.en.bin";
|
| 44 |
std::string fname_out;
|
| 45 |
std::string commands;
|
|
|
|
| 46 |
};
|
| 47 |
|
| 48 |
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
|
@@ -71,6 +74,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 71 |
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
| 72 |
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
|
| 73 |
else if (arg == "-cmd" || arg == "--commands") { params.commands = argv[++i]; }
|
|
|
|
| 74 |
else {
|
| 75 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 76 |
whisper_print_usage(argc, argv, params);
|
|
@@ -103,6 +107,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|
| 103 |
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
| 104 |
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
| 105 |
fprintf(stderr, " -cmd FNAME, --commands FNAME [%-7s] text file with allowed commands\n", params.commands.c_str());
|
|
|
|
| 106 |
fprintf(stderr, "\n");
|
| 107 |
}
|
| 108 |
|
|
@@ -837,6 +842,115 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
|
|
| 837 |
return 0;
|
| 838 |
}
|
| 839 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 840 |
int main(int argc, char ** argv) {
|
| 841 |
whisper_params params;
|
| 842 |
|
|
@@ -892,6 +1006,8 @@ int main(int argc, char ** argv) {
|
|
| 892 |
|
| 893 |
if (!params.commands.empty()) {
|
| 894 |
ret_val = process_command_list(ctx, audio, params);
|
|
|
|
|
|
|
| 895 |
} else {
|
| 896 |
ret_val = process_general_transcription(ctx, audio, params);
|
| 897 |
}
|
|
|
|
| 11 |
#include <SDL.h>
|
| 12 |
#include <SDL_audio.h>
|
| 13 |
|
| 14 |
+
#include <iostream>
|
| 15 |
+
#include <sstream>
|
| 16 |
#include <cassert>
|
| 17 |
#include <cstdio>
|
| 18 |
#include <fstream>
|
|
|
|
| 27 |
struct whisper_params {
|
| 28 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
| 29 |
int32_t prompt_ms = 5000;
|
| 30 |
+
int32_t command_ms = 8000;
|
| 31 |
int32_t capture_id = -1;
|
| 32 |
int32_t max_tokens = 32;
|
| 33 |
int32_t audio_ctx = 0;
|
|
|
|
| 45 |
std::string model = "models/ggml-base.en.bin";
|
| 46 |
std::string fname_out;
|
| 47 |
std::string commands;
|
| 48 |
+
std::string prompt;
|
| 49 |
};
|
| 50 |
|
| 51 |
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
|
|
|
| 74 |
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
| 75 |
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
|
| 76 |
else if (arg == "-cmd" || arg == "--commands") { params.commands = argv[++i]; }
|
| 77 |
+
else if (arg == "-p" || arg == "--prompt") { params.prompt = argv[++i]; }
|
| 78 |
else {
|
| 79 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 80 |
whisper_print_usage(argc, argv, params);
|
|
|
|
| 107 |
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
| 108 |
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
| 109 |
fprintf(stderr, " -cmd FNAME, --commands FNAME [%-7s] text file with allowed commands\n", params.commands.c_str());
|
| 110 |
+
fprintf(stderr, " -p, --prompt [%-7s] the required activation prompt\n", params.prompt.c_str());
|
| 111 |
fprintf(stderr, "\n");
|
| 112 |
}
|
| 113 |
|
|
|
|
| 842 |
return 0;
|
| 843 |
}
|
| 844 |
|
| 845 |
+
|
| 846 |
+
// always prompt mode
|
| 847 |
+
// transcribe the voice into text after valid prompt
|
| 848 |
+
int always_prompt_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params ¶ms) {
|
| 849 |
+
bool is_running = true;
|
| 850 |
+
bool ask_prompt = true;
|
| 851 |
+
|
| 852 |
+
float prob = 0.0f;
|
| 853 |
+
|
| 854 |
+
std::vector<float> pcmf32_cur;
|
| 855 |
+
|
| 856 |
+
const std::string k_prompt = params.prompt;
|
| 857 |
+
|
| 858 |
+
std::vector<std::string> words;
|
| 859 |
+
|
| 860 |
+
std::istringstream iss(k_prompt);
|
| 861 |
+
std::string word;
|
| 862 |
+
|
| 863 |
+
while (iss >> word) {
|
| 864 |
+
words.push_back(word);
|
| 865 |
+
}
|
| 866 |
+
|
| 867 |
+
int k_prompt_length = words.size();
|
| 868 |
+
|
| 869 |
+
// main loop
|
| 870 |
+
while (is_running) {
|
| 871 |
+
// handle Ctrl + C
|
| 872 |
+
{
|
| 873 |
+
SDL_Event event;
|
| 874 |
+
while (SDL_PollEvent(&event)) {
|
| 875 |
+
switch (event.type) {
|
| 876 |
+
case SDL_QUIT:
|
| 877 |
+
{
|
| 878 |
+
is_running = false;
|
| 879 |
+
} break;
|
| 880 |
+
default:
|
| 881 |
+
break;
|
| 882 |
+
}
|
| 883 |
+
}
|
| 884 |
+
|
| 885 |
+
if (!is_running) {
|
| 886 |
+
return 0;
|
| 887 |
+
}
|
| 888 |
+
}
|
| 889 |
+
|
| 890 |
+
// delay
|
| 891 |
+
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
| 892 |
+
|
| 893 |
+
if (ask_prompt) {
|
| 894 |
+
fprintf(stdout, "\n");
|
| 895 |
+
fprintf(stdout, "%s: The prompt is: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
|
| 896 |
+
fprintf(stdout, "\n");
|
| 897 |
+
|
| 898 |
+
ask_prompt = false;
|
| 899 |
+
}
|
| 900 |
+
|
| 901 |
+
{
|
| 902 |
+
audio.get(2000, pcmf32_cur);
|
| 903 |
+
|
| 904 |
+
if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
|
| 905 |
+
fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
|
| 906 |
+
|
| 907 |
+
int64_t t_ms = 0;
|
| 908 |
+
|
| 909 |
+
// detect the commands
|
| 910 |
+
audio.get(params.command_ms, pcmf32_cur);
|
| 911 |
+
|
| 912 |
+
const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
|
| 913 |
+
|
| 914 |
+
std::istringstream iss(txt);
|
| 915 |
+
std::string word;
|
| 916 |
+
std::string prompt;
|
| 917 |
+
std::string command;
|
| 918 |
+
int i = 0;
|
| 919 |
+
int command_length = 0;
|
| 920 |
+
while (iss >> word) {
|
| 921 |
+
if (i == k_prompt_length - 1) {
|
| 922 |
+
prompt += word + ' ';
|
| 923 |
+
break;
|
| 924 |
+
}
|
| 925 |
+
prompt += word + ' ';
|
| 926 |
+
i++;
|
| 927 |
+
}
|
| 928 |
+
while (iss >> word) {
|
| 929 |
+
command += word + ' ';
|
| 930 |
+
command_length++;
|
| 931 |
+
}
|
| 932 |
+
|
| 933 |
+
const float sim = similarity(prompt, k_prompt);
|
| 934 |
+
|
| 935 |
+
//debug
|
| 936 |
+
//fprintf(stdout, "command size: %i\n", command_length);
|
| 937 |
+
|
| 938 |
+
|
| 939 |
+
if ((sim > 0.7f) && (command_length >0)){
|
| 940 |
+
fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
|
| 941 |
+
}
|
| 942 |
+
|
| 943 |
+
fprintf(stdout, "\n");
|
| 944 |
+
|
| 945 |
+
|
| 946 |
+
audio.clear();
|
| 947 |
+
}
|
| 948 |
+
}
|
| 949 |
+
}
|
| 950 |
+
|
| 951 |
+
return 0;
|
| 952 |
+
}
|
| 953 |
+
|
| 954 |
int main(int argc, char ** argv) {
|
| 955 |
whisper_params params;
|
| 956 |
|
|
|
|
| 1006 |
|
| 1007 |
if (!params.commands.empty()) {
|
| 1008 |
ret_val = process_command_list(ctx, audio, params);
|
| 1009 |
+
} else if (!params.prompt.empty()) {
|
| 1010 |
+
ret_val = always_prompt_transcription(ctx, audio, params);
|
| 1011 |
} else {
|
| 1012 |
ret_val = process_general_transcription(ctx, audio, params);
|
| 1013 |
}
|