dnhkng commited on
Commit
d11f3b5
·
unverified ·
1 Parent(s): 728fcbe

command : always-prompt mode (#383)

Browse files
Files changed (1) hide show
  1. examples/command/command.cpp +117 -1
examples/command/command.cpp CHANGED
@@ -11,6 +11,8 @@
11
  #include <SDL.h>
12
  #include <SDL_audio.h>
13
 
 
 
14
  #include <cassert>
15
  #include <cstdio>
16
  #include <fstream>
@@ -25,7 +27,7 @@
25
  struct whisper_params {
26
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
27
  int32_t prompt_ms = 5000;
28
- int32_t command_ms = 4000;
29
  int32_t capture_id = -1;
30
  int32_t max_tokens = 32;
31
  int32_t audio_ctx = 0;
@@ -43,6 +45,7 @@ struct whisper_params {
43
  std::string model = "models/ggml-base.en.bin";
44
  std::string fname_out;
45
  std::string commands;
 
46
  };
47
 
48
  void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -71,6 +74,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
71
  else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
72
  else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
73
  else if (arg == "-cmd" || arg == "--commands") { params.commands = argv[++i]; }
 
74
  else {
75
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
76
  whisper_print_usage(argc, argv, params);
@@ -103,6 +107,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
103
  fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
104
  fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
105
  fprintf(stderr, " -cmd FNAME, --commands FNAME [%-7s] text file with allowed commands\n", params.commands.c_str());
 
106
  fprintf(stderr, "\n");
107
  }
108
 
@@ -837,6 +842,115 @@ int process_general_transcription(struct whisper_context * ctx, audio_async &aud
837
  return 0;
838
  }
839
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840
  int main(int argc, char ** argv) {
841
  whisper_params params;
842
 
@@ -892,6 +1006,8 @@ int main(int argc, char ** argv) {
892
 
893
  if (!params.commands.empty()) {
894
  ret_val = process_command_list(ctx, audio, params);
 
 
895
  } else {
896
  ret_val = process_general_transcription(ctx, audio, params);
897
  }
 
11
  #include <SDL.h>
12
  #include <SDL_audio.h>
13
 
14
+ #include <iostream>
15
+ #include <sstream>
16
  #include <cassert>
17
  #include <cstdio>
18
  #include <fstream>
 
27
  struct whisper_params {
28
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
29
  int32_t prompt_ms = 5000;
30
+ int32_t command_ms = 8000;
31
  int32_t capture_id = -1;
32
  int32_t max_tokens = 32;
33
  int32_t audio_ctx = 0;
 
45
  std::string model = "models/ggml-base.en.bin";
46
  std::string fname_out;
47
  std::string commands;
48
+ std::string prompt;
49
  };
50
 
51
  void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 
74
  else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
75
  else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
76
  else if (arg == "-cmd" || arg == "--commands") { params.commands = argv[++i]; }
77
+ else if (arg == "-p" || arg == "--prompt") { params.prompt = argv[++i]; }
78
  else {
79
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
80
  whisper_print_usage(argc, argv, params);
 
107
  fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
108
  fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
109
  fprintf(stderr, " -cmd FNAME, --commands FNAME [%-7s] text file with allowed commands\n", params.commands.c_str());
110
+ fprintf(stderr, " -p, --prompt [%-7s] the required activation prompt\n", params.prompt.c_str());
111
  fprintf(stderr, "\n");
112
  }
113
 
 
842
  return 0;
843
  }
844
 
845
+
846
+ // always prompt mode
847
+ // transcribe the voice into text after valid prompt
848
+ int always_prompt_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
849
+ bool is_running = true;
850
+ bool ask_prompt = true;
851
+
852
+ float prob = 0.0f;
853
+
854
+ std::vector<float> pcmf32_cur;
855
+
856
+ const std::string k_prompt = params.prompt;
857
+
858
+ std::vector<std::string> words;
859
+
860
+ std::istringstream iss(k_prompt);
861
+ std::string word;
862
+
863
+ while (iss >> word) {
864
+ words.push_back(word);
865
+ }
866
+
867
+ int k_prompt_length = words.size();
868
+
869
+ // main loop
870
+ while (is_running) {
871
+ // handle Ctrl + C
872
+ {
873
+ SDL_Event event;
874
+ while (SDL_PollEvent(&event)) {
875
+ switch (event.type) {
876
+ case SDL_QUIT:
877
+ {
878
+ is_running = false;
879
+ } break;
880
+ default:
881
+ break;
882
+ }
883
+ }
884
+
885
+ if (!is_running) {
886
+ return 0;
887
+ }
888
+ }
889
+
890
+ // delay
891
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
892
+
893
+ if (ask_prompt) {
894
+ fprintf(stdout, "\n");
895
+ fprintf(stdout, "%s: The prompt is: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
896
+ fprintf(stdout, "\n");
897
+
898
+ ask_prompt = false;
899
+ }
900
+
901
+ {
902
+ audio.get(2000, pcmf32_cur);
903
+
904
+ if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
905
+ fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
906
+
907
+ int64_t t_ms = 0;
908
+
909
+ // detect the commands
910
+ audio.get(params.command_ms, pcmf32_cur);
911
+
912
+ const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
913
+
914
+ std::istringstream iss(txt);
915
+ std::string word;
916
+ std::string prompt;
917
+ std::string command;
918
+ int i = 0;
919
+ int command_length = 0;
920
+ while (iss >> word) {
921
+ if (i == k_prompt_length - 1) {
922
+ prompt += word + ' ';
923
+ break;
924
+ }
925
+ prompt += word + ' ';
926
+ i++;
927
+ }
928
+ while (iss >> word) {
929
+ command += word + ' ';
930
+ command_length++;
931
+ }
932
+
933
+ const float sim = similarity(prompt, k_prompt);
934
+
935
+ //debug
936
+ //fprintf(stdout, "command size: %i\n", command_length);
937
+
938
+
939
+ if ((sim > 0.7f) && (command_length >0)){
940
+ fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
941
+ }
942
+
943
+ fprintf(stdout, "\n");
944
+
945
+
946
+ audio.clear();
947
+ }
948
+ }
949
+ }
950
+
951
+ return 0;
952
+ }
953
+
954
  int main(int argc, char ** argv) {
955
  whisper_params params;
956
 
 
1006
 
1007
  if (!params.commands.empty()) {
1008
  ret_val = process_command_list(ctx, audio, params);
1009
+ } else if (!params.prompt.empty()) {
1010
+ ret_val = always_prompt_transcription(ctx, audio, params);
1011
  } else {
1012
  ret_val = process_general_transcription(ctx, audio, params);
1013
  }