Spaces:
Running
Running
refactoring : more readable code
Browse files- .gitignore +1 -0
- README.md +0 -1
- bindings/javascript/emscripten.cpp +8 -8
- examples/bench/bench.cpp +10 -11
- examples/main/main.cpp +80 -104
- examples/stream/stream.cpp +108 -134
- examples/talk.wasm/emscripten.cpp +12 -12
- examples/whisper.objc/whisper.objc/ViewController.m +8 -8
- examples/whisper.wasm/CMakeLists.txt +1 -0
- whisper.cpp +51 -51
- whisper.h +1 -1
.gitignore
CHANGED
|
@@ -17,6 +17,7 @@ bench
|
|
| 17 |
sync.sh
|
| 18 |
compile_commands.json
|
| 19 |
|
|
|
|
| 20 |
examples/whisper.objc/whisper.objc.xcodeproj/xcshareddata
|
| 21 |
examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
|
| 22 |
examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
|
|
|
|
| 17 |
sync.sh
|
| 18 |
compile_commands.json
|
| 19 |
|
| 20 |
+
examples/arm_neon.h
|
| 21 |
examples/whisper.objc/whisper.objc.xcodeproj/xcshareddata
|
| 22 |
examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
|
| 23 |
examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
|
README.md
CHANGED
|
@@ -99,7 +99,6 @@ usage: ./main [options] file0.wav file1.wav ...
|
|
| 99 |
|
| 100 |
options:
|
| 101 |
-h, --help show this help message and exit
|
| 102 |
-
-s SEED, --seed SEED RNG seed (default: -1)
|
| 103 |
-t N, --threads N number of threads to use during computation (default: 4)
|
| 104 |
-p N, --processors N number of processors to use during computation (default: 1)
|
| 105 |
-ot N, --offset-t N time offset in milliseconds (default: 0)
|
|
|
|
| 99 |
|
| 100 |
options:
|
| 101 |
-h, --help show this help message and exit
|
|
|
|
| 102 |
-t N, --threads N number of threads to use during computation (default: 4)
|
| 103 |
-p N, --processors N number of processors to use during computation (default: 1)
|
| 104 |
-ot N, --offset-t N time offset in milliseconds (default: 0)
|
bindings/javascript/emscripten.cpp
CHANGED
|
@@ -46,14 +46,14 @@ EMSCRIPTEN_BINDINGS(whisper) {
|
|
| 46 |
|
| 47 |
struct whisper_full_params params = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
|
| 48 |
|
| 49 |
-
params.print_realtime
|
| 50 |
-
params.print_progress
|
| 51 |
-
params.print_timestamps
|
| 52 |
-
params.
|
| 53 |
-
params.translate
|
| 54 |
-
params.language
|
| 55 |
-
params.n_threads
|
| 56 |
-
params.offset_ms
|
| 57 |
|
| 58 |
std::vector<float> pcmf32;
|
| 59 |
const int n = audio["length"].as<int>();
|
|
|
|
| 46 |
|
| 47 |
struct whisper_full_params params = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
|
| 48 |
|
| 49 |
+
params.print_realtime = true;
|
| 50 |
+
params.print_progress = false;
|
| 51 |
+
params.print_timestamps = true;
|
| 52 |
+
params.print_special = false;
|
| 53 |
+
params.translate = translate;
|
| 54 |
+
params.language = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
|
| 55 |
+
params.n_threads = std::min(8, (int) std::thread::hardware_concurrency());
|
| 56 |
+
params.offset_ms = 0;
|
| 57 |
|
| 58 |
std::vector<float> pcmf32;
|
| 59 |
const int n = audio["length"].as<int>();
|
examples/bench/bench.cpp
CHANGED
|
@@ -6,9 +6,9 @@
|
|
| 6 |
|
| 7 |
// command-line parameters
|
| 8 |
struct whisper_params {
|
| 9 |
-
int32_t n_threads
|
| 10 |
|
| 11 |
-
std::string model
|
| 12 |
};
|
| 13 |
|
| 14 |
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
|
@@ -17,14 +17,13 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 17 |
for (int i = 1; i < argc; i++) {
|
| 18 |
std::string arg = argv[i];
|
| 19 |
|
| 20 |
-
if (arg == "-
|
| 21 |
-
params.n_threads = std::stoi(argv[++i]);
|
| 22 |
-
} else if (arg == "-m" || arg == "--model") {
|
| 23 |
-
params.model = argv[++i];
|
| 24 |
-
} else if (arg == "-h" || arg == "--help") {
|
| 25 |
whisper_print_usage(argc, argv, params);
|
| 26 |
exit(0);
|
| 27 |
-
}
|
|
|
|
|
|
|
|
|
|
| 28 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 29 |
whisper_print_usage(argc, argv, params);
|
| 30 |
exit(0);
|
|
@@ -39,9 +38,9 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
|
| 39 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 40 |
fprintf(stderr, "\n");
|
| 41 |
fprintf(stderr, "options:\n");
|
| 42 |
-
fprintf(stderr, " -h, --help
|
| 43 |
-
fprintf(stderr, " -t N, --threads N
|
| 44 |
-
fprintf(stderr, " -m FNAME, --model FNAME
|
| 45 |
fprintf(stderr, "\n");
|
| 46 |
}
|
| 47 |
|
|
|
|
| 6 |
|
| 7 |
// command-line parameters
|
| 8 |
struct whisper_params {
|
| 9 |
+
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
| 10 |
|
| 11 |
+
std::string model = "models/ggml-base.en.bin";
|
| 12 |
};
|
| 13 |
|
| 14 |
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
|
|
|
| 17 |
for (int i = 1; i < argc; i++) {
|
| 18 |
std::string arg = argv[i];
|
| 19 |
|
| 20 |
+
if (arg == "-h" || arg == "--help") {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
whisper_print_usage(argc, argv, params);
|
| 22 |
exit(0);
|
| 23 |
+
}
|
| 24 |
+
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
| 25 |
+
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
| 26 |
+
else {
|
| 27 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 28 |
whisper_print_usage(argc, argv, params);
|
| 29 |
exit(0);
|
|
|
|
| 38 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 39 |
fprintf(stderr, "\n");
|
| 40 |
fprintf(stderr, "options:\n");
|
| 41 |
+
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
| 42 |
+
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
| 43 |
+
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
| 44 |
fprintf(stderr, "\n");
|
| 45 |
}
|
| 46 |
|
examples/main/main.cpp
CHANGED
|
@@ -48,7 +48,6 @@ void replace_all(std::string & s, const std::string & search, const std::string
|
|
| 48 |
|
| 49 |
// command-line parameters
|
| 50 |
struct whisper_params {
|
| 51 |
-
int32_t seed = -1; // RNG seed, not used currently
|
| 52 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
| 53 |
int32_t n_processors = 1;
|
| 54 |
int32_t offset_t_ms = 0;
|
|
@@ -59,15 +58,15 @@ struct whisper_params {
|
|
| 59 |
|
| 60 |
float word_thold = 0.01f;
|
| 61 |
|
| 62 |
-
bool speed_up
|
| 63 |
-
bool translate
|
| 64 |
-
bool output_txt
|
| 65 |
-
bool output_vtt
|
| 66 |
-
bool output_srt
|
| 67 |
-
bool output_wts
|
| 68 |
-
bool
|
| 69 |
-
bool print_colors
|
| 70 |
-
bool no_timestamps
|
| 71 |
|
| 72 |
std::string language = "en";
|
| 73 |
std::string model = "models/ggml-base.en.bin";
|
|
@@ -86,57 +85,31 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 86 |
continue;
|
| 87 |
}
|
| 88 |
|
| 89 |
-
if (arg == "-
|
| 90 |
-
params.seed = std::stoi(argv[++i]);
|
| 91 |
-
} else if (arg == "-t" || arg == "--threads") {
|
| 92 |
-
params.n_threads = std::stoi(argv[++i]);
|
| 93 |
-
} else if (arg == "-p" || arg == "--processors") {
|
| 94 |
-
params.n_processors = std::stoi(argv[++i]);
|
| 95 |
-
} else if (arg == "-ot" || arg == "--offset-t") {
|
| 96 |
-
params.offset_t_ms = std::stoi(argv[++i]);
|
| 97 |
-
} else if (arg == "-on" || arg == "--offset-n") {
|
| 98 |
-
params.offset_n = std::stoi(argv[++i]);
|
| 99 |
-
} else if (arg == "-d" || arg == "--duration") {
|
| 100 |
-
params.duration_ms = std::stoi(argv[++i]);
|
| 101 |
-
} else if (arg == "-mc" || arg == "--max-context") {
|
| 102 |
-
params.max_context = std::stoi(argv[++i]);
|
| 103 |
-
} else if (arg == "-ml" || arg == "--max-len") {
|
| 104 |
-
params.max_len = std::stoi(argv[++i]);
|
| 105 |
-
} else if (arg == "-wt" || arg == "--word-thold") {
|
| 106 |
-
params.word_thold = std::stof(argv[++i]);
|
| 107 |
-
} else if (arg == "-su" || arg == "--speed-up") {
|
| 108 |
-
params.speed_up = true;
|
| 109 |
-
} else if (arg == "-tr" || arg == "--translate") {
|
| 110 |
-
params.translate = true;
|
| 111 |
-
} else if (arg == "-l" || arg == "--language") {
|
| 112 |
-
params.language = argv[++i];
|
| 113 |
-
if (whisper_lang_id(params.language.c_str()) == -1) {
|
| 114 |
-
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
| 115 |
-
whisper_print_usage(argc, argv, params);
|
| 116 |
-
exit(0);
|
| 117 |
-
}
|
| 118 |
-
} else if (arg == "-otxt" || arg == "--output-txt") {
|
| 119 |
-
params.output_txt = true;
|
| 120 |
-
} else if (arg == "-ovtt" || arg == "--output-vtt") {
|
| 121 |
-
params.output_vtt = true;
|
| 122 |
-
} else if (arg == "-osrt" || arg == "--output-srt") {
|
| 123 |
-
params.output_srt = true;
|
| 124 |
-
} else if (arg == "-owts" || arg == "--output-words") {
|
| 125 |
-
params.output_wts = true;
|
| 126 |
-
} else if (arg == "-ps" || arg == "--print_special") {
|
| 127 |
-
params.print_special_tokens = true;
|
| 128 |
-
} else if (arg == "-pc" || arg == "--print_colors") {
|
| 129 |
-
params.print_colors = true;
|
| 130 |
-
} else if (arg == "-nt" || arg == "--no_timestamps") {
|
| 131 |
-
params.no_timestamps = true;
|
| 132 |
-
} else if (arg == "-m" || arg == "--model") {
|
| 133 |
-
params.model = argv[++i];
|
| 134 |
-
} else if (arg == "-f" || arg == "--file") {
|
| 135 |
-
params.fname_inp.push_back(argv[++i]);
|
| 136 |
-
} else if (arg == "-h" || arg == "--help") {
|
| 137 |
whisper_print_usage(argc, argv, params);
|
| 138 |
exit(0);
|
| 139 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 141 |
whisper_print_usage(argc, argv, params);
|
| 142 |
exit(0);
|
|
@@ -151,28 +124,27 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
|
| 151 |
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
|
| 152 |
fprintf(stderr, "\n");
|
| 153 |
fprintf(stderr, "options:\n");
|
| 154 |
-
fprintf(stderr, " -h, --help
|
| 155 |
-
fprintf(stderr, " -
|
| 156 |
-
fprintf(stderr, " -
|
| 157 |
-
fprintf(stderr, " -
|
| 158 |
-
fprintf(stderr, " -
|
| 159 |
-
fprintf(stderr, " -
|
| 160 |
-
fprintf(stderr, " -
|
| 161 |
-
fprintf(stderr, " -
|
| 162 |
-
fprintf(stderr, " -
|
| 163 |
-
fprintf(stderr, " -
|
| 164 |
-
fprintf(stderr, " -
|
| 165 |
-
fprintf(stderr, " -
|
| 166 |
-
fprintf(stderr, " -
|
| 167 |
-
fprintf(stderr, " -
|
| 168 |
-
fprintf(stderr, " -
|
| 169 |
-
fprintf(stderr, " -
|
| 170 |
-
fprintf(stderr, " -
|
| 171 |
-
fprintf(stderr, " -
|
| 172 |
-
fprintf(stderr, " -
|
| 173 |
-
fprintf(stderr, " -
|
| 174 |
-
fprintf(stderr, " -
|
| 175 |
-
fprintf(stderr, " -f FNAME, --file FNAME input WAV file path\n");
|
| 176 |
fprintf(stderr, "\n");
|
| 177 |
}
|
| 178 |
|
|
@@ -191,7 +163,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
|
|
| 191 |
if (params.no_timestamps) {
|
| 192 |
if (params.print_colors) {
|
| 193 |
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
| 194 |
-
if (params.
|
| 195 |
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
| 196 |
if (id >= whisper_token_eot(ctx)) {
|
| 197 |
continue;
|
|
@@ -217,7 +189,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi
|
|
| 217 |
if (params.print_colors) {
|
| 218 |
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
|
| 219 |
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
| 220 |
-
if (params.
|
| 221 |
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
| 222 |
if (id >= whisper_token_eot(ctx)) {
|
| 223 |
continue;
|
|
@@ -428,16 +400,18 @@ int main(int argc, char ** argv) {
|
|
| 428 |
return 1;
|
| 429 |
}
|
| 430 |
|
| 431 |
-
if (params.seed < 0) {
|
| 432 |
-
params.seed = time(NULL);
|
| 433 |
-
}
|
| 434 |
-
|
| 435 |
if (params.fname_inp.empty()) {
|
| 436 |
fprintf(stderr, "error: no input files specified\n");
|
| 437 |
whisper_print_usage(argc, argv, params);
|
| 438 |
return 2;
|
| 439 |
}
|
| 440 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
// whisper init
|
| 442 |
|
| 443 |
struct whisper_context * ctx = whisper_init(params.model.c_str());
|
|
@@ -474,6 +448,8 @@ int main(int argc, char ** argv) {
|
|
| 474 |
fprintf(stderr, "error: failed to open WAV file from stdin\n");
|
| 475 |
return 4;
|
| 476 |
}
|
|
|
|
|
|
|
| 477 |
}
|
| 478 |
else if (drwav_init_file(&wav, fname_inp.c_str(), NULL) == false) {
|
| 479 |
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
|
|
@@ -495,7 +471,7 @@ int main(int argc, char ** argv) {
|
|
| 495 |
return 7;
|
| 496 |
}
|
| 497 |
|
| 498 |
-
|
| 499 |
|
| 500 |
std::vector<int16_t> pcm16;
|
| 501 |
pcm16.resize(n*wav.channels);
|
|
@@ -547,22 +523,22 @@ int main(int argc, char ** argv) {
|
|
| 547 |
{
|
| 548 |
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
| 549 |
|
| 550 |
-
wparams.print_realtime
|
| 551 |
-
wparams.print_progress
|
| 552 |
-
wparams.print_timestamps
|
| 553 |
-
wparams.
|
| 554 |
-
wparams.translate
|
| 555 |
-
wparams.language
|
| 556 |
-
wparams.n_threads
|
| 557 |
-
wparams.n_max_text_ctx
|
| 558 |
-
wparams.offset_ms
|
| 559 |
-
wparams.duration_ms
|
| 560 |
-
|
| 561 |
-
wparams.token_timestamps
|
| 562 |
-
wparams.thold_pt
|
| 563 |
-
wparams.max_len
|
| 564 |
-
|
| 565 |
-
wparams.speed_up
|
| 566 |
|
| 567 |
// this callback is called on each new segment
|
| 568 |
if (!wparams.print_realtime) {
|
|
|
|
| 48 |
|
| 49 |
// command-line parameters
|
| 50 |
struct whisper_params {
|
|
|
|
| 51 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
| 52 |
int32_t n_processors = 1;
|
| 53 |
int32_t offset_t_ms = 0;
|
|
|
|
| 58 |
|
| 59 |
float word_thold = 0.01f;
|
| 60 |
|
| 61 |
+
bool speed_up = false;
|
| 62 |
+
bool translate = false;
|
| 63 |
+
bool output_txt = false;
|
| 64 |
+
bool output_vtt = false;
|
| 65 |
+
bool output_srt = false;
|
| 66 |
+
bool output_wts = false;
|
| 67 |
+
bool print_special = false;
|
| 68 |
+
bool print_colors = false;
|
| 69 |
+
bool no_timestamps = false;
|
| 70 |
|
| 71 |
std::string language = "en";
|
| 72 |
std::string model = "models/ggml-base.en.bin";
|
|
|
|
| 85 |
continue;
|
| 86 |
}
|
| 87 |
|
| 88 |
+
if (arg == "-h" || arg == "--help") {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
whisper_print_usage(argc, argv, params);
|
| 90 |
exit(0);
|
| 91 |
+
}
|
| 92 |
+
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
| 93 |
+
else if (arg == "-p" || arg == "--processors") { params.n_processors = std::stoi(argv[++i]); }
|
| 94 |
+
else if (arg == "-ot" || arg == "--offset-t") { params.offset_t_ms = std::stoi(argv[++i]); }
|
| 95 |
+
else if (arg == "-on" || arg == "--offset-n") { params.offset_n = std::stoi(argv[++i]); }
|
| 96 |
+
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
|
| 97 |
+
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
|
| 98 |
+
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
|
| 99 |
+
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
| 100 |
+
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
| 101 |
+
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 102 |
+
else if (arg == "-otxt" || arg == "--output-txt") { params.output_txt = true; }
|
| 103 |
+
else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
|
| 104 |
+
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
|
| 105 |
+
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
|
| 106 |
+
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 107 |
+
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
| 108 |
+
else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
|
| 109 |
+
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
| 110 |
+
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
| 111 |
+
else if (arg == "-f" || arg == "--file") { params.fname_inp.push_back(argv[++i]); }
|
| 112 |
+
else {
|
| 113 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 114 |
whisper_print_usage(argc, argv, params);
|
| 115 |
exit(0);
|
|
|
|
| 124 |
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
|
| 125 |
fprintf(stderr, "\n");
|
| 126 |
fprintf(stderr, "options:\n");
|
| 127 |
+
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
| 128 |
+
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
| 129 |
+
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
|
| 130 |
+
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
|
| 131 |
+
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
|
| 132 |
+
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
|
| 133 |
+
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
| 134 |
+
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
| 135 |
+
fprintf(stderr, " -wt N, --word-thold N [%-7f] word timestamp probability threshold\n", params.word_thold);
|
| 136 |
+
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
| 137 |
+
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 138 |
+
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
|
| 139 |
+
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
| 140 |
+
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
|
| 141 |
+
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
| 142 |
+
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 143 |
+
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
| 144 |
+
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
|
| 145 |
+
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
| 146 |
+
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
| 147 |
+
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
|
|
|
| 148 |
fprintf(stderr, "\n");
|
| 149 |
}
|
| 150 |
|
|
|
|
| 163 |
if (params.no_timestamps) {
|
| 164 |
if (params.print_colors) {
|
| 165 |
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
| 166 |
+
if (params.print_special == false) {
|
| 167 |
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
| 168 |
if (id >= whisper_token_eot(ctx)) {
|
| 169 |
continue;
|
|
|
|
| 189 |
if (params.print_colors) {
|
| 190 |
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
|
| 191 |
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
| 192 |
+
if (params.print_special == false) {
|
| 193 |
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
| 194 |
if (id >= whisper_token_eot(ctx)) {
|
| 195 |
continue;
|
|
|
|
| 400 |
return 1;
|
| 401 |
}
|
| 402 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
if (params.fname_inp.empty()) {
|
| 404 |
fprintf(stderr, "error: no input files specified\n");
|
| 405 |
whisper_print_usage(argc, argv, params);
|
| 406 |
return 2;
|
| 407 |
}
|
| 408 |
|
| 409 |
+
if (whisper_lang_id(params.language.c_str()) == -1) {
|
| 410 |
+
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
| 411 |
+
whisper_print_usage(argc, argv, params);
|
| 412 |
+
exit(0);
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
// whisper init
|
| 416 |
|
| 417 |
struct whisper_context * ctx = whisper_init(params.model.c_str());
|
|
|
|
| 448 |
fprintf(stderr, "error: failed to open WAV file from stdin\n");
|
| 449 |
return 4;
|
| 450 |
}
|
| 451 |
+
|
| 452 |
+
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
|
| 453 |
}
|
| 454 |
else if (drwav_init_file(&wav, fname_inp.c_str(), NULL) == false) {
|
| 455 |
fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
|
|
|
|
| 471 |
return 7;
|
| 472 |
}
|
| 473 |
|
| 474 |
+
const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
|
| 475 |
|
| 476 |
std::vector<int16_t> pcm16;
|
| 477 |
pcm16.resize(n*wav.channels);
|
|
|
|
| 523 |
{
|
| 524 |
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
| 525 |
|
| 526 |
+
wparams.print_realtime = false;
|
| 527 |
+
wparams.print_progress = false;
|
| 528 |
+
wparams.print_timestamps = !params.no_timestamps;
|
| 529 |
+
wparams.print_special = params.print_special;
|
| 530 |
+
wparams.translate = params.translate;
|
| 531 |
+
wparams.language = params.language.c_str();
|
| 532 |
+
wparams.n_threads = params.n_threads;
|
| 533 |
+
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
|
| 534 |
+
wparams.offset_ms = params.offset_t_ms;
|
| 535 |
+
wparams.duration_ms = params.duration_ms;
|
| 536 |
+
|
| 537 |
+
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
| 538 |
+
wparams.thold_pt = params.word_thold;
|
| 539 |
+
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
| 540 |
+
|
| 541 |
+
wparams.speed_up = params.speed_up;
|
| 542 |
|
| 543 |
// this callback is called on each new segment
|
| 544 |
if (!wparams.print_realtime) {
|
examples/stream/stream.cpp
CHANGED
|
@@ -4,11 +4,6 @@
|
|
| 4 |
|
| 5 |
#include "whisper.h"
|
| 6 |
|
| 7 |
-
// third-party utilities
|
| 8 |
-
// use your favorite implementations
|
| 9 |
-
#define DR_WAV_IMPLEMENTATION
|
| 10 |
-
#include "dr_wav.h"
|
| 11 |
-
|
| 12 |
#include <SDL.h>
|
| 13 |
#include <SDL_audio.h>
|
| 14 |
|
|
@@ -35,7 +30,6 @@ std::string to_timestamp(int64_t t) {
|
|
| 35 |
|
| 36 |
// command-line parameters
|
| 37 |
struct whisper_params {
|
| 38 |
-
int32_t seed = -1; // RNG seed, not used currently
|
| 39 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
| 40 |
int32_t step_ms = 3000;
|
| 41 |
int32_t length_ms = 10000;
|
|
@@ -43,11 +37,11 @@ struct whisper_params {
|
|
| 43 |
int32_t max_tokens = 32;
|
| 44 |
int32_t audio_ctx = 0;
|
| 45 |
|
| 46 |
-
bool speed_up
|
| 47 |
-
bool translate
|
| 48 |
-
bool no_context
|
| 49 |
-
bool
|
| 50 |
-
bool no_timestamps
|
| 51 |
|
| 52 |
std::string language = "en";
|
| 53 |
std::string model = "models/ggml-base.en.bin";
|
|
@@ -60,45 +54,24 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|
| 60 |
for (int i = 1; i < argc; i++) {
|
| 61 |
std::string arg = argv[i];
|
| 62 |
|
| 63 |
-
if (arg == "-
|
| 64 |
-
params.seed = std::stoi(argv[++i]);
|
| 65 |
-
} else if (arg == "-t" || arg == "--threads") {
|
| 66 |
-
params.n_threads = std::stoi(argv[++i]);
|
| 67 |
-
} else if (arg == "--step") {
|
| 68 |
-
params.step_ms = std::stoi(argv[++i]);
|
| 69 |
-
} else if (arg == "--length") {
|
| 70 |
-
params.length_ms = std::stoi(argv[++i]);
|
| 71 |
-
} else if (arg == "-c" || arg == "--capture") {
|
| 72 |
-
params.capture_id = std::stoi(argv[++i]);
|
| 73 |
-
} else if (arg == "-mt" || arg == "--max_tokens") {
|
| 74 |
-
params.max_tokens = std::stoi(argv[++i]);
|
| 75 |
-
} else if (arg == "-ac" || arg == "--audio_ctx") {
|
| 76 |
-
params.audio_ctx = std::stoi(argv[++i]);
|
| 77 |
-
} else if (arg == "-su" || arg == "--speed-up") {
|
| 78 |
-
params.speed_up = true;
|
| 79 |
-
} else if (arg == "-tr" || arg == "--translate") {
|
| 80 |
-
params.translate = true;
|
| 81 |
-
} else if (arg == "-kc" || arg == "--keep-context") {
|
| 82 |
-
params.no_context = false;
|
| 83 |
-
} else if (arg == "-l" || arg == "--language") {
|
| 84 |
-
params.language = argv[++i];
|
| 85 |
-
if (whisper_lang_id(params.language.c_str()) == -1) {
|
| 86 |
-
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
| 87 |
-
whisper_print_usage(argc, argv, params);
|
| 88 |
-
exit(0);
|
| 89 |
-
}
|
| 90 |
-
} else if (arg == "-ps" || arg == "--print_special") {
|
| 91 |
-
params.print_special_tokens = true;
|
| 92 |
-
} else if (arg == "-nt" || arg == "--no_timestamps") {
|
| 93 |
-
params.no_timestamps = true;
|
| 94 |
-
} else if (arg == "-m" || arg == "--model") {
|
| 95 |
-
params.model = argv[++i];
|
| 96 |
-
} else if (arg == "-f" || arg == "--file") {
|
| 97 |
-
params.fname_out = argv[++i];
|
| 98 |
-
} else if (arg == "-h" || arg == "--help") {
|
| 99 |
whisper_print_usage(argc, argv, params);
|
| 100 |
exit(0);
|
| 101 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 103 |
whisper_print_usage(argc, argv, params);
|
| 104 |
exit(0);
|
|
@@ -113,22 +86,20 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
|
|
| 113 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 114 |
fprintf(stderr, "\n");
|
| 115 |
fprintf(stderr, "options:\n");
|
| 116 |
-
fprintf(stderr, " -h, --help
|
| 117 |
-
fprintf(stderr, " -
|
| 118 |
-
fprintf(stderr, "
|
| 119 |
-
fprintf(stderr, " --
|
| 120 |
-
fprintf(stderr, "
|
| 121 |
-
fprintf(stderr, " -
|
| 122 |
-
fprintf(stderr, " -
|
| 123 |
-
fprintf(stderr, " -
|
| 124 |
-
fprintf(stderr, " -
|
| 125 |
-
fprintf(stderr, " -
|
| 126 |
-
fprintf(stderr, " -
|
| 127 |
-
fprintf(stderr, " -
|
| 128 |
-
fprintf(stderr, " -
|
| 129 |
-
fprintf(stderr, " -
|
| 130 |
-
fprintf(stderr, " -m FNAME, --model FNAME model path (default: %s)\n", params.model.c_str());
|
| 131 |
-
fprintf(stderr, " -f FNAME, --file FNAME text output file name (default: no output to file)\n");
|
| 132 |
fprintf(stderr, "\n");
|
| 133 |
}
|
| 134 |
|
|
@@ -144,56 +115,51 @@ bool audio_sdl_init(const int capture_id) {
|
|
| 144 |
return false;
|
| 145 |
}
|
| 146 |
|
| 147 |
-
|
| 148 |
-
SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
|
| 149 |
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
|
| 155 |
-
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
}
|
| 163 |
}
|
| 164 |
}
|
| 165 |
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
SDL_AudioSpec capture_spec_obtained;
|
| 169 |
|
| 170 |
-
|
| 171 |
-
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
}
|
| 195 |
}
|
| 196 |
-
|
| 197 |
|
| 198 |
return true;
|
| 199 |
}
|
|
@@ -207,10 +173,6 @@ int main(int argc, char ** argv) {
|
|
| 207 |
return 1;
|
| 208 |
}
|
| 209 |
|
| 210 |
-
if (params.seed < 0) {
|
| 211 |
-
params.seed = time(NULL);
|
| 212 |
-
}
|
| 213 |
-
|
| 214 |
// init audio
|
| 215 |
|
| 216 |
if (!audio_sdl_init(params.capture_id)) {
|
|
@@ -218,6 +180,12 @@ int main(int argc, char ** argv) {
|
|
| 218 |
return 1;
|
| 219 |
}
|
| 220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
// whisper init
|
| 222 |
|
| 223 |
struct whisper_context * ctx = whisper_init(params.model.c_str());
|
|
@@ -276,16 +244,22 @@ int main(int argc, char ** argv) {
|
|
| 276 |
|
| 277 |
// main audio loop
|
| 278 |
while (is_running) {
|
| 279 |
-
//
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
}
|
| 290 |
}
|
| 291 |
|
|
@@ -327,22 +301,22 @@ int main(int argc, char ** argv) {
|
|
| 327 |
{
|
| 328 |
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
| 329 |
|
| 330 |
-
wparams.print_progress
|
| 331 |
-
wparams.
|
| 332 |
-
wparams.print_realtime
|
| 333 |
-
wparams.print_timestamps
|
| 334 |
-
wparams.translate
|
| 335 |
-
wparams.no_context
|
| 336 |
-
wparams.single_segment
|
| 337 |
-
wparams.max_tokens
|
| 338 |
-
wparams.language
|
| 339 |
-
wparams.n_threads
|
| 340 |
-
|
| 341 |
-
wparams.audio_ctx
|
| 342 |
-
wparams.speed_up
|
| 343 |
-
|
| 344 |
-
wparams.prompt_tokens
|
| 345 |
-
wparams.prompt_n_tokens
|
| 346 |
|
| 347 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
| 348 |
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
|
|
|
| 4 |
|
| 5 |
#include "whisper.h"
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
#include <SDL.h>
|
| 8 |
#include <SDL_audio.h>
|
| 9 |
|
|
|
|
| 30 |
|
| 31 |
// command-line parameters
|
| 32 |
struct whisper_params {
|
|
|
|
| 33 |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
| 34 |
int32_t step_ms = 3000;
|
| 35 |
int32_t length_ms = 10000;
|
|
|
|
| 37 |
int32_t max_tokens = 32;
|
| 38 |
int32_t audio_ctx = 0;
|
| 39 |
|
| 40 |
+
bool speed_up = false;
|
| 41 |
+
bool translate = false;
|
| 42 |
+
bool no_context = true;
|
| 43 |
+
bool print_special = false;
|
| 44 |
+
bool no_timestamps = true;
|
| 45 |
|
| 46 |
std::string language = "en";
|
| 47 |
std::string model = "models/ggml-base.en.bin";
|
|
|
|
| 54 |
for (int i = 1; i < argc; i++) {
|
| 55 |
std::string arg = argv[i];
|
| 56 |
|
| 57 |
+
if (arg == "-h" || arg == "--help") {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
whisper_print_usage(argc, argv, params);
|
| 59 |
exit(0);
|
| 60 |
+
}
|
| 61 |
+
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
| 62 |
+
else if ( arg == "--step") { params.step_ms = std::stoi(argv[++i]); }
|
| 63 |
+
else if ( arg == "--length") { params.length_ms = std::stoi(argv[++i]); }
|
| 64 |
+
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
|
| 65 |
+
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
|
| 66 |
+
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
| 67 |
+
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
| 68 |
+
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
| 69 |
+
else if (arg == "-kc" || arg == "--keep-context") { params.no_context = false; }
|
| 70 |
+
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 71 |
+
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
| 72 |
+
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
| 73 |
+
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
|
| 74 |
+
else {
|
| 75 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 76 |
whisper_print_usage(argc, argv, params);
|
| 77 |
exit(0);
|
|
|
|
| 86 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 87 |
fprintf(stderr, "\n");
|
| 88 |
fprintf(stderr, "options:\n");
|
| 89 |
+
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
| 90 |
+
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
| 91 |
+
fprintf(stderr, " --step N [%-7d] audio step size in milliseconds\n", params.step_ms);
|
| 92 |
+
fprintf(stderr, " --length N [%-7d] audio length in milliseconds\n", params.length_ms);
|
| 93 |
+
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
|
| 94 |
+
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
|
| 95 |
+
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
| 96 |
+
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
| 97 |
+
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
| 98 |
+
fprintf(stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n", params.no_context ? "false" : "true");
|
| 99 |
+
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 100 |
+
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
| 101 |
+
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
| 102 |
+
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
|
|
|
|
|
|
| 103 |
fprintf(stderr, "\n");
|
| 104 |
}
|
| 105 |
|
|
|
|
| 115 |
return false;
|
| 116 |
}
|
| 117 |
|
| 118 |
+
SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
|
|
|
|
| 119 |
|
| 120 |
+
if (SDL_Init(SDL_INIT_AUDIO) < 0) {
|
| 121 |
+
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
|
| 122 |
+
return (1);
|
| 123 |
+
}
|
| 124 |
|
| 125 |
+
SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
|
| 126 |
|
| 127 |
+
{
|
| 128 |
+
int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
|
| 129 |
+
fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
|
| 130 |
+
for (int i = 0; i < nDevices; i++) {
|
| 131 |
+
fprintf(stderr, "%s: - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
|
|
|
|
| 132 |
}
|
| 133 |
}
|
| 134 |
|
| 135 |
+
SDL_AudioSpec capture_spec_requested;
|
| 136 |
+
SDL_AudioSpec capture_spec_obtained;
|
|
|
|
| 137 |
|
| 138 |
+
SDL_zero(capture_spec_requested);
|
| 139 |
+
SDL_zero(capture_spec_obtained);
|
| 140 |
|
| 141 |
+
capture_spec_requested.freq = WHISPER_SAMPLE_RATE;
|
| 142 |
+
capture_spec_requested.format = AUDIO_F32;
|
| 143 |
+
capture_spec_requested.channels = 1;
|
| 144 |
+
capture_spec_requested.samples = 1024;
|
| 145 |
|
| 146 |
+
if (capture_id >= 0) {
|
| 147 |
+
fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
|
| 148 |
+
g_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
|
| 149 |
+
} else {
|
| 150 |
+
fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
|
| 151 |
+
g_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
|
| 152 |
+
}
|
| 153 |
+
if (!g_dev_id_in) {
|
| 154 |
+
fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
|
| 155 |
+
g_dev_id_in = 0;
|
| 156 |
+
} else {
|
| 157 |
+
fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, g_dev_id_in);
|
| 158 |
+
fprintf(stderr, "%s: - sample rate: %d\n", __func__, capture_spec_obtained.freq);
|
| 159 |
+
fprintf(stderr, "%s: - format: %d (required: %d)\n", __func__, capture_spec_obtained.format, capture_spec_requested.format);
|
| 160 |
+
fprintf(stderr, "%s: - channels: %d (required: %d)\n", __func__, capture_spec_obtained.channels, capture_spec_requested.channels);
|
| 161 |
+
fprintf(stderr, "%s: - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
|
|
|
|
| 162 |
}
|
|
|
|
| 163 |
|
| 164 |
return true;
|
| 165 |
}
|
|
|
|
| 173 |
return 1;
|
| 174 |
}
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
// init audio
|
| 177 |
|
| 178 |
if (!audio_sdl_init(params.capture_id)) {
|
|
|
|
| 180 |
return 1;
|
| 181 |
}
|
| 182 |
|
| 183 |
+
if (whisper_lang_id(params.language.c_str()) == -1) {
|
| 184 |
+
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
| 185 |
+
whisper_print_usage(argc, argv, params);
|
| 186 |
+
exit(0);
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
// whisper init
|
| 190 |
|
| 191 |
struct whisper_context * ctx = whisper_init(params.model.c_str());
|
|
|
|
| 244 |
|
| 245 |
// main audio loop
|
| 246 |
while (is_running) {
|
| 247 |
+
// handle Ctrl + C
|
| 248 |
+
{
|
| 249 |
+
SDL_Event event;
|
| 250 |
+
while (SDL_PollEvent(&event)) {
|
| 251 |
+
switch (event.type) {
|
| 252 |
+
case SDL_QUIT:
|
| 253 |
+
{
|
| 254 |
+
is_running = false;
|
| 255 |
+
} break;
|
| 256 |
+
default:
|
| 257 |
+
break;
|
| 258 |
+
}
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
if (!is_running) {
|
| 262 |
+
break;
|
| 263 |
}
|
| 264 |
}
|
| 265 |
|
|
|
|
| 301 |
{
|
| 302 |
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
| 303 |
|
| 304 |
+
wparams.print_progress = false;
|
| 305 |
+
wparams.print_special = params.print_special;
|
| 306 |
+
wparams.print_realtime = false;
|
| 307 |
+
wparams.print_timestamps = !params.no_timestamps;
|
| 308 |
+
wparams.translate = params.translate;
|
| 309 |
+
wparams.no_context = true;
|
| 310 |
+
wparams.single_segment = true;
|
| 311 |
+
wparams.max_tokens = params.max_tokens;
|
| 312 |
+
wparams.language = params.language.c_str();
|
| 313 |
+
wparams.n_threads = params.n_threads;
|
| 314 |
+
|
| 315 |
+
wparams.audio_ctx = params.audio_ctx;
|
| 316 |
+
wparams.speed_up = params.speed_up;
|
| 317 |
+
|
| 318 |
+
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
|
| 319 |
+
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
|
| 320 |
|
| 321 |
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
| 322 |
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
examples/talk.wasm/emscripten.cpp
CHANGED
|
@@ -51,15 +51,15 @@ void talk_main(size_t index) {
|
|
| 51 |
|
| 52 |
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
|
| 53 |
|
| 54 |
-
wparams.n_threads
|
| 55 |
-
wparams.offset_ms
|
| 56 |
-
wparams.translate
|
| 57 |
-
wparams.no_context
|
| 58 |
-
wparams.single_segment
|
| 59 |
-
wparams.print_realtime
|
| 60 |
-
wparams.print_progress
|
| 61 |
-
wparams.print_timestamps
|
| 62 |
-
wparams.
|
| 63 |
|
| 64 |
wparams.max_tokens = 32;
|
| 65 |
wparams.audio_ctx = 768; // partial encoder context for better performance
|
|
@@ -75,9 +75,9 @@ void talk_main(size_t index) {
|
|
| 75 |
// whisper context
|
| 76 |
auto & ctx = g_contexts[index];
|
| 77 |
|
| 78 |
-
const int64_t step_samples
|
| 79 |
-
const int64_t step_ms = (step_samples*1000)/WHISPER_SAMPLE_RATE;
|
| 80 |
const int64_t window_samples = 9*WHISPER_SAMPLE_RATE;
|
|
|
|
| 81 |
|
| 82 |
auto t_last = std::chrono::high_resolution_clock::now();
|
| 83 |
|
|
@@ -111,7 +111,7 @@ void talk_main(size_t index) {
|
|
| 111 |
pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
|
| 112 |
}
|
| 113 |
|
| 114 |
-
// if energy in during last second is above threshold, then skip
|
| 115 |
{
|
| 116 |
float energy_all = 0.0f;
|
| 117 |
float energy_1s = 0.0f;
|
|
|
|
| 51 |
|
| 52 |
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
|
| 53 |
|
| 54 |
+
wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
|
| 55 |
+
wparams.offset_ms = 0;
|
| 56 |
+
wparams.translate = false;
|
| 57 |
+
wparams.no_context = true;
|
| 58 |
+
wparams.single_segment = true;
|
| 59 |
+
wparams.print_realtime = false;
|
| 60 |
+
wparams.print_progress = false;
|
| 61 |
+
wparams.print_timestamps = true;
|
| 62 |
+
wparams.print_special = false;
|
| 63 |
|
| 64 |
wparams.max_tokens = 32;
|
| 65 |
wparams.audio_ctx = 768; // partial encoder context for better performance
|
|
|
|
| 75 |
// whisper context
|
| 76 |
auto & ctx = g_contexts[index];
|
| 77 |
|
| 78 |
+
const int64_t step_samples = 2*WHISPER_SAMPLE_RATE;
|
|
|
|
| 79 |
const int64_t window_samples = 9*WHISPER_SAMPLE_RATE;
|
| 80 |
+
const int64_t step_ms = (step_samples*1000)/WHISPER_SAMPLE_RATE;
|
| 81 |
|
| 82 |
auto t_last = std::chrono::high_resolution_clock::now();
|
| 83 |
|
|
|
|
| 111 |
pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
|
| 112 |
}
|
| 113 |
|
| 114 |
+
// VAD: if energy in during last second is above threshold, then skip
|
| 115 |
{
|
| 116 |
float energy_all = 0.0f;
|
| 117 |
float energy_1s = 0.0f;
|
examples/whisper.objc/whisper.objc/ViewController.m
CHANGED
|
@@ -161,14 +161,14 @@ void AudioInputCallback(void * inUserData,
|
|
| 161 |
// run the model
|
| 162 |
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
| 163 |
|
| 164 |
-
params.print_realtime
|
| 165 |
-
params.print_progress
|
| 166 |
-
params.print_timestamps
|
| 167 |
-
params.
|
| 168 |
-
params.translate
|
| 169 |
-
params.language
|
| 170 |
-
params.n_threads
|
| 171 |
-
params.offset_ms
|
| 172 |
|
| 173 |
CFTimeInterval startTime = CACurrentMediaTime();
|
| 174 |
|
|
|
|
| 161 |
// run the model
|
| 162 |
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
| 163 |
|
| 164 |
+
params.print_realtime = true;
|
| 165 |
+
params.print_progress = false;
|
| 166 |
+
params.print_timestamps = true;
|
| 167 |
+
params.print_special = false;
|
| 168 |
+
params.translate = false;
|
| 169 |
+
params.language = "en";
|
| 170 |
+
params.n_threads = 4;
|
| 171 |
+
params.offset_ms = 0;
|
| 172 |
|
| 173 |
CFTimeInterval startTime = CACurrentMediaTime();
|
| 174 |
|
examples/whisper.wasm/CMakeLists.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
set(TARGET whisper.wasm)
|
| 2 |
|
| 3 |
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
|
|
|
|
| 4 |
configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/whisper.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/whisper.js COPYONLY)
|
|
|
|
| 1 |
set(TARGET whisper.wasm)
|
| 2 |
|
| 3 |
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
|
| 4 |
+
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
|
| 5 |
configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/whisper.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/whisper.js COPYONLY)
|
whisper.cpp
CHANGED
|
@@ -2389,92 +2389,92 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
|
|
| 2389 |
case WHISPER_SAMPLING_GREEDY:
|
| 2390 |
{
|
| 2391 |
result = {
|
| 2392 |
-
/*.strategy
|
| 2393 |
|
| 2394 |
-
/*.n_threads
|
| 2395 |
-
/*.n_max_text_ctx
|
| 2396 |
-
/*.offset_ms
|
| 2397 |
-
/*.duration_ms
|
| 2398 |
|
| 2399 |
-
/*.translate
|
| 2400 |
-
/*.no_context
|
| 2401 |
-
/*.single_segment
|
| 2402 |
-
/*.
|
| 2403 |
-
/*.print_progress
|
| 2404 |
-
/*.print_realtime
|
| 2405 |
-
/*.print_timestamps
|
| 2406 |
|
| 2407 |
-
/*.token_timestamps
|
| 2408 |
-
/*.thold_pt
|
| 2409 |
-
/*.thold_ptsum
|
| 2410 |
-
/*.max_len
|
| 2411 |
-
/*.max_tokens
|
| 2412 |
|
| 2413 |
-
/*.speed_up
|
| 2414 |
-
/*.audio_ctx
|
| 2415 |
|
| 2416 |
-
/*.prompt_tokens
|
| 2417 |
-
/*.prompt_n_tokens
|
| 2418 |
|
| 2419 |
-
/*.language
|
| 2420 |
|
| 2421 |
-
/*.greedy
|
| 2422 |
/*.n_past =*/ 0,
|
| 2423 |
},
|
| 2424 |
|
| 2425 |
-
/*.beam_search
|
| 2426 |
/*.n_past =*/ -1,
|
| 2427 |
/*.beam_width =*/ -1,
|
| 2428 |
/*.n_best =*/ -1,
|
| 2429 |
},
|
| 2430 |
|
| 2431 |
-
/*.new_segment_callback
|
| 2432 |
/*.new_segment_callback_user_data =*/ nullptr,
|
| 2433 |
};
|
| 2434 |
} break;
|
| 2435 |
case WHISPER_SAMPLING_BEAM_SEARCH:
|
| 2436 |
{
|
| 2437 |
result = {
|
| 2438 |
-
/*.strategy
|
| 2439 |
|
| 2440 |
-
/*.n_threads
|
| 2441 |
-
/*.n_max_text_ctx
|
| 2442 |
-
/*.offset_ms
|
| 2443 |
-
/*.duration_ms
|
| 2444 |
|
| 2445 |
-
/*.translate
|
| 2446 |
-
/*.no_context
|
| 2447 |
-
/*.single_segment
|
| 2448 |
-
/*.
|
| 2449 |
-
/*.print_progress
|
| 2450 |
-
/*.print_realtime
|
| 2451 |
-
/*.print_timestamps
|
| 2452 |
|
| 2453 |
-
/*.token_timestamps
|
| 2454 |
-
/*.thold_pt
|
| 2455 |
-
/*.thold_ptsum
|
| 2456 |
-
/*.max_len
|
| 2457 |
-
/*.max_tokens
|
| 2458 |
|
| 2459 |
-
/*.speed_up
|
| 2460 |
-
/*.audio_ctx
|
| 2461 |
|
| 2462 |
-
/*.prompt_tokens
|
| 2463 |
-
/*.prompt_n_tokens
|
| 2464 |
|
| 2465 |
-
/*.language
|
| 2466 |
|
| 2467 |
-
/*.greedy
|
| 2468 |
/*.n_past =*/ -1,
|
| 2469 |
},
|
| 2470 |
|
| 2471 |
-
/*.beam_search
|
| 2472 |
/*.n_past =*/ 0,
|
| 2473 |
/*.beam_width =*/ 10,
|
| 2474 |
/*.n_best =*/ 5,
|
| 2475 |
},
|
| 2476 |
|
| 2477 |
-
/*.new_segment_callback
|
| 2478 |
/*.new_segment_callback_user_data =*/ nullptr,
|
| 2479 |
};
|
| 2480 |
} break;
|
|
@@ -2762,7 +2762,7 @@ int whisper_full(
|
|
| 2762 |
// ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
|
| 2763 |
// ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
|
| 2764 |
|
| 2765 |
-
if (params.
|
| 2766 |
} else {
|
| 2767 |
text += whisper_token_to_str(ctx, tokens_cur[i].id);
|
| 2768 |
}
|
|
|
|
| 2389 |
case WHISPER_SAMPLING_GREEDY:
|
| 2390 |
{
|
| 2391 |
result = {
|
| 2392 |
+
/*.strategy =*/ WHISPER_SAMPLING_GREEDY,
|
| 2393 |
|
| 2394 |
+
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
|
| 2395 |
+
/*.n_max_text_ctx =*/ 16384,
|
| 2396 |
+
/*.offset_ms =*/ 0,
|
| 2397 |
+
/*.duration_ms =*/ 0,
|
| 2398 |
|
| 2399 |
+
/*.translate =*/ false,
|
| 2400 |
+
/*.no_context =*/ false,
|
| 2401 |
+
/*.single_segment =*/ false,
|
| 2402 |
+
/*.print_special =*/ false,
|
| 2403 |
+
/*.print_progress =*/ true,
|
| 2404 |
+
/*.print_realtime =*/ false,
|
| 2405 |
+
/*.print_timestamps =*/ true,
|
| 2406 |
|
| 2407 |
+
/*.token_timestamps =*/ false,
|
| 2408 |
+
/*.thold_pt =*/ 0.01f,
|
| 2409 |
+
/*.thold_ptsum =*/ 0.01f,
|
| 2410 |
+
/*.max_len =*/ 0,
|
| 2411 |
+
/*.max_tokens =*/ 0,
|
| 2412 |
|
| 2413 |
+
/*.speed_up =*/ false,
|
| 2414 |
+
/*.audio_ctx =*/ 0,
|
| 2415 |
|
| 2416 |
+
/*.prompt_tokens =*/ nullptr,
|
| 2417 |
+
/*.prompt_n_tokens =*/ 0,
|
| 2418 |
|
| 2419 |
+
/*.language =*/ "en",
|
| 2420 |
|
| 2421 |
+
/*.greedy =*/ {
|
| 2422 |
/*.n_past =*/ 0,
|
| 2423 |
},
|
| 2424 |
|
| 2425 |
+
/*.beam_search =*/ {
|
| 2426 |
/*.n_past =*/ -1,
|
| 2427 |
/*.beam_width =*/ -1,
|
| 2428 |
/*.n_best =*/ -1,
|
| 2429 |
},
|
| 2430 |
|
| 2431 |
+
/*.new_segment_callback =*/ nullptr,
|
| 2432 |
/*.new_segment_callback_user_data =*/ nullptr,
|
| 2433 |
};
|
| 2434 |
} break;
|
| 2435 |
case WHISPER_SAMPLING_BEAM_SEARCH:
|
| 2436 |
{
|
| 2437 |
result = {
|
| 2438 |
+
/*.strategy =*/ WHISPER_SAMPLING_BEAM_SEARCH,
|
| 2439 |
|
| 2440 |
+
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
|
| 2441 |
+
/*.n_max_text_ctx =*/ 16384,
|
| 2442 |
+
/*.offset_ms =*/ 0,
|
| 2443 |
+
/*.duration_ms =*/ 0,
|
| 2444 |
|
| 2445 |
+
/*.translate =*/ false,
|
| 2446 |
+
/*.no_context =*/ false,
|
| 2447 |
+
/*.single_segment =*/ false,
|
| 2448 |
+
/*.print_special =*/ false,
|
| 2449 |
+
/*.print_progress =*/ true,
|
| 2450 |
+
/*.print_realtime =*/ false,
|
| 2451 |
+
/*.print_timestamps =*/ true,
|
| 2452 |
|
| 2453 |
+
/*.token_timestamps =*/ false,
|
| 2454 |
+
/*.thold_pt =*/ 0.01f,
|
| 2455 |
+
/*.thold_ptsum =*/ 0.01f,
|
| 2456 |
+
/*.max_len =*/ 0,
|
| 2457 |
+
/*.max_tokens =*/ 0,
|
| 2458 |
|
| 2459 |
+
/*.speed_up =*/ false,
|
| 2460 |
+
/*.audio_ctx =*/ 0,
|
| 2461 |
|
| 2462 |
+
/*.prompt_tokens =*/ nullptr,
|
| 2463 |
+
/*.prompt_n_tokens =*/ 0,
|
| 2464 |
|
| 2465 |
+
/*.language =*/ "en",
|
| 2466 |
|
| 2467 |
+
/*.greedy =*/ {
|
| 2468 |
/*.n_past =*/ -1,
|
| 2469 |
},
|
| 2470 |
|
| 2471 |
+
/*.beam_search =*/ {
|
| 2472 |
/*.n_past =*/ 0,
|
| 2473 |
/*.beam_width =*/ 10,
|
| 2474 |
/*.n_best =*/ 5,
|
| 2475 |
},
|
| 2476 |
|
| 2477 |
+
/*.new_segment_callback =*/ nullptr,
|
| 2478 |
/*.new_segment_callback_user_data =*/ nullptr,
|
| 2479 |
};
|
| 2480 |
} break;
|
|
|
|
| 2762 |
// ctx->vocab.id_to_token[tokens_cur[i].id].c_str(), tokens_cur[i].p,
|
| 2763 |
// ctx->vocab.id_to_token[tokens_cur[i].tid].c_str(), tokens_cur[i].pt);
|
| 2764 |
|
| 2765 |
+
if (params.print_special == false && tokens_cur[i].id >= whisper_token_eot(ctx)) {
|
| 2766 |
} else {
|
| 2767 |
text += whisper_token_to_str(ctx, tokens_cur[i].id);
|
| 2768 |
}
|
whisper.h
CHANGED
|
@@ -192,7 +192,7 @@ extern "C" {
|
|
| 192 |
bool translate;
|
| 193 |
bool no_context;
|
| 194 |
bool single_segment; // force single segment output (useful for streaming)
|
| 195 |
-
bool
|
| 196 |
bool print_progress;
|
| 197 |
bool print_realtime;
|
| 198 |
bool print_timestamps;
|
|
|
|
| 192 |
bool translate;
|
| 193 |
bool no_context;
|
| 194 |
bool single_segment; // force single segment output (useful for streaming)
|
| 195 |
+
bool print_special;
|
| 196 |
bool print_progress;
|
| 197 |
bool print_realtime;
|
| 198 |
bool print_timestamps;
|