Dmitry Atamanov commited on
Commit
7a280a4
·
unverified ·
1 Parent(s): 45399ad

examples : use miniaudio for direct decoding flac, mp3, ogg and wav (#2759)

Browse files
Makefile CHANGED
@@ -18,17 +18,6 @@ samples:
18
  @wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
19
  @wget --quiet --show-progress -O samples/a13.mp3 https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3
20
  @wget --quiet --show-progress -O samples/diffusion2023-07-03.flac https://archive.org/download/diffusion2023-07-03/diffusion2023-07-03.flac
21
- @echo "Converting to 16-bit WAV ..."
22
- @ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
23
- @ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
24
- @ffmpeg -loglevel -0 -y -i samples/hp0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/hp0.wav
25
- @rm samples/*.ogg
26
- @ffmpeg -loglevel -0 -y -i samples/mm1.wav -ar 16000 -ac 1 -c:a pcm_s16le samples/mm0.wav
27
- @rm samples/mm1.wav
28
- @ffmpeg -loglevel -0 -y -i samples/a13.mp3 -ar 16000 -ac 1 -c:a pcm_s16le -ss 00:00:00 -to 00:00:30 samples/a13.wav
29
- @rm samples/a13.mp3
30
- @ffmpeg -loglevel -0 -y -i samples/diffusion2023-07-03.flac -ar 16000 -ac 1 -c:a pcm_s16le samples/diffusion2023-07-03.wav
31
- @rm samples/diffusion2023-07-03.flac
32
 
33
  #
34
  # Models
@@ -59,7 +48,7 @@ tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 larg
59
  @echo "Running $@ on all samples in ./samples ..."
60
  @echo "==============================================="
61
  @echo ""
62
- @for f in samples/*.wav; do \
63
  echo "----------------------------------------------" ; \
64
  echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
65
  echo "----------------------------------------------" ; \
 
18
  @wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
19
  @wget --quiet --show-progress -O samples/a13.mp3 https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3
20
  @wget --quiet --show-progress -O samples/diffusion2023-07-03.flac https://archive.org/download/diffusion2023-07-03/diffusion2023-07-03.flac
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  #
23
  # Models
 
48
  @echo "Running $@ on all samples in ./samples ..."
49
  @echo "==============================================="
50
  @echo ""
51
+ @for f in samples/*$(.flac .mp3 .ogg .wav); do \
52
  echo "----------------------------------------------" ; \
53
  echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
54
  echo "----------------------------------------------" ; \
examples/addon.node/addon.cpp CHANGED
@@ -171,8 +171,8 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
171
 
172
  // read the input audio file if params.pcmf32 is not provided
173
  if (params.pcmf32.empty()) {
174
- if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
175
- fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
176
  continue;
177
  }
178
  } else {
 
171
 
172
  // read the input audio file if params.pcmf32 is not provided
173
  if (params.pcmf32.empty()) {
174
+ if (!::read_audio_data(fname_inp, pcmf32, pcmf32s, params.diarize)) {
175
+ fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str());
176
  continue;
177
  }
178
  } else {
examples/cli/cli.cpp CHANGED
@@ -199,7 +199,8 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
199
 
200
  static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
201
  fprintf(stderr, "\n");
202
- fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
 
203
  fprintf(stderr, "\n");
204
  fprintf(stderr, "options:\n");
205
  fprintf(stderr, " -h, --help [default] show this help message and exit\n");
@@ -244,7 +245,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
244
  fprintf(stderr, " -dl, --detect-language [%-7s] exit after automatically detecting language\n", params.detect_language ? "true" : "false");
245
  fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt (max n_text_ctx/2 tokens)\n", params.prompt.c_str());
246
  fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
247
- fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
248
  fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
249
  fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str());
250
  fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false");
@@ -1069,8 +1070,8 @@ int main(int argc, char ** argv) {
1069
  std::vector<float> pcmf32; // mono-channel F32 PCM
1070
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
1071
 
1072
- if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
1073
- fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
1074
  continue;
1075
  }
1076
 
 
199
 
200
  static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
201
  fprintf(stderr, "\n");
202
+ fprintf(stderr, "usage: %s [options] file0 file1 ...\n", argv[0]);
203
+ fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n");
204
  fprintf(stderr, "\n");
205
  fprintf(stderr, "options:\n");
206
  fprintf(stderr, " -h, --help [default] show this help message and exit\n");
 
245
  fprintf(stderr, " -dl, --detect-language [%-7s] exit after automatically detecting language\n", params.detect_language ? "true" : "false");
246
  fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt (max n_text_ctx/2 tokens)\n", params.prompt.c_str());
247
  fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
248
+ fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input audio file path\n", "");
249
  fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
250
  fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str());
251
  fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false");
 
1070
  std::vector<float> pcmf32; // mono-channel F32 PCM
1071
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
1072
 
1073
+ if (!::read_audio_data(fname_inp, pcmf32, pcmf32s, params.diarize)) {
1074
+ fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str());
1075
  continue;
1076
  }
1077
 
examples/common.cpp CHANGED
@@ -4,8 +4,17 @@
4
 
5
  // third-party utilities
6
  // use your favorite implementations
7
- #define DR_WAV_IMPLEMENTATION
8
- #include "dr_wav.h"
 
 
 
 
 
 
 
 
 
9
 
10
  #include <cmath>
11
  #include <cstring>
@@ -639,111 +648,95 @@ bool is_wav_buffer(const std::string buf) {
639
  return true;
640
  }
641
 
642
- bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
643
- drwav wav;
644
- std::vector<uint8_t> wav_data; // used for pipe input from stdin or ffmpeg decoding output
 
 
 
 
 
645
 
646
  if (fname == "-") {
647
- {
648
- #ifdef _WIN32
649
- _setmode(_fileno(stdin), _O_BINARY);
650
- #endif
651
-
652
- uint8_t buf[1024];
653
- while (true)
654
- {
655
- const size_t n = fread(buf, 1, sizeof(buf), stdin);
656
- if (n == 0) {
657
- break;
658
- }
659
- wav_data.insert(wav_data.end(), buf, buf + n);
660
- }
661
- }
662
 
663
- if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
664
- fprintf(stderr, "error: failed to open WAV file from stdin\n");
665
- return false;
666
- }
 
 
 
 
 
 
 
 
 
667
 
668
- fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
 
 
 
669
  }
670
  else if (is_wav_buffer(fname)) {
671
- if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
672
- fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
673
- return false;
674
- }
 
675
  }
676
- else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
677
  #if defined(WHISPER_FFMPEG)
678
- if (ffmpeg_decode_audio(fname, wav_data) != 0) {
679
- fprintf(stderr, "error: failed to ffmpeg decode '%s' \n", fname.c_str());
680
- return false;
681
- }
682
- if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
683
- fprintf(stderr, "error: failed to read wav data as wav \n");
684
- return false;
685
- }
 
 
 
686
  #else
687
- fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
688
- return false;
689
- #endif
690
- }
691
 
692
- if (wav.channels != 1 && wav.channels != 2) {
693
- fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
694
- drwav_uninit(&wav);
695
- return false;
696
  }
697
 
698
- if (stereo && wav.channels != 2) {
699
- fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
700
- drwav_uninit(&wav);
701
- return false;
702
- }
703
 
704
- if (wav.sampleRate != COMMON_SAMPLE_RATE) {
705
- fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
706
- drwav_uninit(&wav);
707
- return false;
708
- }
709
 
710
- if (wav.bitsPerSample != 16) {
711
- fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
712
- drwav_uninit(&wav);
713
- return false;
714
  }
715
 
716
- const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8);
717
 
718
- std::vector<int16_t> pcm16;
719
- pcm16.resize(n*wav.channels);
720
- drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
721
- drwav_uninit(&wav);
722
 
723
- // convert to mono, float
724
- pcmf32.resize(n);
725
- if (wav.channels == 1) {
726
- for (uint64_t i = 0; i < n; i++) {
727
- pcmf32[i] = float(pcm16[i])/32768.0f;
728
- }
729
- } else {
730
- for (uint64_t i = 0; i < n; i++) {
731
- pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
732
- }
733
  }
734
 
735
  if (stereo) {
736
- // convert to stereo, float
737
- pcmf32s.resize(2);
738
-
739
- pcmf32s[0].resize(n);
740
- pcmf32s[1].resize(n);
741
- for (uint64_t i = 0; i < n; i++) {
742
- pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
743
- pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
744
- }
745
  }
746
 
 
 
747
  return true;
748
  }
749
 
@@ -909,3 +902,6 @@ bool speak_with_file(const std::string & command, const std::string & text, cons
909
  }
910
  return true;
911
  }
 
 
 
 
4
 
5
  // third-party utilities
6
  // use your favorite implementations
7
+ #define STB_VORBIS_HEADER_ONLY
8
+ #include "stb_vorbis.c" /* Enables Vorbis decoding. */
9
+
10
+ #define MA_NO_DEVICE_IO
11
+ #define MA_NO_THREADING
12
+ #define MA_NO_ENCODING
13
+ #define MA_NO_GENERATION
14
+ #define MA_NO_RESOURCE_MANAGER
15
+ #define MA_NO_NODE_GRAPH
16
+ #define MINIAUDIO_IMPLEMENTATION
17
+ #include "miniaudio.h"
18
 
19
  #include <cmath>
20
  #include <cstring>
 
648
  return true;
649
  }
650
 
651
+ bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
652
+ std::vector<uint8_t> audio_data; // used for pipe input from stdin or ffmpeg decoding output
653
+
654
+ ma_result result;
655
+ ma_decoder_config decoder_config;
656
+ ma_decoder decoder;
657
+
658
+ decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, COMMON_SAMPLE_RATE);
659
 
660
  if (fname == "-") {
661
+ #ifdef _WIN32
662
+ _setmode(_fileno(stdin), _O_BINARY);
663
+ #endif
 
 
 
 
 
 
 
 
 
 
 
 
664
 
665
+ uint8_t buf[1024];
666
+ while (true)
667
+ {
668
+ const size_t n = fread(buf, 1, sizeof(buf), stdin);
669
+ if (n == 0) {
670
+ break;
671
+ }
672
+ audio_data.insert(audio_data.end(), buf, buf + n);
673
+ }
674
+
675
+ if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) {
676
+
677
+ fprintf(stderr, "Error: failed to open audio data from stdin (%s)\n", ma_result_description(result));
678
 
679
+ return false;
680
+ }
681
+
682
+ fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, audio_data.size());
683
  }
684
  else if (is_wav_buffer(fname)) {
685
+ if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) {
686
+ fprintf(stderr, "Error: failed to open audio data from fname buffer (%s)\n", ma_result_description(result));
687
+
688
+ return false;
689
+ }
690
  }
691
+ else if ((result = ma_decoder_init_file(fname.c_str(), &decoder_config, &decoder)) != MA_SUCCESS) {
692
  #if defined(WHISPER_FFMPEG)
693
+ if (ffmpeg_decode_audio(fname, audio_data) != 0) {
694
+ fprintf(stderr, "error: failed to ffmpeg decode '%s'\n", fname.c_str());
695
+
696
+ return false;
697
+ }
698
+
699
+ if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) {
700
+ fprintf(stderr, "error: failed to read audio data as wav (%s)\n", ma_result_description(result));
701
+
702
+ return false;
703
+ }
704
  #else
705
+ fprintf(stderr, "error: failed to open '%s' file (%s)\n", fname.c_str(), ma_result_description(result));
 
 
 
706
 
707
+ return false;
708
+ #endif
 
 
709
  }
710
 
711
+ ma_uint64 frame_count;
712
+ ma_uint64 frames_read;
 
 
 
713
 
714
+ if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
715
+ fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));
 
 
 
716
 
717
+ return false;
 
 
 
718
  }
719
 
720
+ pcmf32.resize(stereo ? frame_count*2 : frame_count);
721
 
722
+ if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
723
+ fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));
 
 
724
 
725
+ return false;
 
 
 
 
 
 
 
 
 
726
  }
727
 
728
  if (stereo) {
729
+ pcmf32s.resize(2);
730
+ pcmf32s[0].resize(frame_count);
731
+ pcmf32s[1].resize(frame_count);
732
+ for (uint64_t i = 0; i < frame_count; i++) {
733
+ pcmf32s[0][i] = pcmf32[2*i];
734
+ pcmf32s[1][i] = pcmf32[2*i + 1];
735
+ }
 
 
736
  }
737
 
738
+ ma_decoder_uninit(&decoder);
739
+
740
  return true;
741
  }
742
 
 
902
  }
903
  return true;
904
  }
905
+
906
+ #undef STB_VORBIS_HEADER_ONLY
907
+ #include "stb_vorbis.c"
examples/common.h CHANGED
@@ -143,7 +143,7 @@ bool is_wav_buffer(const std::string buf);
143
  // fname can be a buffer of WAV data instead of a filename
144
  // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
145
  // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
146
- bool read_wav(
147
  const std::string & fname,
148
  std::vector<float> & pcmf32,
149
  std::vector<std::vector<float>> & pcmf32s,
 
143
  // fname can be a buffer of WAV data instead of a filename
144
  // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
145
  // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
146
+ bool read_audio_data(
147
  const std::string & fname,
148
  std::vector<float> & pcmf32,
149
  std::vector<std::vector<float>> & pcmf32s,
examples/dr_wav.h DELETED
The diff for this file is too large to render. See raw diff
 
examples/generate-karaoke.sh CHANGED
@@ -41,20 +41,17 @@ fi
41
  # record some raw audio
42
  sox -d rec.wav
43
 
44
- # resample to 16kHz
45
- ffmpeg -y -i ./rec.wav -ar 16000 -ac 1 -c:a pcm_s16le ./rec16.wav > /dev/null 2>&1
46
-
47
  # run Whisper
48
  echo "Processing ..."
49
- ${executable} -m models/ggml-base.en.bin rec16.wav -owts > /dev/null 2>&1
50
 
51
  # generate Karaoke video
52
  echo "Generating video ..."
53
- source rec16.wav.wts > /dev/null 2>&1
54
 
55
  # play the video
56
  echo "Playing ./rec16.wav.mp4 ..."
57
- ffplay -loglevel 0 -autoexit ./rec16.wav.mp4
58
 
59
  echo "Done"
60
  exit 0
 
41
  # record some raw audio
42
  sox -d rec.wav
43
 
 
 
 
44
  # run Whisper
45
  echo "Processing ..."
46
+ ${executable} -m models/ggml-base.en.bin rec.wav -owts > /dev/null 2>&1
47
 
48
  # generate Karaoke video
49
  echo "Generating video ..."
50
+ source rec.wav.wts > /dev/null 2>&1
51
 
52
  # play the video
53
  echo "Playing ./rec16.wav.mp4 ..."
54
+ ffplay -loglevel 0 -autoexit ./rec.wav.mp4
55
 
56
  echo "Done"
57
  exit 0
examples/miniaudio.h ADDED
The diff for this file is too large to render. See raw diff
 
examples/server/server.cpp CHANGED
@@ -722,8 +722,8 @@ int main(int argc, char ** argv) {
722
  return;
723
  }
724
 
725
- // read wav content into pcmf32
726
- if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
727
  {
728
  fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
729
  const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
@@ -734,10 +734,10 @@ int main(int argc, char ** argv) {
734
  // remove temp file
735
  std::remove(temp_filename.c_str());
736
  } else {
737
- if (!::read_wav(audio_file.content, pcmf32, pcmf32s, params.diarize))
738
  {
739
- fprintf(stderr, "error: failed to read WAV file\n");
740
- const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
741
  res.set_content(error_resp, "application/json");
742
  return;
743
  }
 
722
  return;
723
  }
724
 
725
+ // read audio content into pcmf32
726
+ if (!::read_audio_data(temp_filename, pcmf32, pcmf32s, params.diarize))
727
  {
728
  fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
729
  const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
 
734
  // remove temp file
735
  std::remove(temp_filename.c_str());
736
  } else {
737
+ if (!::read_audio_data(audio_file.content, pcmf32, pcmf32s, params.diarize))
738
  {
739
+ fprintf(stderr, "error: failed to read audio data\n");
740
+ const std::string error_resp = "{\"error\":\"failed to read audio data\"}";
741
  res.set_content(error_resp, "application/json");
742
  return;
743
  }
examples/stb_vorbis.c ADDED
The diff for this file is too large to render. See raw diff