Spaces:

natasa365
/

whisper.cpp

Running

App Files Files Community

KitaitiMakoto commited on Jun 4

Commit

acad667

unverified ·

1 Parent(s): 17ba7f5

ruby : Add parallel transcription support (#3222)

Browse files

* Fix indentation of code sample in document comment

* Make Whisper::Context#transcribe able to run non-parallel

* Add test for Whisper::Context#transcribe with parallel option

* Follow signature API change of Context#transcribe

* Remove useless variable assignment

* Move simple usage up in README

* Add need help section in README

* Add document on Context#transcribe's parallel option in README

* Update date

* Fix signature of Context.new

* Make Context#subscribe accept n_processors option

* Make test follow #transcribe's change

* Make RBS follow #transcribe's change

* Add document for #transcribe's n_processors option

* Rename test directory so that Rake tasks' default setting is used

Files changed (20) hide show

bindings/ruby/README.md +28 -11
bindings/ruby/Rakefile +5 -7
bindings/ruby/ext/ruby_whisper.c +2 -0
bindings/ruby/ext/ruby_whisper_context.c +5 -0
bindings/ruby/ext/ruby_whisper_transcribe.cpp +10 -5
bindings/ruby/sig/whisper.rbs +37 -37
bindings/ruby/{tests → test}/helper.rb +0 -0
bindings/ruby/{tests → test}/jfk_reader/.gitignore +0 -0
bindings/ruby/{tests → test}/jfk_reader/extconf.rb +0 -0
bindings/ruby/{tests → test}/jfk_reader/jfk_reader.c +0 -0
bindings/ruby/{tests → test}/test_callback.rb +0 -0
bindings/ruby/{tests → test}/test_error.rb +0 -0
bindings/ruby/{tests → test}/test_model.rb +0 -0
bindings/ruby/{tests → test}/test_package.rb +0 -0
bindings/ruby/{tests → test}/test_params.rb +0 -0
bindings/ruby/{tests → test}/test_segment.rb +0 -0
bindings/ruby/{tests → test}/test_vad.rb +0 -0
bindings/ruby/{tests → test}/test_vad_params.rb +0 -0
bindings/ruby/{tests → test}/test_whisper.rb +18 -0
bindings/ruby/whispercpp.gemspec +2 -2

bindings/ruby/README.md CHANGED Viewed

@@ -70,17 +70,6 @@ end
 Some models are prepared up-front:
-```ruby
-base_en = Whisper::Model.pre_converted_models["base.en"]
-whisper = Whisper::Context.new(base_en)
-```
-At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
-```ruby
-Whisper::Model.pre_converted_models["base"].clear_cache
-```
 You also can use shorthand for pre-converted models:
 ```ruby
@@ -105,6 +94,19 @@ puts Whisper::Model.pre_converted_models.keys
 #   :
 ```
 You can also use local model files you prepared:
 ```ruby
@@ -163,6 +165,16 @@ For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisp
 API
 ---
 ### Segments ###
 Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
@@ -297,6 +309,11 @@ First call of `rake test` builds an extension and downloads a model for testing.
 If something seems wrong on build, running `rake clean` solves some cases.
 License
 -------

 Some models are prepared up-front:
 You also can use shorthand for pre-converted models:
 ```ruby
 #   :
 ```
+You can also retrieve each model:
+```ruby
+base_en = Whisper::Model.pre_converted_models["base.en"]
+whisper = Whisper::Context.new(base_en)
+```
+At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
+```ruby
+Whisper::Model.pre_converted_models["base"].clear_cache
+```
 You can also use local model files you prepared:
 ```ruby
 API
 ---
+### Transcription ###
+By default, `Whisper::Context#transcribe` works in a single thread. You can make it work in parallel by passing `n_processors` option:
+```ruby
+whisper.transcribe("path/to/audio.wav", params, n_processors: Etc.nprocessors)
+```
+Note that transcription occasionally might be low accuracy when it works in parallel.
 ### Segments ###
 Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
 If something seems wrong on build, running `rake clean` solves some cases.
+### Need help ###
+* Windows support
+* Refinement of C/C++ code, especially memory management
 License
 -------

bindings/ruby/Rakefile CHANGED Viewed

@@ -67,17 +67,15 @@ file LIB_FILE => [SO_FILE, "lib"] do |t|
 end
 CLEAN.include LIB_FILE
-Rake::TestTask.new do |t|
-  t.test_files = FileList["tests/test_*.rb"]
-end
-TEST_MEMORY_VIEW = "tests/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
-file TEST_MEMORY_VIEW => "tests/jfk_reader/jfk_reader.c" do |t|
-  chdir "tests/jfk_reader" do
     ruby "extconf.rb"
     sh "make"
   end
 end
-CLEAN.include "tests/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}"
 task test: [LIB_FILE, TEST_MEMORY_VIEW]

 end
 CLEAN.include LIB_FILE
+Rake::TestTask.new
+TEST_MEMORY_VIEW = "test/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
+file TEST_MEMORY_VIEW => "test/jfk_reader/jfk_reader.c" do |t|
+  chdir "test/jfk_reader" do
     ruby "extconf.rb"
     sh "make"
   end
 end
+CLEAN.include "test/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}"
 task test: [LIB_FILE, TEST_MEMORY_VIEW]

bindings/ruby/ext/ruby_whisper.c CHANGED Viewed

@@ -24,6 +24,7 @@ ID id_URI;
 ID id_pre_converted_models;
 ID id_coreml_compiled_models;
 ID id_cache;
 static bool is_log_callback_finalized = false;
@@ -142,6 +143,7 @@ void Init_whisper() {
   id_pre_converted_models = rb_intern("pre_converted_models");
   id_coreml_compiled_models = rb_intern("coreml_compiled_models");
   id_cache = rb_intern("cache");
   mWhisper = rb_define_module("Whisper");
   mVAD = rb_define_module_under(mWhisper, "VAD");

 ID id_pre_converted_models;
 ID id_coreml_compiled_models;
 ID id_cache;
+ID id_n_processors;
 static bool is_log_callback_finalized = false;
   id_pre_converted_models = rb_intern("pre_converted_models");
   id_coreml_compiled_models = rb_intern("coreml_compiled_models");
   id_cache = rb_intern("cache");
+  id_n_processors = rb_intern("n_processors");
   mWhisper = rb_define_module("Whisper");
   mVAD = rb_define_module_under(mWhisper, "VAD");

bindings/ruby/ext/ruby_whisper_context.c CHANGED Viewed

@@ -13,6 +13,7 @@ extern ID id_URI;
 extern ID id_pre_converted_models;
 extern ID id_coreml_compiled_models;
 extern ID id_cache;
 extern VALUE cContext;
 extern VALUE eError;
@@ -24,6 +25,8 @@ extern VALUE rb_whisper_model_s_new(VALUE context);
 extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
 extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
 static void
 ruby_whisper_free(ruby_whisper *rw)
 {
@@ -633,6 +636,8 @@ init_ruby_whisper_context(VALUE *mWhisper)
 {
   cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject);
   rb_define_alloc_func(cContext, ruby_whisper_allocate);
   rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);

 extern ID id_pre_converted_models;
 extern ID id_coreml_compiled_models;
 extern ID id_cache;
+extern ID id_n_processors;
 extern VALUE cContext;
 extern VALUE eError;
 extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
 extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
+ID transcribe_option_names[1];
 static void
 ruby_whisper_free(ruby_whisper *rw)
 {
 {
   cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject);
+  transcribe_option_names[0] = id_n_processors;
   rb_define_alloc_func(cContext, ruby_whisper_allocate);
   rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);

bindings/ruby/ext/ruby_whisper_transcribe.cpp CHANGED Viewed

@@ -13,6 +13,7 @@ extern const rb_data_type_t ruby_whisper_params_type;
 extern ID id_to_s;
 extern ID id_call;
 extern void
 prepare_transcription(ruby_whisper_params * rwp, VALUE * self);
@@ -34,9 +35,14 @@ VALUE
 ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
   ruby_whisper *rw;
   ruby_whisper_params *rwp;
-  VALUE wave_file_path, blk, params;
-  rb_scan_args(argc, argv, "02&", &wave_file_path, &params, &blk);
   TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
   TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
@@ -66,7 +72,7 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
   prepare_transcription(rwp, &self);
-  if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
     fprintf(stderr, "failed to process audio\n");
     return self;
   }
@@ -76,9 +82,8 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
     const char * text = whisper_full_get_segment_text(rw->context, i);
     output = rb_str_concat(output, rb_str_new2(text));
   }
-  VALUE idCall = id_call;
   if (blk != Qnil) {
-    rb_funcall(blk, idCall, 1, output);
   }
   return self;
 }

 extern ID id_to_s;
 extern ID id_call;
+extern ID transcribe_option_names[1];
 extern void
 prepare_transcription(ruby_whisper_params * rwp, VALUE * self);
 ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
   ruby_whisper *rw;
   ruby_whisper_params *rwp;
+  VALUE wave_file_path, blk, params, kws;
+  VALUE opts[1];
+  rb_scan_args_kw(RB_SCAN_ARGS_LAST_HASH_KEYWORDS, argc, argv, "2:&", &wave_file_path, &params, &kws, &blk);
+  rb_get_kwargs(kws, transcribe_option_names, 0, 1, opts);
+  int n_processors = opts[0] == Qundef ? 1 : NUM2INT(opts[0]);
   TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
   TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
   prepare_transcription(rwp, &self);
+  if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), n_processors) != 0) {
     fprintf(stderr, "failed to process audio\n");
     return self;
   }
     const char * text = whisper_full_get_segment_text(rw->context, i);
     output = rb_str_concat(output, rb_str_new2(text));
   }
   if (blk != Qnil) {
+    rb_funcall(blk, id_call, 1, output);
   }
   return self;
 }

bindings/ruby/sig/whisper.rbs CHANGED Viewed

@@ -25,19 +25,19 @@ module Whisper
   def self.system_info_str: () -> String
   class Context
-    def self.new: (path | ::URI::HTTP) -> instance
     # transcribe a single file
     # can emit to a block results
     #
-    #   params = Whisper::Params.new
-    #   params.duration = 60_000
-    #   whisper.transcribe "path/to/audio.wav", params do |text|
-    #     puts text
-    #   end
     #
-    def transcribe: (string, Params) -> self
-                  | (string, Params) { (String) -> void } -> self
     def model_n_vocab: () -> Integer
     def model_n_audio_ctx: () -> Integer
@@ -50,16 +50,16 @@ module Whisper
     # Yields each Whisper::Segment:
     #
-    #   whisper.transcribe("path/to/audio.wav", params)
-    #   whisper.each_segment do |segment|
-    #     puts segment.text
-    #   end
     #
     # Returns an Enumerator if no block given:
     #
-    #   whisper.transcribe("path/to/audio.wav", params)
-    #   enum = whisper.each_segment
-    #   enum.to_a # => [#<Whisper::Segment>, ...]
     #
     def each_segment: { (Segment) -> void } -> void
                     | () -> Enumerator[Segment]
@@ -74,25 +74,25 @@ module Whisper
     # Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
     #
-    #   full_get_segment_t0(3) # => 1668 (16680 ms)
     #
     def full_get_segment_t0: (Integer) -> Integer
     # End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
     #
-    #   full_get_segment_t1(3) # => 1668 (16680 ms)
     #
     def full_get_segment_t1: (Integer) -> Integer
     # Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
     #
-    #   full_get_segment_speacker_turn_next(3) # => true
     #
     def full_get_segment_speaker_turn_next: (Integer) -> (true | false)
     # Text of a segment indexed by +segment_index+.
     #
-    #   full_get_segment_text(3) # => "ask not what your country can do for you, ..."
     #
     def full_get_segment_text: (Integer) -> String
@@ -282,9 +282,9 @@ module Whisper
     # Sets new segment callback, called for every newly generated text segment.
     #
-    #   params.new_segment_callback = ->(context, _, n_new, user_data) {
-    #     # ...
-    #   }
     #
     def new_segment_callback=: (new_segment_callback) -> new_segment_callback
     def new_segment_callback: () -> (new_segment_callback | nil)
@@ -297,9 +297,9 @@ module Whisper
     # Sets progress callback, called on each progress update.
     #
-    #   params.new_segment_callback = ->(context, _, progress, user_data) {
-    #     # ...
-    #   }
     #
     # +progress+ is an Integer between 0 and 100.
     #
@@ -327,9 +327,9 @@ module Whisper
     # Sets abort callback, called to check if the process should be aborted.
     #
-    #   params.abort_callback = ->(user_data) {
-    #     # ...
-    #   }
     #
     #
     def abort_callback=: (abort_callback) -> abort_callback
@@ -358,9 +358,9 @@ module Whisper
     # Hook called on new segment. Yields each Whisper::Segment.
     #
-    #   whisper.on_new_segment do |segment|
-    #     # ...
-    #   end
     #
     def on_new_segment: { (Segment) -> void } -> void
@@ -374,13 +374,13 @@ module Whisper
     # Call block to determine whether abort or not. Return +true+ when you want to abort.
     #
-    #   params.abort_on do
-    #     if some_condition
-    #       true # abort
-    #     else
-    #       false # continue
     #     end
-    #   end
     #
     def abort_on: { (Object user_data) -> boolish } -> void
   end

   def self.system_info_str: () -> String
   class Context
+    def self.new: (String | path | ::URI::HTTP) -> instance
     # transcribe a single file
     # can emit to a block results
     #
+    #     params = Whisper::Params.new
+    #     params.duration = 60_000
+    #     whisper.transcribe "path/to/audio.wav", params do |text|
+    #       puts text
+    #     end
     #
+    def transcribe: (string, Params, ?n_processors: Integer) -> self
+                  | (string, Params, ?n_processors: Integer) { (String) -> void } -> self
     def model_n_vocab: () -> Integer
     def model_n_audio_ctx: () -> Integer
     # Yields each Whisper::Segment:
     #
+    #     whisper.transcribe("path/to/audio.wav", params)
+    #     whisper.each_segment do |segment|
+    #       puts segment.text
+    #     end
     #
     # Returns an Enumerator if no block given:
     #
+    #     whisper.transcribe("path/to/audio.wav", params)
+    #     enum = whisper.each_segment
+    #     enum.to_a # => [#<Whisper::Segment>, ...]
     #
     def each_segment: { (Segment) -> void } -> void
                     | () -> Enumerator[Segment]
     # Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
     #
+    #     full_get_segment_t0(3) # => 1668 (16680 ms)
     #
     def full_get_segment_t0: (Integer) -> Integer
     # End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
     #
+    #     full_get_segment_t1(3) # => 1668 (16680 ms)
     #
     def full_get_segment_t1: (Integer) -> Integer
     # Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
     #
+    #     full_get_segment_speacker_turn_next(3) # => true
     #
     def full_get_segment_speaker_turn_next: (Integer) -> (true | false)
     # Text of a segment indexed by +segment_index+.
     #
+    #     full_get_segment_text(3) # => "ask not what your country can do for you, ..."
     #
     def full_get_segment_text: (Integer) -> String
     # Sets new segment callback, called for every newly generated text segment.
     #
+    #     params.new_segment_callback = ->(context, _, n_new, user_data) {
+    #       # ...
+    #     }
     #
     def new_segment_callback=: (new_segment_callback) -> new_segment_callback
     def new_segment_callback: () -> (new_segment_callback | nil)
     # Sets progress callback, called on each progress update.
     #
+    #     params.new_segment_callback = ->(context, _, progress, user_data) {
+    #       # ...
+    #     }
     #
     # +progress+ is an Integer between 0 and 100.
     #
     # Sets abort callback, called to check if the process should be aborted.
     #
+    #     params.abort_callback = ->(user_data) {
+    #       # ...
+    #     }
     #
     #
     def abort_callback=: (abort_callback) -> abort_callback
     # Hook called on new segment. Yields each Whisper::Segment.
     #
+    #     whisper.on_new_segment do |segment|
+    #       # ...
+    #     end
     #
     def on_new_segment: { (Segment) -> void } -> void
     # Call block to determine whether abort or not. Return +true+ when you want to abort.
     #
+    #     params.abort_on do
+    #       if some_condition
+    #         true # abort
+    #       else
+    #         false # continue
+    #       end
     #     end
     #
     def abort_on: { (Object user_data) -> boolish } -> void
   end

bindings/ruby/{tests → test}/helper.rb RENAMED Viewed

File without changes

bindings/ruby/{tests → test}/jfk_reader/.gitignore RENAMED Viewed

File without changes

bindings/ruby/{tests → test}/jfk_reader/extconf.rb RENAMED Viewed

File without changes

bindings/ruby/{tests → test}/jfk_reader/jfk_reader.c RENAMED Viewed

File without changes

bindings/ruby/{tests → test}/test_callback.rb RENAMED Viewed

File without changes

bindings/ruby/{tests → test}/test_error.rb RENAMED Viewed

File without changes

bindings/ruby/{tests → test}/test_model.rb RENAMED Viewed

File without changes

bindings/ruby/{tests → test}/test_package.rb RENAMED Viewed

File without changes

bindings/ruby/{tests → test}/test_params.rb RENAMED Viewed

File without changes

bindings/ruby/{tests → test}/test_segment.rb RENAMED Viewed

File without changes

bindings/ruby/{tests → test}/test_vad.rb RENAMED Viewed

File without changes

bindings/ruby/{tests → test}/test_vad_params.rb RENAMED Viewed

File without changes

bindings/ruby/{tests → test}/test_whisper.rb RENAMED Viewed

@@ -20,6 +20,24 @@ class TestWhisper < TestBase
     }
   end
   sub_test_case "After transcription" do
     def test_full_n_segments
       assert_equal 1, whisper.full_n_segments

     }
   end
+  def test_transcribe_non_parallel
+    @whisper = Whisper::Context.new("base.en")
+    params  = Whisper::Params.new
+    @whisper.transcribe(AUDIO, params, n_processors: 1) {|text|
+      assert_match(/ask not what your country can do for you, ask what you can do for your country/, text)
+    }
+  end
+  def test_transcribe_n_processors
+    @whisper = Whisper::Context.new("base.en")
+    params  = Whisper::Params.new
+    @whisper.transcribe(AUDIO, params, n_processors: 4) {|text|
+      assert_match(/ask not what your country can do for you[,.] ask what you can do for your country/i, text)
+    }
+  end
   sub_test_case "After transcription" do
     def test_full_n_segments
       assert_equal 1, whisper.full_n_segments

bindings/ruby/whispercpp.gemspec CHANGED Viewed

@@ -4,7 +4,7 @@ Gem::Specification.new do |s|
   s.name    = "whispercpp"
   s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
   s.version = '1.3.3'
-  s.date    = '2025-06-01'
   s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
   s.email   = '[email protected]'
   s.extra_rdoc_files = ['LICENSE', 'README.md']
@@ -21,7 +21,7 @@ Gem::Specification.new do |s|
               }
   s.summary = %q{Ruby whisper.cpp bindings}
-  s.test_files = s.files.select {|file| file.start_with? "tests/"}
   s.extensions << 'ext/extconf.rb'
   s.required_ruby_version = '>= 3.1.0'

   s.name    = "whispercpp"
   s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
   s.version = '1.3.3'
+  s.date    = '2025-06-03'
   s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
   s.email   = '[email protected]'
   s.extra_rdoc_files = ['LICENSE', 'README.md']
               }
   s.summary = %q{Ruby whisper.cpp bindings}
+  s.test_files = s.files.select {|file| file.start_with? "test/"}
   s.extensions << 'ext/extconf.rb'
   s.required_ruby_version = '>= 3.1.0'