danbev commited on
Commit
f198f02
·
unverified ·
1 Parent(s): 7953154

vad : revisit timestamp alignment/mapping (#3173)

Browse files

* vad : revisit timestamp alignment/mapping

This commit improving the timestamp alignment by introducing a mapping
table, adding intermediate reference points for longer segments, and
binary search for lookups.

The motivation for this changes is to address issues with the currently
solution where zero-length segments are possible, and also to improve
the precision of the VAD timestamps.

Refs: https://github.com/ggml-org/whisper.cpp/issues/3162

* vad : use uint64_t for time mapping

This commit changes the type of the `processed_time` and `original_time`
fields in the `vad_time_mapping` struct from `double` to `uint64_t`.

The motivation for this change is made to improve precision and avoid
floating-point inaccuracies and also be consistent with other part of
the code base that use `uint64_t` for time representation.

This is a part of a refactoring where I'm also going to change the
vad_segment_info struct to use `uint64_t` for the start and end times.
This is the reason for the not so pleasant conversion and casts in the
code at the moment.

* vad : change vad_segment_info and whisper_vad_segment to use uint64_t

* vad : use int64_t instead of uint64_t for timestamps

To be consistent with other timestamps in the codebase.

* vad : add centisecond conversion functions

* vad : extract vad processing from whisper_full_with_state

This commit extracts the VAD processing from the
`whisper_full_with_state` function into the `whisper_full` and
`whisper_full_parallel` functions.

The motivation for this is that I did not take into account that when
`whisper_full_parallel` is called with `n_processors > 1`, then the
vad processing would not be applied correctly. Instead the VAD
processing should be done prior to processing in the case of
`whisper_full_parallel`.

* vad : remove filtered_n_samples from whisper_vad

The commit removes the parameter `filtered_n_samples` from the
`whisper_vad` function signature and its usage, as it is no longer
needed since filtered samples is now a vector (previously it was a
float*)

The motivation for this is to simplify the usage of this function.

* vad : remove vad_mapping_table_initialized flag

* vad : fix leaning (none) of pointer/references

Files changed (1) hide show
  1. src/whisper.cpp +195 -144
src/whisper.cpp CHANGED
@@ -859,6 +859,11 @@ struct whisper_aheads_masks {
859
  ggml_backend_buffer_t buffer = nullptr;
860
  };
861
 
 
 
 
 
 
862
  struct whisper_state {
863
  int64_t t_sample_us = 0;
864
  int64_t t_encode_us = 0;
@@ -948,13 +953,15 @@ struct whisper_state {
948
  whisper_vad_context * vad_context = nullptr;
949
 
950
  struct vad_segment_info {
951
- float orig_start;
952
- float orig_end;
953
- float vad_start;
954
- float vad_end;
955
  };
956
  std::vector<vad_segment_info> vad_segments;
957
  bool has_vad_segments = false;
 
 
958
  };
959
 
960
  struct whisper_context {
@@ -4407,8 +4414,8 @@ struct whisper_vad_model {
4407
  };
4408
 
4409
  struct whisper_vad_segment {
4410
- float start; // Start time in seconds
4411
- float end; // End time in seconds
4412
  };
4413
 
4414
  struct whisper_vad_segments {
@@ -4456,6 +4463,15 @@ struct whisper_vad_params whisper_vad_default_params(void) {
4456
  return result;
4457
  }
4458
 
 
 
 
 
 
 
 
 
 
4459
  static bool weight_buft_supported(const whisper_vad_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
4460
  bool op_supported = true;
4461
 
@@ -5400,12 +5416,12 @@ struct whisper_vad_segments * whisper_vad_segments_from_probs(
5400
  (speeches[i].end + speech_pad_samples) : audio_length_samples;
5401
  }
5402
 
5403
- // Convert from samples to seconds and copy to final segments
5404
- segments[i].start = (float)speeches[i].start / sample_rate;
5405
- segments[i].end = (float)speeches[i].end / sample_rate;
5406
 
5407
  WHISPER_LOG_INFO("%s: VAD segment %d: start = %.2f, end = %.2f (duration: %.2f)\n",
5408
- __func__, i, segments[i].start, segments[i].end, segments[i].end - segments[i].start);
5409
  }
5410
 
5411
  whisper_vad_segments * vad_segments = new whisper_vad_segments;
@@ -6602,10 +6618,13 @@ static bool whisper_vad(
6602
  struct whisper_full_params params,
6603
  const float * samples,
6604
  int n_samples,
6605
- std::vector<float> & filtered_samples,
6606
- int & filtered_n_samples) {
6607
- WHISPER_LOG_INFO("%s: VAD is enabled, processing speach segments only\n", __func__);
6608
- filtered_n_samples = 0;
 
 
 
6609
 
6610
  if (state->vad_context == nullptr) {
6611
  struct whisper_vad_context_params vad_ctx_params = whisper_vad_default_context_params();
@@ -6627,13 +6646,17 @@ static bool whisper_vad(
6627
  ctx->state->vad_segments.clear();
6628
  ctx->state->vad_segments.reserve(vad_segments->data.size());
6629
 
 
 
 
 
6630
  WHISPER_LOG_INFO("%s: detected %d speech segments\n", __func__, (int)vad_segments->data.size());
6631
  float overlap_seconds = vad_params.samples_overlap;
6632
  int overlap_samples = overlap_seconds * WHISPER_SAMPLE_RATE;
6633
 
6634
  for (int i = 0; i < (int)vad_segments->data.size(); i++) {
6635
- int segment_start_samples = vad_segments->data[i].start * WHISPER_SAMPLE_RATE;
6636
- int segment_end_samples = vad_segments->data[i].end * WHISPER_SAMPLE_RATE;
6637
 
6638
  if (i < (int)vad_segments->data.size() - 1) {
6639
  segment_end_samples += overlap_samples;
@@ -6642,9 +6665,9 @@ static bool whisper_vad(
6642
  filtered_n_samples += (segment_end_samples - segment_start_samples);
6643
 
6644
  WHISPER_LOG_INFO("%s: Including segment %d: %.2f - %.2f (duration: %.2f)\n",
6645
- __func__, i, vad_segments->data[i].start,
6646
- vad_segments->data[i].end + (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0),
6647
- (vad_segments->data[i].end - vad_segments->data[i].start) +
6648
  (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0));
6649
  }
6650
 
@@ -6666,8 +6689,8 @@ static bool whisper_vad(
6666
 
6667
  int offset = 0;
6668
  for (int i = 0; i < (int)vad_segments->data.size(); i++) {
6669
- int segment_start_samples = vad_segments->data[i].start * WHISPER_SAMPLE_RATE;
6670
- int segment_end_samples = vad_segments->data[i].end * WHISPER_SAMPLE_RATE;
6671
 
6672
  if (i < (int)vad_segments->data.size() - 1) {
6673
  segment_end_samples += overlap_samples;
@@ -6676,18 +6699,47 @@ static bool whisper_vad(
6676
  segment_start_samples = std::min(segment_start_samples, n_samples - 1);
6677
  segment_end_samples = std::min(segment_end_samples, n_samples);
6678
  int segment_length = segment_end_samples - segment_start_samples;
6679
-
6680
  if (segment_length > 0) {
6681
  whisper_state::vad_segment_info segment;
6682
 
6683
  segment.orig_start = vad_segments->data[i].start;
6684
  segment.orig_end = vad_segments->data[i].end;
6685
 
6686
- segment.vad_start = offset / (float)WHISPER_SAMPLE_RATE;
6687
- segment.vad_end = (offset + segment_length) / (float)WHISPER_SAMPLE_RATE;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6688
 
6689
  WHISPER_LOG_INFO("%s: vad_segment_info: orig_start: %.2f, orig_end: %.2f, vad_start: %.2f, vad_end: %.2f\n",
6690
- __func__, segment.orig_start, segment.orig_end, segment.vad_start, segment.vad_end);
6691
  ctx->state->vad_segments.push_back(segment);
6692
 
6693
  // Copy this speech segment
@@ -6696,6 +6748,17 @@ static bool whisper_vad(
6696
 
6697
  // Add silence after this segment (except after the last segment)
6698
  if (i < (int)vad_segments->data.size() - 1) {
 
 
 
 
 
 
 
 
 
 
 
6699
  // Fill with zeros (silence)
6700
  memset(filtered_samples.data() + offset, 0, silence_samples * sizeof(float));
6701
  offset += silence_samples;
@@ -6703,6 +6766,24 @@ static bool whisper_vad(
6703
  }
6704
  }
6705
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6706
  filtered_n_samples = offset;
6707
  WHISPER_LOG_INFO("%s: Reduced audio from %d to %d samples (%.1f%% reduction)\n",
6708
  __func__, n_samples, filtered_n_samples, 100.0f * (1.0f - (float)filtered_n_samples / n_samples));
@@ -6722,27 +6803,9 @@ int whisper_full_with_state(
6722
 
6723
  result_all.clear();
6724
 
6725
- const float * process_samples = samples;
6726
- int n_process_samples = n_samples;
6727
- std::vector<float> vad_samples;
6728
-
6729
- if (params.vad) {
6730
- WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
6731
- int vad_n_samples;
6732
- if (!whisper_vad(ctx, state, params, samples, n_samples, vad_samples, vad_n_samples)) {
6733
- WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
6734
- return -1;
6735
- }
6736
- if (vad_n_samples == 0) {
6737
- return 0;
6738
- }
6739
- process_samples = vad_samples.data();
6740
- n_process_samples = vad_n_samples;
6741
- }
6742
-
6743
- if (n_process_samples > 0) {
6744
  // compute log mel spectrogram
6745
- if (whisper_pcm_to_mel_with_state(ctx, state, process_samples, n_process_samples, params.n_threads) != 0) {
6746
  WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
6747
  return -2;
6748
  }
@@ -7652,6 +7715,20 @@ int whisper_full(
7652
  struct whisper_full_params params,
7653
  const float * samples,
7654
  int n_samples) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7655
  return whisper_full_with_state(ctx, ctx->state, params, samples, n_samples);
7656
  }
7657
 
@@ -7661,9 +7738,24 @@ int whisper_full_parallel(
7661
  const float * samples,
7662
  int n_samples,
7663
  int n_processors) {
 
7664
  if (n_processors == 1) {
7665
  return whisper_full(ctx, params, samples, n_samples);
7666
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7667
  int ret = 0;
7668
 
7669
  // prepare separate states for each thread
@@ -7786,130 +7878,89 @@ int whisper_full_lang_id(struct whisper_context * ctx) {
7786
  return ctx->state->lang_id;
7787
  }
7788
 
7789
- int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
7790
- // If VAD wasn't used, return the original timestamp
7791
- if (!state->has_vad_segments || state->vad_segments.empty()) {
7792
- return state->result_all[i_segment].t0;
7793
  }
7794
 
7795
- // Get the start timestamp produced by whisper_full. whisper_full processes
7796
- // only the speech segments in this case so we need to map these timestamps
7797
- // back to the original audio.
7798
- float t0 = state->result_all[i_segment].t0 / 100.0f;
7799
 
7800
- // Find which VAD segment this timestamp belongs.
7801
- // TODO(danbev) This could be optimized by using a binary search if the number
7802
- // of segments exceed a certain limit. Also we might be able to assume that
7803
- // the access pattern is sequential and optimized for that too.
7804
- for (size_t i = 0; i < state->vad_segments.size(); i++) {
7805
- const auto & segment = state->vad_segments[i];
7806
 
7807
- // Check if the timestamp falls within this segment.
7808
- if (t0 >= segment.vad_start && t0 <= segment.vad_end) {
7809
- float proportion = 0.0f;
7810
- if (segment.vad_end > segment.vad_start) {
7811
- proportion = (t0 - segment.vad_start) / (segment.vad_end - segment.vad_start);
7812
- }
7813
- float orig_t0 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
7814
- return (int64_t)(orig_t0 * 100);
7815
  }
 
 
 
 
 
7816
  }
7817
 
7818
- // Check if the timestamp falls between two segments.
7819
- for (size_t i = 0; i < state->vad_segments.size() - 1; i++) {
7820
- const auto & curr = state->vad_segments[i];
7821
- const auto & next = state->vad_segments[i + 1];
7822
 
7823
- if (t0 > curr.vad_end && t0 < next.vad_start) {
7824
- // Calculate how far we are through the gap as a proportion
7825
- float gap_proportion = 0.0f;
7826
- if (next.vad_start > curr.vad_end) {
7827
- gap_proportion = (t0 - curr.vad_end) / (next.vad_start - curr.vad_end);
7828
- }
7829
- float orig_t0 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
7830
- return (int64_t)(orig_t0 * 100);
7831
- }
7832
- }
7833
 
7834
- // Handle the case where the timestamp is after the last segment.
7835
- if (t0 > state->vad_segments.back().vad_end) {
7836
- // For timestamps after the last segment, add the extra time to the end of the last segment
7837
- const auto& last = state->vad_segments.back();
7838
- // Calculate how far beyond the last segment
7839
- float extra_time = t0 - last.vad_end;
7840
- // Add this extra time to the original end time
7841
- float orig_t0 = last.orig_end + extra_time;
7842
- return (int64_t)(orig_t0 * 100);
7843
  }
7844
 
7845
- WHISPER_LOG_WARN("%s: Could not map t0 = %f to a VAD segment\n", __func__, t0);
7846
- return t0;
7847
  }
7848
 
7849
- int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
7850
- return whisper_full_get_segment_t0_from_state(ctx->state, i_segment);
 
 
 
 
 
 
 
 
 
 
7851
  }
7852
 
 
7853
  int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment) {
7854
  // If VAD wasn't used, return the original timestamp
7855
- if (!state->has_vad_segments || state->vad_segments.empty()) {
7856
  return state->result_all[i_segment].t1;
7857
  }
7858
 
7859
- // Get the end timestamp produced by whisper_full. whisper_full processes
7860
- // only the speech segments in this case so we need to map these timestamps
7861
- // back to the original audio.
7862
- float t1 = state->result_all[i_segment].t1 / 100.0f;
7863
-
7864
- // Find which VAD segment this timestamp belongs.
7865
- // TODO(danbev) This could be optimized by using a binary search if the number
7866
- // of segments exceed a certain limit. Also we might be able to assume that
7867
- // the access pattern is sequential and optimized for that too.
7868
- for (size_t i = 0; i < state->vad_segments.size(); i++) {
7869
- const auto& segment = state->vad_segments[i];
7870
-
7871
- // Check if the timestamp falls within this segment.
7872
- if (t1 >= segment.vad_start && t1 <= segment.vad_end) {
7873
- // Calculate the proportion through the filtered segment.
7874
- float proportion = 0.0f;
7875
- if (segment.vad_end > segment.vad_start) {
7876
- proportion = (t1 - segment.vad_start) / (segment.vad_end - segment.vad_start);
7877
- }
7878
- float orig_t1 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
7879
- return (int64_t)(orig_t1 * 100);
7880
- }
7881
- }
7882
 
7883
- // Check if the timestamp falls between two segments.
7884
- for (size_t i = 0; i < state->vad_segments.size() - 1; i++) {
7885
- const auto & curr = state->vad_segments[i];
7886
- const auto & next = state->vad_segments[i + 1];
7887
 
7888
- if (t1 > curr.vad_end && t1 < next.vad_start) {
7889
- // Calculate how far we are through the gap as a proportion
7890
- float gap_proportion = 0.0f;
7891
- if (next.vad_start > curr.vad_end) {
7892
- gap_proportion = (t1 - curr.vad_end) / (next.vad_start - curr.vad_end);
7893
- }
7894
- // Map to the corresponding position in the original gap
7895
- float orig_t1 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
7896
- return (int64_t)(orig_t1 * 100);
7897
- }
7898
- }
7899
 
7900
- // Handle the case where the timestamp is after the last segment
7901
- if (t1 > state->vad_segments.back().vad_end) {
7902
- // For the last segment, use the end of the last VAD segment
7903
- const auto& last = state->vad_segments.back();
7904
- // Calculate how far beyond the last segment
7905
- float extra_time = t1 - last.vad_end;
7906
- // Add this extra time to the original end time
7907
- float orig_t1 = last.orig_end + extra_time;
7908
- return (int64_t)(orig_t1 * 100);
7909
  }
7910
 
7911
- WHISPER_LOG_WARN("%s: Could not map t1 = %f to a VAD segment\n", __func__, t1);
7912
- return t1;
 
 
 
 
7913
  }
7914
 
7915
  int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) {
 
859
  ggml_backend_buffer_t buffer = nullptr;
860
  };
861
 
862
+ struct vad_time_mapping {
863
+ int64_t processed_time; // Time in processed (VAD) audio
864
+ int64_t original_time; // Corresponding time in original audio
865
+ };
866
+
867
  struct whisper_state {
868
  int64_t t_sample_us = 0;
869
  int64_t t_encode_us = 0;
 
953
  whisper_vad_context * vad_context = nullptr;
954
 
955
  struct vad_segment_info {
956
+ int64_t orig_start;
957
+ int64_t orig_end;
958
+ int64_t vad_start;
959
+ int64_t vad_end;
960
  };
961
  std::vector<vad_segment_info> vad_segments;
962
  bool has_vad_segments = false;
963
+
964
+ std::vector<vad_time_mapping> vad_mapping_table;
965
  };
966
 
967
  struct whisper_context {
 
4414
  };
4415
 
4416
  struct whisper_vad_segment {
4417
+ int64_t start;
4418
+ int64_t end;
4419
  };
4420
 
4421
  struct whisper_vad_segments {
 
4463
  return result;
4464
  }
4465
 
4466
+ // Time conversion utility functions for whisper VAD
4467
+ static int cs_to_samples(int64_t cs) {
4468
+ return (int)((cs / 100.0) * WHISPER_SAMPLE_RATE + 0.5);
4469
+ }
4470
+
4471
+ static int64_t samples_to_cs(int samples) {
4472
+ return (int64_t)((samples / (double)WHISPER_SAMPLE_RATE) * 100.0 + 0.5);
4473
+ }
4474
+
4475
  static bool weight_buft_supported(const whisper_vad_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
4476
  bool op_supported = true;
4477
 
 
5416
  (speeches[i].end + speech_pad_samples) : audio_length_samples;
5417
  }
5418
 
5419
+ // Convert from samples to centiseconds
5420
+ segments[i].start = samples_to_cs(speeches[i].start);
5421
+ segments[i].end = samples_to_cs(speeches[i].end);
5422
 
5423
  WHISPER_LOG_INFO("%s: VAD segment %d: start = %.2f, end = %.2f (duration: %.2f)\n",
5424
+ __func__, i, segments[i].start/100.0, segments[i].end/100.0, (segments[i].end - segments[i].start)/100.0);
5425
  }
5426
 
5427
  whisper_vad_segments * vad_segments = new whisper_vad_segments;
 
6618
  struct whisper_full_params params,
6619
  const float * samples,
6620
  int n_samples,
6621
+ std::vector<float> & filtered_samples) {
6622
+ WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
6623
+ int filtered_n_samples = 0;
6624
+
6625
+ // Clear any existing mapping table
6626
+ state->vad_mapping_table.clear();
6627
+ state->has_vad_segments = false;
6628
 
6629
  if (state->vad_context == nullptr) {
6630
  struct whisper_vad_context_params vad_ctx_params = whisper_vad_default_context_params();
 
6646
  ctx->state->vad_segments.clear();
6647
  ctx->state->vad_segments.reserve(vad_segments->data.size());
6648
 
6649
+ // Initialize the time mapping table
6650
+ state->vad_mapping_table.clear();
6651
+ state->vad_mapping_table.reserve(vad_segments->data.size() * 4);
6652
+
6653
  WHISPER_LOG_INFO("%s: detected %d speech segments\n", __func__, (int)vad_segments->data.size());
6654
  float overlap_seconds = vad_params.samples_overlap;
6655
  int overlap_samples = overlap_seconds * WHISPER_SAMPLE_RATE;
6656
 
6657
  for (int i = 0; i < (int)vad_segments->data.size(); i++) {
6658
+ int segment_start_samples = cs_to_samples(vad_segments->data[i].start);
6659
+ int segment_end_samples = cs_to_samples(vad_segments->data[i].end);
6660
 
6661
  if (i < (int)vad_segments->data.size() - 1) {
6662
  segment_end_samples += overlap_samples;
 
6665
  filtered_n_samples += (segment_end_samples - segment_start_samples);
6666
 
6667
  WHISPER_LOG_INFO("%s: Including segment %d: %.2f - %.2f (duration: %.2f)\n",
6668
+ __func__, i, vad_segments->data[i].start/100.0,
6669
+ (vad_segments->data[i].end/100.0 + (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0)),
6670
+ (vad_segments->data[i].end - vad_segments->data[i].start)/100.0 +
6671
  (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0));
6672
  }
6673
 
 
6689
 
6690
  int offset = 0;
6691
  for (int i = 0; i < (int)vad_segments->data.size(); i++) {
6692
+ int segment_start_samples = cs_to_samples(vad_segments->data[i].start);
6693
+ int segment_end_samples = cs_to_samples(vad_segments->data[i].end);
6694
 
6695
  if (i < (int)vad_segments->data.size() - 1) {
6696
  segment_end_samples += overlap_samples;
 
6699
  segment_start_samples = std::min(segment_start_samples, n_samples - 1);
6700
  segment_end_samples = std::min(segment_end_samples, n_samples);
6701
  int segment_length = segment_end_samples - segment_start_samples;
 
6702
  if (segment_length > 0) {
6703
  whisper_state::vad_segment_info segment;
6704
 
6705
  segment.orig_start = vad_segments->data[i].start;
6706
  segment.orig_end = vad_segments->data[i].end;
6707
 
6708
+ segment.vad_start = samples_to_cs(offset);
6709
+ segment.vad_end = samples_to_cs(offset + segment_length);
6710
+
6711
+ // Add segment boundaries to mapping table
6712
+ vad_time_mapping start_mapping = {segment.vad_start, segment.orig_start};
6713
+ vad_time_mapping end_mapping = {segment.vad_end, segment.orig_end};
6714
+
6715
+ state->vad_mapping_table.push_back(start_mapping);
6716
+ state->vad_mapping_table.push_back(end_mapping);
6717
+
6718
+ // Add intermediate points for longer segments to improve interpolation accuracy
6719
+ const int64_t min_segment_length = 100; // 1 second
6720
+ const int64_t point_interval = 20; // Add a point every 200ms
6721
+
6722
+ if (segment.vad_end - segment.vad_start > min_segment_length) {
6723
+ int64_t segment_duration = segment.vad_end - segment.vad_start;
6724
+ int num_points = (int)(segment_duration / point_interval) - 1;
6725
+
6726
+ for (int j = 1; j <= num_points; j++) {
6727
+ int64_t vad_time = segment.vad_start + j * point_interval;
6728
+
6729
+ if (vad_time >= segment.vad_end) continue;
6730
+
6731
+ int64_t vad_elapsed = vad_time - segment.vad_start;
6732
+ int64_t vad_total = segment.vad_end - segment.vad_start;
6733
+ int64_t orig_total = segment.orig_end - segment.orig_start;
6734
+ int64_t orig_time = segment.orig_start + (vad_elapsed * orig_total) / vad_total;
6735
+
6736
+ vad_time_mapping intermediate_mapping = {vad_time, orig_time};
6737
+ state->vad_mapping_table.push_back(intermediate_mapping);
6738
+ }
6739
+ }
6740
 
6741
  WHISPER_LOG_INFO("%s: vad_segment_info: orig_start: %.2f, orig_end: %.2f, vad_start: %.2f, vad_end: %.2f\n",
6742
+ __func__, segment.orig_start/100.0, segment.orig_end/100.0, segment.vad_start/100.0, segment.vad_end/100.0);
6743
  ctx->state->vad_segments.push_back(segment);
6744
 
6745
  // Copy this speech segment
 
6748
 
6749
  // Add silence after this segment (except after the last segment)
6750
  if (i < (int)vad_segments->data.size() - 1) {
6751
+ // Calculate the start and end time of the silence gap in processed audio
6752
+ int64_t silence_start_vad = samples_to_cs(offset);
6753
+ int64_t silence_end_vad = samples_to_cs(offset + silence_samples);
6754
+ // Calculate the corresponding original times
6755
+ int64_t orig_silence_start = segment.orig_end;
6756
+ int64_t orig_silence_end = vad_segments->data[i+1].start;
6757
+
6758
+ // Add mapping points for silence boundaries
6759
+ state->vad_mapping_table.push_back({silence_start_vad, orig_silence_start});
6760
+ state->vad_mapping_table.push_back({silence_end_vad, orig_silence_end});
6761
+
6762
  // Fill with zeros (silence)
6763
  memset(filtered_samples.data() + offset, 0, silence_samples * sizeof(float));
6764
  offset += silence_samples;
 
6766
  }
6767
  }
6768
 
6769
+ // Sort the mapping table by processed time
6770
+ std::sort(state->vad_mapping_table.begin(), state->vad_mapping_table.end(),
6771
+ [](const vad_time_mapping& a, const vad_time_mapping& b) {
6772
+ return a.processed_time < b.processed_time;
6773
+ });
6774
+
6775
+ // Remove any duplicate processed times to ensure monotonicity which is
6776
+ // needed for binary search and interpolation later.
6777
+ if (!state->vad_mapping_table.empty()) {
6778
+ auto last = std::unique(state->vad_mapping_table.begin(), state->vad_mapping_table.end(),
6779
+ [](const vad_time_mapping& a, const vad_time_mapping& b) {
6780
+ return a.processed_time == b.processed_time;
6781
+ });
6782
+ state->vad_mapping_table.erase(last, state->vad_mapping_table.end());
6783
+ }
6784
+
6785
+ WHISPER_LOG_INFO("%s: Created time mapping table with %d points\n", __func__, (int)state->vad_mapping_table.size());
6786
+
6787
  filtered_n_samples = offset;
6788
  WHISPER_LOG_INFO("%s: Reduced audio from %d to %d samples (%.1f%% reduction)\n",
6789
  __func__, n_samples, filtered_n_samples, 100.0f * (1.0f - (float)filtered_n_samples / n_samples));
 
6803
 
6804
  result_all.clear();
6805
 
6806
+ if (n_samples > 0) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6807
  // compute log mel spectrogram
6808
+ if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
6809
  WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
6810
  return -2;
6811
  }
 
7715
  struct whisper_full_params params,
7716
  const float * samples,
7717
  int n_samples) {
7718
+
7719
+ std::vector<float> vad_samples;
7720
+ if (params.vad) {
7721
+ WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
7722
+ if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) {
7723
+ WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
7724
+ return -1;
7725
+ }
7726
+ if (vad_samples.empty()) {
7727
+ return 0;
7728
+ }
7729
+ samples = vad_samples.data();
7730
+ n_samples = vad_samples.size();
7731
+ }
7732
  return whisper_full_with_state(ctx, ctx->state, params, samples, n_samples);
7733
  }
7734
 
 
7738
  const float * samples,
7739
  int n_samples,
7740
  int n_processors) {
7741
+
7742
  if (n_processors == 1) {
7743
  return whisper_full(ctx, params, samples, n_samples);
7744
  }
7745
+
7746
+ std::vector<float> vad_samples;
7747
+ if (params.vad) {
7748
+ WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
7749
+ if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) {
7750
+ WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
7751
+ return -1;
7752
+ }
7753
+ if (vad_samples.empty()) {
7754
+ return 0;
7755
+ }
7756
+ samples = vad_samples.data();
7757
+ n_samples = vad_samples.size();
7758
+ }
7759
  int ret = 0;
7760
 
7761
  // prepare separate states for each thread
 
7878
  return ctx->state->lang_id;
7879
  }
7880
 
7881
+ static int64_t map_processed_to_original_time(int64_t processed_time, const std::vector<vad_time_mapping> & mapping_table) {
7882
+ if (mapping_table.empty()) {
7883
+ return processed_time;
 
7884
  }
7885
 
7886
+ if (processed_time <= mapping_table.front().processed_time) {
7887
+ return mapping_table.front().original_time; // Before first mapping point
7888
+ }
 
7889
 
7890
+ if (processed_time >= mapping_table.back().processed_time) {
7891
+ return mapping_table.back().original_time; // After last mapping point
7892
+ }
 
 
 
7893
 
7894
+ // Binary search over the time map that finds the first entry that has a
7895
+ // processed time greater than or equal to the current processed time.
7896
+ auto upper = std::lower_bound(mapping_table.begin(), mapping_table.end(), processed_time,
7897
+ [](const vad_time_mapping & entry, int64_t time) {
7898
+ return entry.processed_time < time;
 
 
 
7899
  }
7900
+ );
7901
+
7902
+ // If exact match found
7903
+ if (upper->processed_time == processed_time) {
7904
+ return upper->original_time;
7905
  }
7906
 
7907
+ // Need to interpolate between two points
7908
+ auto lower = upper - 1;
 
 
7909
 
7910
+ int64_t processed_diff = upper->processed_time - lower->processed_time;
7911
+ int64_t original_diff = upper->original_time - lower->original_time;
7912
+ int64_t offset = processed_time - lower->processed_time;
 
 
 
 
 
 
 
7913
 
7914
+ if (processed_diff == 0) {
7915
+ return lower->original_time;
 
 
 
 
 
 
 
7916
  }
7917
 
7918
+ // Perform linear interpolation
7919
+ return lower->original_time + (offset * original_diff) / processed_diff;
7920
  }
7921
 
7922
+ // Function to get the starting timestamp of a segment
7923
+ int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
7924
+ // If VAD wasn't used, return the original timestamp
7925
+ if (!state->has_vad_segments || state->vad_mapping_table.empty()) {
7926
+ return state->result_all[i_segment].t0;
7927
+ }
7928
+
7929
+ // Get the processed timestamp
7930
+ int64_t t0 = state->result_all[i_segment].t0;
7931
+
7932
+ // Map to original time using the mapping table
7933
+ return map_processed_to_original_time(t0, state->vad_mapping_table);
7934
  }
7935
 
7936
+ // Function to get the ending timestamp of a segment
7937
  int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment) {
7938
  // If VAD wasn't used, return the original timestamp
7939
+ if (!state->has_vad_segments || state->vad_mapping_table.empty()) {
7940
  return state->result_all[i_segment].t1;
7941
  }
7942
 
7943
+ // Get the processed timestamp
7944
+ int64_t t1 = state->result_all[i_segment].t1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7945
 
7946
+ // Map to original time using the mapping table
7947
+ int64_t orig_t1 = map_processed_to_original_time(t1, state->vad_mapping_table);
 
 
7948
 
7949
+ // Get the corresponding t0 for this segment
7950
+ int64_t orig_t0 = whisper_full_get_segment_t0_from_state(state, i_segment);
 
 
 
 
 
 
 
 
 
7951
 
7952
+ // Ensure minimum duration to prevent zero-length segments
7953
+ const int64_t min_duration = 10; // 10ms minimum
7954
+ if (orig_t1 - orig_t0 < min_duration) {
7955
+ orig_t1 = orig_t0 + min_duration;
 
 
 
 
 
7956
  }
7957
 
7958
+ return orig_t1;
7959
+ }
7960
+
7961
+
7962
+ int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
7963
+ return whisper_full_get_segment_t0_from_state(ctx->state, i_segment);
7964
  }
7965
 
7966
  int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) {