Spaces:
Running
vad : revisit timestamp alignment/mapping (#3173)
Browse files* vad : revisit timestamp alignment/mapping
This commit improving the timestamp alignment by introducing a mapping
table, adding intermediate reference points for longer segments, and
binary search for lookups.
The motivation for this changes is to address issues with the currently
solution where zero-length segments are possible, and also to improve
the precision of the VAD timestamps.
Refs: https://github.com/ggml-org/whisper.cpp/issues/3162
* vad : use uint64_t for time mapping
This commit changes the type of the `processed_time` and `original_time`
fields in the `vad_time_mapping` struct from `double` to `uint64_t`.
The motivation for this change is made to improve precision and avoid
floating-point inaccuracies and also be consistent with other part of
the code base that use `uint64_t` for time representation.
This is a part of a refactoring where I'm also going to change the
vad_segment_info struct to use `uint64_t` for the start and end times.
This is the reason for the not so pleasant conversion and casts in the
code at the moment.
* vad : change vad_segment_info and whisper_vad_segment to use uint64_t
* vad : use int64_t instead of uint64_t for timestamps
To be consistent with other timestamps in the codebase.
* vad : add centisecond conversion functions
* vad : extract vad processing from whisper_full_with_state
This commit extracts the VAD processing from the
`whisper_full_with_state` function into the `whisper_full` and
`whisper_full_parallel` functions.
The motivation for this is that I did not take into account that when
`whisper_full_parallel` is called with `n_processors > 1`, then the
vad processing would not be applied correctly. Instead the VAD
processing should be done prior to processing in the case of
`whisper_full_parallel`.
* vad : remove filtered_n_samples from whisper_vad
The commit removes the parameter `filtered_n_samples` from the
`whisper_vad` function signature and its usage, as it is no longer
needed since filtered samples is now a vector (previously it was a
float*)
The motivation for this is to simplify the usage of this function.
* vad : remove vad_mapping_table_initialized flag
* vad : fix leaning (none) of pointer/references
- src/whisper.cpp +195 -144
|
@@ -859,6 +859,11 @@ struct whisper_aheads_masks {
|
|
| 859 |
ggml_backend_buffer_t buffer = nullptr;
|
| 860 |
};
|
| 861 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 862 |
struct whisper_state {
|
| 863 |
int64_t t_sample_us = 0;
|
| 864 |
int64_t t_encode_us = 0;
|
|
@@ -948,13 +953,15 @@ struct whisper_state {
|
|
| 948 |
whisper_vad_context * vad_context = nullptr;
|
| 949 |
|
| 950 |
struct vad_segment_info {
|
| 951 |
-
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
|
| 955 |
};
|
| 956 |
std::vector<vad_segment_info> vad_segments;
|
| 957 |
bool has_vad_segments = false;
|
|
|
|
|
|
|
| 958 |
};
|
| 959 |
|
| 960 |
struct whisper_context {
|
|
@@ -4407,8 +4414,8 @@ struct whisper_vad_model {
|
|
| 4407 |
};
|
| 4408 |
|
| 4409 |
struct whisper_vad_segment {
|
| 4410 |
-
|
| 4411 |
-
|
| 4412 |
};
|
| 4413 |
|
| 4414 |
struct whisper_vad_segments {
|
|
@@ -4456,6 +4463,15 @@ struct whisper_vad_params whisper_vad_default_params(void) {
|
|
| 4456 |
return result;
|
| 4457 |
}
|
| 4458 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4459 |
static bool weight_buft_supported(const whisper_vad_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
| 4460 |
bool op_supported = true;
|
| 4461 |
|
|
@@ -5400,12 +5416,12 @@ struct whisper_vad_segments * whisper_vad_segments_from_probs(
|
|
| 5400 |
(speeches[i].end + speech_pad_samples) : audio_length_samples;
|
| 5401 |
}
|
| 5402 |
|
| 5403 |
-
// Convert from samples to
|
| 5404 |
-
segments[i].start = (
|
| 5405 |
-
segments[i].end = (
|
| 5406 |
|
| 5407 |
WHISPER_LOG_INFO("%s: VAD segment %d: start = %.2f, end = %.2f (duration: %.2f)\n",
|
| 5408 |
-
__func__, i, segments[i].start, segments[i].end, segments[i].end - segments[i].start);
|
| 5409 |
}
|
| 5410 |
|
| 5411 |
whisper_vad_segments * vad_segments = new whisper_vad_segments;
|
|
@@ -6602,10 +6618,13 @@ static bool whisper_vad(
|
|
| 6602 |
struct whisper_full_params params,
|
| 6603 |
const float * samples,
|
| 6604 |
int n_samples,
|
| 6605 |
-
std::vector<float> & filtered_samples
|
| 6606 |
-
|
| 6607 |
-
|
| 6608 |
-
|
|
|
|
|
|
|
|
|
|
| 6609 |
|
| 6610 |
if (state->vad_context == nullptr) {
|
| 6611 |
struct whisper_vad_context_params vad_ctx_params = whisper_vad_default_context_params();
|
|
@@ -6627,13 +6646,17 @@ static bool whisper_vad(
|
|
| 6627 |
ctx->state->vad_segments.clear();
|
| 6628 |
ctx->state->vad_segments.reserve(vad_segments->data.size());
|
| 6629 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6630 |
WHISPER_LOG_INFO("%s: detected %d speech segments\n", __func__, (int)vad_segments->data.size());
|
| 6631 |
float overlap_seconds = vad_params.samples_overlap;
|
| 6632 |
int overlap_samples = overlap_seconds * WHISPER_SAMPLE_RATE;
|
| 6633 |
|
| 6634 |
for (int i = 0; i < (int)vad_segments->data.size(); i++) {
|
| 6635 |
-
int segment_start_samples = vad_segments->data[i].start
|
| 6636 |
-
int segment_end_samples = vad_segments->data[i].end
|
| 6637 |
|
| 6638 |
if (i < (int)vad_segments->data.size() - 1) {
|
| 6639 |
segment_end_samples += overlap_samples;
|
|
@@ -6642,9 +6665,9 @@ static bool whisper_vad(
|
|
| 6642 |
filtered_n_samples += (segment_end_samples - segment_start_samples);
|
| 6643 |
|
| 6644 |
WHISPER_LOG_INFO("%s: Including segment %d: %.2f - %.2f (duration: %.2f)\n",
|
| 6645 |
-
__func__, i, vad_segments->data[i].start,
|
| 6646 |
-
vad_segments->data[i].end + (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0),
|
| 6647 |
-
(vad_segments->data[i].end - vad_segments->data[i].start) +
|
| 6648 |
(i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0));
|
| 6649 |
}
|
| 6650 |
|
|
@@ -6666,8 +6689,8 @@ static bool whisper_vad(
|
|
| 6666 |
|
| 6667 |
int offset = 0;
|
| 6668 |
for (int i = 0; i < (int)vad_segments->data.size(); i++) {
|
| 6669 |
-
int segment_start_samples = vad_segments->data[i].start
|
| 6670 |
-
int segment_end_samples = vad_segments->data[i].end
|
| 6671 |
|
| 6672 |
if (i < (int)vad_segments->data.size() - 1) {
|
| 6673 |
segment_end_samples += overlap_samples;
|
|
@@ -6676,18 +6699,47 @@ static bool whisper_vad(
|
|
| 6676 |
segment_start_samples = std::min(segment_start_samples, n_samples - 1);
|
| 6677 |
segment_end_samples = std::min(segment_end_samples, n_samples);
|
| 6678 |
int segment_length = segment_end_samples - segment_start_samples;
|
| 6679 |
-
|
| 6680 |
if (segment_length > 0) {
|
| 6681 |
whisper_state::vad_segment_info segment;
|
| 6682 |
|
| 6683 |
segment.orig_start = vad_segments->data[i].start;
|
| 6684 |
segment.orig_end = vad_segments->data[i].end;
|
| 6685 |
|
| 6686 |
-
segment.vad_start = offset
|
| 6687 |
-
segment.vad_end = (offset + segment_length)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6688 |
|
| 6689 |
WHISPER_LOG_INFO("%s: vad_segment_info: orig_start: %.2f, orig_end: %.2f, vad_start: %.2f, vad_end: %.2f\n",
|
| 6690 |
-
__func__, segment.orig_start, segment.orig_end, segment.vad_start, segment.vad_end);
|
| 6691 |
ctx->state->vad_segments.push_back(segment);
|
| 6692 |
|
| 6693 |
// Copy this speech segment
|
|
@@ -6696,6 +6748,17 @@ static bool whisper_vad(
|
|
| 6696 |
|
| 6697 |
// Add silence after this segment (except after the last segment)
|
| 6698 |
if (i < (int)vad_segments->data.size() - 1) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6699 |
// Fill with zeros (silence)
|
| 6700 |
memset(filtered_samples.data() + offset, 0, silence_samples * sizeof(float));
|
| 6701 |
offset += silence_samples;
|
|
@@ -6703,6 +6766,24 @@ static bool whisper_vad(
|
|
| 6703 |
}
|
| 6704 |
}
|
| 6705 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6706 |
filtered_n_samples = offset;
|
| 6707 |
WHISPER_LOG_INFO("%s: Reduced audio from %d to %d samples (%.1f%% reduction)\n",
|
| 6708 |
__func__, n_samples, filtered_n_samples, 100.0f * (1.0f - (float)filtered_n_samples / n_samples));
|
|
@@ -6722,27 +6803,9 @@ int whisper_full_with_state(
|
|
| 6722 |
|
| 6723 |
result_all.clear();
|
| 6724 |
|
| 6725 |
-
|
| 6726 |
-
int n_process_samples = n_samples;
|
| 6727 |
-
std::vector<float> vad_samples;
|
| 6728 |
-
|
| 6729 |
-
if (params.vad) {
|
| 6730 |
-
WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
|
| 6731 |
-
int vad_n_samples;
|
| 6732 |
-
if (!whisper_vad(ctx, state, params, samples, n_samples, vad_samples, vad_n_samples)) {
|
| 6733 |
-
WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
|
| 6734 |
-
return -1;
|
| 6735 |
-
}
|
| 6736 |
-
if (vad_n_samples == 0) {
|
| 6737 |
-
return 0;
|
| 6738 |
-
}
|
| 6739 |
-
process_samples = vad_samples.data();
|
| 6740 |
-
n_process_samples = vad_n_samples;
|
| 6741 |
-
}
|
| 6742 |
-
|
| 6743 |
-
if (n_process_samples > 0) {
|
| 6744 |
// compute log mel spectrogram
|
| 6745 |
-
if (whisper_pcm_to_mel_with_state(ctx, state,
|
| 6746 |
WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
|
| 6747 |
return -2;
|
| 6748 |
}
|
|
@@ -7652,6 +7715,20 @@ int whisper_full(
|
|
| 7652 |
struct whisper_full_params params,
|
| 7653 |
const float * samples,
|
| 7654 |
int n_samples) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7655 |
return whisper_full_with_state(ctx, ctx->state, params, samples, n_samples);
|
| 7656 |
}
|
| 7657 |
|
|
@@ -7661,9 +7738,24 @@ int whisper_full_parallel(
|
|
| 7661 |
const float * samples,
|
| 7662 |
int n_samples,
|
| 7663 |
int n_processors) {
|
|
|
|
| 7664 |
if (n_processors == 1) {
|
| 7665 |
return whisper_full(ctx, params, samples, n_samples);
|
| 7666 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7667 |
int ret = 0;
|
| 7668 |
|
| 7669 |
// prepare separate states for each thread
|
|
@@ -7786,130 +7878,89 @@ int whisper_full_lang_id(struct whisper_context * ctx) {
|
|
| 7786 |
return ctx->state->lang_id;
|
| 7787 |
}
|
| 7788 |
|
| 7789 |
-
int64_t
|
| 7790 |
-
|
| 7791 |
-
|
| 7792 |
-
return state->result_all[i_segment].t0;
|
| 7793 |
}
|
| 7794 |
|
| 7795 |
-
|
| 7796 |
-
|
| 7797 |
-
|
| 7798 |
-
float t0 = state->result_all[i_segment].t0 / 100.0f;
|
| 7799 |
|
| 7800 |
-
|
| 7801 |
-
|
| 7802 |
-
|
| 7803 |
-
// the access pattern is sequential and optimized for that too.
|
| 7804 |
-
for (size_t i = 0; i < state->vad_segments.size(); i++) {
|
| 7805 |
-
const auto & segment = state->vad_segments[i];
|
| 7806 |
|
| 7807 |
-
|
| 7808 |
-
|
| 7809 |
-
|
| 7810 |
-
|
| 7811 |
-
|
| 7812 |
-
}
|
| 7813 |
-
float orig_t0 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
|
| 7814 |
-
return (int64_t)(orig_t0 * 100);
|
| 7815 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7816 |
}
|
| 7817 |
|
| 7818 |
-
//
|
| 7819 |
-
|
| 7820 |
-
const auto & curr = state->vad_segments[i];
|
| 7821 |
-
const auto & next = state->vad_segments[i + 1];
|
| 7822 |
|
| 7823 |
-
|
| 7824 |
-
|
| 7825 |
-
|
| 7826 |
-
if (next.vad_start > curr.vad_end) {
|
| 7827 |
-
gap_proportion = (t0 - curr.vad_end) / (next.vad_start - curr.vad_end);
|
| 7828 |
-
}
|
| 7829 |
-
float orig_t0 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
|
| 7830 |
-
return (int64_t)(orig_t0 * 100);
|
| 7831 |
-
}
|
| 7832 |
-
}
|
| 7833 |
|
| 7834 |
-
|
| 7835 |
-
|
| 7836 |
-
// For timestamps after the last segment, add the extra time to the end of the last segment
|
| 7837 |
-
const auto& last = state->vad_segments.back();
|
| 7838 |
-
// Calculate how far beyond the last segment
|
| 7839 |
-
float extra_time = t0 - last.vad_end;
|
| 7840 |
-
// Add this extra time to the original end time
|
| 7841 |
-
float orig_t0 = last.orig_end + extra_time;
|
| 7842 |
-
return (int64_t)(orig_t0 * 100);
|
| 7843 |
}
|
| 7844 |
|
| 7845 |
-
|
| 7846 |
-
return
|
| 7847 |
}
|
| 7848 |
|
| 7849 |
-
|
| 7850 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7851 |
}
|
| 7852 |
|
|
|
|
| 7853 |
int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment) {
|
| 7854 |
// If VAD wasn't used, return the original timestamp
|
| 7855 |
-
if (!state->has_vad_segments || state->
|
| 7856 |
return state->result_all[i_segment].t1;
|
| 7857 |
}
|
| 7858 |
|
| 7859 |
-
// Get the
|
| 7860 |
-
|
| 7861 |
-
// back to the original audio.
|
| 7862 |
-
float t1 = state->result_all[i_segment].t1 / 100.0f;
|
| 7863 |
-
|
| 7864 |
-
// Find which VAD segment this timestamp belongs.
|
| 7865 |
-
// TODO(danbev) This could be optimized by using a binary search if the number
|
| 7866 |
-
// of segments exceed a certain limit. Also we might be able to assume that
|
| 7867 |
-
// the access pattern is sequential and optimized for that too.
|
| 7868 |
-
for (size_t i = 0; i < state->vad_segments.size(); i++) {
|
| 7869 |
-
const auto& segment = state->vad_segments[i];
|
| 7870 |
-
|
| 7871 |
-
// Check if the timestamp falls within this segment.
|
| 7872 |
-
if (t1 >= segment.vad_start && t1 <= segment.vad_end) {
|
| 7873 |
-
// Calculate the proportion through the filtered segment.
|
| 7874 |
-
float proportion = 0.0f;
|
| 7875 |
-
if (segment.vad_end > segment.vad_start) {
|
| 7876 |
-
proportion = (t1 - segment.vad_start) / (segment.vad_end - segment.vad_start);
|
| 7877 |
-
}
|
| 7878 |
-
float orig_t1 = segment.orig_start + proportion * (segment.orig_end - segment.orig_start);
|
| 7879 |
-
return (int64_t)(orig_t1 * 100);
|
| 7880 |
-
}
|
| 7881 |
-
}
|
| 7882 |
|
| 7883 |
-
//
|
| 7884 |
-
|
| 7885 |
-
const auto & curr = state->vad_segments[i];
|
| 7886 |
-
const auto & next = state->vad_segments[i + 1];
|
| 7887 |
|
| 7888 |
-
|
| 7889 |
-
|
| 7890 |
-
float gap_proportion = 0.0f;
|
| 7891 |
-
if (next.vad_start > curr.vad_end) {
|
| 7892 |
-
gap_proportion = (t1 - curr.vad_end) / (next.vad_start - curr.vad_end);
|
| 7893 |
-
}
|
| 7894 |
-
// Map to the corresponding position in the original gap
|
| 7895 |
-
float orig_t1 = curr.orig_end + gap_proportion * (next.orig_start - curr.orig_end);
|
| 7896 |
-
return (int64_t)(orig_t1 * 100);
|
| 7897 |
-
}
|
| 7898 |
-
}
|
| 7899 |
|
| 7900 |
-
//
|
| 7901 |
-
|
| 7902 |
-
|
| 7903 |
-
|
| 7904 |
-
// Calculate how far beyond the last segment
|
| 7905 |
-
float extra_time = t1 - last.vad_end;
|
| 7906 |
-
// Add this extra time to the original end time
|
| 7907 |
-
float orig_t1 = last.orig_end + extra_time;
|
| 7908 |
-
return (int64_t)(orig_t1 * 100);
|
| 7909 |
}
|
| 7910 |
|
| 7911 |
-
|
| 7912 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7913 |
}
|
| 7914 |
|
| 7915 |
int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) {
|
|
|
|
| 859 |
ggml_backend_buffer_t buffer = nullptr;
|
| 860 |
};
|
| 861 |
|
| 862 |
+
struct vad_time_mapping {
|
| 863 |
+
int64_t processed_time; // Time in processed (VAD) audio
|
| 864 |
+
int64_t original_time; // Corresponding time in original audio
|
| 865 |
+
};
|
| 866 |
+
|
| 867 |
struct whisper_state {
|
| 868 |
int64_t t_sample_us = 0;
|
| 869 |
int64_t t_encode_us = 0;
|
|
|
|
| 953 |
whisper_vad_context * vad_context = nullptr;
|
| 954 |
|
| 955 |
struct vad_segment_info {
|
| 956 |
+
int64_t orig_start;
|
| 957 |
+
int64_t orig_end;
|
| 958 |
+
int64_t vad_start;
|
| 959 |
+
int64_t vad_end;
|
| 960 |
};
|
| 961 |
std::vector<vad_segment_info> vad_segments;
|
| 962 |
bool has_vad_segments = false;
|
| 963 |
+
|
| 964 |
+
std::vector<vad_time_mapping> vad_mapping_table;
|
| 965 |
};
|
| 966 |
|
| 967 |
struct whisper_context {
|
|
|
|
| 4414 |
};
|
| 4415 |
|
| 4416 |
struct whisper_vad_segment {
|
| 4417 |
+
int64_t start;
|
| 4418 |
+
int64_t end;
|
| 4419 |
};
|
| 4420 |
|
| 4421 |
struct whisper_vad_segments {
|
|
|
|
| 4463 |
return result;
|
| 4464 |
}
|
| 4465 |
|
| 4466 |
+
// Time conversion utility functions for whisper VAD
|
| 4467 |
+
static int cs_to_samples(int64_t cs) {
|
| 4468 |
+
return (int)((cs / 100.0) * WHISPER_SAMPLE_RATE + 0.5);
|
| 4469 |
+
}
|
| 4470 |
+
|
| 4471 |
+
static int64_t samples_to_cs(int samples) {
|
| 4472 |
+
return (int64_t)((samples / (double)WHISPER_SAMPLE_RATE) * 100.0 + 0.5);
|
| 4473 |
+
}
|
| 4474 |
+
|
| 4475 |
static bool weight_buft_supported(const whisper_vad_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
| 4476 |
bool op_supported = true;
|
| 4477 |
|
|
|
|
| 5416 |
(speeches[i].end + speech_pad_samples) : audio_length_samples;
|
| 5417 |
}
|
| 5418 |
|
| 5419 |
+
// Convert from samples to centiseconds
|
| 5420 |
+
segments[i].start = samples_to_cs(speeches[i].start);
|
| 5421 |
+
segments[i].end = samples_to_cs(speeches[i].end);
|
| 5422 |
|
| 5423 |
WHISPER_LOG_INFO("%s: VAD segment %d: start = %.2f, end = %.2f (duration: %.2f)\n",
|
| 5424 |
+
__func__, i, segments[i].start/100.0, segments[i].end/100.0, (segments[i].end - segments[i].start)/100.0);
|
| 5425 |
}
|
| 5426 |
|
| 5427 |
whisper_vad_segments * vad_segments = new whisper_vad_segments;
|
|
|
|
| 6618 |
struct whisper_full_params params,
|
| 6619 |
const float * samples,
|
| 6620 |
int n_samples,
|
| 6621 |
+
std::vector<float> & filtered_samples) {
|
| 6622 |
+
WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
|
| 6623 |
+
int filtered_n_samples = 0;
|
| 6624 |
+
|
| 6625 |
+
// Clear any existing mapping table
|
| 6626 |
+
state->vad_mapping_table.clear();
|
| 6627 |
+
state->has_vad_segments = false;
|
| 6628 |
|
| 6629 |
if (state->vad_context == nullptr) {
|
| 6630 |
struct whisper_vad_context_params vad_ctx_params = whisper_vad_default_context_params();
|
|
|
|
| 6646 |
ctx->state->vad_segments.clear();
|
| 6647 |
ctx->state->vad_segments.reserve(vad_segments->data.size());
|
| 6648 |
|
| 6649 |
+
// Initialize the time mapping table
|
| 6650 |
+
state->vad_mapping_table.clear();
|
| 6651 |
+
state->vad_mapping_table.reserve(vad_segments->data.size() * 4);
|
| 6652 |
+
|
| 6653 |
WHISPER_LOG_INFO("%s: detected %d speech segments\n", __func__, (int)vad_segments->data.size());
|
| 6654 |
float overlap_seconds = vad_params.samples_overlap;
|
| 6655 |
int overlap_samples = overlap_seconds * WHISPER_SAMPLE_RATE;
|
| 6656 |
|
| 6657 |
for (int i = 0; i < (int)vad_segments->data.size(); i++) {
|
| 6658 |
+
int segment_start_samples = cs_to_samples(vad_segments->data[i].start);
|
| 6659 |
+
int segment_end_samples = cs_to_samples(vad_segments->data[i].end);
|
| 6660 |
|
| 6661 |
if (i < (int)vad_segments->data.size() - 1) {
|
| 6662 |
segment_end_samples += overlap_samples;
|
|
|
|
| 6665 |
filtered_n_samples += (segment_end_samples - segment_start_samples);
|
| 6666 |
|
| 6667 |
WHISPER_LOG_INFO("%s: Including segment %d: %.2f - %.2f (duration: %.2f)\n",
|
| 6668 |
+
__func__, i, vad_segments->data[i].start/100.0,
|
| 6669 |
+
(vad_segments->data[i].end/100.0 + (i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0)),
|
| 6670 |
+
(vad_segments->data[i].end - vad_segments->data[i].start)/100.0 +
|
| 6671 |
(i < (int)vad_segments->data.size() - 1 ? overlap_seconds : 0));
|
| 6672 |
}
|
| 6673 |
|
|
|
|
| 6689 |
|
| 6690 |
int offset = 0;
|
| 6691 |
for (int i = 0; i < (int)vad_segments->data.size(); i++) {
|
| 6692 |
+
int segment_start_samples = cs_to_samples(vad_segments->data[i].start);
|
| 6693 |
+
int segment_end_samples = cs_to_samples(vad_segments->data[i].end);
|
| 6694 |
|
| 6695 |
if (i < (int)vad_segments->data.size() - 1) {
|
| 6696 |
segment_end_samples += overlap_samples;
|
|
|
|
| 6699 |
segment_start_samples = std::min(segment_start_samples, n_samples - 1);
|
| 6700 |
segment_end_samples = std::min(segment_end_samples, n_samples);
|
| 6701 |
int segment_length = segment_end_samples - segment_start_samples;
|
|
|
|
| 6702 |
if (segment_length > 0) {
|
| 6703 |
whisper_state::vad_segment_info segment;
|
| 6704 |
|
| 6705 |
segment.orig_start = vad_segments->data[i].start;
|
| 6706 |
segment.orig_end = vad_segments->data[i].end;
|
| 6707 |
|
| 6708 |
+
segment.vad_start = samples_to_cs(offset);
|
| 6709 |
+
segment.vad_end = samples_to_cs(offset + segment_length);
|
| 6710 |
+
|
| 6711 |
+
// Add segment boundaries to mapping table
|
| 6712 |
+
vad_time_mapping start_mapping = {segment.vad_start, segment.orig_start};
|
| 6713 |
+
vad_time_mapping end_mapping = {segment.vad_end, segment.orig_end};
|
| 6714 |
+
|
| 6715 |
+
state->vad_mapping_table.push_back(start_mapping);
|
| 6716 |
+
state->vad_mapping_table.push_back(end_mapping);
|
| 6717 |
+
|
| 6718 |
+
// Add intermediate points for longer segments to improve interpolation accuracy
|
| 6719 |
+
const int64_t min_segment_length = 100; // 1 second
|
| 6720 |
+
const int64_t point_interval = 20; // Add a point every 200ms
|
| 6721 |
+
|
| 6722 |
+
if (segment.vad_end - segment.vad_start > min_segment_length) {
|
| 6723 |
+
int64_t segment_duration = segment.vad_end - segment.vad_start;
|
| 6724 |
+
int num_points = (int)(segment_duration / point_interval) - 1;
|
| 6725 |
+
|
| 6726 |
+
for (int j = 1; j <= num_points; j++) {
|
| 6727 |
+
int64_t vad_time = segment.vad_start + j * point_interval;
|
| 6728 |
+
|
| 6729 |
+
if (vad_time >= segment.vad_end) continue;
|
| 6730 |
+
|
| 6731 |
+
int64_t vad_elapsed = vad_time - segment.vad_start;
|
| 6732 |
+
int64_t vad_total = segment.vad_end - segment.vad_start;
|
| 6733 |
+
int64_t orig_total = segment.orig_end - segment.orig_start;
|
| 6734 |
+
int64_t orig_time = segment.orig_start + (vad_elapsed * orig_total) / vad_total;
|
| 6735 |
+
|
| 6736 |
+
vad_time_mapping intermediate_mapping = {vad_time, orig_time};
|
| 6737 |
+
state->vad_mapping_table.push_back(intermediate_mapping);
|
| 6738 |
+
}
|
| 6739 |
+
}
|
| 6740 |
|
| 6741 |
WHISPER_LOG_INFO("%s: vad_segment_info: orig_start: %.2f, orig_end: %.2f, vad_start: %.2f, vad_end: %.2f\n",
|
| 6742 |
+
__func__, segment.orig_start/100.0, segment.orig_end/100.0, segment.vad_start/100.0, segment.vad_end/100.0);
|
| 6743 |
ctx->state->vad_segments.push_back(segment);
|
| 6744 |
|
| 6745 |
// Copy this speech segment
|
|
|
|
| 6748 |
|
| 6749 |
// Add silence after this segment (except after the last segment)
|
| 6750 |
if (i < (int)vad_segments->data.size() - 1) {
|
| 6751 |
+
// Calculate the start and end time of the silence gap in processed audio
|
| 6752 |
+
int64_t silence_start_vad = samples_to_cs(offset);
|
| 6753 |
+
int64_t silence_end_vad = samples_to_cs(offset + silence_samples);
|
| 6754 |
+
// Calculate the corresponding original times
|
| 6755 |
+
int64_t orig_silence_start = segment.orig_end;
|
| 6756 |
+
int64_t orig_silence_end = vad_segments->data[i+1].start;
|
| 6757 |
+
|
| 6758 |
+
// Add mapping points for silence boundaries
|
| 6759 |
+
state->vad_mapping_table.push_back({silence_start_vad, orig_silence_start});
|
| 6760 |
+
state->vad_mapping_table.push_back({silence_end_vad, orig_silence_end});
|
| 6761 |
+
|
| 6762 |
// Fill with zeros (silence)
|
| 6763 |
memset(filtered_samples.data() + offset, 0, silence_samples * sizeof(float));
|
| 6764 |
offset += silence_samples;
|
|
|
|
| 6766 |
}
|
| 6767 |
}
|
| 6768 |
|
| 6769 |
+
// Sort the mapping table by processed time
|
| 6770 |
+
std::sort(state->vad_mapping_table.begin(), state->vad_mapping_table.end(),
|
| 6771 |
+
[](const vad_time_mapping& a, const vad_time_mapping& b) {
|
| 6772 |
+
return a.processed_time < b.processed_time;
|
| 6773 |
+
});
|
| 6774 |
+
|
| 6775 |
+
// Remove any duplicate processed times to ensure monotonicity which is
|
| 6776 |
+
// needed for binary search and interpolation later.
|
| 6777 |
+
if (!state->vad_mapping_table.empty()) {
|
| 6778 |
+
auto last = std::unique(state->vad_mapping_table.begin(), state->vad_mapping_table.end(),
|
| 6779 |
+
[](const vad_time_mapping& a, const vad_time_mapping& b) {
|
| 6780 |
+
return a.processed_time == b.processed_time;
|
| 6781 |
+
});
|
| 6782 |
+
state->vad_mapping_table.erase(last, state->vad_mapping_table.end());
|
| 6783 |
+
}
|
| 6784 |
+
|
| 6785 |
+
WHISPER_LOG_INFO("%s: Created time mapping table with %d points\n", __func__, (int)state->vad_mapping_table.size());
|
| 6786 |
+
|
| 6787 |
filtered_n_samples = offset;
|
| 6788 |
WHISPER_LOG_INFO("%s: Reduced audio from %d to %d samples (%.1f%% reduction)\n",
|
| 6789 |
__func__, n_samples, filtered_n_samples, 100.0f * (1.0f - (float)filtered_n_samples / n_samples));
|
|
|
|
| 6803 |
|
| 6804 |
result_all.clear();
|
| 6805 |
|
| 6806 |
+
if (n_samples > 0) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6807 |
// compute log mel spectrogram
|
| 6808 |
+
if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
|
| 6809 |
WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
|
| 6810 |
return -2;
|
| 6811 |
}
|
|
|
|
| 7715 |
struct whisper_full_params params,
|
| 7716 |
const float * samples,
|
| 7717 |
int n_samples) {
|
| 7718 |
+
|
| 7719 |
+
std::vector<float> vad_samples;
|
| 7720 |
+
if (params.vad) {
|
| 7721 |
+
WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
|
| 7722 |
+
if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) {
|
| 7723 |
+
WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
|
| 7724 |
+
return -1;
|
| 7725 |
+
}
|
| 7726 |
+
if (vad_samples.empty()) {
|
| 7727 |
+
return 0;
|
| 7728 |
+
}
|
| 7729 |
+
samples = vad_samples.data();
|
| 7730 |
+
n_samples = vad_samples.size();
|
| 7731 |
+
}
|
| 7732 |
return whisper_full_with_state(ctx, ctx->state, params, samples, n_samples);
|
| 7733 |
}
|
| 7734 |
|
|
|
|
| 7738 |
const float * samples,
|
| 7739 |
int n_samples,
|
| 7740 |
int n_processors) {
|
| 7741 |
+
|
| 7742 |
if (n_processors == 1) {
|
| 7743 |
return whisper_full(ctx, params, samples, n_samples);
|
| 7744 |
}
|
| 7745 |
+
|
| 7746 |
+
std::vector<float> vad_samples;
|
| 7747 |
+
if (params.vad) {
|
| 7748 |
+
WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
|
| 7749 |
+
if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) {
|
| 7750 |
+
WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
|
| 7751 |
+
return -1;
|
| 7752 |
+
}
|
| 7753 |
+
if (vad_samples.empty()) {
|
| 7754 |
+
return 0;
|
| 7755 |
+
}
|
| 7756 |
+
samples = vad_samples.data();
|
| 7757 |
+
n_samples = vad_samples.size();
|
| 7758 |
+
}
|
| 7759 |
int ret = 0;
|
| 7760 |
|
| 7761 |
// prepare separate states for each thread
|
|
|
|
| 7878 |
return ctx->state->lang_id;
|
| 7879 |
}
|
| 7880 |
|
| 7881 |
+
static int64_t map_processed_to_original_time(int64_t processed_time, const std::vector<vad_time_mapping> & mapping_table) {
|
| 7882 |
+
if (mapping_table.empty()) {
|
| 7883 |
+
return processed_time;
|
|
|
|
| 7884 |
}
|
| 7885 |
|
| 7886 |
+
if (processed_time <= mapping_table.front().processed_time) {
|
| 7887 |
+
return mapping_table.front().original_time; // Before first mapping point
|
| 7888 |
+
}
|
|
|
|
| 7889 |
|
| 7890 |
+
if (processed_time >= mapping_table.back().processed_time) {
|
| 7891 |
+
return mapping_table.back().original_time; // After last mapping point
|
| 7892 |
+
}
|
|
|
|
|
|
|
|
|
|
| 7893 |
|
| 7894 |
+
// Binary search over the time map that finds the first entry that has a
|
| 7895 |
+
// processed time greater than or equal to the current processed time.
|
| 7896 |
+
auto upper = std::lower_bound(mapping_table.begin(), mapping_table.end(), processed_time,
|
| 7897 |
+
[](const vad_time_mapping & entry, int64_t time) {
|
| 7898 |
+
return entry.processed_time < time;
|
|
|
|
|
|
|
|
|
|
| 7899 |
}
|
| 7900 |
+
);
|
| 7901 |
+
|
| 7902 |
+
// If exact match found
|
| 7903 |
+
if (upper->processed_time == processed_time) {
|
| 7904 |
+
return upper->original_time;
|
| 7905 |
}
|
| 7906 |
|
| 7907 |
+
// Need to interpolate between two points
|
| 7908 |
+
auto lower = upper - 1;
|
|
|
|
|
|
|
| 7909 |
|
| 7910 |
+
int64_t processed_diff = upper->processed_time - lower->processed_time;
|
| 7911 |
+
int64_t original_diff = upper->original_time - lower->original_time;
|
| 7912 |
+
int64_t offset = processed_time - lower->processed_time;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7913 |
|
| 7914 |
+
if (processed_diff == 0) {
|
| 7915 |
+
return lower->original_time;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7916 |
}
|
| 7917 |
|
| 7918 |
+
// Perform linear interpolation
|
| 7919 |
+
return lower->original_time + (offset * original_diff) / processed_diff;
|
| 7920 |
}
|
| 7921 |
|
| 7922 |
+
// Function to get the starting timestamp of a segment
|
| 7923 |
+
int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
|
| 7924 |
+
// If VAD wasn't used, return the original timestamp
|
| 7925 |
+
if (!state->has_vad_segments || state->vad_mapping_table.empty()) {
|
| 7926 |
+
return state->result_all[i_segment].t0;
|
| 7927 |
+
}
|
| 7928 |
+
|
| 7929 |
+
// Get the processed timestamp
|
| 7930 |
+
int64_t t0 = state->result_all[i_segment].t0;
|
| 7931 |
+
|
| 7932 |
+
// Map to original time using the mapping table
|
| 7933 |
+
return map_processed_to_original_time(t0, state->vad_mapping_table);
|
| 7934 |
}
|
| 7935 |
|
| 7936 |
+
// Function to get the ending timestamp of a segment
|
| 7937 |
int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment) {
|
| 7938 |
// If VAD wasn't used, return the original timestamp
|
| 7939 |
+
if (!state->has_vad_segments || state->vad_mapping_table.empty()) {
|
| 7940 |
return state->result_all[i_segment].t1;
|
| 7941 |
}
|
| 7942 |
|
| 7943 |
+
// Get the processed timestamp
|
| 7944 |
+
int64_t t1 = state->result_all[i_segment].t1;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7945 |
|
| 7946 |
+
// Map to original time using the mapping table
|
| 7947 |
+
int64_t orig_t1 = map_processed_to_original_time(t1, state->vad_mapping_table);
|
|
|
|
|
|
|
| 7948 |
|
| 7949 |
+
// Get the corresponding t0 for this segment
|
| 7950 |
+
int64_t orig_t0 = whisper_full_get_segment_t0_from_state(state, i_segment);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7951 |
|
| 7952 |
+
// Ensure minimum duration to prevent zero-length segments
|
| 7953 |
+
const int64_t min_duration = 10; // 10ms minimum
|
| 7954 |
+
if (orig_t1 - orig_t0 < min_duration) {
|
| 7955 |
+
orig_t1 = orig_t0 + min_duration;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7956 |
}
|
| 7957 |
|
| 7958 |
+
return orig_t1;
|
| 7959 |
+
}
|
| 7960 |
+
|
| 7961 |
+
|
| 7962 |
+
int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
|
| 7963 |
+
return whisper_full_get_segment_t0_from_state(ctx->state, i_segment);
|
| 7964 |
}
|
| 7965 |
|
| 7966 |
int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) {
|