ggerganov commited on
Commit
b0c2b16
Β·
unverified Β·
1 Parent(s): 2deaecf

whisper : by default disable non-speech tokens suppression (#473)

Browse files

This seems to be causing hallucinations in the end of the audio, e.g.:

"Thank you for listening"
"Amen"
..

Files changed (1) hide show
  1. whisper.cpp +10 -16
whisper.cpp CHANGED
@@ -2936,7 +2936,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
2936
  /*.language =*/ "en",
2937
 
2938
  /*.suppress_blank =*/ true,
2939
- /*.suppress_non_speech_tokens =*/true,
2940
 
2941
  /*.temperature =*/ 0.0f,
2942
  /*.max_initial_ts =*/ 1.0f,
@@ -3078,8 +3078,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, int max_len, bool
3078
  return res;
3079
  }
3080
 
3081
- static const std::vector<std::string> non_speech_tokens
3082
- {
3083
  "\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
3084
  "_", "`", "{", "|", "}", "~", "γ€Œ", "」", "γ€Ž", "』", "<<", ">>", "<<<", ">>>", "--",
3085
  "---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "β™ͺβ™ͺ",
@@ -3149,26 +3148,21 @@ static void whisper_process_logits(
3149
 
3150
  // suppress non-speech tokens
3151
  // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
3152
- if (params.suppress_non_speech_tokens)
3153
- {
3154
- for (const std::string &token : non_speech_tokens)
3155
- {
3156
- std::string suppress_tokens[] = {token, " " + token};
3157
- for (const std::string &suppress_token : suppress_tokens)
3158
- {
3159
- if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end())
3160
- {
3161
  logits[vocab.token_to_id.at(suppress_token)] = -INFINITY;
3162
  }
3163
  }
3164
  }
 
3165
  // allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
3166
- if (vocab.token_to_id.find(" -") != vocab.token_to_id.end())
3167
- {
3168
  logits[vocab.token_to_id.at(" -")] = -INFINITY;
3169
  }
3170
- if (vocab.token_to_id.find(" '") != vocab.token_to_id.end())
3171
- {
3172
  logits[vocab.token_to_id.at(" '")] = -INFINITY;
3173
  }
3174
  }
 
2936
  /*.language =*/ "en",
2937
 
2938
  /*.suppress_blank =*/ true,
2939
+ /*.suppress_non_speech_tokens =*/ false,
2940
 
2941
  /*.temperature =*/ 0.0f,
2942
  /*.max_initial_ts =*/ 1.0f,
 
3078
  return res;
3079
  }
3080
 
3081
+ static const std::vector<std::string> non_speech_tokens = {
 
3082
  "\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
3083
  "_", "`", "{", "|", "}", "~", "γ€Œ", "」", "γ€Ž", "』", "<<", ">>", "<<<", ">>>", "--",
3084
  "---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "β™ͺβ™ͺ",
 
3148
 
3149
  // suppress non-speech tokens
3150
  // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
3151
+ if (params.suppress_non_speech_tokens) {
3152
+ for (const std::string & token : non_speech_tokens) {
3153
+ const std::string suppress_tokens[] = {token, " " + token};
3154
+ for (const std::string & suppress_token : suppress_tokens) {
3155
+ if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end()) {
 
 
 
 
3156
  logits[vocab.token_to_id.at(suppress_token)] = -INFINITY;
3157
  }
3158
  }
3159
  }
3160
+
3161
  // allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
3162
+ if (vocab.token_to_id.find(" -") != vocab.token_to_id.end()) {
 
3163
  logits[vocab.token_to_id.at(" -")] = -INFINITY;
3164
  }
3165
+ if (vocab.token_to_id.find(" '") != vocab.token_to_id.end()) {
 
3166
  logits[vocab.token_to_id.at(" '")] = -INFINITY;
3167
  }
3168
  }