AsukaMinato commited on
Commit
3ebe4d6
·
unverified ·
1 Parent(s): fece054

minor : improve C++ and Python style (#768)

Browse files

* use some STL functions

* use self.field than setattr, use pathlib.Path

* recover some format

* const some iter

* Keep the original

* 2 space

models/convert-h5-to-ggml.py CHANGED
@@ -23,6 +23,7 @@ import json
23
  import code
24
  import torch
25
  import numpy as np
 
26
 
27
  from transformers import WhisperForConditionalGeneration
28
 
@@ -75,16 +76,13 @@ if len(sys.argv) < 4:
75
  print("Usage: convert-h5-to-ggml.py dir_model path-to-whisper-repo dir-output [use-f32]\n")
76
  sys.exit(1)
77
 
78
- dir_model = sys.argv[1]
79
- dir_whisper = sys.argv[2]
80
- dir_out = sys.argv[3]
81
 
82
- with open(dir_model + "/vocab.json", "r", encoding="utf8") as f:
83
- encoder = json.load(f)
84
- with open(dir_model + "/added_tokens.json", "r", encoding="utf8") as f:
85
- encoder_added = json.load(f)
86
- with open(dir_model + "/config.json", "r", encoding="utf8") as f:
87
- hparams = json.load(f)
88
 
89
  model = WhisperForConditionalGeneration.from_pretrained(dir_model)
90
 
@@ -96,16 +94,15 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as
96
 
97
  dir_tokenizer = dir_model
98
 
99
- fname_out = dir_out + "/ggml-model.bin"
100
 
101
- with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
102
- tokens = json.load(f)
103
 
104
  # use 16-bit or 32-bit floats
105
  use_f16 = True
106
  if len(sys.argv) > 4:
107
  use_f16 = False
108
- fname_out = dir_out + "/ggml-model-f32.bin"
109
 
110
  fout = open(fname_out, "wb")
111
 
@@ -171,10 +168,9 @@ for name in list_vars.keys():
171
  data = data.astype(np.float16)
172
 
173
  # reshape conv bias from [n] to [n, 1]
174
- if name == "encoder.conv1.bias" or \
175
- name == "encoder.conv2.bias":
176
  data = data.reshape(data.shape[0], 1)
177
- print(" Reshaped variable: " + name + " to shape: ", data.shape)
178
 
179
  n_dims = len(data.shape)
180
  print(name, n_dims, data.shape)
@@ -182,7 +178,7 @@ for name in list_vars.keys():
182
  # looks like the whisper models are in f16 by default
183
  # so we need to convert the small tensors to f32 until we fully support f16 in ggml
184
  # ftype == 0 -> float32, ftype == 1 -> float16
185
- ftype = 1;
186
  if use_f16:
187
  if n_dims < 2 or \
188
  name == "encoder.conv1.bias" or \
@@ -197,16 +193,16 @@ for name in list_vars.keys():
197
  ftype = 0
198
 
199
  # header
200
- str = name.encode('utf-8')
201
- fout.write(struct.pack("iii", n_dims, len(str), ftype))
202
  for i in range(n_dims):
203
  fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
204
- fout.write(str);
205
 
206
  # data
207
  data.tofile(fout)
208
 
209
  fout.close()
210
 
211
- print("Done. Output file: " + fname_out)
212
  print("")
 
23
  import code
24
  import torch
25
  import numpy as np
26
+ from pathlib import Path
27
 
28
  from transformers import WhisperForConditionalGeneration
29
 
 
76
  print("Usage: convert-h5-to-ggml.py dir_model path-to-whisper-repo dir-output [use-f32]\n")
77
  sys.exit(1)
78
 
79
+ dir_model = Path(sys.argv[1])
80
+ dir_whisper = Path(sys.argv[2])
81
+ dir_out = Path(sys.argv[3])
82
 
83
+ encoder = json.load((dir_model / "vocab.json").open("r", encoding="utf8"))
84
+ encoder_added = json.load((dir_model / "added_tokens.json").open( "r", encoding="utf8"))
85
+ hparams = json.load((dir_model / "config.json").open("r", encoding="utf8") )
 
 
 
86
 
87
  model = WhisperForConditionalGeneration.from_pretrained(dir_model)
88
 
 
94
 
95
  dir_tokenizer = dir_model
96
 
97
+ fname_out = dir_out / "ggml-model.bin"
98
 
99
+ tokens = json.load(open(dir_tokenizer / "vocab.json", "r", encoding="utf8"))
 
100
 
101
  # use 16-bit or 32-bit floats
102
  use_f16 = True
103
  if len(sys.argv) > 4:
104
  use_f16 = False
105
+ fname_out = dir_out / "ggml-model-f32.bin"
106
 
107
  fout = open(fname_out, "wb")
108
 
 
168
  data = data.astype(np.float16)
169
 
170
  # reshape conv bias from [n] to [n, 1]
171
+ if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
 
172
  data = data.reshape(data.shape[0], 1)
173
+ print(" Reshaped variable: " , name , " to shape: ", data.shape)
174
 
175
  n_dims = len(data.shape)
176
  print(name, n_dims, data.shape)
 
178
  # looks like the whisper models are in f16 by default
179
  # so we need to convert the small tensors to f32 until we fully support f16 in ggml
180
  # ftype == 0 -> float32, ftype == 1 -> float16
181
+ ftype = 1
182
  if use_f16:
183
  if n_dims < 2 or \
184
  name == "encoder.conv1.bias" or \
 
193
  ftype = 0
194
 
195
  # header
196
+ str_ = name.encode('utf-8')
197
+ fout.write(struct.pack("iii", n_dims, len(str_), ftype))
198
  for i in range(n_dims):
199
  fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
200
+ fout.write(str_)
201
 
202
  # data
203
  data.tofile(fout)
204
 
205
  fout.close()
206
 
207
+ print("Done. Output file: " , fname_out)
208
  print("")
models/convert-pt-to-ggml.py CHANGED
@@ -40,7 +40,7 @@ import code
40
  import torch
41
  import numpy as np
42
  import base64
43
-
44
  #from transformers import GPTJForCausalLM
45
  #from transformers import GPT2TokenizerFast
46
 
@@ -194,17 +194,17 @@ if len(sys.argv) < 4:
194
  print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
195
  sys.exit(1)
196
 
197
- fname_inp = sys.argv[1]
198
- dir_whisper = sys.argv[2]
199
- dir_out = sys.argv[3]
200
 
201
  # try to load PyTorch binary data
202
  try:
203
  model_bytes = open(fname_inp, "rb").read()
204
  with io.BytesIO(model_bytes) as fp:
205
  checkpoint = torch.load(fp, map_location="cpu")
206
- except:
207
- print("Error: failed to load PyTorch model file: %s" % fname_inp)
208
  sys.exit(1)
209
 
210
  hparams = checkpoint["dims"]
@@ -218,17 +218,17 @@ list_vars = checkpoint["model_state_dict"]
218
 
219
  # load mel filters
220
  n_mels = hparams["n_mels"]
221
- with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f:
222
  filters = torch.from_numpy(f[f"mel_{n_mels}"])
223
  #print (filters)
224
 
225
  #code.interact(local=locals())
226
 
227
  multilingual = hparams["n_vocab"] == 51865
228
- tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
229
 
230
  # output in the same directory as the model
231
- fname_out = dir_out + "/ggml-model.bin"
232
 
233
  with open(tokenizer, "rb") as f:
234
  contents = f.read()
@@ -238,9 +238,9 @@ with open(tokenizer, "rb") as f:
238
  use_f16 = True
239
  if len(sys.argv) > 4:
240
  use_f16 = False
241
- fname_out = dir_out + "/ggml-model-f32.bin"
242
 
243
- fout = open(fname_out, "wb")
244
 
245
  fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
246
  fout.write(struct.pack("i", hparams["n_vocab"]))
@@ -273,20 +273,19 @@ for key in tokens:
273
 
274
  for name in list_vars.keys():
275
  data = list_vars[name].squeeze().numpy()
276
- print("Processing variable: " + name + " with shape: ", data.shape)
277
 
278
  # reshape conv bias from [n] to [n, 1]
279
- if name == "encoder.conv1.bias" or \
280
- name == "encoder.conv2.bias":
281
  data = data.reshape(data.shape[0], 1)
282
- print(" Reshaped variable: " + name + " to shape: ", data.shape)
283
 
284
- n_dims = len(data.shape);
285
 
286
  # looks like the whisper models are in f16 by default
287
  # so we need to convert the small tensors to f32 until we fully support f16 in ggml
288
  # ftype == 0 -> float32, ftype == 1 -> float16
289
- ftype = 1;
290
  if use_f16:
291
  if n_dims < 2 or \
292
  name == "encoder.conv1.bias" or \
@@ -307,16 +306,16 @@ for name in list_vars.keys():
307
  # data = data.transpose()
308
 
309
  # header
310
- str = name.encode('utf-8')
311
- fout.write(struct.pack("iii", n_dims, len(str), ftype))
312
  for i in range(n_dims):
313
  fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
314
- fout.write(str);
315
 
316
  # data
317
  data.tofile(fout)
318
 
319
  fout.close()
320
 
321
- print("Done. Output file: " + fname_out)
322
  print("")
 
40
  import torch
41
  import numpy as np
42
  import base64
43
+ from pathlib import Path
44
  #from transformers import GPTJForCausalLM
45
  #from transformers import GPT2TokenizerFast
46
 
 
194
  print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
195
  sys.exit(1)
196
 
197
+ fname_inp = Path(sys.argv[1])
198
+ dir_whisper = Path(sys.argv[2])
199
+ dir_out = Path(sys.argv[3])
200
 
201
  # try to load PyTorch binary data
202
  try:
203
  model_bytes = open(fname_inp, "rb").read()
204
  with io.BytesIO(model_bytes) as fp:
205
  checkpoint = torch.load(fp, map_location="cpu")
206
+ except Exception:
207
+ print("Error: failed to load PyTorch model file:" , fname_inp)
208
  sys.exit(1)
209
 
210
  hparams = checkpoint["dims"]
 
218
 
219
  # load mel filters
220
  n_mels = hparams["n_mels"]
221
+ with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
222
  filters = torch.from_numpy(f[f"mel_{n_mels}"])
223
  #print (filters)
224
 
225
  #code.interact(local=locals())
226
 
227
  multilingual = hparams["n_vocab"] == 51865
228
+ tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
229
 
230
  # output in the same directory as the model
231
+ fname_out = dir_out / "ggml-model.bin"
232
 
233
  with open(tokenizer, "rb") as f:
234
  contents = f.read()
 
238
  use_f16 = True
239
  if len(sys.argv) > 4:
240
  use_f16 = False
241
+ fname_out = dir_out / "ggml-model-f32.bin"
242
 
243
+ fout = fname_out.open("wb")
244
 
245
  fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
246
  fout.write(struct.pack("i", hparams["n_vocab"]))
 
273
 
274
  for name in list_vars.keys():
275
  data = list_vars[name].squeeze().numpy()
276
+ print("Processing variable: " , name , " with shape: ", data.shape)
277
 
278
  # reshape conv bias from [n] to [n, 1]
279
+ if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
 
280
  data = data.reshape(data.shape[0], 1)
281
+ print(f" Reshaped variable: {name} to shape: ", data.shape)
282
 
283
+ n_dims = len(data.shape)
284
 
285
  # looks like the whisper models are in f16 by default
286
  # so we need to convert the small tensors to f32 until we fully support f16 in ggml
287
  # ftype == 0 -> float32, ftype == 1 -> float16
288
+ ftype = 1
289
  if use_f16:
290
  if n_dims < 2 or \
291
  name == "encoder.conv1.bias" or \
 
306
  # data = data.transpose()
307
 
308
  # header
309
+ str_ = name.encode('utf-8')
310
+ fout.write(struct.pack("iii", n_dims, len(str_), ftype))
311
  for i in range(n_dims):
312
  fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
313
+ fout.write(str_)
314
 
315
  # data
316
  data.tofile(fout)
317
 
318
  fout.close()
319
 
320
+ print("Done. Output file: " , fname_out)
321
  print("")
models/convert-whisper-to-coreml.py CHANGED
@@ -20,7 +20,7 @@ def linear_to_conv2d_map(state_dict, prefix, local_metadata, strict,
20
  """
21
  for k in state_dict:
22
  is_attention = all(substr in k for substr in ['attn', '.weight'])
23
- is_mlp = any([k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight']])
24
 
25
  if (is_attention or is_mlp) and len(state_dict[k].shape) == 2:
26
  state_dict[k] = state_dict[k][:, :, None, None]
@@ -42,11 +42,10 @@ class LayerNormANE(LayerNormANEBase):
42
  class MultiHeadAttentionANE(MultiHeadAttention):
43
  def __init__(self, n_state: int, n_head: int):
44
  super().__init__(n_state, n_head)
45
-
46
- setattr(self, 'query', nn.Conv2d(n_state, n_state, kernel_size=1))
47
- setattr(self, 'key', nn.Conv2d(n_state, n_state, kernel_size=1, bias=False))
48
- setattr(self, 'value', nn.Conv2d(n_state, n_state, kernel_size=1))
49
- setattr(self, 'out', nn.Conv2d(n_state, n_state, kernel_size=1))
50
 
51
  def forward(self,
52
  x: Tensor,
@@ -104,30 +103,28 @@ class MultiHeadAttentionANE(MultiHeadAttention):
104
  class ResidualAttentionBlockANE(ResidualAttentionBlock):
105
  def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
106
  super().__init__(n_state, n_head, cross_attention)
107
-
108
- setattr(self, 'attn', MultiHeadAttentionANE(n_state, n_head))
109
- setattr(self, 'attn_ln', LayerNormANE(n_state))
110
-
111
- setattr(self, 'cross_attn', MultiHeadAttentionANE(n_state, n_head) if cross_attention else None)
112
- setattr(self, 'cross_attn_ln', LayerNormANE(n_state) if cross_attention else None)
113
 
114
  n_mlp = n_state * 4
115
- setattr(self, 'mlp', nn.Sequential(
116
  nn.Conv2d(n_state, n_mlp, kernel_size=1),
117
  nn.GELU(),
118
  nn.Conv2d(n_mlp, n_state, kernel_size=1)
119
- ))
120
- setattr(self, 'mlp_ln', LayerNormANE(n_state))
121
 
122
 
123
  class AudioEncoderANE(AudioEncoder):
124
  def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
125
  super().__init__(n_mels, n_ctx, n_state, n_head, n_layer)
126
 
127
- setattr(self, 'blocks', nn.ModuleList(
128
  [ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
129
- ))
130
- setattr(self, 'ln_post', LayerNormANE(n_state))
131
 
132
  def forward(self, x: Tensor):
133
  """
@@ -168,10 +165,10 @@ class TextDecoderANE(TextDecoder):
168
  def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
169
  super().__init__(n_vocab, n_ctx, n_state, n_head, n_layer)
170
 
171
- setattr(self, 'blocks', nn.ModuleList(
172
  [ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
173
- ))
174
- setattr(self, 'ln', LayerNormANE(n_state))
175
 
176
  def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
177
  """
@@ -213,20 +210,20 @@ class WhisperANE(Whisper):
213
  def __init__(self, dims: ModelDimensions):
214
  super().__init__(dims)
215
 
216
- setattr(self, 'encoder', AudioEncoderANE(
217
  self.dims.n_mels,
218
  self.dims.n_audio_ctx,
219
  self.dims.n_audio_state,
220
  self.dims.n_audio_head,
221
  self.dims.n_audio_layer,
222
- ))
223
- setattr(self, 'decoder', TextDecoderANE(
224
  self.dims.n_vocab,
225
  self.dims.n_text_ctx,
226
  self.dims.n_text_state,
227
  self.dims.n_text_head,
228
  self.dims.n_text_layer,
229
- ))
230
 
231
  self._register_load_state_dict_pre_hook(linear_to_conv2d_map)
232
 
 
20
  """
21
  for k in state_dict:
22
  is_attention = all(substr in k for substr in ['attn', '.weight'])
23
+ is_mlp = any(k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight'])
24
 
25
  if (is_attention or is_mlp) and len(state_dict[k].shape) == 2:
26
  state_dict[k] = state_dict[k][:, :, None, None]
 
42
  class MultiHeadAttentionANE(MultiHeadAttention):
43
  def __init__(self, n_state: int, n_head: int):
44
  super().__init__(n_state, n_head)
45
+ self.query = nn.Conv2d(n_state, n_state, kernel_size=1)
46
+ self.key = nn.Conv2d(n_state, n_state, kernel_size=1, bias=False)
47
+ self.value = nn.Conv2d(n_state, n_state, kernel_size=1)
48
+ self.out = nn.Conv2d(n_state, n_state, kernel_size=1)
 
49
 
50
  def forward(self,
51
  x: Tensor,
 
103
  class ResidualAttentionBlockANE(ResidualAttentionBlock):
104
  def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
105
  super().__init__(n_state, n_head, cross_attention)
106
+ self.attn = MultiHeadAttentionANE(n_state, n_head)
107
+ self.attn_ln = LayerNormANE(n_state)
108
+ self.cross_attn = MultiHeadAttentionANE(n_state, n_head) if cross_attention else None
109
+ self.cross_attn_ln = LayerNormANE(n_state) if cross_attention else None
 
 
110
 
111
  n_mlp = n_state * 4
112
+ self.mlp = nn.Sequential(
113
  nn.Conv2d(n_state, n_mlp, kernel_size=1),
114
  nn.GELU(),
115
  nn.Conv2d(n_mlp, n_state, kernel_size=1)
116
+ )
117
+ self.mlp_ln = LayerNormANE(n_state)
118
 
119
 
120
  class AudioEncoderANE(AudioEncoder):
121
  def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
122
  super().__init__(n_mels, n_ctx, n_state, n_head, n_layer)
123
 
124
+ self.blocks = nn.ModuleList(
125
  [ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
126
+ )
127
+ self.ln_post = LayerNormANE(n_state)
128
 
129
  def forward(self, x: Tensor):
130
  """
 
165
  def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
166
  super().__init__(n_vocab, n_ctx, n_state, n_head, n_layer)
167
 
168
+ self.blocks= nn.ModuleList(
169
  [ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
170
+ )
171
+ self.ln= LayerNormANE(n_state)
172
 
173
  def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
174
  """
 
210
  def __init__(self, dims: ModelDimensions):
211
  super().__init__(dims)
212
 
213
+ self.encoder = AudioEncoderANE(
214
  self.dims.n_mels,
215
  self.dims.n_audio_ctx,
216
  self.dims.n_audio_state,
217
  self.dims.n_audio_head,
218
  self.dims.n_audio_layer,
219
+ )
220
+ self.decoder = TextDecoderANE(
221
  self.dims.n_vocab,
222
  self.dims.n_text_ctx,
223
  self.dims.n_text_state,
224
  self.dims.n_text_head,
225
  self.dims.n_text_layer,
226
+ )
227
 
228
  self._register_load_state_dict_pre_hook(linear_to_conv2d_map)
229
 
whisper.cpp CHANGED
@@ -2356,11 +2356,7 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float>
2356
  sum += fft_out[k] * filters.data[j * n_fft + k];
2357
  }
2358
 
2359
- if (sum < 1e-10) {
2360
- sum = 1e-10;
2361
- }
2362
-
2363
- sum = log10(sum);
2364
 
2365
  mel.data[j * mel.n_len + i] = sum;
2366
  }
@@ -2602,7 +2598,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
2602
  }
2603
 
2604
  struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
2605
- whisper_model_loader loader = {};
2606
 
2607
  fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
2608
 
@@ -2612,22 +2607,27 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
2612
  return nullptr;
2613
  }
2614
 
2615
- loader.context = &fin;
2616
-
2617
- loader.read = [](void * ctx, void * output, size_t read_size) {
2618
- std::ifstream * fin = (std::ifstream*)ctx;
2619
- fin->read((char *)output, read_size);
2620
- return read_size;
2621
- };
2622
-
2623
- loader.eof = [](void * ctx) {
2624
- std::ifstream * fin = (std::ifstream*)ctx;
2625
- return fin->eof();
2626
- };
2627
-
2628
- loader.close = [](void * ctx) {
2629
- std::ifstream * fin = (std::ifstream*)ctx;
2630
- fin->close();
 
 
 
 
 
2631
  };
2632
 
2633
  auto ctx = whisper_init_no_state(&loader);
@@ -2647,30 +2647,34 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t
2647
  };
2648
 
2649
  buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
2650
- whisper_model_loader loader = {};
2651
 
2652
  fprintf(stderr, "%s: loading model from buffer\n", __func__);
2653
 
2654
- loader.context = &ctx;
 
2655
 
2656
- loader.read = [](void * ctx, void * output, size_t read_size) {
2657
- buf_context * buf = reinterpret_cast<buf_context *>(ctx);
 
2658
 
2659
- size_t size_to_copy = buf->current_offset + read_size < buf->size ? read_size : buf->size - buf->current_offset;
 
 
2660
 
2661
- memcpy(output, buf->buffer + buf->current_offset, size_to_copy);
2662
- buf->current_offset += size_to_copy;
2663
 
2664
- return size_to_copy;
2665
- };
2666
 
2667
- loader.eof = [](void * ctx) {
2668
- buf_context * buf = reinterpret_cast<buf_context *>(ctx);
 
2669
 
2670
- return buf->current_offset >= buf->size;
2671
- };
2672
 
2673
- loader.close = [](void * /*ctx*/) { };
2674
 
2675
  return whisper_init_no_state(&loader);
2676
  }
@@ -2909,7 +2913,6 @@ int whisper_lang_id(const char * lang) {
2909
  fprintf(stderr, "%s: unknown language '%s'\n", __func__, lang);
2910
  return -1;
2911
  }
2912
-
2913
  return g_lang.at(lang).first;
2914
  }
2915
 
@@ -3303,15 +3306,15 @@ static void whisper_exp_compute_token_level_timestamps(
3303
 
3304
  // trim from start (in place)
3305
  static inline void ltrim(std::string &s) {
3306
- s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
3307
- return !std::isspace(ch);
3308
  }));
3309
  }
3310
 
3311
  // trim from end (in place)
3312
  static inline void rtrim(std::string &s) {
3313
- s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
3314
- return !std::isspace(ch);
3315
  }).base(), s.end());
3316
  }
3317
 
 
2356
  sum += fft_out[k] * filters.data[j * n_fft + k];
2357
  }
2358
 
2359
+ sum = log10(std::max(sum, 1e-10));
 
 
 
 
2360
 
2361
  mel.data[j * mel.n_len + i] = sum;
2362
  }
 
2598
  }
2599
 
2600
  struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
 
2601
 
2602
  fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
2603
 
 
2607
  return nullptr;
2608
  }
2609
 
2610
+ whisper_model_loader loader = {
2611
+ .context = &fin,
2612
+
2613
+ .read =
2614
+ [](void *ctx, void *output, size_t read_size) {
2615
+ std::ifstream *fin = (std::ifstream *)ctx;
2616
+ fin->read((char *)output, read_size);
2617
+ return read_size;
2618
+ },
2619
+
2620
+ .eof =
2621
+ [](void *ctx) {
2622
+ std::ifstream *fin = (std::ifstream *)ctx;
2623
+ return fin->eof();
2624
+ },
2625
+
2626
+ .close =
2627
+ [](void *ctx) {
2628
+ std::ifstream *fin = (std::ifstream *)ctx;
2629
+ fin->close();
2630
+ }
2631
  };
2632
 
2633
  auto ctx = whisper_init_no_state(&loader);
 
2647
  };
2648
 
2649
  buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
 
2650
 
2651
  fprintf(stderr, "%s: loading model from buffer\n", __func__);
2652
 
2653
+ whisper_model_loader loader = {
2654
+ .context = &ctx,
2655
 
2656
+ .read =
2657
+ [](void *ctx, void *output, size_t read_size) {
2658
+ buf_context *buf = reinterpret_cast<buf_context *>(ctx);
2659
 
2660
+ size_t size_to_copy = buf->current_offset + read_size < buf->size
2661
+ ? read_size
2662
+ : buf->size - buf->current_offset;
2663
 
2664
+ memcpy(output, buf->buffer + buf->current_offset, size_to_copy);
2665
+ buf->current_offset += size_to_copy;
2666
 
2667
+ return size_to_copy;
2668
+ },
2669
 
2670
+ .eof =
2671
+ [](void *ctx) {
2672
+ buf_context *buf = reinterpret_cast<buf_context *>(ctx);
2673
 
2674
+ return buf->current_offset >= buf->size;
2675
+ },
2676
 
2677
+ .close = [](void * /*ctx*/) {}};
2678
 
2679
  return whisper_init_no_state(&loader);
2680
  }
 
2913
  fprintf(stderr, "%s: unknown language '%s'\n", __func__, lang);
2914
  return -1;
2915
  }
 
2916
  return g_lang.at(lang).first;
2917
  }
2918
 
 
3306
 
3307
  // trim from start (in place)
3308
  static inline void ltrim(std::string &s) {
3309
+ s.erase(s.begin(), std::find_if_not(s.begin(), s.end(), [](unsigned char ch) {
3310
+ return std::isspace(ch);
3311
  }));
3312
  }
3313
 
3314
  // trim from end (in place)
3315
  static inline void rtrim(std::string &s) {
3316
+ s.erase(std::find_if_not(s.rbegin(), s.rend(), [](unsigned char ch) {
3317
+ return std::isspace(ch);
3318
  }).base(), s.end());
3319
  }
3320