ggerganov commited on
Commit
1ffa4c8
·
unverified ·
1 Parent(s): 23d667f

models : simplify the conversion script

Browse files

"transformers" dependency is not actually needed

Files changed (1) hide show
  1. models/convert-pt-to-ggml.py +23 -23
models/convert-pt-to-ggml.py CHANGED
@@ -40,8 +40,8 @@ import code
40
  import torch
41
  import numpy as np
42
 
43
- from transformers import GPTJForCausalLM
44
- from transformers import GPT2TokenizerFast
45
 
46
  # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
47
  LANGUAGES = {
@@ -146,25 +146,25 @@ LANGUAGES = {
146
  "su": "sundanese",
147
  }
148
 
149
- # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
150
- def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
151
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
152
- path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
153
- tokenizer = GPT2TokenizerFast.from_pretrained(path)
154
-
155
- specials = [
156
- "<|startoftranscript|>",
157
- *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
158
- "<|translate|>",
159
- "<|transcribe|>",
160
- "<|startoflm|>",
161
- "<|startofprev|>",
162
- "<|nocaptions|>",
163
- "<|notimestamps|>",
164
- ]
165
-
166
- tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
167
- return tokenizer
168
 
169
  # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
170
  def bytes_to_unicode():
@@ -224,12 +224,12 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as
224
  #code.interact(local=locals())
225
 
226
  multilingual = hparams["n_vocab"] == 51865
227
- tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
228
 
 
229
  #print(tokenizer)
230
  #print(tokenizer.name_or_path)
231
  #print(len(tokenizer.additional_special_tokens))
232
- dir_tokenizer = tokenizer.name_or_path
233
 
234
  # output in the same directory as the model
235
  fname_out = dir_out + "/ggml-model.bin"
 
40
  import torch
41
  import numpy as np
42
 
43
+ #from transformers import GPTJForCausalLM
44
+ #from transformers import GPT2TokenizerFast
45
 
46
  # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
47
  LANGUAGES = {
 
146
  "su": "sundanese",
147
  }
148
 
149
+ ## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
150
+ #def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
151
+ # os.environ["TOKENIZERS_PARALLELISM"] = "false"
152
+ # path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
153
+ # tokenizer = GPT2TokenizerFast.from_pretrained(path)
154
+ #
155
+ # specials = [
156
+ # "<|startoftranscript|>",
157
+ # *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
158
+ # "<|translate|>",
159
+ # "<|transcribe|>",
160
+ # "<|startoflm|>",
161
+ # "<|startofprev|>",
162
+ # "<|nocaptions|>",
163
+ # "<|notimestamps|>",
164
+ # ]
165
+ #
166
+ # tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
167
+ # return tokenizer
168
 
169
  # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
170
  def bytes_to_unicode():
 
224
  #code.interact(local=locals())
225
 
226
  multilingual = hparams["n_vocab"] == 51865
227
+ dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2")
228
 
229
+ #tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
230
  #print(tokenizer)
231
  #print(tokenizer.name_or_path)
232
  #print(len(tokenizer.additional_special_tokens))
 
233
 
234
  # output in the same directory as the model
235
  fname_out = dir_out + "/ggml-model.bin"