Spaces:
Sleeping
Sleeping
models : simplify the conversion script
Browse files"transformers" dependency is not actually needed
- models/convert-pt-to-ggml.py +23 -23
models/convert-pt-to-ggml.py
CHANGED
|
@@ -40,8 +40,8 @@ import code
|
|
| 40 |
import torch
|
| 41 |
import numpy as np
|
| 42 |
|
| 43 |
-
from transformers import GPTJForCausalLM
|
| 44 |
-
from transformers import GPT2TokenizerFast
|
| 45 |
|
| 46 |
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
|
| 47 |
LANGUAGES = {
|
|
@@ -146,25 +146,25 @@ LANGUAGES = {
|
|
| 146 |
"su": "sundanese",
|
| 147 |
}
|
| 148 |
|
| 149 |
-
|
| 150 |
-
def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
|
| 151 |
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 152 |
-
path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
|
| 153 |
-
tokenizer = GPT2TokenizerFast.from_pretrained(path)
|
| 154 |
-
|
| 155 |
-
specials = [
|
| 156 |
-
"<|startoftranscript|>",
|
| 157 |
-
*[f"<|{lang}|>" for lang in LANGUAGES.keys()],
|
| 158 |
-
"<|translate|>",
|
| 159 |
-
"<|transcribe|>",
|
| 160 |
-
"<|startoflm|>",
|
| 161 |
-
"<|startofprev|>",
|
| 162 |
-
"<|nocaptions|>",
|
| 163 |
-
"<|notimestamps|>",
|
| 164 |
-
]
|
| 165 |
-
|
| 166 |
-
tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
|
| 167 |
-
return tokenizer
|
| 168 |
|
| 169 |
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
| 170 |
def bytes_to_unicode():
|
|
@@ -224,12 +224,12 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as
|
|
| 224 |
#code.interact(local=locals())
|
| 225 |
|
| 226 |
multilingual = hparams["n_vocab"] == 51865
|
| 227 |
-
|
| 228 |
|
|
|
|
| 229 |
#print(tokenizer)
|
| 230 |
#print(tokenizer.name_or_path)
|
| 231 |
#print(len(tokenizer.additional_special_tokens))
|
| 232 |
-
dir_tokenizer = tokenizer.name_or_path
|
| 233 |
|
| 234 |
# output in the same directory as the model
|
| 235 |
fname_out = dir_out + "/ggml-model.bin"
|
|
|
|
| 40 |
import torch
|
| 41 |
import numpy as np
|
| 42 |
|
| 43 |
+
#from transformers import GPTJForCausalLM
|
| 44 |
+
#from transformers import GPT2TokenizerFast
|
| 45 |
|
| 46 |
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
|
| 47 |
LANGUAGES = {
|
|
|
|
| 146 |
"su": "sundanese",
|
| 147 |
}
|
| 148 |
|
| 149 |
+
## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
|
| 150 |
+
#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
|
| 151 |
+
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 152 |
+
# path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
|
| 153 |
+
# tokenizer = GPT2TokenizerFast.from_pretrained(path)
|
| 154 |
+
#
|
| 155 |
+
# specials = [
|
| 156 |
+
# "<|startoftranscript|>",
|
| 157 |
+
# *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
|
| 158 |
+
# "<|translate|>",
|
| 159 |
+
# "<|transcribe|>",
|
| 160 |
+
# "<|startoflm|>",
|
| 161 |
+
# "<|startofprev|>",
|
| 162 |
+
# "<|nocaptions|>",
|
| 163 |
+
# "<|notimestamps|>",
|
| 164 |
+
# ]
|
| 165 |
+
#
|
| 166 |
+
# tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
|
| 167 |
+
# return tokenizer
|
| 168 |
|
| 169 |
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
| 170 |
def bytes_to_unicode():
|
|
|
|
| 224 |
#code.interact(local=locals())
|
| 225 |
|
| 226 |
multilingual = hparams["n_vocab"] == 51865
|
| 227 |
+
dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2")
|
| 228 |
|
| 229 |
+
#tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2")
|
| 230 |
#print(tokenizer)
|
| 231 |
#print(tokenizer.name_or_path)
|
| 232 |
#print(len(tokenizer.additional_special_tokens))
|
|
|
|
| 233 |
|
| 234 |
# output in the same directory as the model
|
| 235 |
fname_out = dir_out + "/ggml-model.bin"
|