Fix tokenizer content
Browse files- special_tokens_map.json +25 -4
- tokenizer.json +90 -24
- tokenizer_config.json +20 -13
special_tokens_map.json
CHANGED
|
@@ -1,27 +1,48 @@
|
|
| 1 |
{
|
| 2 |
"bos_token": {
|
| 3 |
-
"content": "
|
| 4 |
"lstrip": false,
|
| 5 |
"normalized": false,
|
| 6 |
"rstrip": false,
|
| 7 |
"single_word": false
|
| 8 |
},
|
| 9 |
"eos_token": {
|
| 10 |
-
"content": "
|
| 11 |
"lstrip": false,
|
| 12 |
"normalized": false,
|
| 13 |
"rstrip": false,
|
| 14 |
"single_word": false
|
| 15 |
},
|
| 16 |
"pad_token": {
|
| 17 |
-
"content": "
|
| 18 |
"lstrip": false,
|
| 19 |
"normalized": false,
|
| 20 |
"rstrip": false,
|
| 21 |
"single_word": false
|
| 22 |
},
|
| 23 |
"unk_token": {
|
| 24 |
-
"content": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"lstrip": false,
|
| 26 |
"normalized": false,
|
| 27 |
"rstrip": false,
|
|
|
|
| 1 |
{
|
| 2 |
"bos_token": {
|
| 3 |
+
"content": "[BOS]",
|
| 4 |
"lstrip": false,
|
| 5 |
"normalized": false,
|
| 6 |
"rstrip": false,
|
| 7 |
"single_word": false
|
| 8 |
},
|
| 9 |
"eos_token": {
|
| 10 |
+
"content": "[EOS]",
|
| 11 |
"lstrip": false,
|
| 12 |
"normalized": false,
|
| 13 |
"rstrip": false,
|
| 14 |
"single_word": false
|
| 15 |
},
|
| 16 |
"pad_token": {
|
| 17 |
+
"content": "[PAD]",
|
| 18 |
"lstrip": false,
|
| 19 |
"normalized": false,
|
| 20 |
"rstrip": false,
|
| 21 |
"single_word": false
|
| 22 |
},
|
| 23 |
"unk_token": {
|
| 24 |
+
"content": "[UNK]",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
},
|
| 30 |
+
"cls_token": {
|
| 31 |
+
"content": "[CLS]",
|
| 32 |
+
"lstrip": false,
|
| 33 |
+
"normalized": false,
|
| 34 |
+
"rstrip": false,
|
| 35 |
+
"single_word": false
|
| 36 |
+
},
|
| 37 |
+
"mask_token": {
|
| 38 |
+
"content": "[MASK]",
|
| 39 |
+
"lstrip": true,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false
|
| 43 |
+
},
|
| 44 |
+
"sep_token": {
|
| 45 |
+
"content": "[SEP]",
|
| 46 |
"lstrip": false,
|
| 47 |
"normalized": false,
|
| 48 |
"rstrip": false,
|
tokenizer.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
"added_tokens": [
|
| 6 |
{
|
| 7 |
"id": 0,
|
| 8 |
-
"content": "
|
| 9 |
"single_word": false,
|
| 10 |
"lstrip": false,
|
| 11 |
"rstrip": false,
|
|
@@ -14,7 +14,7 @@
|
|
| 14 |
},
|
| 15 |
{
|
| 16 |
"id": 1,
|
| 17 |
-
"content": "
|
| 18 |
"single_word": false,
|
| 19 |
"lstrip": false,
|
| 20 |
"rstrip": false,
|
|
@@ -23,7 +23,7 @@
|
|
| 23 |
},
|
| 24 |
{
|
| 25 |
"id": 2,
|
| 26 |
-
"content": "
|
| 27 |
"single_word": false,
|
| 28 |
"lstrip": false,
|
| 29 |
"rstrip": false,
|
|
@@ -32,7 +32,7 @@
|
|
| 32 |
},
|
| 33 |
{
|
| 34 |
"id": 3,
|
| 35 |
-
"content": "
|
| 36 |
"single_word": false,
|
| 37 |
"lstrip": false,
|
| 38 |
"rstrip": false,
|
|
@@ -41,7 +41,7 @@
|
|
| 41 |
},
|
| 42 |
{
|
| 43 |
"id": 4,
|
| 44 |
-
"content": "
|
| 45 |
"single_word": false,
|
| 46 |
"lstrip": false,
|
| 47 |
"rstrip": false,
|
|
@@ -50,7 +50,7 @@
|
|
| 50 |
},
|
| 51 |
{
|
| 52 |
"id": 5,
|
| 53 |
-
"content": "
|
| 54 |
"single_word": false,
|
| 55 |
"lstrip": false,
|
| 56 |
"rstrip": false,
|
|
@@ -59,9 +59,9 @@
|
|
| 59 |
},
|
| 60 |
{
|
| 61 |
"id": 6,
|
| 62 |
-
"content": "
|
| 63 |
"single_word": false,
|
| 64 |
-
"lstrip":
|
| 65 |
"rstrip": false,
|
| 66 |
"normalized": false,
|
| 67 |
"special": true
|
|
@@ -2396,7 +2396,7 @@
|
|
| 2396 |
"single": [
|
| 2397 |
{
|
| 2398 |
"SpecialToken": {
|
| 2399 |
-
"id": "
|
| 2400 |
"type_id": 0
|
| 2401 |
}
|
| 2402 |
},
|
|
@@ -2405,12 +2405,18 @@
|
|
| 2405 |
"id": "A",
|
| 2406 |
"type_id": 0
|
| 2407 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2408 |
}
|
| 2409 |
],
|
| 2410 |
"pair": [
|
| 2411 |
{
|
| 2412 |
"SpecialToken": {
|
| 2413 |
-
"id": "
|
| 2414 |
"type_id": 0
|
| 2415 |
}
|
| 2416 |
},
|
|
@@ -2422,25 +2428,85 @@
|
|
| 2422 |
},
|
| 2423 |
{
|
| 2424 |
"SpecialToken": {
|
| 2425 |
-
"id": "
|
| 2426 |
-
"type_id":
|
| 2427 |
}
|
| 2428 |
},
|
| 2429 |
{
|
| 2430 |
"Sequence": {
|
| 2431 |
"id": "B",
|
| 2432 |
-
"type_id":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2433 |
}
|
| 2434 |
}
|
| 2435 |
],
|
| 2436 |
"special_tokens": {
|
| 2437 |
-
"
|
| 2438 |
-
"id": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2439 |
"ids": [
|
| 2440 |
2
|
| 2441 |
],
|
| 2442 |
"tokens": [
|
| 2443 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2444 |
]
|
| 2445 |
}
|
| 2446 |
}
|
|
@@ -2456,20 +2522,20 @@
|
|
| 2456 |
"model": {
|
| 2457 |
"type": "BPE",
|
| 2458 |
"dropout": null,
|
| 2459 |
-
"unk_token": "
|
| 2460 |
"continuing_subword_prefix": null,
|
| 2461 |
"end_of_word_suffix": null,
|
| 2462 |
"fuse_unk": false,
|
| 2463 |
"byte_fallback": false,
|
| 2464 |
"ignore_merges": true,
|
| 2465 |
"vocab": {
|
| 2466 |
-
"
|
| 2467 |
-
"
|
| 2468 |
-
"
|
| 2469 |
-
"
|
| 2470 |
-
"
|
| 2471 |
-
"
|
| 2472 |
-
"
|
| 2473 |
"<RESERVED_TOKEN_1>": 7,
|
| 2474 |
"<RESERVED_TOKEN_2>": 8,
|
| 2475 |
"<RESERVED_TOKEN_3>": 9,
|
|
|
|
| 5 |
"added_tokens": [
|
| 6 |
{
|
| 7 |
"id": 0,
|
| 8 |
+
"content": "[PAD]",
|
| 9 |
"single_word": false,
|
| 10 |
"lstrip": false,
|
| 11 |
"rstrip": false,
|
|
|
|
| 14 |
},
|
| 15 |
{
|
| 16 |
"id": 1,
|
| 17 |
+
"content": "[UNK]",
|
| 18 |
"single_word": false,
|
| 19 |
"lstrip": false,
|
| 20 |
"rstrip": false,
|
|
|
|
| 23 |
},
|
| 24 |
{
|
| 25 |
"id": 2,
|
| 26 |
+
"content": "[BOS]",
|
| 27 |
"single_word": false,
|
| 28 |
"lstrip": false,
|
| 29 |
"rstrip": false,
|
|
|
|
| 32 |
},
|
| 33 |
{
|
| 34 |
"id": 3,
|
| 35 |
+
"content": "[EOS]",
|
| 36 |
"single_word": false,
|
| 37 |
"lstrip": false,
|
| 38 |
"rstrip": false,
|
|
|
|
| 41 |
},
|
| 42 |
{
|
| 43 |
"id": 4,
|
| 44 |
+
"content": "[CLS]",
|
| 45 |
"single_word": false,
|
| 46 |
"lstrip": false,
|
| 47 |
"rstrip": false,
|
|
|
|
| 50 |
},
|
| 51 |
{
|
| 52 |
"id": 5,
|
| 53 |
+
"content": "[SEP]",
|
| 54 |
"single_word": false,
|
| 55 |
"lstrip": false,
|
| 56 |
"rstrip": false,
|
|
|
|
| 59 |
},
|
| 60 |
{
|
| 61 |
"id": 6,
|
| 62 |
+
"content": "[MASK]",
|
| 63 |
"single_word": false,
|
| 64 |
+
"lstrip": true,
|
| 65 |
"rstrip": false,
|
| 66 |
"normalized": false,
|
| 67 |
"special": true
|
|
|
|
| 2396 |
"single": [
|
| 2397 |
{
|
| 2398 |
"SpecialToken": {
|
| 2399 |
+
"id": "[CLS]",
|
| 2400 |
"type_id": 0
|
| 2401 |
}
|
| 2402 |
},
|
|
|
|
| 2405 |
"id": "A",
|
| 2406 |
"type_id": 0
|
| 2407 |
}
|
| 2408 |
+
},
|
| 2409 |
+
{
|
| 2410 |
+
"SpecialToken": {
|
| 2411 |
+
"id": "[SEP]",
|
| 2412 |
+
"type_id": 0
|
| 2413 |
+
}
|
| 2414 |
}
|
| 2415 |
],
|
| 2416 |
"pair": [
|
| 2417 |
{
|
| 2418 |
"SpecialToken": {
|
| 2419 |
+
"id": "[CLS]",
|
| 2420 |
"type_id": 0
|
| 2421 |
}
|
| 2422 |
},
|
|
|
|
| 2428 |
},
|
| 2429 |
{
|
| 2430 |
"SpecialToken": {
|
| 2431 |
+
"id": "[SEP]",
|
| 2432 |
+
"type_id": 0
|
| 2433 |
}
|
| 2434 |
},
|
| 2435 |
{
|
| 2436 |
"Sequence": {
|
| 2437 |
"id": "B",
|
| 2438 |
+
"type_id": 0
|
| 2439 |
+
}
|
| 2440 |
+
},
|
| 2441 |
+
{
|
| 2442 |
+
"SpecialToken": {
|
| 2443 |
+
"id": "[SEP]",
|
| 2444 |
+
"type_id": 0
|
| 2445 |
}
|
| 2446 |
}
|
| 2447 |
],
|
| 2448 |
"special_tokens": {
|
| 2449 |
+
"[PAD]": {
|
| 2450 |
+
"id": "[PAD]",
|
| 2451 |
+
"ids": [
|
| 2452 |
+
0
|
| 2453 |
+
],
|
| 2454 |
+
"tokens": [
|
| 2455 |
+
"[PAD]"
|
| 2456 |
+
]
|
| 2457 |
+
},
|
| 2458 |
+
"[UNK]": {
|
| 2459 |
+
"id": "[UNK]",
|
| 2460 |
+
"ids": [
|
| 2461 |
+
1
|
| 2462 |
+
],
|
| 2463 |
+
"tokens": [
|
| 2464 |
+
"[UNK]"
|
| 2465 |
+
]
|
| 2466 |
+
},
|
| 2467 |
+
"[BOS]": {
|
| 2468 |
+
"id": "[BOS]",
|
| 2469 |
"ids": [
|
| 2470 |
2
|
| 2471 |
],
|
| 2472 |
"tokens": [
|
| 2473 |
+
"[BOS]"
|
| 2474 |
+
]
|
| 2475 |
+
},
|
| 2476 |
+
"[EOS]": {
|
| 2477 |
+
"id": "[EOS]",
|
| 2478 |
+
"ids": [
|
| 2479 |
+
3
|
| 2480 |
+
],
|
| 2481 |
+
"tokens": [
|
| 2482 |
+
"[EOS]"
|
| 2483 |
+
]
|
| 2484 |
+
},
|
| 2485 |
+
"[CLS]": {
|
| 2486 |
+
"id": "[CLS]",
|
| 2487 |
+
"ids": [
|
| 2488 |
+
4
|
| 2489 |
+
],
|
| 2490 |
+
"tokens": [
|
| 2491 |
+
"[CLS]"
|
| 2492 |
+
]
|
| 2493 |
+
},
|
| 2494 |
+
"[SEP]": {
|
| 2495 |
+
"id": "[SEP]",
|
| 2496 |
+
"ids": [
|
| 2497 |
+
5
|
| 2498 |
+
],
|
| 2499 |
+
"tokens": [
|
| 2500 |
+
"[SEP]"
|
| 2501 |
+
]
|
| 2502 |
+
},
|
| 2503 |
+
"[MASK]": {
|
| 2504 |
+
"id": "[MASK]",
|
| 2505 |
+
"ids": [
|
| 2506 |
+
6
|
| 2507 |
+
],
|
| 2508 |
+
"tokens": [
|
| 2509 |
+
"[MASK]"
|
| 2510 |
]
|
| 2511 |
}
|
| 2512 |
}
|
|
|
|
| 2522 |
"model": {
|
| 2523 |
"type": "BPE",
|
| 2524 |
"dropout": null,
|
| 2525 |
+
"unk_token": "[UNK]",
|
| 2526 |
"continuing_subword_prefix": null,
|
| 2527 |
"end_of_word_suffix": null,
|
| 2528 |
"fuse_unk": false,
|
| 2529 |
"byte_fallback": false,
|
| 2530 |
"ignore_merges": true,
|
| 2531 |
"vocab": {
|
| 2532 |
+
"[PAD]": 0,
|
| 2533 |
+
"[UNK]": 1,
|
| 2534 |
+
"[BOS]": 2,
|
| 2535 |
+
"[EOS]": 3,
|
| 2536 |
+
"[CLS]": 4,
|
| 2537 |
+
"[SEP]": 5,
|
| 2538 |
+
"[MASK]": 6,
|
| 2539 |
"<RESERVED_TOKEN_1>": 7,
|
| 2540 |
"<RESERVED_TOKEN_2>": 8,
|
| 2541 |
"<RESERVED_TOKEN_3>": 9,
|
tokenizer_config.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"added_tokens_decoder": {
|
| 3 |
"0": {
|
| 4 |
-
"content": "
|
| 5 |
"lstrip": false,
|
| 6 |
"normalized": false,
|
| 7 |
"rstrip": false,
|
|
@@ -9,7 +9,7 @@
|
|
| 9 |
"special": true
|
| 10 |
},
|
| 11 |
"1": {
|
| 12 |
-
"content": "
|
| 13 |
"lstrip": false,
|
| 14 |
"normalized": false,
|
| 15 |
"rstrip": false,
|
|
@@ -17,7 +17,7 @@
|
|
| 17 |
"special": true
|
| 18 |
},
|
| 19 |
"2": {
|
| 20 |
-
"content": "
|
| 21 |
"lstrip": false,
|
| 22 |
"normalized": false,
|
| 23 |
"rstrip": false,
|
|
@@ -25,7 +25,7 @@
|
|
| 25 |
"special": true
|
| 26 |
},
|
| 27 |
"3": {
|
| 28 |
-
"content": "
|
| 29 |
"lstrip": false,
|
| 30 |
"normalized": false,
|
| 31 |
"rstrip": false,
|
|
@@ -33,7 +33,7 @@
|
|
| 33 |
"special": true
|
| 34 |
},
|
| 35 |
"4": {
|
| 36 |
-
"content": "
|
| 37 |
"lstrip": false,
|
| 38 |
"normalized": false,
|
| 39 |
"rstrip": false,
|
|
@@ -41,7 +41,7 @@
|
|
| 41 |
"special": true
|
| 42 |
},
|
| 43 |
"5": {
|
| 44 |
-
"content": "
|
| 45 |
"lstrip": false,
|
| 46 |
"normalized": false,
|
| 47 |
"rstrip": false,
|
|
@@ -49,8 +49,8 @@
|
|
| 49 |
"special": true
|
| 50 |
},
|
| 51 |
"6": {
|
| 52 |
-
"content": "
|
| 53 |
-
"lstrip":
|
| 54 |
"normalized": false,
|
| 55 |
"rstrip": false,
|
| 56 |
"single_word": false,
|
|
@@ -2097,11 +2097,18 @@
|
|
| 2097 |
"special": true
|
| 2098 |
}
|
| 2099 |
},
|
| 2100 |
-
"bos_token": "
|
| 2101 |
"clean_up_tokenization_spaces": true,
|
| 2102 |
-
"eos_token": "
|
| 2103 |
-
"model_max_length":
|
| 2104 |
-
"pad_token": "
|
| 2105 |
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 2106 |
-
"unk_token": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2107 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"added_tokens_decoder": {
|
| 3 |
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
"lstrip": false,
|
| 6 |
"normalized": false,
|
| 7 |
"rstrip": false,
|
|
|
|
| 9 |
"special": true
|
| 10 |
},
|
| 11 |
"1": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
"lstrip": false,
|
| 14 |
"normalized": false,
|
| 15 |
"rstrip": false,
|
|
|
|
| 17 |
"special": true
|
| 18 |
},
|
| 19 |
"2": {
|
| 20 |
+
"content": "[BOS]",
|
| 21 |
"lstrip": false,
|
| 22 |
"normalized": false,
|
| 23 |
"rstrip": false,
|
|
|
|
| 25 |
"special": true
|
| 26 |
},
|
| 27 |
"3": {
|
| 28 |
+
"content": "[EOS]",
|
| 29 |
"lstrip": false,
|
| 30 |
"normalized": false,
|
| 31 |
"rstrip": false,
|
|
|
|
| 33 |
"special": true
|
| 34 |
},
|
| 35 |
"4": {
|
| 36 |
+
"content": "[CLS]",
|
| 37 |
"lstrip": false,
|
| 38 |
"normalized": false,
|
| 39 |
"rstrip": false,
|
|
|
|
| 41 |
"special": true
|
| 42 |
},
|
| 43 |
"5": {
|
| 44 |
+
"content": "[SEP]",
|
| 45 |
"lstrip": false,
|
| 46 |
"normalized": false,
|
| 47 |
"rstrip": false,
|
|
|
|
| 49 |
"special": true
|
| 50 |
},
|
| 51 |
"6": {
|
| 52 |
+
"content": "[MASK]",
|
| 53 |
+
"lstrip": true,
|
| 54 |
"normalized": false,
|
| 55 |
"rstrip": false,
|
| 56 |
"single_word": false,
|
|
|
|
| 2097 |
"special": true
|
| 2098 |
}
|
| 2099 |
},
|
| 2100 |
+
"bos_token": "[BOS]",
|
| 2101 |
"clean_up_tokenization_spaces": true,
|
| 2102 |
+
"eos_token": "[EOS]",
|
| 2103 |
+
"model_max_length": 8192,
|
| 2104 |
+
"pad_token": "[PAD]",
|
| 2105 |
"tokenizer_class": "PreTrainedTokenizerFast",
|
| 2106 |
+
"unk_token": "[UNK]",
|
| 2107 |
+
"cls_token": "[CLS]",
|
| 2108 |
+
"mask_token": "[MASK]",
|
| 2109 |
+
"sep_token": "[SEP]",
|
| 2110 |
+
"model_input_names": [
|
| 2111 |
+
"input_ids",
|
| 2112 |
+
"attention_mask"
|
| 2113 |
+
]
|
| 2114 |
}
|