ebrarkiziloglu commited on
Commit
2d009e0
·
verified ·
1 Parent(s): 47b8dd5

Fix tokenizer content

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +25 -4
  2. tokenizer.json +90 -24
  3. tokenizer_config.json +20 -13
special_tokens_map.json CHANGED
@@ -1,27 +1,48 @@
1
  {
2
  "bos_token": {
3
- "content": "<BOS>",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
- "content": "<EOS>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
- "content": "<PAD>",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
  "unk_token": {
24
- "content": "<UNK>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  "lstrip": false,
26
  "normalized": false,
27
  "rstrip": false,
 
1
  {
2
  "bos_token": {
3
+ "content": "[BOS]",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
+ "content": "[EOS]",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "pad_token": {
17
+ "content": "[PAD]",
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
21
  "single_word": false
22
  },
23
  "unk_token": {
24
+ "content": "[UNK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "cls_token": {
31
+ "content": "[CLS]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "mask_token": {
38
+ "content": "[MASK]",
39
+ "lstrip": true,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "sep_token": {
45
+ "content": "[SEP]",
46
  "lstrip": false,
47
  "normalized": false,
48
  "rstrip": false,
tokenizer.json CHANGED
@@ -5,7 +5,7 @@
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
- "content": "<PAD>",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
@@ -14,7 +14,7 @@
14
  },
15
  {
16
  "id": 1,
17
- "content": "<UNK>",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
@@ -23,7 +23,7 @@
23
  },
24
  {
25
  "id": 2,
26
- "content": "<BOS>",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
@@ -32,7 +32,7 @@
32
  },
33
  {
34
  "id": 3,
35
- "content": "<EOS>",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
@@ -41,7 +41,7 @@
41
  },
42
  {
43
  "id": 4,
44
- "content": "<INST>",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
@@ -50,7 +50,7 @@
50
  },
51
  {
52
  "id": 5,
53
- "content": "</INST>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
@@ -59,9 +59,9 @@
59
  },
60
  {
61
  "id": 6,
62
- "content": "<MASK>",
63
  "single_word": false,
64
- "lstrip": false,
65
  "rstrip": false,
66
  "normalized": false,
67
  "special": true
@@ -2396,7 +2396,7 @@
2396
  "single": [
2397
  {
2398
  "SpecialToken": {
2399
- "id": "<BOS>",
2400
  "type_id": 0
2401
  }
2402
  },
@@ -2405,12 +2405,18 @@
2405
  "id": "A",
2406
  "type_id": 0
2407
  }
 
 
 
 
 
 
2408
  }
2409
  ],
2410
  "pair": [
2411
  {
2412
  "SpecialToken": {
2413
- "id": "<BOS>",
2414
  "type_id": 0
2415
  }
2416
  },
@@ -2422,25 +2428,85 @@
2422
  },
2423
  {
2424
  "SpecialToken": {
2425
- "id": "<BOS>",
2426
- "type_id": 1
2427
  }
2428
  },
2429
  {
2430
  "Sequence": {
2431
  "id": "B",
2432
- "type_id": 1
 
 
 
 
 
 
2433
  }
2434
  }
2435
  ],
2436
  "special_tokens": {
2437
- "<BOS>": {
2438
- "id": "<BOS>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2439
  "ids": [
2440
  2
2441
  ],
2442
  "tokens": [
2443
- "<BOS>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2444
  ]
2445
  }
2446
  }
@@ -2456,20 +2522,20 @@
2456
  "model": {
2457
  "type": "BPE",
2458
  "dropout": null,
2459
- "unk_token": "<UNK>",
2460
  "continuing_subword_prefix": null,
2461
  "end_of_word_suffix": null,
2462
  "fuse_unk": false,
2463
  "byte_fallback": false,
2464
  "ignore_merges": true,
2465
  "vocab": {
2466
- "<PAD>": 0,
2467
- "<UNK>": 1,
2468
- "<BOS>": 2,
2469
- "<EOS>": 3,
2470
- "<INST>": 4,
2471
- "</INST>": 5,
2472
- "<MASK>": 6,
2473
  "<RESERVED_TOKEN_1>": 7,
2474
  "<RESERVED_TOKEN_2>": 8,
2475
  "<RESERVED_TOKEN_3>": 9,
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
+ "content": "[PAD]",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
 
14
  },
15
  {
16
  "id": 1,
17
+ "content": "[UNK]",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
 
23
  },
24
  {
25
  "id": 2,
26
+ "content": "[BOS]",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
 
32
  },
33
  {
34
  "id": 3,
35
+ "content": "[EOS]",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
 
41
  },
42
  {
43
  "id": 4,
44
+ "content": "[CLS]",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
 
50
  },
51
  {
52
  "id": 5,
53
+ "content": "[SEP]",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
 
59
  },
60
  {
61
  "id": 6,
62
+ "content": "[MASK]",
63
  "single_word": false,
64
+ "lstrip": true,
65
  "rstrip": false,
66
  "normalized": false,
67
  "special": true
 
2396
  "single": [
2397
  {
2398
  "SpecialToken": {
2399
+ "id": "[CLS]",
2400
  "type_id": 0
2401
  }
2402
  },
 
2405
  "id": "A",
2406
  "type_id": 0
2407
  }
2408
+ },
2409
+ {
2410
+ "SpecialToken": {
2411
+ "id": "[SEP]",
2412
+ "type_id": 0
2413
+ }
2414
  }
2415
  ],
2416
  "pair": [
2417
  {
2418
  "SpecialToken": {
2419
+ "id": "[CLS]",
2420
  "type_id": 0
2421
  }
2422
  },
 
2428
  },
2429
  {
2430
  "SpecialToken": {
2431
+ "id": "[SEP]",
2432
+ "type_id": 0
2433
  }
2434
  },
2435
  {
2436
  "Sequence": {
2437
  "id": "B",
2438
+ "type_id": 0
2439
+ }
2440
+ },
2441
+ {
2442
+ "SpecialToken": {
2443
+ "id": "[SEP]",
2444
+ "type_id": 0
2445
  }
2446
  }
2447
  ],
2448
  "special_tokens": {
2449
+ "[PAD]": {
2450
+ "id": "[PAD]",
2451
+ "ids": [
2452
+ 0
2453
+ ],
2454
+ "tokens": [
2455
+ "[PAD]"
2456
+ ]
2457
+ },
2458
+ "[UNK]": {
2459
+ "id": "[UNK]",
2460
+ "ids": [
2461
+ 1
2462
+ ],
2463
+ "tokens": [
2464
+ "[UNK]"
2465
+ ]
2466
+ },
2467
+ "[BOS]": {
2468
+ "id": "[BOS]",
2469
  "ids": [
2470
  2
2471
  ],
2472
  "tokens": [
2473
+ "[BOS]"
2474
+ ]
2475
+ },
2476
+ "[EOS]": {
2477
+ "id": "[EOS]",
2478
+ "ids": [
2479
+ 3
2480
+ ],
2481
+ "tokens": [
2482
+ "[EOS]"
2483
+ ]
2484
+ },
2485
+ "[CLS]": {
2486
+ "id": "[CLS]",
2487
+ "ids": [
2488
+ 4
2489
+ ],
2490
+ "tokens": [
2491
+ "[CLS]"
2492
+ ]
2493
+ },
2494
+ "[SEP]": {
2495
+ "id": "[SEP]",
2496
+ "ids": [
2497
+ 5
2498
+ ],
2499
+ "tokens": [
2500
+ "[SEP]"
2501
+ ]
2502
+ },
2503
+ "[MASK]": {
2504
+ "id": "[MASK]",
2505
+ "ids": [
2506
+ 6
2507
+ ],
2508
+ "tokens": [
2509
+ "[MASK]"
2510
  ]
2511
  }
2512
  }
 
2522
  "model": {
2523
  "type": "BPE",
2524
  "dropout": null,
2525
+ "unk_token": "[UNK]",
2526
  "continuing_subword_prefix": null,
2527
  "end_of_word_suffix": null,
2528
  "fuse_unk": false,
2529
  "byte_fallback": false,
2530
  "ignore_merges": true,
2531
  "vocab": {
2532
+ "[PAD]": 0,
2533
+ "[UNK]": 1,
2534
+ "[BOS]": 2,
2535
+ "[EOS]": 3,
2536
+ "[CLS]": 4,
2537
+ "[SEP]": 5,
2538
+ "[MASK]": 6,
2539
  "<RESERVED_TOKEN_1>": 7,
2540
  "<RESERVED_TOKEN_2>": 8,
2541
  "<RESERVED_TOKEN_3>": 9,
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "<PAD>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
@@ -9,7 +9,7 @@
9
  "special": true
10
  },
11
  "1": {
12
- "content": "<UNK>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
@@ -17,7 +17,7 @@
17
  "special": true
18
  },
19
  "2": {
20
- "content": "<BOS>",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
@@ -25,7 +25,7 @@
25
  "special": true
26
  },
27
  "3": {
28
- "content": "<EOS>",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
@@ -33,7 +33,7 @@
33
  "special": true
34
  },
35
  "4": {
36
- "content": "<INST>",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
@@ -41,7 +41,7 @@
41
  "special": true
42
  },
43
  "5": {
44
- "content": "</INST>",
45
  "lstrip": false,
46
  "normalized": false,
47
  "rstrip": false,
@@ -49,8 +49,8 @@
49
  "special": true
50
  },
51
  "6": {
52
- "content": "<MASK>",
53
- "lstrip": false,
54
  "normalized": false,
55
  "rstrip": false,
56
  "single_word": false,
@@ -2097,11 +2097,18 @@
2097
  "special": true
2098
  }
2099
  },
2100
- "bos_token": "<BOS>",
2101
  "clean_up_tokenization_spaces": true,
2102
- "eos_token": "<EOS>",
2103
- "model_max_length": 1000000000000000019884624838656,
2104
- "pad_token": "<PAD>",
2105
  "tokenizer_class": "PreTrainedTokenizerFast",
2106
- "unk_token": "<UNK>"
 
 
 
 
 
 
 
2107
  }
 
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "[PAD]",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
 
9
  "special": true
10
  },
11
  "1": {
12
+ "content": "[UNK]",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
 
17
  "special": true
18
  },
19
  "2": {
20
+ "content": "[BOS]",
21
  "lstrip": false,
22
  "normalized": false,
23
  "rstrip": false,
 
25
  "special": true
26
  },
27
  "3": {
28
+ "content": "[EOS]",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
 
33
  "special": true
34
  },
35
  "4": {
36
+ "content": "[CLS]",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
 
41
  "special": true
42
  },
43
  "5": {
44
+ "content": "[SEP]",
45
  "lstrip": false,
46
  "normalized": false,
47
  "rstrip": false,
 
49
  "special": true
50
  },
51
  "6": {
52
+ "content": "[MASK]",
53
+ "lstrip": true,
54
  "normalized": false,
55
  "rstrip": false,
56
  "single_word": false,
 
2097
  "special": true
2098
  }
2099
  },
2100
+ "bos_token": "[BOS]",
2101
  "clean_up_tokenization_spaces": true,
2102
+ "eos_token": "[EOS]",
2103
+ "model_max_length": 8192,
2104
+ "pad_token": "[PAD]",
2105
  "tokenizer_class": "PreTrainedTokenizerFast",
2106
+ "unk_token": "[UNK]",
2107
+ "cls_token": "[CLS]",
2108
+ "mask_token": "[MASK]",
2109
+ "sep_token": "[SEP]",
2110
+ "model_input_names": [
2111
+ "input_ids",
2112
+ "attention_mask"
2113
+ ]
2114
  }