File size: 771 Bytes
91dd7fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# prepare_cal_data.py (Final version)
from datasets import load_dataset
import pandas as pd
import os

os.environ['HF_HOME'] = os.path.join(os.path.expanduser('~'), '.cache', 'huggingface')

print("Downloading and preparing a LARGER wikitext calibration dataset...")

# Load 10,000 samples instead of 1,000 to ensure enough tokens
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train", streaming=True).take(10000)

output_parquet_file = "wikitext_cal_data.parquet"

data_list = []
for item in dataset:
    text = item['text'].strip()
    if text:
        data_list.append(text)

df = pd.DataFrame(data_list, columns=['text'])
df.to_parquet(output_parquet_file, engine='fastparquet')

print(f"'{output_parquet_file}' created successfully with more data.")