File size: 771 Bytes

91dd7fc

# prepare_cal_data.py (Final version)
from datasets import load_dataset
import pandas as pd
import os

os.environ['HF_HOME'] = os.path.join(os.path.expanduser('~'), '.cache', 'huggingface')

print("Downloading and preparing a LARGER wikitext calibration dataset...")

# Load 10,000 samples instead of 1,000 to ensure enough tokens
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train", streaming=True).take(10000)

output_parquet_file = "wikitext_cal_data.parquet"

data_list = []
for item in dataset:
    text = item['text'].strip()
    if text:
        data_list.append(text)

df = pd.DataFrame(data_list, columns=['text'])
df.to_parquet(output_parquet_file, engine='fastparquet')

print(f"'{output_parquet_file}' created successfully with more data.")