|
|
|
|
|
from datasets import load_dataset |
|
|
import pandas as pd |
|
|
import os |
|
|
|
|
|
os.environ['HF_HOME'] = os.path.join(os.path.expanduser('~'), '.cache', 'huggingface') |
|
|
|
|
|
print("Downloading and preparing a LARGER wikitext calibration dataset...") |
|
|
|
|
|
|
|
|
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train", streaming=True).take(10000) |
|
|
|
|
|
output_parquet_file = "wikitext_cal_data.parquet" |
|
|
|
|
|
data_list = [] |
|
|
for item in dataset: |
|
|
text = item['text'].strip() |
|
|
if text: |
|
|
data_list.append(text) |
|
|
|
|
|
df = pd.DataFrame(data_list, columns=['text']) |
|
|
df.to_parquet(output_parquet_file, engine='fastparquet') |
|
|
|
|
|
print(f"'{output_parquet_file}' created successfully with more data.") |
|
|
|