Spaces:
Build error
Build error
data analysis notebooks
Browse files
llm_toolkit/translation_utils.py
CHANGED
|
@@ -163,6 +163,24 @@ def load_translation_dataset(data_path, tokenizer=None):
|
|
| 163 |
return datasets
|
| 164 |
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
def get_metrics(df, max_output_tokens=2048):
|
| 167 |
metrics_df = pd.DataFrame(df.columns.T)[2:]
|
| 168 |
metrics_df.rename(columns={0: "model"}, inplace=True)
|
|
@@ -203,15 +221,15 @@ def get_metrics(df, max_output_tokens=2048):
|
|
| 203 |
)
|
| 204 |
|
| 205 |
num_entries_with_max_output_tokens.append(
|
| 206 |
-
df["output_tokens"]
|
| 207 |
)
|
| 208 |
|
| 209 |
metrics_df["meteor"] = meteor
|
| 210 |
metrics_df["bleu_1"] = bleu_1
|
| 211 |
metrics_df["rouge_l"] = rouge_l
|
| 212 |
metrics_df["ews_score"] = ews_score
|
| 213 |
-
metrics_df["repetition_score"] =
|
| 214 |
-
metrics_df["total_repetitions"] =
|
| 215 |
metrics_df["num_entries_with_max_output_tokens"] = (
|
| 216 |
num_entries_with_max_output_tokens
|
| 217 |
)
|
|
|
|
| 163 |
return datasets
|
| 164 |
|
| 165 |
|
| 166 |
+
def count_entries_with_max_tokens(entries, max_tokens):
|
| 167 |
+
"""
|
| 168 |
+
Count the number of entries with the max output tokens or more.
|
| 169 |
+
|
| 170 |
+
Parameters:
|
| 171 |
+
entries (list of int): List of token counts for each entry.
|
| 172 |
+
max_tokens (int): The maximum token threshold.
|
| 173 |
+
|
| 174 |
+
Returns:
|
| 175 |
+
int: The number of entries with token counts greater than or equal to max_tokens.
|
| 176 |
+
"""
|
| 177 |
+
count = 0
|
| 178 |
+
for tokens in entries:
|
| 179 |
+
if tokens >= max_tokens:
|
| 180 |
+
count += 1
|
| 181 |
+
return count
|
| 182 |
+
|
| 183 |
+
|
| 184 |
def get_metrics(df, max_output_tokens=2048):
|
| 185 |
metrics_df = pd.DataFrame(df.columns.T)[2:]
|
| 186 |
metrics_df.rename(columns={0: "model"}, inplace=True)
|
|
|
|
| 221 |
)
|
| 222 |
|
| 223 |
num_entries_with_max_output_tokens.append(
|
| 224 |
+
count_entries_with_max_tokens(df["output_tokens"], max_output_tokens)
|
| 225 |
)
|
| 226 |
|
| 227 |
metrics_df["meteor"] = meteor
|
| 228 |
metrics_df["bleu_1"] = bleu_1
|
| 229 |
metrics_df["rouge_l"] = rouge_l
|
| 230 |
metrics_df["ews_score"] = ews_score
|
| 231 |
+
metrics_df["repetition_score"] = repetition_score
|
| 232 |
+
metrics_df["total_repetitions"] = total_repetitions
|
| 233 |
metrics_df["num_entries_with_max_output_tokens"] = (
|
| 234 |
num_entries_with_max_output_tokens
|
| 235 |
)
|
notebooks/00_Data Analysis.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/00a_Data Analysis_greedy_decoding.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|