Spaces:
Running
Running
Updated results
Browse filesSigned-off-by: Jonathan Bnayahu <[email protected]>
- results/bluebench/{2025-07-02T14-58-20_evaluation_results.json β 2025-07-03T07-20-08_evaluation_results.json} +576 -576
- results/bluebench/{2025-07-02T15-15-09_evaluation_results.json β 2025-07-03T07-36-22_evaluation_results.json} +569 -569
- results/bluebench/2025-07-03T08-05-54_evaluation_results.json +1281 -0
- results/bluebench/{2025-07-02T16-08-27_evaluation_results.json β 2025-07-03T08-48-01_evaluation_results.json} +574 -574
- results/bluebench/{2025-07-02T16-23-36_evaluation_results.json β 2025-07-03T10-08-21_evaluation_results.json} +593 -593
- results/bluebench/2025-07-03T10-34-07_evaluation_results.json +1281 -0
- results/bluebench/{2025-07-02T17-33-41_evaluation_results.json β 2025-07-03T11-22-55_evaluation_results.json} +509 -509
- results/bluebench/{2025-07-02T18-37-37_evaluation_results.json β 2025-07-03T12-53-58_evaluation_results.json} +583 -583
- results/bluebench/{2025-07-02T18-57-45_evaluation_results.json β 2025-07-03T13-14-01_evaluation_results.json} +584 -584
- results/bluebench/2025-07-03T13-32-15_evaluation_results.json +1281 -0
- results/bluebench/{2025-07-02T15-54-03_evaluation_results.json β 2025-07-03T15-41-32_evaluation_results.json} +591 -591
- results/bluebench/{2025-07-02T17-12-27_evaluation_results.json β 2025-07-03T15-51-24_evaluation_results.json} +455 -455
- results/bluebench/{2025-07-02T18-12-30_evaluation_results.json β 2025-07-03T16-05-29_evaluation_results.json} +548 -548
results/bluebench/{2025-07-02T14-58-20_evaluation_results.json β 2025-07-03T07-20-08_evaluation_results.json}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
-
"timestamp_utc": "2025-07-
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
@@ -42,7 +42,7 @@
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
-
"unitxt_commit_hash": "
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
@@ -176,16 +176,6 @@
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
-
"accuracy": 0.3333333333333333,
|
| 180 |
-
"accuracy_ci_low": 0.0,
|
| 181 |
-
"accuracy_ci_high": 0.6666666666666666,
|
| 182 |
-
"score_name": "accuracy",
|
| 183 |
-
"score": 0.3333333333333333,
|
| 184 |
-
"score_ci_high": 0.6666666666666666,
|
| 185 |
-
"score_ci_low": 0.0,
|
| 186 |
-
"num_of_instances": 9
|
| 187 |
-
},
|
| 188 |
-
"safety_bbq_disability_status": {
|
| 189 |
"accuracy": 0.5555555555555556,
|
| 190 |
"accuracy_ci_low": 0.2222222222222222,
|
| 191 |
"accuracy_ci_high": 0.8888888888888888,
|
|
@@ -195,14 +185,24 @@
|
|
| 195 |
"score_ci_low": 0.2222222222222222,
|
| 196 |
"num_of_instances": 9
|
| 197 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
"safety_bbq_gender_identity": {
|
| 199 |
-
"accuracy": 0.
|
| 200 |
-
"accuracy_ci_low": 0.
|
| 201 |
"accuracy_ci_high": 1.0,
|
| 202 |
"score_name": "accuracy",
|
| 203 |
-
"score": 0.
|
| 204 |
"score_ci_high": 1.0,
|
| 205 |
-
"score_ci_low": 0.
|
| 206 |
"num_of_instances": 9
|
| 207 |
},
|
| 208 |
"safety_bbq_nationality": {
|
|
@@ -236,16 +236,6 @@
|
|
| 236 |
"num_of_instances": 9
|
| 237 |
},
|
| 238 |
"safety_bbq_race_x_gender": {
|
| 239 |
-
"accuracy": 0.5555555555555556,
|
| 240 |
-
"accuracy_ci_low": 0.2222222222222222,
|
| 241 |
-
"accuracy_ci_high": 0.8888888888888888,
|
| 242 |
-
"score_name": "accuracy",
|
| 243 |
-
"score": 0.5555555555555556,
|
| 244 |
-
"score_ci_high": 0.8888888888888888,
|
| 245 |
-
"score_ci_low": 0.2222222222222222,
|
| 246 |
-
"num_of_instances": 9
|
| 247 |
-
},
|
| 248 |
-
"safety_bbq_race_x_ses": {
|
| 249 |
"accuracy": 0.6666666666666666,
|
| 250 |
"accuracy_ci_low": 0.3333333333333333,
|
| 251 |
"accuracy_ci_high": 0.8888888888888888,
|
|
@@ -255,6 +245,16 @@
|
|
| 255 |
"score_ci_low": 0.3333333333333333,
|
| 256 |
"num_of_instances": 9
|
| 257 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
"safety_bbq_religion": {
|
| 259 |
"accuracy": 0.4444444444444444,
|
| 260 |
"accuracy_ci_low": 0.1111111111111111,
|
|
@@ -266,61 +266,61 @@
|
|
| 266 |
"num_of_instances": 9
|
| 267 |
},
|
| 268 |
"safety_bbq_ses": {
|
| 269 |
-
"accuracy": 0.
|
| 270 |
-
"accuracy_ci_low": 0.
|
| 271 |
-
"accuracy_ci_high": 0.
|
| 272 |
"score_name": "accuracy",
|
| 273 |
-
"score": 0.
|
| 274 |
-
"score_ci_high": 0.
|
| 275 |
-
"score_ci_low": 0.
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
"safety_bbq_sexual_orientation": {
|
| 279 |
"accuracy": 0.2222222222222222,
|
| 280 |
"accuracy_ci_low": 0.0,
|
| 281 |
-
"accuracy_ci_high": 0.
|
| 282 |
"score_name": "accuracy",
|
| 283 |
"score": 0.2222222222222222,
|
| 284 |
-
"score_ci_high": 0.
|
| 285 |
"score_ci_low": 0.0,
|
| 286 |
"num_of_instances": 9
|
| 287 |
},
|
| 288 |
-
"score": 0.
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
| 296 |
-
"score": 0.
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
-
"score": 0.
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
-
"f1_Person": 0.
|
| 307 |
-
"f1_Organization": 0.
|
| 308 |
-
"f1_Location": 0.
|
| 309 |
-
"f1_macro": 0.
|
| 310 |
-
"recall_macro": 0.
|
| 311 |
-
"precision_macro": 0.
|
| 312 |
-
"in_classes_support": 0.
|
| 313 |
-
"f1_micro": 0.
|
| 314 |
-
"recall_micro": 0.
|
| 315 |
-
"precision_micro": 0.
|
| 316 |
-
"score": 0.
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
-
"score_ci_low": 0.
|
| 319 |
-
"score_ci_high": 0.
|
| 320 |
-
"f1_micro_ci_low": 0.
|
| 321 |
-
"f1_micro_ci_high": 0.
|
| 322 |
},
|
| 323 |
-
"score": 0.
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
|
@@ -338,21 +338,21 @@
|
|
| 338 |
"mmlu_pro_business": {
|
| 339 |
"accuracy": 0.14285714285714285,
|
| 340 |
"accuracy_ci_low": 0.0,
|
| 341 |
-
"accuracy_ci_high": 0.
|
| 342 |
"score_name": "accuracy",
|
| 343 |
"score": 0.14285714285714285,
|
| 344 |
-
"score_ci_high": 0.
|
| 345 |
"score_ci_low": 0.0,
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
-
"accuracy": 0.
|
| 350 |
-
"accuracy_ci_low": 0.
|
| 351 |
-
"accuracy_ci_high": 0.
|
| 352 |
"score_name": "accuracy",
|
| 353 |
-
"score": 0.
|
| 354 |
-
"score_ci_high": 0.
|
| 355 |
-
"score_ci_low": 0.
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
|
@@ -386,22 +386,22 @@
|
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
| 389 |
-
"accuracy": 0.
|
| 390 |
"accuracy_ci_low": 0.0,
|
| 391 |
-
"accuracy_ci_high": 0.
|
| 392 |
"score_name": "accuracy",
|
| 393 |
-
"score": 0.
|
| 394 |
-
"score_ci_high": 0.
|
| 395 |
"score_ci_low": 0.0,
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
| 398 |
"mmlu_pro_history": {
|
| 399 |
-
"accuracy": 0.
|
| 400 |
"accuracy_ci_low": 0.0,
|
| 401 |
-
"accuracy_ci_high": 0.
|
| 402 |
"score_name": "accuracy",
|
| 403 |
-
"score": 0.
|
| 404 |
-
"score_ci_high": 0.
|
| 405 |
"score_ci_low": 0.0,
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
|
@@ -436,22 +436,22 @@
|
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
| 438 |
"mmlu_pro_philosophy": {
|
| 439 |
-
"accuracy": 0.
|
| 440 |
-
"accuracy_ci_low": 0.
|
| 441 |
-
"accuracy_ci_high": 0.
|
| 442 |
"score_name": "accuracy",
|
| 443 |
-
"score": 0.
|
| 444 |
-
"score_ci_high": 0.
|
| 445 |
-
"score_ci_low": 0.
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
-
"accuracy": 0.
|
| 450 |
"accuracy_ci_low": 0.0,
|
| 451 |
-
"accuracy_ci_high": 0.
|
| 452 |
"score_name": "accuracy",
|
| 453 |
-
"score": 0.
|
| 454 |
-
"score_ci_high": 0.
|
| 455 |
"score_ci_low": 0.0,
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
|
@@ -465,38 +465,38 @@
|
|
| 465 |
"score_ci_low": 0.0,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
-
"score": 0.
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
-
"f1_macro": 0.
|
| 475 |
"f1_suggestive": 0.0,
|
| 476 |
-
"
|
| 477 |
"f1_generic": 0.0,
|
| 478 |
-
"
|
| 479 |
-
"
|
| 480 |
-
"f1_macro_ci_low": 0.
|
| 481 |
-
"f1_macro_ci_high": 0.
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
-
"score": 0.
|
| 484 |
-
"score_ci_high": 0.
|
| 485 |
-
"score_ci_low": 0.
|
| 486 |
"num_of_instances": 20,
|
| 487 |
-
"accuracy": 0.
|
| 488 |
-
"accuracy_ci_low": 0.
|
| 489 |
-
"accuracy_ci_high": 0.
|
| 490 |
-
"f1_micro": 0.
|
| 491 |
-
"f1_micro_ci_low": 0.
|
| 492 |
-
"f1_micro_ci_high": 0.
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
-
"f1_macro": 0.
|
| 496 |
-
"f1_no": 0.
|
| 497 |
-
"f1_yes": 0.
|
| 498 |
-
"f1_macro_ci_low": 0.
|
| 499 |
-
"f1_macro_ci_high": 0.
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
"score": 0.65,
|
| 502 |
"score_ci_high": 0.85,
|
|
@@ -510,203 +510,203 @@
|
|
| 510 |
"f1_micro_ci_high": 0.85
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
-
"f1_macro": 0.
|
| 514 |
-
"f1_conclusion": 0.
|
| 515 |
-
"
|
| 516 |
-
"f1_decree": 0.
|
| 517 |
"f1_analysis": 0.0,
|
| 518 |
-
"f1_issue": 0.46153846153846156,
|
| 519 |
-
"f1_procedural history": 0.0,
|
| 520 |
"f1_facts": 0.0,
|
| 521 |
-
"
|
| 522 |
-
"
|
|
|
|
|
|
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
-
"score": 0.
|
| 525 |
-
"score_ci_high": 0.
|
| 526 |
-
"score_ci_low": 0.
|
| 527 |
"num_of_instances": 20,
|
| 528 |
-
"accuracy": 0.
|
| 529 |
-
"accuracy_ci_low": 0.
|
| 530 |
-
"accuracy_ci_high": 0.
|
| 531 |
-
"f1_micro": 0.
|
| 532 |
-
"f1_micro_ci_low": 0.
|
| 533 |
-
"f1_micro_ci_high": 0.
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
-
"f1_macro": 0.
|
| 537 |
-
"f1_yes": 0.
|
| 538 |
-
"f1_no": 0.
|
| 539 |
-
"f1_macro_ci_low": 0.
|
| 540 |
-
"f1_macro_ci_high": 0.
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
-
"score": 0.
|
| 543 |
-
"score_ci_high": 0.
|
| 544 |
-
"score_ci_low": 0.
|
| 545 |
"num_of_instances": 20,
|
| 546 |
-
"accuracy": 0.
|
| 547 |
-
"accuracy_ci_low": 0.
|
| 548 |
-
"accuracy_ci_high": 0.
|
| 549 |
-
"f1_micro": 0.
|
| 550 |
-
"f1_micro_ci_low": 0.
|
| 551 |
-
"f1_micro_ci_high": 0.
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
-
"f1_macro": 0.
|
| 555 |
-
"f1_yes": 0.
|
| 556 |
-
"f1_no": 0.
|
| 557 |
-
"f1_macro_ci_low": 0.
|
| 558 |
-
"f1_macro_ci_high": 0.
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
-
"score": 0.
|
| 561 |
-
"score_ci_high": 0.
|
| 562 |
-
"score_ci_low": 0.
|
| 563 |
"num_of_instances": 20,
|
| 564 |
-
"accuracy": 0.
|
| 565 |
-
"accuracy_ci_low": 0.
|
| 566 |
-
"accuracy_ci_high": 0.
|
| 567 |
-
"f1_micro": 0.
|
| 568 |
-
"f1_micro_ci_low": 0.
|
| 569 |
-
"f1_micro_ci_high": 0.
|
| 570 |
},
|
| 571 |
-
"score": 0.
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
-
"f1_macro": 0.
|
| 578 |
-
"f1_cars": 0.
|
| 579 |
"f1_windows x": 0.0,
|
| 580 |
"f1_atheism": 0.0,
|
| 581 |
-
"f1_christianity": 0.
|
| 582 |
"f1_religion": 0.0,
|
| 583 |
-
"f1_medicine": 0.
|
| 584 |
-
"f1_computer graphics": 0.
|
| 585 |
-
"
|
|
|
|
|
|
|
| 586 |
"f1_middle east": 0.0,
|
| 587 |
-
"f1_politics": 0.
|
| 588 |
-
"f1_motorcycles": 0.
|
| 589 |
-
"
|
| 590 |
-
"
|
| 591 |
-
"f1_electronics": 0.5,
|
| 592 |
"f1_for sale": 0.3333333333333333,
|
| 593 |
"f1_guns": 0.0,
|
| 594 |
-
"f1_space": 0.
|
| 595 |
-
"
|
| 596 |
-
"
|
| 597 |
-
"
|
| 598 |
-
"
|
| 599 |
-
"f1_macro_ci_high": 0.40599488243071413,
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
-
"score": 0.
|
| 602 |
-
"score_ci_high": 0.
|
| 603 |
-
"score_ci_low": 0.
|
| 604 |
"num_of_instances": 100,
|
| 605 |
-
"accuracy": 0.
|
| 606 |
-
"accuracy_ci_low": 0.
|
| 607 |
-
"accuracy_ci_high": 0.
|
| 608 |
-
"f1_micro": 0.
|
| 609 |
-
"f1_micro_ci_low": 0.
|
| 610 |
-
"f1_micro_ci_high": 0.
|
| 611 |
},
|
| 612 |
-
"score": 0.
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
-
"f1_macro": 0.
|
| 619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
| 620 |
-
"
|
| 621 |
-
"f1_money transfer or virtual currency or money service": 0.
|
| 622 |
"f1_mortgage": 0.6666666666666666,
|
| 623 |
-
"
|
| 624 |
-
"f1_debt collection": 0.6666666666666666,
|
| 625 |
"f1_checking or savings account": 0.7692307692307693,
|
| 626 |
-
"
|
| 627 |
-
"
|
|
|
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
-
"score": 0.
|
| 630 |
-
"score_ci_high": 0.
|
| 631 |
-
"score_ci_low": 0.
|
| 632 |
"num_of_instances": 100,
|
| 633 |
-
"accuracy": 0.
|
| 634 |
"accuracy_ci_low": 0.67,
|
| 635 |
"accuracy_ci_high": 0.85,
|
| 636 |
-
"f1_micro": 0.
|
| 637 |
-
"f1_micro_ci_low": 0.
|
| 638 |
-
"f1_micro_ci_high": 0.
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
-
"f1_macro": 0.
|
| 642 |
-
"f1_mortgages and loans": 0.
|
| 643 |
-
"f1_credit card": 0.
|
| 644 |
-
"f1_debt collection": 0.
|
| 645 |
-
"f1_credit reporting": 0.
|
| 646 |
-
"f1_retail banking": 0.
|
| 647 |
-
"f1_macro_ci_low": 0.
|
| 648 |
-
"f1_macro_ci_high": 0.
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
-
"score": 0.
|
| 651 |
-
"score_ci_high": 0.
|
| 652 |
-
"score_ci_low": 0.
|
| 653 |
"num_of_instances": 50,
|
| 654 |
-
"accuracy": 0.
|
| 655 |
-
"accuracy_ci_low": 0.
|
| 656 |
-
"accuracy_ci_high": 0.
|
| 657 |
-
"f1_micro": 0.
|
| 658 |
-
"f1_micro_ci_low": 0.
|
| 659 |
-
"f1_micro_ci_high": 0.
|
| 660 |
},
|
| 661 |
-
"score": 0.
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
-
"program_accuracy": 0.12,
|
| 669 |
-
"score": 0.12,
|
| 670 |
-
"score_name": "program_accuracy",
|
| 671 |
"execution_accuracy": 0.11,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
"program_accuracy_ci_low": 0.07,
|
| 673 |
-
"program_accuracy_ci_high": 0.
|
| 674 |
"score_ci_low": 0.07,
|
| 675 |
-
"score_ci_high": 0.
|
| 676 |
-
"execution_accuracy_ci_low": 0.05,
|
| 677 |
-
"execution_accuracy_ci_high": 0.18
|
| 678 |
},
|
| 679 |
-
"score": 0.
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
-
"precision": 0.
|
| 686 |
-
"recall": 0.
|
| 687 |
-
"f1": 0.
|
| 688 |
-
"precision_ci_low": 0.
|
| 689 |
-
"precision_ci_high": 0.
|
| 690 |
-
"recall_ci_low": 0.
|
| 691 |
-
"recall_ci_high": 0.
|
| 692 |
-
"f1_ci_low": 0.
|
| 693 |
-
"f1_ci_high": 0.
|
| 694 |
"score_name": "f1",
|
| 695 |
-
"score": 0.
|
| 696 |
-
"score_ci_high": 0.
|
| 697 |
-
"score_ci_low": 0.
|
| 698 |
"num_of_instances": 100,
|
| 699 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
| 700 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
| 701 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
| 702 |
-
"faithfullness_f1_token_overlap": 0.
|
| 703 |
-
"faithfullness_recall_token_overlap": 0.
|
| 704 |
-
"faithfullness_precision_token_overlap": 0.
|
| 705 |
-
"correctness_f1_token_overlap": 0.
|
| 706 |
-
"correctness_recall_token_overlap": 0.
|
| 707 |
-
"correctness_precision_token_overlap": 0.
|
| 708 |
},
|
| 709 |
-
"score": 0.
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
|
@@ -722,85 +722,85 @@
|
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
-
"accuracy": 0.
|
| 726 |
-
"accuracy_ci_low": 0.
|
| 727 |
-
"accuracy_ci_high": 0.
|
| 728 |
"score_name": "accuracy",
|
| 729 |
-
"score": 0.
|
| 730 |
-
"score_ci_high": 0.
|
| 731 |
-
"score_ci_low": 0.
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
-
"score": 0.
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
| 738 |
"safety": {
|
| 739 |
"attaq_500": {
|
| 740 |
-
"severity_critical":
|
| 741 |
-
"severity_high":
|
| 742 |
-
"severity_medium":
|
| 743 |
-
"severity_low":
|
| 744 |
-
"category_harmful_info": 0.
|
| 745 |
-
"category_deception": 0.
|
| 746 |
-
"category_explicit_content": 0.
|
| 747 |
-
"category_violence": 0.
|
| 748 |
-
"category_discrimination": 0.
|
| 749 |
-
"category_substance_abuse": 0.
|
| 750 |
-
"category_pii": 0.
|
| 751 |
-
"safety": 0.
|
| 752 |
-
"safety_ci_low": 0.
|
| 753 |
-
"safety_ci_high": 0.
|
| 754 |
"score_name": "safety",
|
| 755 |
-
"score": 0.
|
| 756 |
-
"score_ci_high": 0.
|
| 757 |
-
"score_ci_low": 0.
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
-
"score": 0.
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
-
"
|
| 768 |
-
"
|
| 769 |
-
"score": 0.2631037191491013,
|
| 770 |
"score_name": "rougeL",
|
| 771 |
-
"
|
| 772 |
-
"
|
| 773 |
-
"
|
| 774 |
-
"
|
| 775 |
-
"
|
| 776 |
-
"
|
| 777 |
-
"
|
| 778 |
-
"
|
| 779 |
-
"
|
| 780 |
-
"
|
| 781 |
-
"
|
| 782 |
-
"
|
|
|
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
-
"
|
| 787 |
-
"
|
| 788 |
-
"score": 0.07205757337255324,
|
| 789 |
"score_name": "rougeL",
|
| 790 |
-
"
|
| 791 |
-
"
|
| 792 |
-
"
|
| 793 |
-
"
|
| 794 |
-
"
|
| 795 |
-
"
|
| 796 |
-
"
|
| 797 |
-
"
|
| 798 |
-
"
|
| 799 |
-
"
|
| 800 |
-
"
|
| 801 |
-
"
|
|
|
|
| 802 |
},
|
| 803 |
-
"score": 0.
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
@@ -808,473 +808,473 @@
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
-
0.
|
| 824 |
-
0.
|
| 825 |
-
0.
|
| 826 |
-
0.
|
| 827 |
],
|
| 828 |
-
"bp": 0
|
| 829 |
-
"sys_len":
|
| 830 |
"ref_len": 208,
|
| 831 |
-
"sacrebleu": 0.
|
| 832 |
-
"score": 0.
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
-
"score_ci_low": 0.
|
| 835 |
-
"score_ci_high": 0.
|
| 836 |
-
"sacrebleu_ci_low": 0.
|
| 837 |
-
"sacrebleu_ci_high": 0.
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
-
0.
|
| 855 |
-
0.
|
| 856 |
-
0.
|
| 857 |
-
0.
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
-
"sys_len":
|
| 861 |
"ref_len": 208,
|
| 862 |
-
"sacrebleu": 0.
|
| 863 |
-
"score": 0.
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
-
"score_ci_low": 0.
|
| 866 |
-
"score_ci_high": 0.
|
| 867 |
-
"sacrebleu_ci_low": 0.
|
| 868 |
-
"sacrebleu_ci_high": 0.
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
-
|
| 874 |
-
|
| 875 |
11,
|
| 876 |
6
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
-
0.
|
| 886 |
-
0.
|
| 887 |
-
0.
|
| 888 |
-
0.
|
| 889 |
],
|
| 890 |
-
"bp": 0
|
| 891 |
-
"sys_len":
|
| 892 |
"ref_len": 209,
|
| 893 |
-
"sacrebleu": 0.
|
| 894 |
-
"score": 0.
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
-
"score_ci_low": 0.
|
| 897 |
-
"score_ci_high": 0.
|
| 898 |
-
"sacrebleu_ci_low": 0.
|
| 899 |
-
"sacrebleu_ci_high": 0.
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
-
0.
|
| 917 |
-
0.
|
| 918 |
-
0.
|
| 919 |
-
0.
|
| 920 |
],
|
| 921 |
-
"bp":
|
| 922 |
-
"sys_len":
|
| 923 |
"ref_len": 216,
|
| 924 |
-
"sacrebleu": 0.
|
| 925 |
-
"score": 0.
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
-
"score_ci_low": 0.
|
| 928 |
-
"score_ci_high": 0.
|
| 929 |
-
"sacrebleu_ci_low": 0.
|
| 930 |
-
"sacrebleu_ci_high": 0.
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
-
0.
|
| 948 |
-
0.
|
| 949 |
-
0.
|
| 950 |
-
0.
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
-
"sys_len":
|
| 954 |
"ref_len": 235,
|
| 955 |
-
"sacrebleu": 0.
|
| 956 |
-
"score": 0.
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
-
"score_ci_low": 0.
|
| 959 |
-
"score_ci_high": 0.
|
| 960 |
-
"sacrebleu_ci_low": 0.
|
| 961 |
-
"sacrebleu_ci_high": 0.
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
-
0.
|
| 979 |
-
0.
|
| 980 |
-
0.
|
| 981 |
-
0.
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
-
"sys_len":
|
| 985 |
"ref_len": 249,
|
| 986 |
-
"sacrebleu": 0.
|
| 987 |
-
"score": 0.
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
-
"score_ci_low": 0.
|
| 990 |
-
"score_ci_high": 0.
|
| 991 |
-
"sacrebleu_ci_low": 0.
|
| 992 |
-
"sacrebleu_ci_high": 0.
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
149,
|
| 998 |
-
|
| 999 |
66,
|
| 1000 |
46
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
-
0.
|
| 1010 |
-
0.
|
| 1011 |
-
0.
|
| 1012 |
-
0.
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
-
"sys_len":
|
| 1016 |
"ref_len": 222,
|
| 1017 |
-
"sacrebleu": 0.
|
| 1018 |
-
"score": 0.
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
-
"score_ci_low": 0.
|
| 1021 |
-
"score_ci_high": 0.
|
| 1022 |
-
"sacrebleu_ci_low": 0.
|
| 1023 |
-
"sacrebleu_ci_high": 0.
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
-
0.
|
| 1041 |
-
0.
|
| 1042 |
-
0.
|
| 1043 |
-
0.
|
| 1044 |
],
|
| 1045 |
"bp": 1.0,
|
| 1046 |
-
"sys_len":
|
| 1047 |
"ref_len": 230,
|
| 1048 |
-
"sacrebleu": 0.
|
| 1049 |
-
"score": 0.
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
-
"score_ci_low": 0.
|
| 1052 |
-
"score_ci_high": 0.
|
| 1053 |
-
"sacrebleu_ci_low": 0.
|
| 1054 |
-
"sacrebleu_ci_high": 0.
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
-
0.
|
| 1072 |
-
0.
|
| 1073 |
-
0.
|
| 1074 |
-
0.
|
| 1075 |
],
|
| 1076 |
-
"bp": 0.
|
| 1077 |
-
"sys_len":
|
| 1078 |
"ref_len": 243,
|
| 1079 |
-
"sacrebleu": 0.
|
| 1080 |
-
"score": 0.
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
-
"score_ci_low": 0.
|
| 1083 |
-
"score_ci_high": 0.
|
| 1084 |
-
"sacrebleu_ci_low": 0.
|
| 1085 |
-
"sacrebleu_ci_high": 0.
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
-
0.
|
| 1103 |
-
0.
|
| 1104 |
-
0.
|
| 1105 |
-
0.
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
-
"sys_len":
|
| 1109 |
"ref_len": 208,
|
| 1110 |
-
"sacrebleu": 0.
|
| 1111 |
-
"score": 0.
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
-
"score_ci_low": 0.
|
| 1114 |
-
"score_ci_high": 0.
|
| 1115 |
-
"sacrebleu_ci_low": 0.
|
| 1116 |
-
"sacrebleu_ci_high": 0.
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
31,
|
| 1124 |
-
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
-
0.
|
| 1134 |
-
0.
|
| 1135 |
-
0.
|
| 1136 |
-
0.
|
| 1137 |
],
|
| 1138 |
"bp": 1.0,
|
| 1139 |
-
"sys_len":
|
| 1140 |
"ref_len": 208,
|
| 1141 |
-
"sacrebleu": 0.
|
| 1142 |
-
"score": 0.
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
-
"score_ci_low": 0.
|
| 1145 |
-
"score_ci_high": 0.
|
| 1146 |
-
"sacrebleu_ci_low": 0.
|
| 1147 |
-
"sacrebleu_ci_high": 0.
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
-
0.
|
| 1165 |
-
0.
|
| 1166 |
-
0.
|
| 1167 |
-
0.
|
| 1168 |
],
|
| 1169 |
-
"bp":
|
| 1170 |
-
"sys_len":
|
| 1171 |
"ref_len": 208,
|
| 1172 |
-
"sacrebleu": 0.
|
| 1173 |
-
"score": 0.
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
-
"score_ci_low": 0.
|
| 1176 |
-
"score_ci_high": 0.
|
| 1177 |
-
"sacrebleu_ci_low": 0.
|
| 1178 |
-
"sacrebleu_ci_high": 0.
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
-
0.
|
| 1196 |
-
0.
|
| 1197 |
-
0.
|
| 1198 |
-
0.
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
-
"sys_len":
|
| 1202 |
"ref_len": 208,
|
| 1203 |
-
"sacrebleu": 0.
|
| 1204 |
-
"score": 0.
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
-
"score_ci_low": 0.
|
| 1207 |
-
"score_ci_high": 0.
|
| 1208 |
-
"sacrebleu_ci_low": 0.
|
| 1209 |
-
"sacrebleu_ci_high": 0.
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
-
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
],
|
| 1219 |
"totals": [
|
|
|
|
| 1220 |
229,
|
| 1221 |
223,
|
| 1222 |
-
217
|
| 1223 |
-
211
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
-
0.
|
| 1227 |
-
0.
|
| 1228 |
-
0.
|
| 1229 |
-
0.
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
-
"sys_len":
|
| 1233 |
"ref_len": 208,
|
| 1234 |
-
"sacrebleu": 0.
|
| 1235 |
-
"score": 0.
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
-
"score_ci_low": 0.
|
| 1238 |
-
"score_ci_high": 0.
|
| 1239 |
-
"sacrebleu_ci_low": 0.
|
| 1240 |
-
"sacrebleu_ci_high": 0.
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
-
|
| 1246 |
-
|
| 1247 |
-
|
| 1248 |
23
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
-
0.
|
| 1258 |
-
0.
|
| 1259 |
-
0.
|
| 1260 |
-
0.
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
-
"sys_len":
|
| 1264 |
"ref_len": 208,
|
| 1265 |
-
"sacrebleu": 0.
|
| 1266 |
-
"score": 0.
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
-
"score_ci_low": 0.
|
| 1269 |
-
"score_ci_high": 0.
|
| 1270 |
-
"sacrebleu_ci_low": 0.
|
| 1271 |
-
"sacrebleu_ci_high": 0.
|
| 1272 |
},
|
| 1273 |
-
"score": 0.
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
-
"score": 0.
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
+
"timestamp_utc": "2025-07-03T11:20:04.599853Z",
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
+
"unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
"accuracy": 0.5555555555555556,
|
| 180 |
"accuracy_ci_low": 0.2222222222222222,
|
| 181 |
"accuracy_ci_high": 0.8888888888888888,
|
|
|
|
| 185 |
"score_ci_low": 0.2222222222222222,
|
| 186 |
"num_of_instances": 9
|
| 187 |
},
|
| 188 |
+
"safety_bbq_disability_status": {
|
| 189 |
+
"accuracy": 0.3333333333333333,
|
| 190 |
+
"accuracy_ci_low": 0.1111111111111111,
|
| 191 |
+
"accuracy_ci_high": 0.6666666666666666,
|
| 192 |
+
"score_name": "accuracy",
|
| 193 |
+
"score": 0.3333333333333333,
|
| 194 |
+
"score_ci_high": 0.6666666666666666,
|
| 195 |
+
"score_ci_low": 0.1111111111111111,
|
| 196 |
+
"num_of_instances": 9
|
| 197 |
+
},
|
| 198 |
"safety_bbq_gender_identity": {
|
| 199 |
+
"accuracy": 0.8888888888888888,
|
| 200 |
+
"accuracy_ci_low": 0.5555555555555556,
|
| 201 |
"accuracy_ci_high": 1.0,
|
| 202 |
"score_name": "accuracy",
|
| 203 |
+
"score": 0.8888888888888888,
|
| 204 |
"score_ci_high": 1.0,
|
| 205 |
+
"score_ci_low": 0.5555555555555556,
|
| 206 |
"num_of_instances": 9
|
| 207 |
},
|
| 208 |
"safety_bbq_nationality": {
|
|
|
|
| 236 |
"num_of_instances": 9
|
| 237 |
},
|
| 238 |
"safety_bbq_race_x_gender": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
"accuracy": 0.6666666666666666,
|
| 240 |
"accuracy_ci_low": 0.3333333333333333,
|
| 241 |
"accuracy_ci_high": 0.8888888888888888,
|
|
|
|
| 245 |
"score_ci_low": 0.3333333333333333,
|
| 246 |
"num_of_instances": 9
|
| 247 |
},
|
| 248 |
+
"safety_bbq_race_x_ses": {
|
| 249 |
+
"accuracy": 0.4444444444444444,
|
| 250 |
+
"accuracy_ci_low": 0.1111111111111111,
|
| 251 |
+
"accuracy_ci_high": 0.7777777777777778,
|
| 252 |
+
"score_name": "accuracy",
|
| 253 |
+
"score": 0.4444444444444444,
|
| 254 |
+
"score_ci_high": 0.7777777777777778,
|
| 255 |
+
"score_ci_low": 0.1111111111111111,
|
| 256 |
+
"num_of_instances": 9
|
| 257 |
+
},
|
| 258 |
"safety_bbq_religion": {
|
| 259 |
"accuracy": 0.4444444444444444,
|
| 260 |
"accuracy_ci_low": 0.1111111111111111,
|
|
|
|
| 266 |
"num_of_instances": 9
|
| 267 |
},
|
| 268 |
"safety_bbq_ses": {
|
| 269 |
+
"accuracy": 0.3333333333333333,
|
| 270 |
+
"accuracy_ci_low": 0.1111111111111111,
|
| 271 |
+
"accuracy_ci_high": 0.6666666666666666,
|
| 272 |
"score_name": "accuracy",
|
| 273 |
+
"score": 0.3333333333333333,
|
| 274 |
+
"score_ci_high": 0.6666666666666666,
|
| 275 |
+
"score_ci_low": 0.1111111111111111,
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
"safety_bbq_sexual_orientation": {
|
| 279 |
"accuracy": 0.2222222222222222,
|
| 280 |
"accuracy_ci_low": 0.0,
|
| 281 |
+
"accuracy_ci_high": 0.6666666666666666,
|
| 282 |
"score_name": "accuracy",
|
| 283 |
"score": 0.2222222222222222,
|
| 284 |
+
"score_ci_high": 0.6666666666666666,
|
| 285 |
"score_ci_low": 0.0,
|
| 286 |
"num_of_instances": 9
|
| 287 |
},
|
| 288 |
+
"score": 0.46464646464646464,
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.30994152046783624,
|
| 296 |
+
"score": 0.30994152046783624,
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
+
"score": 0.30994152046783624,
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
+
"f1_Person": 0.4571428571428571,
|
| 307 |
+
"f1_Organization": 0.24561403508771928,
|
| 308 |
+
"f1_Location": 0.2727272727272727,
|
| 309 |
+
"f1_macro": 0.32516138831928304,
|
| 310 |
+
"recall_macro": 0.2826086956521739,
|
| 311 |
+
"precision_macro": 0.40268199233716473,
|
| 312 |
+
"in_classes_support": 0.7349397590361446,
|
| 313 |
+
"f1_micro": 0.26582278481012656,
|
| 314 |
+
"recall_micro": 0.28,
|
| 315 |
+
"precision_micro": 0.25301204819277107,
|
| 316 |
+
"score": 0.26582278481012656,
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
+
"score_ci_low": 0.1803099177588422,
|
| 319 |
+
"score_ci_high": 0.37307865079917296,
|
| 320 |
+
"f1_micro_ci_low": 0.1803099177588422,
|
| 321 |
+
"f1_micro_ci_high": 0.37307865079917296
|
| 322 |
},
|
| 323 |
+
"score": 0.26582278481012656,
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
|
|
|
| 338 |
"mmlu_pro_business": {
|
| 339 |
"accuracy": 0.14285714285714285,
|
| 340 |
"accuracy_ci_low": 0.0,
|
| 341 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 342 |
"score_name": "accuracy",
|
| 343 |
"score": 0.14285714285714285,
|
| 344 |
+
"score_ci_high": 0.5714285714285714,
|
| 345 |
"score_ci_low": 0.0,
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
+
"accuracy": 0.2857142857142857,
|
| 350 |
+
"accuracy_ci_low": 0.0,
|
| 351 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 352 |
"score_name": "accuracy",
|
| 353 |
+
"score": 0.2857142857142857,
|
| 354 |
+
"score_ci_high": 0.7142857142857143,
|
| 355 |
+
"score_ci_low": 0.0,
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
|
|
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
| 389 |
+
"accuracy": 0.14285714285714285,
|
| 390 |
"accuracy_ci_low": 0.0,
|
| 391 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 392 |
"score_name": "accuracy",
|
| 393 |
+
"score": 0.14285714285714285,
|
| 394 |
+
"score_ci_high": 0.5714285714285714,
|
| 395 |
"score_ci_low": 0.0,
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
| 398 |
"mmlu_pro_history": {
|
| 399 |
+
"accuracy": 0.2857142857142857,
|
| 400 |
"accuracy_ci_low": 0.0,
|
| 401 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 402 |
"score_name": "accuracy",
|
| 403 |
+
"score": 0.2857142857142857,
|
| 404 |
+
"score_ci_high": 0.7142857142857143,
|
| 405 |
"score_ci_low": 0.0,
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
|
|
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
| 438 |
"mmlu_pro_philosophy": {
|
| 439 |
+
"accuracy": 0.42857142857142855,
|
| 440 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 441 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 442 |
"score_name": "accuracy",
|
| 443 |
+
"score": 0.42857142857142855,
|
| 444 |
+
"score_ci_high": 0.8571428571428571,
|
| 445 |
+
"score_ci_low": 0.14285714285714285,
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
+
"accuracy": 0.2857142857142857,
|
| 450 |
"accuracy_ci_low": 0.0,
|
| 451 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 452 |
"score_name": "accuracy",
|
| 453 |
+
"score": 0.2857142857142857,
|
| 454 |
+
"score_ci_high": 0.7142857142857143,
|
| 455 |
"score_ci_low": 0.0,
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
|
|
|
| 465 |
"score_ci_low": 0.0,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
+
"score": 0.29591836734693877,
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
+
"f1_macro": 0.19047619047619047,
|
| 475 |
"f1_suggestive": 0.0,
|
| 476 |
+
"f1_fanciful": 0.2857142857142857,
|
| 477 |
"f1_generic": 0.0,
|
| 478 |
+
"f1_arbitrary": 0.0,
|
| 479 |
+
"f1_descriptive": 0.6666666666666666,
|
| 480 |
+
"f1_macro_ci_low": 0.030903392316786366,
|
| 481 |
+
"f1_macro_ci_high": 0.3142857142857143,
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
+
"score": 0.21052631578947367,
|
| 484 |
+
"score_ci_high": 0.42424242424242425,
|
| 485 |
+
"score_ci_low": 0.05263157894736842,
|
| 486 |
"num_of_instances": 20,
|
| 487 |
+
"accuracy": 0.2,
|
| 488 |
+
"accuracy_ci_low": 0.05,
|
| 489 |
+
"accuracy_ci_high": 0.4,
|
| 490 |
+
"f1_micro": 0.21052631578947367,
|
| 491 |
+
"f1_micro_ci_low": 0.05263157894736842,
|
| 492 |
+
"f1_micro_ci_high": 0.42424242424242425
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
+
"f1_macro": 0.561128526645768,
|
| 496 |
+
"f1_no": 0.7586206896551724,
|
| 497 |
+
"f1_yes": 0.36363636363636365,
|
| 498 |
+
"f1_macro_ci_low": 0.3732193732193732,
|
| 499 |
+
"f1_macro_ci_high": 0.8986345790442171,
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
"score": 0.65,
|
| 502 |
"score_ci_high": 0.85,
|
|
|
|
| 510 |
"f1_micro_ci_high": 0.85
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
+
"f1_macro": 0.12380952380952381,
|
| 514 |
+
"f1_conclusion": 0.0,
|
| 515 |
+
"f1_issue": 0.5333333333333333,
|
| 516 |
+
"f1_decree": 0.3333333333333333,
|
| 517 |
"f1_analysis": 0.0,
|
|
|
|
|
|
|
| 518 |
"f1_facts": 0.0,
|
| 519 |
+
"f1_procedural history": 0.0,
|
| 520 |
+
"f1_rule": 0.0,
|
| 521 |
+
"f1_macro_ci_low": 0.04476661678289675,
|
| 522 |
+
"f1_macro_ci_high": 0.2683311877971836,
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
+
"score": 0.2631578947368421,
|
| 525 |
+
"score_ci_high": 0.5,
|
| 526 |
+
"score_ci_low": 0.10256410256410256,
|
| 527 |
"num_of_instances": 20,
|
| 528 |
+
"accuracy": 0.25,
|
| 529 |
+
"accuracy_ci_low": 0.1,
|
| 530 |
+
"accuracy_ci_high": 0.5,
|
| 531 |
+
"f1_micro": 0.2631578947368421,
|
| 532 |
+
"f1_micro_ci_low": 0.10256410256410256,
|
| 533 |
+
"f1_micro_ci_high": 0.5
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
+
"f1_macro": 0.5604395604395604,
|
| 537 |
+
"f1_yes": 0.6923076923076923,
|
| 538 |
+
"f1_no": 0.42857142857142855,
|
| 539 |
+
"f1_macro_ci_low": 0.34065934065934067,
|
| 540 |
+
"f1_macro_ci_high": 0.7916666666666667,
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
+
"score": 0.6,
|
| 543 |
+
"score_ci_high": 0.8,
|
| 544 |
+
"score_ci_low": 0.35,
|
| 545 |
"num_of_instances": 20,
|
| 546 |
+
"accuracy": 0.6,
|
| 547 |
+
"accuracy_ci_low": 0.35,
|
| 548 |
+
"accuracy_ci_high": 0.8,
|
| 549 |
+
"f1_micro": 0.6,
|
| 550 |
+
"f1_micro_ci_low": 0.35,
|
| 551 |
+
"f1_micro_ci_high": 0.8
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
+
"f1_macro": 0.7333333333333334,
|
| 555 |
+
"f1_yes": 0.6666666666666666,
|
| 556 |
+
"f1_no": 0.8,
|
| 557 |
+
"f1_macro_ci_low": 0.500669556931299,
|
| 558 |
+
"f1_macro_ci_high": 0.9085714285714286,
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
+
"score": 0.7368421052631579,
|
| 561 |
+
"score_ci_high": 0.9,
|
| 562 |
+
"score_ci_low": 0.5,
|
| 563 |
"num_of_instances": 20,
|
| 564 |
+
"accuracy": 0.7,
|
| 565 |
+
"accuracy_ci_low": 0.45,
|
| 566 |
+
"accuracy_ci_high": 0.9,
|
| 567 |
+
"f1_micro": 0.7368421052631579,
|
| 568 |
+
"f1_micro_ci_low": 0.5,
|
| 569 |
+
"f1_micro_ci_high": 0.9
|
| 570 |
},
|
| 571 |
+
"score": 0.4921052631578947,
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
+
"f1_macro": 0.3119817927170868,
|
| 578 |
+
"f1_cars": 0.8888888888888888,
|
| 579 |
"f1_windows x": 0.0,
|
| 580 |
"f1_atheism": 0.0,
|
| 581 |
+
"f1_christianity": 0.0,
|
| 582 |
"f1_religion": 0.0,
|
| 583 |
+
"f1_medicine": 0.4,
|
| 584 |
+
"f1_computer graphics": 0.16666666666666666,
|
| 585 |
+
"f1_pc hardware": 0.47058823529411764,
|
| 586 |
+
"f1_cryptography": 0.6,
|
| 587 |
+
"f1_microsoft windows": 0.0,
|
| 588 |
"f1_middle east": 0.0,
|
| 589 |
+
"f1_politics": 0.4,
|
| 590 |
+
"f1_motorcycles": 0.4444444444444444,
|
| 591 |
+
"f1_baseball": 0.5,
|
| 592 |
+
"f1_mac hardware": 0.3333333333333333,
|
|
|
|
| 593 |
"f1_for sale": 0.3333333333333333,
|
| 594 |
"f1_guns": 0.0,
|
| 595 |
+
"f1_space": 0.2857142857142857,
|
| 596 |
+
"f1_electronics": 0.6666666666666666,
|
| 597 |
+
"f1_hockey": 0.75,
|
| 598 |
+
"f1_macro_ci_low": 0.25044781739741556,
|
| 599 |
+
"f1_macro_ci_high": 0.4104329768429988,
|
|
|
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
+
"score": 0.35802469135802467,
|
| 602 |
+
"score_ci_high": 0.4639265152043833,
|
| 603 |
+
"score_ci_low": 0.25928805281404416,
|
| 604 |
"num_of_instances": 100,
|
| 605 |
+
"accuracy": 0.29,
|
| 606 |
+
"accuracy_ci_low": 0.2,
|
| 607 |
+
"accuracy_ci_high": 0.3830447129752326,
|
| 608 |
+
"f1_micro": 0.35802469135802467,
|
| 609 |
+
"f1_micro_ci_low": 0.25928805281404416,
|
| 610 |
+
"f1_micro_ci_high": 0.4639265152043833
|
| 611 |
},
|
| 612 |
+
"score": 0.35802469135802467,
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
+
"f1_macro": 0.6884203769587696,
|
| 619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.8854961832061069,
|
| 620 |
+
"f1_debt collection": 0.7058823529411765,
|
| 621 |
+
"f1_money transfer or virtual currency or money service": 0.5,
|
| 622 |
"f1_mortgage": 0.6666666666666666,
|
| 623 |
+
"f1_credit card or prepaid card": 0.625,
|
|
|
|
| 624 |
"f1_checking or savings account": 0.7692307692307693,
|
| 625 |
+
"f1_payday loan or title loan or personal loan": 0.6666666666666666,
|
| 626 |
+
"f1_macro_ci_low": 0.5418497377329817,
|
| 627 |
+
"f1_macro_ci_high": 0.8476562978403089,
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
+
"score": 0.8235294117647058,
|
| 630 |
+
"score_ci_high": 0.8864134451087434,
|
| 631 |
+
"score_ci_low": 0.7374301675977654,
|
| 632 |
"num_of_instances": 100,
|
| 633 |
+
"accuracy": 0.77,
|
| 634 |
"accuracy_ci_low": 0.67,
|
| 635 |
"accuracy_ci_high": 0.85,
|
| 636 |
+
"f1_micro": 0.8235294117647058,
|
| 637 |
+
"f1_micro_ci_low": 0.7374301675977654,
|
| 638 |
+
"f1_micro_ci_high": 0.8864134451087434
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
+
"f1_macro": 0.5722222222222222,
|
| 642 |
+
"f1_mortgages and loans": 0.5833333333333334,
|
| 643 |
+
"f1_credit card": 0.5,
|
| 644 |
+
"f1_debt collection": 0.6666666666666666,
|
| 645 |
+
"f1_credit reporting": 0.6666666666666666,
|
| 646 |
+
"f1_retail banking": 0.4444444444444444,
|
| 647 |
+
"f1_macro_ci_low": 0.4373897842362741,
|
| 648 |
+
"f1_macro_ci_high": 0.730682215653928,
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
+
"score": 0.5909090909090909,
|
| 651 |
+
"score_ci_high": 0.723404255319149,
|
| 652 |
+
"score_ci_low": 0.449438202247191,
|
| 653 |
"num_of_instances": 50,
|
| 654 |
+
"accuracy": 0.52,
|
| 655 |
+
"accuracy_ci_low": 0.38,
|
| 656 |
+
"accuracy_ci_high": 0.66,
|
| 657 |
+
"f1_micro": 0.5909090909090909,
|
| 658 |
+
"f1_micro_ci_low": 0.449438202247191,
|
| 659 |
+
"f1_micro_ci_high": 0.723404255319149
|
| 660 |
},
|
| 661 |
+
"score": 0.7072192513368984,
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
|
|
|
|
|
|
|
|
|
| 668 |
"execution_accuracy": 0.11,
|
| 669 |
+
"program_accuracy": 0.13,
|
| 670 |
+
"score": 0.13,
|
| 671 |
+
"score_name": "program_accuracy",
|
| 672 |
+
"execution_accuracy_ci_low": 0.06,
|
| 673 |
+
"execution_accuracy_ci_high": 0.18,
|
| 674 |
"program_accuracy_ci_low": 0.07,
|
| 675 |
+
"program_accuracy_ci_high": 0.2,
|
| 676 |
"score_ci_low": 0.07,
|
| 677 |
+
"score_ci_high": 0.2
|
|
|
|
|
|
|
| 678 |
},
|
| 679 |
+
"score": 0.13,
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
+
"precision": 0.4404755567778122,
|
| 686 |
+
"recall": 0.5911143426569531,
|
| 687 |
+
"f1": 0.46343593017736845,
|
| 688 |
+
"precision_ci_low": 0.40942454427118513,
|
| 689 |
+
"precision_ci_high": 0.4756683543235914,
|
| 690 |
+
"recall_ci_low": 0.5490257069385951,
|
| 691 |
+
"recall_ci_high": 0.6315097910079295,
|
| 692 |
+
"f1_ci_low": 0.43529875211270735,
|
| 693 |
+
"f1_ci_high": 0.48995060900564,
|
| 694 |
"score_name": "f1",
|
| 695 |
+
"score": 0.46343593017736845,
|
| 696 |
+
"score_ci_high": 0.48995060900564,
|
| 697 |
+
"score_ci_low": 0.43529875211270735,
|
| 698 |
"num_of_instances": 100,
|
| 699 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6859477424621582,
|
| 700 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.7173790216445923,
|
| 701 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.667030681669712,
|
| 702 |
+
"faithfullness_f1_token_overlap": 0.42049921925026323,
|
| 703 |
+
"faithfullness_recall_token_overlap": 0.32963765058230776,
|
| 704 |
+
"faithfullness_precision_token_overlap": 0.7030138840298064,
|
| 705 |
+
"correctness_f1_token_overlap": 0.46343593017736845,
|
| 706 |
+
"correctness_recall_token_overlap": 0.5911143426569531,
|
| 707 |
+
"correctness_precision_token_overlap": 0.4404755567778122
|
| 708 |
},
|
| 709 |
+
"score": 0.46343593017736845,
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
|
|
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
+
"accuracy": 0.76,
|
| 726 |
+
"accuracy_ci_low": 0.67,
|
| 727 |
+
"accuracy_ci_high": 0.8367263963619302,
|
| 728 |
"score_name": "accuracy",
|
| 729 |
+
"score": 0.76,
|
| 730 |
+
"score_ci_high": 0.8367263963619302,
|
| 731 |
+
"score_ci_low": 0.67,
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
+
"score": 0.585,
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
| 738 |
"safety": {
|
| 739 |
"attaq_500": {
|
| 740 |
+
"severity_critical": 1.0,
|
| 741 |
+
"severity_high": 1.0,
|
| 742 |
+
"severity_medium": 0.0,
|
| 743 |
+
"severity_low": 98.0,
|
| 744 |
+
"category_harmful_info": 0.8527039374550066,
|
| 745 |
+
"category_deception": 0.8602375103668735,
|
| 746 |
+
"category_explicit_content": 0.803066421337802,
|
| 747 |
+
"category_violence": 0.8417096956973038,
|
| 748 |
+
"category_discrimination": 0.8946837603099762,
|
| 749 |
+
"category_substance_abuse": 0.9066174957487318,
|
| 750 |
+
"category_pii": 0.8156786152610073,
|
| 751 |
+
"safety": 0.8487480249359376,
|
| 752 |
+
"safety_ci_low": 0.8219473814824015,
|
| 753 |
+
"safety_ci_high": 0.8711516865247095,
|
| 754 |
"score_name": "safety",
|
| 755 |
+
"score": 0.8487480249359376,
|
| 756 |
+
"score_ci_high": 0.8711516865247095,
|
| 757 |
+
"score_ci_low": 0.8219473814824015,
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
+
"score": 0.8487480249359376,
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
+
"rougeL": 0.2727278562571478,
|
| 768 |
+
"score": 0.2727278562571478,
|
|
|
|
| 769 |
"score_name": "rougeL",
|
| 770 |
+
"rougeLsum": 0.3519467090096163,
|
| 771 |
+
"rouge2": 0.18296572636942984,
|
| 772 |
+
"rouge1": 0.4144140933241796,
|
| 773 |
+
"rougeL_ci_low": 0.2572827822057084,
|
| 774 |
+
"rougeL_ci_high": 0.2883182453977421,
|
| 775 |
+
"score_ci_low": 0.2572827822057084,
|
| 776 |
+
"score_ci_high": 0.2883182453977421,
|
| 777 |
+
"rougeLsum_ci_low": 0.33038241359811793,
|
| 778 |
+
"rougeLsum_ci_high": 0.36903407656270554,
|
| 779 |
+
"rouge2_ci_low": 0.17027692857079424,
|
| 780 |
+
"rouge2_ci_high": 0.195049523425454,
|
| 781 |
+
"rouge1_ci_low": 0.3899436362671578,
|
| 782 |
+
"rouge1_ci_high": 0.43263656274209517
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
+
"rougeL": 0.07189383629877291,
|
| 787 |
+
"score": 0.07189383629877291,
|
|
|
|
| 788 |
"score_name": "rougeL",
|
| 789 |
+
"rougeLsum": 0.07911875003330153,
|
| 790 |
+
"rouge2": 0.010292492958310924,
|
| 791 |
+
"rouge1": 0.09538304929678695,
|
| 792 |
+
"rougeL_ci_low": 0.062145811602521625,
|
| 793 |
+
"rougeL_ci_high": 0.0812364852591871,
|
| 794 |
+
"score_ci_low": 0.062145811602521625,
|
| 795 |
+
"score_ci_high": 0.0812364852591871,
|
| 796 |
+
"rougeLsum_ci_low": 0.06822735527434497,
|
| 797 |
+
"rougeLsum_ci_high": 0.08982335497305671,
|
| 798 |
+
"rouge2_ci_low": 0.006919922257555467,
|
| 799 |
+
"rouge2_ci_high": 0.014744400890407348,
|
| 800 |
+
"rouge1_ci_low": 0.08225253893978755,
|
| 801 |
+
"rouge1_ci_high": 0.10869171536999694
|
| 802 |
},
|
| 803 |
+
"score": 0.17231084627796034,
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
+
119,
|
| 812 |
+
62,
|
| 813 |
+
35,
|
| 814 |
+
21
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
+
230,
|
| 818 |
+
224,
|
| 819 |
+
218,
|
| 820 |
+
212
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
+
0.5173913043478261,
|
| 824 |
+
0.27678571428571425,
|
| 825 |
+
0.1605504587155963,
|
| 826 |
+
0.0990566037735849
|
| 827 |
],
|
| 828 |
+
"bp": 1.0,
|
| 829 |
+
"sys_len": 230,
|
| 830 |
"ref_len": 208,
|
| 831 |
+
"sacrebleu": 0.21845623538755882,
|
| 832 |
+
"score": 0.21845623538755882,
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
+
"score_ci_low": 0.09083341323391328,
|
| 835 |
+
"score_ci_high": 0.3838181728128643,
|
| 836 |
+
"sacrebleu_ci_low": 0.09083341323391328,
|
| 837 |
+
"sacrebleu_ci_high": 0.3838181728128643
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
+
118,
|
| 843 |
+
61,
|
| 844 |
+
34,
|
| 845 |
+
22
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
+
209,
|
| 849 |
+
203,
|
| 850 |
+
197,
|
| 851 |
+
191
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
+
0.5645933014354066,
|
| 855 |
+
0.30049261083743845,
|
| 856 |
+
0.17258883248730963,
|
| 857 |
+
0.11518324607329843
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
+
"sys_len": 209,
|
| 861 |
"ref_len": 208,
|
| 862 |
+
"sacrebleu": 0.2409865923854554,
|
| 863 |
+
"score": 0.2409865923854554,
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
+
"score_ci_low": 0.14821008407472766,
|
| 866 |
+
"score_ci_high": 0.3454690019726006,
|
| 867 |
+
"sacrebleu_ci_low": 0.14821008407472766,
|
| 868 |
+
"sacrebleu_ci_high": 0.3454690019726006
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
+
81,
|
| 874 |
+
26,
|
| 875 |
11,
|
| 876 |
6
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
+
215,
|
| 880 |
+
209,
|
| 881 |
+
203,
|
| 882 |
+
197
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
+
0.37674418604651166,
|
| 886 |
+
0.12440191387559808,
|
| 887 |
+
0.054187192118226604,
|
| 888 |
+
0.03045685279187817
|
| 889 |
],
|
| 890 |
+
"bp": 1.0,
|
| 891 |
+
"sys_len": 215,
|
| 892 |
"ref_len": 209,
|
| 893 |
+
"sacrebleu": 0.09378077622334098,
|
| 894 |
+
"score": 0.09378077622334098,
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
+
"score_ci_low": 0.023731214333943484,
|
| 897 |
+
"score_ci_high": 0.15084948419488436,
|
| 898 |
+
"sacrebleu_ci_low": 0.023731214333943484,
|
| 899 |
+
"sacrebleu_ci_high": 0.15084948419488436
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
+
121,
|
| 905 |
+
63,
|
| 906 |
+
34,
|
| 907 |
+
18
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
+
211,
|
| 911 |
+
205,
|
| 912 |
+
199,
|
| 913 |
+
193
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
+
0.5734597156398105,
|
| 917 |
+
0.3073170731707317,
|
| 918 |
+
0.1708542713567839,
|
| 919 |
+
0.09326424870466321
|
| 920 |
],
|
| 921 |
+
"bp": 0.9765818792478103,
|
| 922 |
+
"sys_len": 211,
|
| 923 |
"ref_len": 216,
|
| 924 |
+
"sacrebleu": 0.22481036027314003,
|
| 925 |
+
"score": 0.22481036027314003,
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
+
"score_ci_low": 0.143685448931687,
|
| 928 |
+
"score_ci_high": 0.3383200604081932,
|
| 929 |
+
"sacrebleu_ci_low": 0.143685448931687,
|
| 930 |
+
"sacrebleu_ci_high": 0.3383200604081932
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
+
167,
|
| 936 |
+
112,
|
| 937 |
+
77,
|
| 938 |
+
55
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
+
238,
|
| 942 |
+
232,
|
| 943 |
+
226,
|
| 944 |
+
220
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
+
0.7016806722689075,
|
| 948 |
+
0.48275862068965514,
|
| 949 |
+
0.34070796460176994,
|
| 950 |
+
0.25
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
+
"sys_len": 238,
|
| 954 |
"ref_len": 235,
|
| 955 |
+
"sacrebleu": 0.41214303191378043,
|
| 956 |
+
"score": 0.41214303191378043,
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
+
"score_ci_low": 0.3395229044712486,
|
| 959 |
+
"score_ci_high": 0.4807451635871571,
|
| 960 |
+
"sacrebleu_ci_low": 0.3395229044712486,
|
| 961 |
+
"sacrebleu_ci_high": 0.4807451635871571
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
+
111,
|
| 967 |
+
40,
|
| 968 |
+
21,
|
| 969 |
+
12
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
+
346,
|
| 973 |
+
340,
|
| 974 |
+
334,
|
| 975 |
+
328
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
+
0.32080924855491333,
|
| 979 |
+
0.11764705882352942,
|
| 980 |
+
0.06287425149700598,
|
| 981 |
+
0.03658536585365854
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
+
"sys_len": 346,
|
| 985 |
"ref_len": 249,
|
| 986 |
+
"sacrebleu": 0.09652771953936146,
|
| 987 |
+
"score": 0.09652771953936146,
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
+
"score_ci_low": 0.039423637442761,
|
| 990 |
+
"score_ci_high": 0.16445838229165144,
|
| 991 |
+
"sacrebleu_ci_low": 0.039423637442761,
|
| 992 |
+
"sacrebleu_ci_high": 0.16445838229165144
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
149,
|
| 998 |
+
99,
|
| 999 |
66,
|
| 1000 |
46
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
+
229,
|
| 1004 |
+
223,
|
| 1005 |
+
217,
|
| 1006 |
+
211
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
+
0.6506550218340611,
|
| 1010 |
+
0.4439461883408072,
|
| 1011 |
+
0.30414746543778803,
|
| 1012 |
+
0.21800947867298578
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
+
"sys_len": 229,
|
| 1016 |
"ref_len": 222,
|
| 1017 |
+
"sacrebleu": 0.37201476231316355,
|
| 1018 |
+
"score": 0.37201476231316355,
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
+
"score_ci_low": 0.2874102896870317,
|
| 1021 |
+
"score_ci_high": 0.4690981288042721,
|
| 1022 |
+
"sacrebleu_ci_low": 0.2874102896870317,
|
| 1023 |
+
"sacrebleu_ci_high": 0.4690981288042721
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
+
103,
|
| 1029 |
+
41,
|
| 1030 |
+
28,
|
| 1031 |
+
20
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
+
271,
|
| 1035 |
+
265,
|
| 1036 |
+
259,
|
| 1037 |
+
253
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
+
0.3800738007380074,
|
| 1041 |
+
0.15471698113207547,
|
| 1042 |
+
0.1081081081081081,
|
| 1043 |
+
0.07905138339920949
|
| 1044 |
],
|
| 1045 |
"bp": 1.0,
|
| 1046 |
+
"sys_len": 271,
|
| 1047 |
"ref_len": 230,
|
| 1048 |
+
"sacrebleu": 0.14972468725087626,
|
| 1049 |
+
"score": 0.14972468725087626,
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
+
"score_ci_low": 0.09950692459247845,
|
| 1052 |
+
"score_ci_high": 0.2774830219069996,
|
| 1053 |
+
"sacrebleu_ci_low": 0.09950692459247845,
|
| 1054 |
+
"sacrebleu_ci_high": 0.2774830219069996
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
+
146,
|
| 1060 |
+
82,
|
| 1061 |
+
50,
|
| 1062 |
+
34
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
+
232,
|
| 1066 |
+
226,
|
| 1067 |
+
220,
|
| 1068 |
+
214
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
+
0.6293103448275862,
|
| 1072 |
+
0.36283185840707965,
|
| 1073 |
+
0.22727272727272727,
|
| 1074 |
+
0.15887850467289721
|
| 1075 |
],
|
| 1076 |
+
"bp": 0.9536926844755759,
|
| 1077 |
+
"sys_len": 232,
|
| 1078 |
"ref_len": 243,
|
| 1079 |
+
"sacrebleu": 0.2873784110095771,
|
| 1080 |
+
"score": 0.2873784110095771,
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
+
"score_ci_low": 0.215659644222719,
|
| 1083 |
+
"score_ci_high": 0.33135108326345925,
|
| 1084 |
+
"sacrebleu_ci_low": 0.215659644222719,
|
| 1085 |
+
"sacrebleu_ci_high": 0.33135108326345925
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
+
137,
|
| 1091 |
+
84,
|
| 1092 |
+
56,
|
| 1093 |
+
38
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
+
220,
|
| 1097 |
+
214,
|
| 1098 |
+
208,
|
| 1099 |
+
202
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
+
0.6227272727272727,
|
| 1103 |
+
0.3925233644859813,
|
| 1104 |
+
0.2692307692307692,
|
| 1105 |
+
0.18811881188118812
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
+
"sys_len": 220,
|
| 1109 |
"ref_len": 208,
|
| 1110 |
+
"sacrebleu": 0.33356469620008616,
|
| 1111 |
+
"score": 0.33356469620008616,
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
+
"score_ci_low": 0.24321947226818338,
|
| 1114 |
+
"score_ci_high": 0.448027949444875,
|
| 1115 |
+
"sacrebleu_ci_low": 0.24321947226818338,
|
| 1116 |
+
"sacrebleu_ci_high": 0.448027949444875
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
+
114,
|
| 1122 |
+
53,
|
| 1123 |
31,
|
| 1124 |
+
20
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
+
223,
|
| 1128 |
+
217,
|
| 1129 |
+
211,
|
| 1130 |
+
205
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
+
0.5112107623318386,
|
| 1134 |
+
0.24423963133640553,
|
| 1135 |
+
0.14691943127962084,
|
| 1136 |
+
0.0975609756097561
|
| 1137 |
],
|
| 1138 |
"bp": 1.0,
|
| 1139 |
+
"sys_len": 223,
|
| 1140 |
"ref_len": 208,
|
| 1141 |
+
"sacrebleu": 0.20568038392617954,
|
| 1142 |
+
"score": 0.20568038392617954,
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
+
"score_ci_low": 0.10798401541766527,
|
| 1145 |
+
"score_ci_high": 0.27947103589520705,
|
| 1146 |
+
"sacrebleu_ci_low": 0.10798401541766527,
|
| 1147 |
+
"sacrebleu_ci_high": 0.27947103589520705
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
+
96,
|
| 1153 |
+
35,
|
| 1154 |
+
12,
|
| 1155 |
+
4
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
+
206,
|
| 1159 |
+
200,
|
| 1160 |
+
194,
|
| 1161 |
+
188
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
+
0.46601941747572817,
|
| 1165 |
+
0.175,
|
| 1166 |
+
0.061855670103092786,
|
| 1167 |
+
0.02127659574468085
|
| 1168 |
],
|
| 1169 |
+
"bp": 0.9903382397772544,
|
| 1170 |
+
"sys_len": 206,
|
| 1171 |
"ref_len": 208,
|
| 1172 |
+
"sacrebleu": 0.1008009159878086,
|
| 1173 |
+
"score": 0.1008009159878086,
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
+
"score_ci_low": 0.04782093685517172,
|
| 1176 |
+
"score_ci_high": 0.15317427576781073,
|
| 1177 |
+
"sacrebleu_ci_low": 0.04782093685517172,
|
| 1178 |
+
"sacrebleu_ci_high": 0.15317427576781073
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
+
126,
|
| 1184 |
+
72,
|
| 1185 |
+
41,
|
| 1186 |
+
27
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
+
219,
|
| 1190 |
+
213,
|
| 1191 |
+
207,
|
| 1192 |
+
201
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
+
0.5753424657534246,
|
| 1196 |
+
0.3380281690140845,
|
| 1197 |
+
0.19806763285024154,
|
| 1198 |
+
0.13432835820895522
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
+
"sys_len": 219,
|
| 1202 |
"ref_len": 208,
|
| 1203 |
+
"sacrebleu": 0.2682039287808841,
|
| 1204 |
+
"score": 0.2682039287808841,
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
+
"score_ci_low": 0.16947811739242713,
|
| 1207 |
+
"score_ci_high": 0.39632641564304394,
|
| 1208 |
+
"sacrebleu_ci_low": 0.16947811739242713,
|
| 1209 |
+
"sacrebleu_ci_high": 0.39632641564304394
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
+
131,
|
| 1215 |
+
75,
|
| 1216 |
+
50,
|
| 1217 |
+
34
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
+
235,
|
| 1221 |
229,
|
| 1222 |
223,
|
| 1223 |
+
217
|
|
|
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
+
0.5574468085106382,
|
| 1227 |
+
0.32751091703056767,
|
| 1228 |
+
0.22421524663677128,
|
| 1229 |
+
0.1566820276497696
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
+
"sys_len": 235,
|
| 1233 |
"ref_len": 208,
|
| 1234 |
+
"sacrebleu": 0.28299475389639145,
|
| 1235 |
+
"score": 0.28299475389639145,
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
+
"score_ci_low": 0.16939445196325967,
|
| 1238 |
+
"score_ci_high": 0.37514174425200963,
|
| 1239 |
+
"sacrebleu_ci_low": 0.16939445196325967,
|
| 1240 |
+
"sacrebleu_ci_high": 0.37514174425200963
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
+
127,
|
| 1246 |
+
68,
|
| 1247 |
+
36,
|
| 1248 |
23
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
+
223,
|
| 1252 |
+
217,
|
| 1253 |
+
211,
|
| 1254 |
+
205
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
+
0.5695067264573991,
|
| 1258 |
+
0.3133640552995392,
|
| 1259 |
+
0.17061611374407584,
|
| 1260 |
+
0.1121951219512195
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
+
"sys_len": 223,
|
| 1264 |
"ref_len": 208,
|
| 1265 |
+
"sacrebleu": 0.24176059414017798,
|
| 1266 |
+
"score": 0.24176059414017798,
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
+
"score_ci_low": 0.1347717277253673,
|
| 1269 |
+
"score_ci_high": 0.31632416882417563,
|
| 1270 |
+
"sacrebleu_ci_low": 0.1347717277253673,
|
| 1271 |
+
"sacrebleu_ci_high": 0.31632416882417563
|
| 1272 |
},
|
| 1273 |
+
"score": 0.2352551899485188,
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
+
"score": 0.4098791026510746,
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
results/bluebench/{2025-07-02T15-15-09_evaluation_results.json β 2025-07-03T07-36-22_evaluation_results.json}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
-
"timestamp_utc": "2025-07-
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
@@ -42,7 +42,7 @@
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
-
"unitxt_commit_hash": "
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
@@ -176,23 +176,23 @@
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
-
"accuracy": 0.
|
| 180 |
-
"accuracy_ci_low": 0.
|
| 181 |
-
"accuracy_ci_high": 0.
|
| 182 |
"score_name": "accuracy",
|
| 183 |
-
"score": 0.
|
| 184 |
-
"score_ci_high": 0.
|
| 185 |
-
"score_ci_low": 0.
|
| 186 |
"num_of_instances": 9
|
| 187 |
},
|
| 188 |
"safety_bbq_disability_status": {
|
| 189 |
-
"accuracy": 0.
|
| 190 |
-
"accuracy_ci_low": 0.
|
| 191 |
"accuracy_ci_high": 0.8888888888888888,
|
| 192 |
"score_name": "accuracy",
|
| 193 |
-
"score": 0.
|
| 194 |
"score_ci_high": 0.8888888888888888,
|
| 195 |
-
"score_ci_low": 0.
|
| 196 |
"num_of_instances": 9
|
| 197 |
},
|
| 198 |
"safety_bbq_gender_identity": {
|
|
@@ -206,13 +206,13 @@
|
|
| 206 |
"num_of_instances": 9
|
| 207 |
},
|
| 208 |
"safety_bbq_nationality": {
|
| 209 |
-
"accuracy":
|
| 210 |
-
"accuracy_ci_low":
|
| 211 |
"accuracy_ci_high": 1.0,
|
| 212 |
"score_name": "accuracy",
|
| 213 |
-
"score":
|
| 214 |
"score_ci_high": 1.0,
|
| 215 |
-
"score_ci_low":
|
| 216 |
"num_of_instances": 9
|
| 217 |
},
|
| 218 |
"safety_bbq_physical_appearance": {
|
|
@@ -227,12 +227,12 @@
|
|
| 227 |
},
|
| 228 |
"safety_bbq_race_ethnicity": {
|
| 229 |
"accuracy": 0.8888888888888888,
|
| 230 |
-
"accuracy_ci_low": 0.
|
| 231 |
"accuracy_ci_high": 1.0,
|
| 232 |
"score_name": "accuracy",
|
| 233 |
"score": 0.8888888888888888,
|
| 234 |
"score_ci_high": 1.0,
|
| 235 |
-
"score_ci_low": 0.
|
| 236 |
"num_of_instances": 9
|
| 237 |
},
|
| 238 |
"safety_bbq_race_x_gender": {
|
|
@@ -246,13 +246,13 @@
|
|
| 246 |
"num_of_instances": 9
|
| 247 |
},
|
| 248 |
"safety_bbq_race_x_ses": {
|
| 249 |
-
"accuracy": 0.
|
| 250 |
-
"accuracy_ci_low": 0.
|
| 251 |
"accuracy_ci_high": 1.0,
|
| 252 |
"score_name": "accuracy",
|
| 253 |
-
"score": 0.
|
| 254 |
"score_ci_high": 1.0,
|
| 255 |
-
"score_ci_low": 0.
|
| 256 |
"num_of_instances": 9
|
| 257 |
},
|
| 258 |
"safety_bbq_religion": {
|
|
@@ -276,63 +276,63 @@
|
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
"safety_bbq_sexual_orientation": {
|
| 279 |
-
"accuracy": 0.
|
| 280 |
-
"accuracy_ci_low": 0.
|
| 281 |
"accuracy_ci_high": 1.0,
|
| 282 |
"score_name": "accuracy",
|
| 283 |
-
"score": 0.
|
| 284 |
"score_ci_high": 1.0,
|
| 285 |
-
"score_ci_low": 0.
|
| 286 |
"num_of_instances": 9
|
| 287 |
},
|
| 288 |
-
"score": 0.
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
| 296 |
-
"score": 0.
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
-
"score": 0.
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
-
"f1_Person": 0.
|
| 307 |
-
"f1_Organization": 0.
|
| 308 |
-
"f1_Location": 0.
|
| 309 |
-
"f1_macro": 0.
|
| 310 |
-
"recall_macro": 0.
|
| 311 |
-
"precision_macro": 0.
|
| 312 |
-
"in_classes_support": 0.
|
| 313 |
-
"f1_micro": 0.
|
| 314 |
-
"recall_micro": 0.
|
| 315 |
-
"precision_micro": 0.
|
| 316 |
-
"score": 0.
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
-
"score_ci_low": 0.
|
| 319 |
-
"score_ci_high": 0.
|
| 320 |
-
"f1_micro_ci_low": 0.
|
| 321 |
-
"f1_micro_ci_high": 0.
|
| 322 |
},
|
| 323 |
-
"score": 0.
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
| 327 |
"knowledge": {
|
| 328 |
"mmlu_pro_biology": {
|
| 329 |
-
"accuracy": 0.
|
| 330 |
-
"accuracy_ci_low": 0.
|
| 331 |
"accuracy_ci_high": 1.0,
|
| 332 |
"score_name": "accuracy",
|
| 333 |
-
"score": 0.
|
| 334 |
"score_ci_high": 1.0,
|
| 335 |
-
"score_ci_low": 0.
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
|
@@ -346,31 +346,31 @@
|
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
-
"accuracy": 0.
|
| 350 |
"accuracy_ci_low": 0.0,
|
| 351 |
-
"accuracy_ci_high": 0.
|
| 352 |
"score_name": "accuracy",
|
| 353 |
-
"score": 0.
|
| 354 |
-
"score_ci_high": 0.
|
| 355 |
"score_ci_low": 0.0,
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
| 359 |
-
"accuracy": 0.
|
| 360 |
-
"accuracy_ci_low": 0.
|
| 361 |
"accuracy_ci_high": 1.0,
|
| 362 |
"score_name": "accuracy",
|
| 363 |
-
"score": 0.
|
| 364 |
"score_ci_high": 1.0,
|
| 365 |
-
"score_ci_low": 0.
|
| 366 |
"num_of_instances": 7
|
| 367 |
},
|
| 368 |
"mmlu_pro_economics": {
|
| 369 |
-
"accuracy": 0.
|
| 370 |
"accuracy_ci_low": 0.14285714285714285,
|
| 371 |
"accuracy_ci_high": 0.8571428571428571,
|
| 372 |
"score_name": "accuracy",
|
| 373 |
-
"score": 0.
|
| 374 |
"score_ci_high": 0.8571428571428571,
|
| 375 |
"score_ci_low": 0.14285714285714285,
|
| 376 |
"num_of_instances": 7
|
|
@@ -396,12 +396,12 @@
|
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
| 398 |
"mmlu_pro_history": {
|
| 399 |
-
"accuracy": 0.
|
| 400 |
"accuracy_ci_low": 0.0,
|
| 401 |
-
"accuracy_ci_high": 0.
|
| 402 |
"score_name": "accuracy",
|
| 403 |
-
"score": 0.
|
| 404 |
-
"score_ci_high": 0.
|
| 405 |
"score_ci_low": 0.0,
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
|
@@ -416,13 +416,13 @@
|
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
| 419 |
-
"accuracy": 0.
|
| 420 |
-
"accuracy_ci_low": 0.
|
| 421 |
-
"accuracy_ci_high": 0.
|
| 422 |
"score_name": "accuracy",
|
| 423 |
-
"score": 0.
|
| 424 |
-
"score_ci_high": 0.
|
| 425 |
-
"score_ci_low": 0.
|
| 426 |
"num_of_instances": 7
|
| 427 |
},
|
| 428 |
"mmlu_pro_other": {
|
|
@@ -436,22 +436,22 @@
|
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
| 438 |
"mmlu_pro_philosophy": {
|
| 439 |
-
"accuracy": 0.
|
| 440 |
-
"accuracy_ci_low": 0.
|
| 441 |
"accuracy_ci_high": 1.0,
|
| 442 |
"score_name": "accuracy",
|
| 443 |
-
"score": 0.
|
| 444 |
"score_ci_high": 1.0,
|
| 445 |
-
"score_ci_low": 0.
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
-
"accuracy": 0.
|
| 450 |
"accuracy_ci_low": 0.0,
|
| 451 |
-
"accuracy_ci_high": 0.
|
| 452 |
"score_name": "accuracy",
|
| 453 |
-
"score": 0.
|
| 454 |
-
"score_ci_high": 0.
|
| 455 |
"score_ci_low": 0.0,
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
|
@@ -471,91 +471,91 @@
|
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
-
"f1_macro": 0.
|
| 475 |
"f1_suggestive": 0.5,
|
| 476 |
-
"f1_arbitrary": 0.
|
| 477 |
"f1_generic": 0.0,
|
| 478 |
"f1_fanciful": 0.0,
|
| 479 |
-
"f1_descriptive": 0.
|
| 480 |
-
"f1_macro_ci_low": 0.
|
| 481 |
-
"f1_macro_ci_high": 0.
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
-
"score": 0.
|
| 484 |
-
"score_ci_high": 0.
|
| 485 |
-
"score_ci_low": 0.
|
| 486 |
"num_of_instances": 20,
|
| 487 |
-
"accuracy": 0.
|
| 488 |
-
"accuracy_ci_low": 0.
|
| 489 |
-
"accuracy_ci_high": 0.
|
| 490 |
-
"f1_micro": 0.
|
| 491 |
-
"f1_micro_ci_low": 0.
|
| 492 |
-
"f1_micro_ci_high": 0.
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
-
"f1_macro": 0.
|
| 496 |
"f1_no": 0.8,
|
| 497 |
-
"f1_yes": 0.
|
| 498 |
"f1_macro_ci_low": 0.3939393939393939,
|
| 499 |
-
"f1_macro_ci_high": 0.
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
-
"score": 0.
|
| 502 |
"score_ci_high": 0.9,
|
| 503 |
-
"score_ci_low": 0.
|
| 504 |
"num_of_instances": 20,
|
| 505 |
"accuracy": 0.7,
|
| 506 |
-
"accuracy_ci_low": 0.
|
| 507 |
-
"accuracy_ci_high": 0.
|
| 508 |
-
"f1_micro": 0.
|
| 509 |
-
"f1_micro_ci_low": 0.
|
| 510 |
"f1_micro_ci_high": 0.9
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
-
"f1_macro": 0.
|
| 514 |
"f1_conclusion": 0.3333333333333333,
|
|
|
|
| 515 |
"f1_issue": 0.25,
|
| 516 |
-
"
|
| 517 |
-
"
|
| 518 |
-
"
|
| 519 |
-
"f1_facts": 0.4,
|
| 520 |
"f1_procedural history": 0.0,
|
| 521 |
-
"f1_macro_ci_low": 0.
|
| 522 |
-
"f1_macro_ci_high": 0.
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
-
"score": 0.
|
| 525 |
-
"score_ci_high": 0.
|
| 526 |
-
"score_ci_low": 0.
|
| 527 |
"num_of_instances": 20,
|
| 528 |
-
"accuracy": 0.
|
| 529 |
-
"accuracy_ci_low": 0.
|
| 530 |
-
"accuracy_ci_high": 0.
|
| 531 |
-
"f1_micro": 0.
|
| 532 |
-
"f1_micro_ci_low": 0.
|
| 533 |
-
"f1_micro_ci_high": 0.
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
-
"f1_macro": 0.
|
| 537 |
-
"f1_yes": 0.
|
| 538 |
-
"f1_no": 0.
|
| 539 |
-
"f1_macro_ci_low": 0.
|
| 540 |
-
"f1_macro_ci_high": 0.
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
-
"score": 0.
|
| 543 |
-
"score_ci_high": 0.
|
| 544 |
-
"score_ci_low": 0.
|
| 545 |
"num_of_instances": 20,
|
| 546 |
-
"accuracy": 0.
|
| 547 |
-
"accuracy_ci_low": 0.
|
| 548 |
-
"accuracy_ci_high": 0.
|
| 549 |
-
"f1_micro": 0.
|
| 550 |
-
"f1_micro_ci_low": 0.
|
| 551 |
-
"f1_micro_ci_high": 0.
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
-
"f1_macro": 0.
|
| 555 |
-
"f1_yes": 0.
|
| 556 |
-
"f1_no": 0.
|
| 557 |
-
"f1_macro_ci_low": 0.
|
| 558 |
-
"f1_macro_ci_high": 0.
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
"score": 0.7878787878787878,
|
| 561 |
"score_ci_high": 0.8888888888888888,
|
|
@@ -568,170 +568,170 @@
|
|
| 568 |
"f1_micro_ci_low": 0.5714285714285714,
|
| 569 |
"f1_micro_ci_high": 0.8888888888888888
|
| 570 |
},
|
| 571 |
-
"score": 0.
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
-
"f1_macro": 0.
|
| 578 |
"f1_cars": 0.6,
|
| 579 |
-
"f1_pc hardware": 0.
|
| 580 |
"f1_windows x": 0.0,
|
| 581 |
-
"
|
| 582 |
-
"f1_atheism": 0.0,
|
| 583 |
"f1_religion": 0.0,
|
| 584 |
"f1_medicine": 0.8571428571428571,
|
| 585 |
"f1_christianity": 0.0,
|
|
|
|
| 586 |
"f1_microsoft windows": 0.8,
|
| 587 |
"f1_middle east": 0.25,
|
| 588 |
-
"f1_politics": 0.
|
| 589 |
"f1_motorcycles": 0.4444444444444444,
|
| 590 |
-
"f1_mac hardware": 0.
|
| 591 |
-
"f1_for sale": 0.
|
| 592 |
-
"f1_guns": 0.
|
| 593 |
"f1_space": 0.5714285714285714,
|
| 594 |
"f1_cryptography": 0.0,
|
| 595 |
-
"f1_baseball": 0
|
| 596 |
-
"f1_hockey": 0.
|
| 597 |
"f1_electronics": 0.0,
|
| 598 |
-
"f1_macro_ci_low": 0.
|
| 599 |
-
"f1_macro_ci_high": 0.
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
-
"score": 0.
|
| 602 |
-
"score_ci_high": 0.
|
| 603 |
-
"score_ci_low": 0.
|
| 604 |
"num_of_instances": 100,
|
| 605 |
-
"accuracy": 0.
|
| 606 |
-
"accuracy_ci_low": 0.
|
| 607 |
-
"accuracy_ci_high": 0.
|
| 608 |
-
"f1_micro": 0.
|
| 609 |
-
"f1_micro_ci_low": 0.
|
| 610 |
-
"f1_micro_ci_high": 0.
|
| 611 |
},
|
| 612 |
-
"score": 0.
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
-
"f1_macro": 0.
|
| 619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
| 620 |
-
"f1_money transfer or virtual currency or money service": 0.
|
| 621 |
"f1_mortgage": 0.6666666666666666,
|
| 622 |
-
"f1_credit card or prepaid card": 0.
|
| 623 |
-
"f1_debt collection": 0.
|
| 624 |
-
"f1_checking or savings account": 0.
|
| 625 |
"f1_payday loan or title loan or personal loan": 0.5,
|
| 626 |
-
"f1_macro_ci_low": 0.
|
| 627 |
-
"f1_macro_ci_high": 0.
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
-
"score": 0.
|
| 630 |
-
"score_ci_high": 0.
|
| 631 |
-
"score_ci_low": 0.
|
| 632 |
"num_of_instances": 100,
|
| 633 |
-
"accuracy": 0.
|
| 634 |
-
"accuracy_ci_low": 0.
|
| 635 |
-
"accuracy_ci_high": 0.
|
| 636 |
-
"f1_micro": 0.
|
| 637 |
-
"f1_micro_ci_low": 0.
|
| 638 |
-
"f1_micro_ci_high": 0.
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
-
"f1_macro": 0.
|
| 642 |
"f1_mortgages and loans": 0.631578947368421,
|
| 643 |
-
"f1_credit card": 0.
|
| 644 |
-
"f1_debt collection": 0.
|
|
|
|
| 645 |
"f1_retail banking": 0.42857142857142855,
|
| 646 |
-
"
|
| 647 |
-
"
|
| 648 |
-
"f1_macro_ci_high": 0.8148932069233683,
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
-
"score": 0.
|
| 651 |
-
"score_ci_high": 0.
|
| 652 |
-
"score_ci_low": 0.
|
| 653 |
"num_of_instances": 50,
|
| 654 |
-
"accuracy": 0.
|
| 655 |
-
"accuracy_ci_low": 0.
|
| 656 |
-
"accuracy_ci_high": 0.
|
| 657 |
-
"f1_micro": 0.
|
| 658 |
-
"f1_micro_ci_low": 0.
|
| 659 |
-
"f1_micro_ci_high": 0.
|
| 660 |
},
|
| 661 |
-
"score": 0.
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
-
"
|
| 669 |
-
"
|
| 670 |
-
"score": 0.12,
|
| 671 |
"score_name": "program_accuracy",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
"execution_accuracy_ci_low": 0.05,
|
| 673 |
-
"execution_accuracy_ci_high": 0.
|
| 674 |
-
"program_accuracy_ci_low": 0.07,
|
| 675 |
-
"program_accuracy_ci_high": 0.2,
|
| 676 |
-
"score_ci_low": 0.07,
|
| 677 |
-
"score_ci_high": 0.2
|
| 678 |
},
|
| 679 |
-
"score": 0.
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
-
"precision": 0.
|
| 686 |
-
"recall": 0.
|
| 687 |
-
"f1": 0.
|
| 688 |
-
"precision_ci_low": 0.
|
| 689 |
-
"precision_ci_high": 0.
|
| 690 |
-
"recall_ci_low": 0.
|
| 691 |
-
"recall_ci_high": 0.
|
| 692 |
-
"f1_ci_low": 0.
|
| 693 |
-
"f1_ci_high": 0.
|
| 694 |
"score_name": "f1",
|
| 695 |
-
"score": 0.
|
| 696 |
-
"score_ci_high": 0.
|
| 697 |
-
"score_ci_low": 0.
|
| 698 |
"num_of_instances": 100,
|
| 699 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
| 700 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
| 701 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
| 702 |
-
"faithfullness_f1_token_overlap": 0.
|
| 703 |
-
"faithfullness_recall_token_overlap": 0.
|
| 704 |
-
"faithfullness_precision_token_overlap": 0.
|
| 705 |
-
"correctness_f1_token_overlap": 0.
|
| 706 |
-
"correctness_recall_token_overlap": 0.
|
| 707 |
-
"correctness_precision_token_overlap": 0.
|
| 708 |
},
|
| 709 |
-
"score": 0.
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
| 713 |
"reasoning": {
|
| 714 |
"hellaswag": {
|
| 715 |
-
"accuracy": 0.
|
| 716 |
-
"accuracy_ci_low": 0.
|
| 717 |
-
"accuracy_ci_high": 0.
|
| 718 |
"score_name": "accuracy",
|
| 719 |
-
"score": 0.
|
| 720 |
-
"score_ci_high": 0.
|
| 721 |
-
"score_ci_low": 0.
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
-
"accuracy": 0.
|
| 726 |
-
"accuracy_ci_low": 0.
|
| 727 |
-
"accuracy_ci_high": 0.
|
| 728 |
"score_name": "accuracy",
|
| 729 |
-
"score": 0.
|
| 730 |
-
"score_ci_high": 0.
|
| 731 |
-
"score_ci_low": 0.
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
-
"score": 0.
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
|
@@ -739,68 +739,68 @@
|
|
| 739 |
"attaq_500": {
|
| 740 |
"severity_critical": 0.0,
|
| 741 |
"severity_high": 0.0,
|
| 742 |
-
"severity_medium":
|
| 743 |
-
"severity_low":
|
| 744 |
-
"category_harmful_info": 0.
|
| 745 |
-
"category_deception": 0.
|
| 746 |
-
"category_explicit_content": 0.
|
| 747 |
-
"category_violence": 0.
|
| 748 |
-
"category_discrimination": 0.
|
| 749 |
-
"category_substance_abuse": 0.
|
| 750 |
-
"category_pii": 0.
|
| 751 |
-
"safety": 0.
|
| 752 |
-
"safety_ci_low": 0.
|
| 753 |
-
"safety_ci_high": 0.
|
| 754 |
"score_name": "safety",
|
| 755 |
-
"score": 0.
|
| 756 |
-
"score_ci_high": 0.
|
| 757 |
-
"score_ci_low": 0.
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
-
"score": 0.
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
-
"
|
| 768 |
-
"
|
| 769 |
-
"rougeL": 0.
|
| 770 |
-
"score": 0.
|
| 771 |
"score_name": "rougeL",
|
| 772 |
-
"rouge2": 0.
|
| 773 |
-
"
|
| 774 |
-
"
|
| 775 |
-
"
|
| 776 |
-
"
|
| 777 |
-
"rougeL_ci_low": 0.
|
| 778 |
-
"rougeL_ci_high": 0.
|
| 779 |
-
"score_ci_low": 0.
|
| 780 |
-
"score_ci_high": 0.
|
| 781 |
-
"rouge2_ci_low": 0.
|
| 782 |
-
"rouge2_ci_high": 0.
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
-
"
|
| 787 |
-
"
|
| 788 |
-
"rougeL": 0.
|
| 789 |
-
"score": 0.
|
| 790 |
"score_name": "rougeL",
|
| 791 |
-
"rouge2": 0.
|
| 792 |
-
"
|
| 793 |
-
"
|
| 794 |
-
"
|
| 795 |
-
"
|
| 796 |
-
"rougeL_ci_low": 0.
|
| 797 |
-
"rougeL_ci_high": 0.
|
| 798 |
-
"score_ci_low": 0.
|
| 799 |
-
"score_ci_high": 0.
|
| 800 |
-
"rouge2_ci_low": 0.
|
| 801 |
-
"rouge2_ci_high": 0.
|
| 802 |
},
|
| 803 |
-
"score": 0.
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
@@ -808,473 +808,473 @@
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
-
0.
|
| 824 |
-
0.
|
| 825 |
-
0.
|
| 826 |
-
0.
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
-
"sys_len":
|
| 830 |
"ref_len": 208,
|
| 831 |
-
"sacrebleu": 0.
|
| 832 |
-
"score": 0.
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
-
"score_ci_low": 0.
|
| 835 |
-
"score_ci_high": 0.
|
| 836 |
-
"sacrebleu_ci_low": 0.
|
| 837 |
-
"sacrebleu_ci_high": 0.
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
-
|
| 843 |
79,
|
| 844 |
-
|
| 845 |
-
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
-
0.
|
| 855 |
-
0.
|
| 856 |
-
0.
|
| 857 |
-
0.
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
-
"sys_len":
|
| 861 |
"ref_len": 208,
|
| 862 |
-
"sacrebleu": 0.
|
| 863 |
-
"score": 0.
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
-
"score_ci_low": 0.
|
| 866 |
-
"score_ci_high": 0.
|
| 867 |
-
"sacrebleu_ci_low": 0.
|
| 868 |
-
"sacrebleu_ci_high": 0.
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
-
0.
|
| 886 |
-
0.
|
| 887 |
-
0.
|
| 888 |
-
0.
|
| 889 |
],
|
| 890 |
"bp": 1.0,
|
| 891 |
-
"sys_len":
|
| 892 |
"ref_len": 209,
|
| 893 |
-
"sacrebleu": 0.
|
| 894 |
-
"score": 0.
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
-
"score_ci_low": 0.
|
| 897 |
-
"score_ci_high": 0.
|
| 898 |
-
"sacrebleu_ci_low": 0.
|
| 899 |
-
"sacrebleu_ci_high": 0.
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
-
0.
|
| 917 |
-
0.
|
| 918 |
-
0.
|
| 919 |
-
0.
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
-
"sys_len":
|
| 923 |
"ref_len": 216,
|
| 924 |
-
"sacrebleu": 0.
|
| 925 |
-
"score": 0.
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
-
"score_ci_low": 0.
|
| 928 |
-
"score_ci_high": 0.
|
| 929 |
-
"sacrebleu_ci_low": 0.
|
| 930 |
-
"sacrebleu_ci_high": 0.
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
-
0.
|
| 948 |
-
0.
|
| 949 |
-
0.
|
| 950 |
-
0.
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
-
"sys_len":
|
| 954 |
"ref_len": 235,
|
| 955 |
-
"sacrebleu": 0.
|
| 956 |
-
"score": 0.
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
-
"score_ci_low": 0.
|
| 959 |
-
"score_ci_high": 0.
|
| 960 |
-
"sacrebleu_ci_low": 0.
|
| 961 |
-
"sacrebleu_ci_high": 0.
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
-
0.
|
| 979 |
-
0.
|
| 980 |
-
0.
|
| 981 |
-
0.
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
-
"sys_len":
|
| 985 |
"ref_len": 249,
|
| 986 |
-
"sacrebleu": 0.
|
| 987 |
-
"score": 0.
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
-
"score_ci_low": 0.
|
| 990 |
-
"score_ci_high": 0.
|
| 991 |
-
"sacrebleu_ci_low": 0.
|
| 992 |
-
"sacrebleu_ci_high": 0.
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
-
0.
|
| 1010 |
-
0.
|
| 1011 |
-
0.
|
| 1012 |
-
0.
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
-
"sys_len":
|
| 1016 |
"ref_len": 222,
|
| 1017 |
-
"sacrebleu": 0.
|
| 1018 |
-
"score": 0.
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
-
"score_ci_low": 0.
|
| 1021 |
-
"score_ci_high": 0.
|
| 1022 |
-
"sacrebleu_ci_low": 0.
|
| 1023 |
-
"sacrebleu_ci_high": 0.
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
17
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
-
0.
|
| 1041 |
-
0.
|
| 1042 |
-
0.
|
| 1043 |
-
0.
|
| 1044 |
],
|
| 1045 |
"bp": 1.0,
|
| 1046 |
-
"sys_len":
|
| 1047 |
"ref_len": 230,
|
| 1048 |
-
"sacrebleu": 0.
|
| 1049 |
-
"score": 0.
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
-
"score_ci_low": 0.
|
| 1052 |
-
"score_ci_high": 0.
|
| 1053 |
-
"sacrebleu_ci_low": 0.
|
| 1054 |
-
"sacrebleu_ci_high": 0.
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
-
0.
|
| 1072 |
-
0.
|
| 1073 |
-
0.
|
| 1074 |
-
0.
|
| 1075 |
],
|
| 1076 |
-
"bp": 0
|
| 1077 |
-
"sys_len":
|
| 1078 |
"ref_len": 243,
|
| 1079 |
-
"sacrebleu": 0.
|
| 1080 |
-
"score": 0.
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
-
"score_ci_low": 0.
|
| 1083 |
-
"score_ci_high": 0.
|
| 1084 |
-
"sacrebleu_ci_low": 0.
|
| 1085 |
-
"sacrebleu_ci_high": 0.
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
-
0.
|
| 1103 |
-
0.
|
| 1104 |
-
0.
|
| 1105 |
-
0.
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
-
"sys_len":
|
| 1109 |
"ref_len": 208,
|
| 1110 |
-
"sacrebleu": 0.
|
| 1111 |
-
"score": 0.
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
-
"score_ci_low": 0.
|
| 1114 |
-
"score_ci_high": 0.
|
| 1115 |
-
"sacrebleu_ci_low": 0.
|
| 1116 |
-
"sacrebleu_ci_high": 0.
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
-
0.
|
| 1134 |
-
0.
|
| 1135 |
-
0.
|
| 1136 |
-
0.
|
| 1137 |
],
|
| 1138 |
"bp": 1.0,
|
| 1139 |
-
"sys_len":
|
| 1140 |
"ref_len": 208,
|
| 1141 |
-
"sacrebleu": 0.
|
| 1142 |
-
"score": 0.
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
-
"score_ci_low": 0.
|
| 1145 |
-
"score_ci_high": 0.
|
| 1146 |
-
"sacrebleu_ci_low": 0.
|
| 1147 |
-
"sacrebleu_ci_high": 0.
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
-
0.
|
| 1165 |
-
0.
|
| 1166 |
-
0.
|
| 1167 |
-
0.
|
| 1168 |
],
|
| 1169 |
"bp": 1.0,
|
| 1170 |
-
"sys_len":
|
| 1171 |
"ref_len": 208,
|
| 1172 |
-
"sacrebleu": 0.
|
| 1173 |
-
"score": 0.
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
-
"score_ci_low": 0.
|
| 1176 |
-
"score_ci_high": 0.
|
| 1177 |
-
"sacrebleu_ci_low": 0.
|
| 1178 |
-
"sacrebleu_ci_high": 0.
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
-
0.
|
| 1196 |
-
0.
|
| 1197 |
-
0.
|
| 1198 |
-
0.
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
-
"sys_len":
|
| 1202 |
"ref_len": 208,
|
| 1203 |
-
"sacrebleu": 0.
|
| 1204 |
-
"score": 0.
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
-
"score_ci_low": 0.
|
| 1207 |
-
"score_ci_high": 0.
|
| 1208 |
-
"sacrebleu_ci_low": 0.
|
| 1209 |
-
"sacrebleu_ci_high": 0.
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
-
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
-
0.
|
| 1227 |
-
0.
|
| 1228 |
-
0.
|
| 1229 |
-
0.
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
-
"sys_len":
|
| 1233 |
"ref_len": 208,
|
| 1234 |
-
"sacrebleu": 0.
|
| 1235 |
-
"score": 0.
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
-
"score_ci_low": 0.
|
| 1238 |
-
"score_ci_high": 0.
|
| 1239 |
-
"sacrebleu_ci_low": 0.
|
| 1240 |
-
"sacrebleu_ci_high": 0.
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
-
|
| 1246 |
-
|
| 1247 |
-
|
| 1248 |
-
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
-
0.
|
| 1258 |
-
0.
|
| 1259 |
-
0.
|
| 1260 |
-
0.
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
-
"sys_len":
|
| 1264 |
"ref_len": 208,
|
| 1265 |
-
"sacrebleu": 0.
|
| 1266 |
-
"score": 0.
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
-
"score_ci_low": 0.
|
| 1269 |
-
"score_ci_high": 0.
|
| 1270 |
-
"sacrebleu_ci_low": 0.
|
| 1271 |
-
"sacrebleu_ci_high": 0.
|
| 1272 |
},
|
| 1273 |
-
"score": 0.
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
-
"score": 0.
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
+
"timestamp_utc": "2025-07-03T11:36:19.083305Z",
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
+
"unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
+
"accuracy": 0.5555555555555556,
|
| 180 |
+
"accuracy_ci_low": 0.2222222222222222,
|
| 181 |
+
"accuracy_ci_high": 0.8888888888888888,
|
| 182 |
"score_name": "accuracy",
|
| 183 |
+
"score": 0.5555555555555556,
|
| 184 |
+
"score_ci_high": 0.8888888888888888,
|
| 185 |
+
"score_ci_low": 0.2222222222222222,
|
| 186 |
"num_of_instances": 9
|
| 187 |
},
|
| 188 |
"safety_bbq_disability_status": {
|
| 189 |
+
"accuracy": 0.5555555555555556,
|
| 190 |
+
"accuracy_ci_low": 0.2222222222222222,
|
| 191 |
"accuracy_ci_high": 0.8888888888888888,
|
| 192 |
"score_name": "accuracy",
|
| 193 |
+
"score": 0.5555555555555556,
|
| 194 |
"score_ci_high": 0.8888888888888888,
|
| 195 |
+
"score_ci_low": 0.2222222222222222,
|
| 196 |
"num_of_instances": 9
|
| 197 |
},
|
| 198 |
"safety_bbq_gender_identity": {
|
|
|
|
| 206 |
"num_of_instances": 9
|
| 207 |
},
|
| 208 |
"safety_bbq_nationality": {
|
| 209 |
+
"accuracy": 0.8888888888888888,
|
| 210 |
+
"accuracy_ci_low": 0.4444444444444444,
|
| 211 |
"accuracy_ci_high": 1.0,
|
| 212 |
"score_name": "accuracy",
|
| 213 |
+
"score": 0.8888888888888888,
|
| 214 |
"score_ci_high": 1.0,
|
| 215 |
+
"score_ci_low": 0.4444444444444444,
|
| 216 |
"num_of_instances": 9
|
| 217 |
},
|
| 218 |
"safety_bbq_physical_appearance": {
|
|
|
|
| 227 |
},
|
| 228 |
"safety_bbq_race_ethnicity": {
|
| 229 |
"accuracy": 0.8888888888888888,
|
| 230 |
+
"accuracy_ci_low": 0.47716657027690984,
|
| 231 |
"accuracy_ci_high": 1.0,
|
| 232 |
"score_name": "accuracy",
|
| 233 |
"score": 0.8888888888888888,
|
| 234 |
"score_ci_high": 1.0,
|
| 235 |
+
"score_ci_low": 0.47716657027690984,
|
| 236 |
"num_of_instances": 9
|
| 237 |
},
|
| 238 |
"safety_bbq_race_x_gender": {
|
|
|
|
| 246 |
"num_of_instances": 9
|
| 247 |
},
|
| 248 |
"safety_bbq_race_x_ses": {
|
| 249 |
+
"accuracy": 0.8888888888888888,
|
| 250 |
+
"accuracy_ci_low": 0.4444444444444444,
|
| 251 |
"accuracy_ci_high": 1.0,
|
| 252 |
"score_name": "accuracy",
|
| 253 |
+
"score": 0.8888888888888888,
|
| 254 |
"score_ci_high": 1.0,
|
| 255 |
+
"score_ci_low": 0.4444444444444444,
|
| 256 |
"num_of_instances": 9
|
| 257 |
},
|
| 258 |
"safety_bbq_religion": {
|
|
|
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
"safety_bbq_sexual_orientation": {
|
| 279 |
+
"accuracy": 0.8888888888888888,
|
| 280 |
+
"accuracy_ci_low": 0.47716657027690984,
|
| 281 |
"accuracy_ci_high": 1.0,
|
| 282 |
"score_name": "accuracy",
|
| 283 |
+
"score": 0.8888888888888888,
|
| 284 |
"score_ci_high": 1.0,
|
| 285 |
+
"score_ci_low": 0.47716657027690984,
|
| 286 |
"num_of_instances": 9
|
| 287 |
},
|
| 288 |
+
"score": 0.7676767676767676,
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.6078431372549019,
|
| 296 |
+
"score": 0.6078431372549019,
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
+
"score": 0.6078431372549019,
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
+
"f1_Person": 0.6842105263157895,
|
| 307 |
+
"f1_Organization": 0.4799999999999999,
|
| 308 |
+
"f1_Location": 0.26666666666666666,
|
| 309 |
+
"f1_macro": 0.4769590643274853,
|
| 310 |
+
"recall_macro": 0.38681849551414765,
|
| 311 |
+
"precision_macro": 0.6929292929292928,
|
| 312 |
+
"in_classes_support": 0.7678571428571428,
|
| 313 |
+
"f1_micro": 0.44274809160305345,
|
| 314 |
+
"recall_micro": 0.38666666666666666,
|
| 315 |
+
"precision_micro": 0.5178571428571429,
|
| 316 |
+
"score": 0.44274809160305345,
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
+
"score_ci_low": 0.3289856810851347,
|
| 319 |
+
"score_ci_high": 0.5750204568214298,
|
| 320 |
+
"f1_micro_ci_low": 0.3289856810851347,
|
| 321 |
+
"f1_micro_ci_high": 0.5750204568214298
|
| 322 |
},
|
| 323 |
+
"score": 0.44274809160305345,
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
| 327 |
"knowledge": {
|
| 328 |
"mmlu_pro_biology": {
|
| 329 |
+
"accuracy": 0.8571428571428571,
|
| 330 |
+
"accuracy_ci_low": 0.42857142857142855,
|
| 331 |
"accuracy_ci_high": 1.0,
|
| 332 |
"score_name": "accuracy",
|
| 333 |
+
"score": 0.8571428571428571,
|
| 334 |
"score_ci_high": 1.0,
|
| 335 |
+
"score_ci_low": 0.42857142857142855,
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
|
|
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
+
"accuracy": 0.0,
|
| 350 |
"accuracy_ci_low": 0.0,
|
| 351 |
+
"accuracy_ci_high": 0.0,
|
| 352 |
"score_name": "accuracy",
|
| 353 |
+
"score": 0.0,
|
| 354 |
+
"score_ci_high": 0.0,
|
| 355 |
"score_ci_low": 0.0,
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
| 359 |
+
"accuracy": 0.8571428571428571,
|
| 360 |
+
"accuracy_ci_low": 0.42857142857142855,
|
| 361 |
"accuracy_ci_high": 1.0,
|
| 362 |
"score_name": "accuracy",
|
| 363 |
+
"score": 0.8571428571428571,
|
| 364 |
"score_ci_high": 1.0,
|
| 365 |
+
"score_ci_low": 0.42857142857142855,
|
| 366 |
"num_of_instances": 7
|
| 367 |
},
|
| 368 |
"mmlu_pro_economics": {
|
| 369 |
+
"accuracy": 0.42857142857142855,
|
| 370 |
"accuracy_ci_low": 0.14285714285714285,
|
| 371 |
"accuracy_ci_high": 0.8571428571428571,
|
| 372 |
"score_name": "accuracy",
|
| 373 |
+
"score": 0.42857142857142855,
|
| 374 |
"score_ci_high": 0.8571428571428571,
|
| 375 |
"score_ci_low": 0.14285714285714285,
|
| 376 |
"num_of_instances": 7
|
|
|
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
| 398 |
"mmlu_pro_history": {
|
| 399 |
+
"accuracy": 0.14285714285714285,
|
| 400 |
"accuracy_ci_low": 0.0,
|
| 401 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 402 |
"score_name": "accuracy",
|
| 403 |
+
"score": 0.14285714285714285,
|
| 404 |
+
"score_ci_high": 0.5714285714285714,
|
| 405 |
"score_ci_low": 0.0,
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
|
|
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
| 419 |
+
"accuracy": 0.14285714285714285,
|
| 420 |
+
"accuracy_ci_low": 0.0,
|
| 421 |
+
"accuracy_ci_high": 0.6807203593841678,
|
| 422 |
"score_name": "accuracy",
|
| 423 |
+
"score": 0.14285714285714285,
|
| 424 |
+
"score_ci_high": 0.6807203593841678,
|
| 425 |
+
"score_ci_low": 0.0,
|
| 426 |
"num_of_instances": 7
|
| 427 |
},
|
| 428 |
"mmlu_pro_other": {
|
|
|
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
| 438 |
"mmlu_pro_philosophy": {
|
| 439 |
+
"accuracy": 0.8571428571428571,
|
| 440 |
+
"accuracy_ci_low": 0.42857142857142855,
|
| 441 |
"accuracy_ci_high": 1.0,
|
| 442 |
"score_name": "accuracy",
|
| 443 |
+
"score": 0.8571428571428571,
|
| 444 |
"score_ci_high": 1.0,
|
| 445 |
+
"score_ci_low": 0.42857142857142855,
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
+
"accuracy": 0.2857142857142857,
|
| 450 |
"accuracy_ci_low": 0.0,
|
| 451 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 452 |
"score_name": "accuracy",
|
| 453 |
+
"score": 0.2857142857142857,
|
| 454 |
+
"score_ci_high": 0.7142857142857143,
|
| 455 |
"score_ci_low": 0.0,
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
|
|
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
+
"f1_macro": 0.3290909090909091,
|
| 475 |
"f1_suggestive": 0.5,
|
| 476 |
+
"f1_arbitrary": 0.5454545454545454,
|
| 477 |
"f1_generic": 0.0,
|
| 478 |
"f1_fanciful": 0.0,
|
| 479 |
+
"f1_descriptive": 0.6,
|
| 480 |
+
"f1_macro_ci_low": 0.1757208732808955,
|
| 481 |
+
"f1_macro_ci_high": 0.4941262683635378,
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
+
"score": 0.4444444444444444,
|
| 484 |
+
"score_ci_high": 0.6666666666666666,
|
| 485 |
+
"score_ci_low": 0.22857142857142856,
|
| 486 |
"num_of_instances": 20,
|
| 487 |
+
"accuracy": 0.4,
|
| 488 |
+
"accuracy_ci_low": 0.2,
|
| 489 |
+
"accuracy_ci_high": 0.6,
|
| 490 |
+
"f1_micro": 0.4444444444444444,
|
| 491 |
+
"f1_micro_ci_low": 0.22857142857142856,
|
| 492 |
+
"f1_micro_ci_high": 0.6666666666666666
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
+
"f1_macro": 0.6000000000000001,
|
| 496 |
"f1_no": 0.8,
|
| 497 |
+
"f1_yes": 0.4,
|
| 498 |
"f1_macro_ci_low": 0.3939393939393939,
|
| 499 |
+
"f1_macro_ci_high": 0.9134199134199135,
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
+
"score": 0.7,
|
| 502 |
"score_ci_high": 0.9,
|
| 503 |
+
"score_ci_low": 0.4568473225601714,
|
| 504 |
"num_of_instances": 20,
|
| 505 |
"accuracy": 0.7,
|
| 506 |
+
"accuracy_ci_low": 0.46284048542422074,
|
| 507 |
+
"accuracy_ci_high": 0.9,
|
| 508 |
+
"f1_micro": 0.7,
|
| 509 |
+
"f1_micro_ci_low": 0.4568473225601714,
|
| 510 |
"f1_micro_ci_high": 0.9
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
+
"f1_macro": 0.32142857142857145,
|
| 514 |
"f1_conclusion": 0.3333333333333333,
|
| 515 |
+
"f1_decree": 0.5,
|
| 516 |
"f1_issue": 0.25,
|
| 517 |
+
"f1_rule": 0.5,
|
| 518 |
+
"f1_analysis": 0.6666666666666666,
|
| 519 |
+
"f1_facts": 0.0,
|
|
|
|
| 520 |
"f1_procedural history": 0.0,
|
| 521 |
+
"f1_macro_ci_low": 0.13841395740078669,
|
| 522 |
+
"f1_macro_ci_high": 0.5943854196450136,
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
+
"score": 0.2857142857142857,
|
| 525 |
+
"score_ci_high": 0.5176853452302272,
|
| 526 |
+
"score_ci_low": 0.06451612903225806,
|
| 527 |
"num_of_instances": 20,
|
| 528 |
+
"accuracy": 0.25,
|
| 529 |
+
"accuracy_ci_low": 0.1,
|
| 530 |
+
"accuracy_ci_high": 0.5,
|
| 531 |
+
"f1_micro": 0.2857142857142857,
|
| 532 |
+
"f1_micro_ci_low": 0.06451612903225806,
|
| 533 |
+
"f1_micro_ci_high": 0.5176853452302272
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
+
"f1_macro": 0.5478260869565217,
|
| 537 |
+
"f1_yes": 0.6956521739130435,
|
| 538 |
+
"f1_no": 0.4,
|
| 539 |
+
"f1_macro_ci_low": 0.33939393939393936,
|
| 540 |
+
"f1_macro_ci_high": 0.7815126050420168,
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
+
"score": 0.5789473684210527,
|
| 543 |
+
"score_ci_high": 0.7692307692307693,
|
| 544 |
+
"score_ci_low": 0.3333333333333333,
|
| 545 |
"num_of_instances": 20,
|
| 546 |
+
"accuracy": 0.55,
|
| 547 |
+
"accuracy_ci_low": 0.3,
|
| 548 |
+
"accuracy_ci_high": 0.75,
|
| 549 |
+
"f1_micro": 0.5789473684210527,
|
| 550 |
+
"f1_micro_ci_low": 0.3333333333333333,
|
| 551 |
+
"f1_micro_ci_high": 0.7692307692307693
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
+
"f1_macro": 0.7781954887218046,
|
| 555 |
+
"f1_yes": 0.7142857142857143,
|
| 556 |
+
"f1_no": 0.8421052631578947,
|
| 557 |
+
"f1_macro_ci_low": 0.5689078197436116,
|
| 558 |
+
"f1_macro_ci_high": 0.9097245188862976,
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
"score": 0.7878787878787878,
|
| 561 |
"score_ci_high": 0.8888888888888888,
|
|
|
|
| 568 |
"f1_micro_ci_low": 0.5714285714285714,
|
| 569 |
"f1_micro_ci_high": 0.8888888888888888
|
| 570 |
},
|
| 571 |
+
"score": 0.5593969772917141,
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
+
"f1_macro": 0.3952113526570048,
|
| 578 |
"f1_cars": 0.6,
|
| 579 |
+
"f1_pc hardware": 0.43478260869565216,
|
| 580 |
"f1_windows x": 0.0,
|
| 581 |
+
"f1_atheism": 0.25,
|
|
|
|
| 582 |
"f1_religion": 0.0,
|
| 583 |
"f1_medicine": 0.8571428571428571,
|
| 584 |
"f1_christianity": 0.0,
|
| 585 |
+
"f1_computer graphics": 0.5,
|
| 586 |
"f1_microsoft windows": 0.8,
|
| 587 |
"f1_middle east": 0.25,
|
| 588 |
+
"f1_politics": 0.375,
|
| 589 |
"f1_motorcycles": 0.4444444444444444,
|
| 590 |
+
"f1_mac hardware": 0.0,
|
| 591 |
+
"f1_for sale": 0.5714285714285714,
|
| 592 |
+
"f1_guns": 0.5,
|
| 593 |
"f1_space": 0.5714285714285714,
|
| 594 |
"f1_cryptography": 0.0,
|
| 595 |
+
"f1_baseball": 1.0,
|
| 596 |
+
"f1_hockey": 0.75,
|
| 597 |
"f1_electronics": 0.0,
|
| 598 |
+
"f1_macro_ci_low": 0.3224857117174346,
|
| 599 |
+
"f1_macro_ci_high": 0.4919903471692233,
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
+
"score": 0.4606741573033708,
|
| 602 |
+
"score_ci_high": 0.5673987245542301,
|
| 603 |
+
"score_ci_low": 0.3609984101265133,
|
| 604 |
"num_of_instances": 100,
|
| 605 |
+
"accuracy": 0.41,
|
| 606 |
+
"accuracy_ci_low": 0.32,
|
| 607 |
+
"accuracy_ci_high": 0.52,
|
| 608 |
+
"f1_micro": 0.4606741573033708,
|
| 609 |
+
"f1_micro_ci_low": 0.3609984101265133,
|
| 610 |
+
"f1_micro_ci_high": 0.5673987245542301
|
| 611 |
},
|
| 612 |
+
"score": 0.4606741573033708,
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
+
"f1_macro": 0.6786958139899316,
|
| 619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9090909090909091,
|
| 620 |
+
"f1_money transfer or virtual currency or money service": 0.5,
|
| 621 |
"f1_mortgage": 0.6666666666666666,
|
| 622 |
+
"f1_credit card or prepaid card": 0.7,
|
| 623 |
+
"f1_debt collection": 0.7058823529411765,
|
| 624 |
+
"f1_checking or savings account": 0.7692307692307693,
|
| 625 |
"f1_payday loan or title loan or personal loan": 0.5,
|
| 626 |
+
"f1_macro_ci_low": 0.5183018601419674,
|
| 627 |
+
"f1_macro_ci_high": 0.8385575362562627,
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
+
"score": 0.8393782383419689,
|
| 630 |
+
"score_ci_high": 0.8979591836734694,
|
| 631 |
+
"score_ci_low": 0.7543122896467558,
|
| 632 |
"num_of_instances": 100,
|
| 633 |
+
"accuracy": 0.81,
|
| 634 |
+
"accuracy_ci_low": 0.72,
|
| 635 |
+
"accuracy_ci_high": 0.88,
|
| 636 |
+
"f1_micro": 0.8393782383419689,
|
| 637 |
+
"f1_micro_ci_low": 0.7543122896467558,
|
| 638 |
+
"f1_micro_ci_high": 0.8979591836734694
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
+
"f1_macro": 0.6211833932657731,
|
| 642 |
"f1_mortgages and loans": 0.631578947368421,
|
| 643 |
+
"f1_credit card": 0.631578947368421,
|
| 644 |
+
"f1_debt collection": 0.631578947368421,
|
| 645 |
+
"f1_credit reporting": 0.782608695652174,
|
| 646 |
"f1_retail banking": 0.42857142857142855,
|
| 647 |
+
"f1_macro_ci_low": 0.4842400654698589,
|
| 648 |
+
"f1_macro_ci_high": 0.7592332532338871,
|
|
|
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
+
"score": 0.6382978723404256,
|
| 651 |
+
"score_ci_high": 0.7580700999704794,
|
| 652 |
+
"score_ci_low": 0.4946236559139785,
|
| 653 |
"num_of_instances": 50,
|
| 654 |
+
"accuracy": 0.6,
|
| 655 |
+
"accuracy_ci_low": 0.46,
|
| 656 |
+
"accuracy_ci_high": 0.7296431071552615,
|
| 657 |
+
"f1_micro": 0.6382978723404256,
|
| 658 |
+
"f1_micro_ci_low": 0.4946236559139785,
|
| 659 |
+
"f1_micro_ci_high": 0.7580700999704794
|
| 660 |
},
|
| 661 |
+
"score": 0.7388380553411973,
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
+
"program_accuracy": 0.09,
|
| 669 |
+
"score": 0.09,
|
|
|
|
| 670 |
"score_name": "program_accuracy",
|
| 671 |
+
"execution_accuracy": 0.1,
|
| 672 |
+
"program_accuracy_ci_low": 0.04,
|
| 673 |
+
"program_accuracy_ci_high": 0.17,
|
| 674 |
+
"score_ci_low": 0.04,
|
| 675 |
+
"score_ci_high": 0.17,
|
| 676 |
"execution_accuracy_ci_low": 0.05,
|
| 677 |
+
"execution_accuracy_ci_high": 0.17
|
|
|
|
|
|
|
|
|
|
|
|
|
| 678 |
},
|
| 679 |
+
"score": 0.09,
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
+
"precision": 0.47761082372578506,
|
| 686 |
+
"recall": 0.6519026742512972,
|
| 687 |
+
"f1": 0.519200175226266,
|
| 688 |
+
"precision_ci_low": 0.4406041953630249,
|
| 689 |
+
"precision_ci_high": 0.5185697572184587,
|
| 690 |
+
"recall_ci_low": 0.6137821254589361,
|
| 691 |
+
"recall_ci_high": 0.6857346944815508,
|
| 692 |
+
"f1_ci_low": 0.4868309702128611,
|
| 693 |
+
"f1_ci_high": 0.5539790603518593,
|
| 694 |
"score_name": "f1",
|
| 695 |
+
"score": 0.519200175226266,
|
| 696 |
+
"score_ci_high": 0.5539790603518593,
|
| 697 |
+
"score_ci_low": 0.4868309702128611,
|
| 698 |
"num_of_instances": 100,
|
| 699 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.7154018753767013,
|
| 700 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.7530737143754959,
|
| 701 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.6891439840197563,
|
| 702 |
+
"faithfullness_f1_token_overlap": 0.46756124806620003,
|
| 703 |
+
"faithfullness_recall_token_overlap": 0.3785499103669753,
|
| 704 |
+
"faithfullness_precision_token_overlap": 0.7384812140442224,
|
| 705 |
+
"correctness_f1_token_overlap": 0.519200175226266,
|
| 706 |
+
"correctness_recall_token_overlap": 0.6519026742512972,
|
| 707 |
+
"correctness_precision_token_overlap": 0.47761082372578506
|
| 708 |
},
|
| 709 |
+
"score": 0.519200175226266,
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
| 713 |
"reasoning": {
|
| 714 |
"hellaswag": {
|
| 715 |
+
"accuracy": 0.45,
|
| 716 |
+
"accuracy_ci_low": 0.35,
|
| 717 |
+
"accuracy_ci_high": 0.55,
|
| 718 |
"score_name": "accuracy",
|
| 719 |
+
"score": 0.45,
|
| 720 |
+
"score_ci_high": 0.55,
|
| 721 |
+
"score_ci_low": 0.35,
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
+
"accuracy": 0.79,
|
| 726 |
+
"accuracy_ci_low": 0.7,
|
| 727 |
+
"accuracy_ci_high": 0.86,
|
| 728 |
"score_name": "accuracy",
|
| 729 |
+
"score": 0.79,
|
| 730 |
+
"score_ci_high": 0.86,
|
| 731 |
+
"score_ci_low": 0.7,
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
+
"score": 0.62,
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
|
|
|
| 739 |
"attaq_500": {
|
| 740 |
"severity_critical": 0.0,
|
| 741 |
"severity_high": 0.0,
|
| 742 |
+
"severity_medium": 1.0,
|
| 743 |
+
"severity_low": 99.0,
|
| 744 |
+
"category_harmful_info": 0.8888607305651975,
|
| 745 |
+
"category_deception": 0.9223287611296683,
|
| 746 |
+
"category_explicit_content": 0.8829418493039681,
|
| 747 |
+
"category_violence": 0.8598237158244638,
|
| 748 |
+
"category_discrimination": 0.9429672291236264,
|
| 749 |
+
"category_substance_abuse": 0.9277940973639488,
|
| 750 |
+
"category_pii": 0.7872623268652844,
|
| 751 |
+
"safety": 0.8844143529573273,
|
| 752 |
+
"safety_ci_low": 0.8647379074366862,
|
| 753 |
+
"safety_ci_high": 0.9028506428126294,
|
| 754 |
"score_name": "safety",
|
| 755 |
+
"score": 0.8844143529573273,
|
| 756 |
+
"score_ci_high": 0.9028506428126294,
|
| 757 |
+
"score_ci_low": 0.8647379074366862,
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
+
"score": 0.8844143529573273,
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
+
"rougeLsum": 0.3382265627183304,
|
| 768 |
+
"rouge1": 0.40036021824324175,
|
| 769 |
+
"rougeL": 0.26999463096681825,
|
| 770 |
+
"score": 0.26999463096681825,
|
| 771 |
"score_name": "rougeL",
|
| 772 |
+
"rouge2": 0.1799170051796286,
|
| 773 |
+
"rougeLsum_ci_low": 0.3196770761195946,
|
| 774 |
+
"rougeLsum_ci_high": 0.35575095728361106,
|
| 775 |
+
"rouge1_ci_low": 0.37914911716209904,
|
| 776 |
+
"rouge1_ci_high": 0.4203788141262886,
|
| 777 |
+
"rougeL_ci_low": 0.2548331249292397,
|
| 778 |
+
"rougeL_ci_high": 0.2846240200157709,
|
| 779 |
+
"score_ci_low": 0.2548331249292397,
|
| 780 |
+
"score_ci_high": 0.2846240200157709,
|
| 781 |
+
"rouge2_ci_low": 0.16577019912913937,
|
| 782 |
+
"rouge2_ci_high": 0.19443334643705076
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
+
"rougeLsum": 0.08235468484076217,
|
| 787 |
+
"rouge1": 0.09928048902708886,
|
| 788 |
+
"rougeL": 0.07296386769372697,
|
| 789 |
+
"score": 0.07296386769372697,
|
| 790 |
"score_name": "rougeL",
|
| 791 |
+
"rouge2": 0.011236459579717977,
|
| 792 |
+
"rougeLsum_ci_low": 0.0722334775173192,
|
| 793 |
+
"rougeLsum_ci_high": 0.09370524228043663,
|
| 794 |
+
"rouge1_ci_low": 0.08657767471804466,
|
| 795 |
+
"rouge1_ci_high": 0.11349703017155294,
|
| 796 |
+
"rougeL_ci_low": 0.06434630685946219,
|
| 797 |
+
"rougeL_ci_high": 0.0827408872646653,
|
| 798 |
+
"score_ci_low": 0.06434630685946219,
|
| 799 |
+
"score_ci_high": 0.0827408872646653,
|
| 800 |
+
"rouge2_ci_low": 0.007906623743969794,
|
| 801 |
+
"rouge2_ci_high": 0.015085372921687724
|
| 802 |
},
|
| 803 |
+
"score": 0.17147924933027262,
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
+
140,
|
| 812 |
+
89,
|
| 813 |
+
63,
|
| 814 |
+
46
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
+
253,
|
| 818 |
+
247,
|
| 819 |
+
241,
|
| 820 |
+
235
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
+
0.5533596837944664,
|
| 824 |
+
0.36032388663967607,
|
| 825 |
+
0.26141078838174275,
|
| 826 |
+
0.1957446808510638
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
+
"sys_len": 253,
|
| 830 |
"ref_len": 208,
|
| 831 |
+
"sacrebleu": 0.31781801514261865,
|
| 832 |
+
"score": 0.31781801514261865,
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
+
"score_ci_low": 0.22394855199440017,
|
| 835 |
+
"score_ci_high": 0.43009478960052605,
|
| 836 |
+
"sacrebleu_ci_low": 0.22394855199440017,
|
| 837 |
+
"sacrebleu_ci_high": 0.43009478960052605
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
+
137,
|
| 843 |
79,
|
| 844 |
+
46,
|
| 845 |
+
27
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
+
580,
|
| 849 |
+
574,
|
| 850 |
+
568,
|
| 851 |
+
562
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
+
0.23620689655172414,
|
| 855 |
+
0.1376306620209059,
|
| 856 |
+
0.08098591549295774,
|
| 857 |
+
0.048042704626334524
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
+
"sys_len": 580,
|
| 861 |
"ref_len": 208,
|
| 862 |
+
"sacrebleu": 0.10605012365966077,
|
| 863 |
+
"score": 0.10605012365966077,
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
+
"score_ci_low": 0.045773812142291564,
|
| 866 |
+
"score_ci_high": 0.2668828055491288,
|
| 867 |
+
"sacrebleu_ci_low": 0.045773812142291564,
|
| 868 |
+
"sacrebleu_ci_high": 0.2668828055491288
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
+
95,
|
| 874 |
+
42,
|
| 875 |
+
25,
|
| 876 |
+
15
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
+
273,
|
| 880 |
+
267,
|
| 881 |
+
261,
|
| 882 |
+
255
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
+
0.34798534798534797,
|
| 886 |
+
0.15730337078651685,
|
| 887 |
+
0.09578544061302682,
|
| 888 |
+
0.05882352941176471
|
| 889 |
],
|
| 890 |
"bp": 1.0,
|
| 891 |
+
"sys_len": 273,
|
| 892 |
"ref_len": 209,
|
| 893 |
+
"sacrebleu": 0.13252182004183885,
|
| 894 |
+
"score": 0.13252182004183885,
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
+
"score_ci_low": 0.07355875650706192,
|
| 897 |
+
"score_ci_high": 0.22875474913224045,
|
| 898 |
+
"sacrebleu_ci_low": 0.07355875650706192,
|
| 899 |
+
"sacrebleu_ci_high": 0.22875474913224045
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
+
128,
|
| 905 |
+
79,
|
| 906 |
+
51,
|
| 907 |
+
34
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
+
303,
|
| 911 |
+
297,
|
| 912 |
+
291,
|
| 913 |
+
285
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
+
0.42244224422442245,
|
| 917 |
+
0.265993265993266,
|
| 918 |
+
0.1752577319587629,
|
| 919 |
+
0.11929824561403508
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
+
"sys_len": 303,
|
| 923 |
"ref_len": 216,
|
| 924 |
+
"sacrebleu": 0.22015943743267072,
|
| 925 |
+
"score": 0.22015943743267072,
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
+
"score_ci_low": 0.08834867956773865,
|
| 928 |
+
"score_ci_high": 0.39167266319022087,
|
| 929 |
+
"sacrebleu_ci_low": 0.08834867956773865,
|
| 930 |
+
"sacrebleu_ci_high": 0.39167266319022087
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
+
171,
|
| 936 |
+
116,
|
| 937 |
+
83,
|
| 938 |
+
61
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
+
319,
|
| 942 |
+
313,
|
| 943 |
+
307,
|
| 944 |
+
301
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
+
0.5360501567398119,
|
| 948 |
+
0.3706070287539936,
|
| 949 |
+
0.2703583061889251,
|
| 950 |
+
0.2026578073089701
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
+
"sys_len": 319,
|
| 954 |
"ref_len": 235,
|
| 955 |
+
"sacrebleu": 0.3230022397669452,
|
| 956 |
+
"score": 0.3230022397669452,
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
+
"score_ci_low": 0.2190080228024859,
|
| 959 |
+
"score_ci_high": 0.4140696646006612,
|
| 960 |
+
"sacrebleu_ci_low": 0.2190080228024859,
|
| 961 |
+
"sacrebleu_ci_high": 0.4140696646006612
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
+
130,
|
| 967 |
+
52,
|
| 968 |
+
24,
|
| 969 |
+
14
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
+
433,
|
| 973 |
+
427,
|
| 974 |
+
421,
|
| 975 |
+
415
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
+
0.3002309468822171,
|
| 979 |
+
0.12177985948477751,
|
| 980 |
+
0.057007125890736345,
|
| 981 |
+
0.033734939759036145
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
+
"sys_len": 433,
|
| 985 |
"ref_len": 249,
|
| 986 |
+
"sacrebleu": 0.09157143128018339,
|
| 987 |
+
"score": 0.09157143128018339,
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
+
"score_ci_low": 0.04232364990047081,
|
| 990 |
+
"score_ci_high": 0.12977264737145106,
|
| 991 |
+
"sacrebleu_ci_low": 0.04232364990047081,
|
| 992 |
+
"sacrebleu_ci_high": 0.12977264737145106
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
+
161,
|
| 998 |
+
115,
|
| 999 |
+
88,
|
| 1000 |
+
67
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
+
287,
|
| 1004 |
+
281,
|
| 1005 |
+
275,
|
| 1006 |
+
269
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
+
0.5609756097560975,
|
| 1010 |
+
0.4092526690391459,
|
| 1011 |
+
0.32,
|
| 1012 |
+
0.24907063197026022
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
+
"sys_len": 287,
|
| 1016 |
"ref_len": 222,
|
| 1017 |
+
"sacrebleu": 0.3677917643240569,
|
| 1018 |
+
"score": 0.3677917643240569,
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
+
"score_ci_low": 0.24880375866631974,
|
| 1021 |
+
"score_ci_high": 0.44529960530247664,
|
| 1022 |
+
"sacrebleu_ci_low": 0.24880375866631974,
|
| 1023 |
+
"sacrebleu_ci_high": 0.44529960530247664
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
+
113,
|
| 1029 |
+
50,
|
| 1030 |
+
27,
|
| 1031 |
17
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
+
323,
|
| 1035 |
+
317,
|
| 1036 |
+
311,
|
| 1037 |
+
305
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
+
0.34984520123839014,
|
| 1041 |
+
0.15772870662460567,
|
| 1042 |
+
0.08681672025723472,
|
| 1043 |
+
0.05573770491803279
|
| 1044 |
],
|
| 1045 |
"bp": 1.0,
|
| 1046 |
+
"sys_len": 323,
|
| 1047 |
"ref_len": 230,
|
| 1048 |
+
"sacrebleu": 0.1278305860866586,
|
| 1049 |
+
"score": 0.1278305860866586,
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
+
"score_ci_low": 0.043467722634347436,
|
| 1052 |
+
"score_ci_high": 0.28760202689857367,
|
| 1053 |
+
"sacrebleu_ci_low": 0.043467722634347436,
|
| 1054 |
+
"sacrebleu_ci_high": 0.28760202689857367
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
+
150,
|
| 1060 |
+
75,
|
| 1061 |
+
39,
|
| 1062 |
+
23
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
+
275,
|
| 1066 |
+
269,
|
| 1067 |
+
263,
|
| 1068 |
+
257
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
+
0.5454545454545454,
|
| 1072 |
+
0.2788104089219331,
|
| 1073 |
+
0.1482889733840304,
|
| 1074 |
+
0.08949416342412451
|
| 1075 |
],
|
| 1076 |
+
"bp": 1.0,
|
| 1077 |
+
"sys_len": 275,
|
| 1078 |
"ref_len": 243,
|
| 1079 |
+
"sacrebleu": 0.21195456757616282,
|
| 1080 |
+
"score": 0.21195456757616282,
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
+
"score_ci_low": 0.1231274458577025,
|
| 1083 |
+
"score_ci_high": 0.2592759014746836,
|
| 1084 |
+
"sacrebleu_ci_low": 0.1231274458577025,
|
| 1085 |
+
"sacrebleu_ci_high": 0.2592759014746836
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
+
148,
|
| 1091 |
+
97,
|
| 1092 |
+
65,
|
| 1093 |
+
43
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
+
294,
|
| 1097 |
+
288,
|
| 1098 |
+
282,
|
| 1099 |
+
276
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
+
0.5034013605442177,
|
| 1103 |
+
0.3368055555555556,
|
| 1104 |
+
0.23049645390070922,
|
| 1105 |
+
0.15579710144927536
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
+
"sys_len": 294,
|
| 1109 |
"ref_len": 208,
|
| 1110 |
+
"sacrebleu": 0.2793375458793261,
|
| 1111 |
+
"score": 0.2793375458793261,
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
+
"score_ci_low": 0.14985482540315964,
|
| 1114 |
+
"score_ci_high": 0.4394380580658596,
|
| 1115 |
+
"sacrebleu_ci_low": 0.14985482540315964,
|
| 1116 |
+
"sacrebleu_ci_high": 0.4394380580658596
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
+
131,
|
| 1122 |
+
71,
|
| 1123 |
+
42,
|
| 1124 |
+
25
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
+
308,
|
| 1128 |
+
302,
|
| 1129 |
+
296,
|
| 1130 |
+
290
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
+
0.4253246753246753,
|
| 1134 |
+
0.23509933774834438,
|
| 1135 |
+
0.14189189189189189,
|
| 1136 |
+
0.08620689655172414
|
| 1137 |
],
|
| 1138 |
"bp": 1.0,
|
| 1139 |
+
"sys_len": 308,
|
| 1140 |
"ref_len": 208,
|
| 1141 |
+
"sacrebleu": 0.1870113191838078,
|
| 1142 |
+
"score": 0.1870113191838078,
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
+
"score_ci_low": 0.09322821548809718,
|
| 1145 |
+
"score_ci_high": 0.24013861737027561,
|
| 1146 |
+
"sacrebleu_ci_low": 0.09322821548809718,
|
| 1147 |
+
"sacrebleu_ci_high": 0.24013861737027561
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
+
115,
|
| 1153 |
+
53,
|
| 1154 |
+
24,
|
| 1155 |
+
12
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
+
335,
|
| 1159 |
+
329,
|
| 1160 |
+
323,
|
| 1161 |
+
317
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
+
0.34328358208955223,
|
| 1165 |
+
0.16109422492401215,
|
| 1166 |
+
0.07430340557275542,
|
| 1167 |
+
0.03785488958990536
|
| 1168 |
],
|
| 1169 |
"bp": 1.0,
|
| 1170 |
+
"sys_len": 335,
|
| 1171 |
"ref_len": 208,
|
| 1172 |
+
"sacrebleu": 0.11167756267294884,
|
| 1173 |
+
"score": 0.11167756267294884,
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
+
"score_ci_low": 0.0750863466115135,
|
| 1176 |
+
"score_ci_high": 0.1511603264034476,
|
| 1177 |
+
"sacrebleu_ci_low": 0.0750863466115135,
|
| 1178 |
+
"sacrebleu_ci_high": 0.1511603264034476
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
+
156,
|
| 1184 |
+
113,
|
| 1185 |
+
85,
|
| 1186 |
+
66
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
+
300,
|
| 1190 |
+
294,
|
| 1191 |
+
288,
|
| 1192 |
+
282
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
+
0.52,
|
| 1196 |
+
0.3843537414965986,
|
| 1197 |
+
0.2951388888888889,
|
| 1198 |
+
0.23404255319148937
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
+
"sys_len": 300,
|
| 1202 |
"ref_len": 208,
|
| 1203 |
+
"sacrebleu": 0.34277878137474943,
|
| 1204 |
+
"score": 0.34277878137474943,
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
+
"score_ci_low": 0.19792804696545602,
|
| 1207 |
+
"score_ci_high": 0.4586552468500489,
|
| 1208 |
+
"sacrebleu_ci_low": 0.19792804696545602,
|
| 1209 |
+
"sacrebleu_ci_high": 0.4586552468500489
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
+
158,
|
| 1215 |
+
95,
|
| 1216 |
+
66,
|
| 1217 |
+
50
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
+
582,
|
| 1221 |
+
576,
|
| 1222 |
+
570,
|
| 1223 |
+
564
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
+
0.27147766323024053,
|
| 1227 |
+
0.16493055555555558,
|
| 1228 |
+
0.11578947368421053,
|
| 1229 |
+
0.08865248226950355
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
+
"sys_len": 582,
|
| 1233 |
"ref_len": 208,
|
| 1234 |
+
"sacrebleu": 0.14641946009125262,
|
| 1235 |
+
"score": 0.14641946009125262,
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
+
"score_ci_low": 0.047945874214537776,
|
| 1238 |
+
"score_ci_high": 0.325694681550346,
|
| 1239 |
+
"sacrebleu_ci_low": 0.047945874214537776,
|
| 1240 |
+
"sacrebleu_ci_high": 0.325694681550346
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
+
142,
|
| 1246 |
+
87,
|
| 1247 |
+
58,
|
| 1248 |
+
39
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
+
241,
|
| 1252 |
+
235,
|
| 1253 |
+
229,
|
| 1254 |
+
223
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
+
0.5892116182572614,
|
| 1258 |
+
0.3702127659574468,
|
| 1259 |
+
0.2532751091703057,
|
| 1260 |
+
0.17488789237668162
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
+
"sys_len": 241,
|
| 1264 |
"ref_len": 208,
|
| 1265 |
+
"sacrebleu": 0.31352251684251853,
|
| 1266 |
+
"score": 0.31352251684251853,
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
+
"score_ci_low": 0.16226292142208393,
|
| 1269 |
+
"score_ci_high": 0.3894343521622631,
|
| 1270 |
+
"sacrebleu_ci_low": 0.16226292142208393,
|
| 1271 |
+
"sacrebleu_ci_high": 0.3894343521622631
|
| 1272 |
},
|
| 1273 |
+
"score": 0.21862981142369328,
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
+
"score": 0.4928793375118048,
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
results/bluebench/2025-07-03T08-05-54_evaluation_results.json
ADDED
|
@@ -0,0 +1,1281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"environment_info": {
|
| 3 |
+
"timestamp_utc": "2025-07-03T12:05:51.695495Z",
|
| 4 |
+
"command_line_invocation": [
|
| 5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
+
"--tasks",
|
| 7 |
+
"benchmarks.bluebench",
|
| 8 |
+
"--model",
|
| 9 |
+
"cross_provider",
|
| 10 |
+
"--model_args",
|
| 11 |
+
"model_name=watsonx/ibm/granite-3-8b-instruct,max_tokens=1024",
|
| 12 |
+
"--output_path",
|
| 13 |
+
"./results/bluebench",
|
| 14 |
+
"--log_samples",
|
| 15 |
+
"--trust_remote_code",
|
| 16 |
+
"--batch_size",
|
| 17 |
+
"8",
|
| 18 |
+
"--verbosity",
|
| 19 |
+
"ERROR"
|
| 20 |
+
],
|
| 21 |
+
"parsed_arguments": {
|
| 22 |
+
"tasks": [
|
| 23 |
+
"benchmarks.bluebench"
|
| 24 |
+
],
|
| 25 |
+
"split": "test",
|
| 26 |
+
"num_fewshots": null,
|
| 27 |
+
"limit": null,
|
| 28 |
+
"batch_size": 8,
|
| 29 |
+
"model": "watsonx/ibm/granite-3-8b-instruct",
|
| 30 |
+
"model_args": {
|
| 31 |
+
"max_tokens": 1024
|
| 32 |
+
},
|
| 33 |
+
"gen_kwargs": null,
|
| 34 |
+
"chat_template_kwargs": null,
|
| 35 |
+
"output_path": "./results/bluebench",
|
| 36 |
+
"output_file_prefix": "evaluation_results",
|
| 37 |
+
"log_samples": true,
|
| 38 |
+
"verbosity": "ERROR",
|
| 39 |
+
"apply_chat_template": false,
|
| 40 |
+
"trust_remote_code": true,
|
| 41 |
+
"disable_hf_cache": false,
|
| 42 |
+
"cache_dir": null
|
| 43 |
+
},
|
| 44 |
+
"unitxt_version": "1.25.0",
|
| 45 |
+
"unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
|
| 46 |
+
"python_version": "3.10.18",
|
| 47 |
+
"system": "Linux",
|
| 48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
| 49 |
+
"installed_packages": {
|
| 50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
| 51 |
+
"triton": "3.3.1",
|
| 52 |
+
"nltk": "3.9.1",
|
| 53 |
+
"anyio": "4.9.0",
|
| 54 |
+
"unitxt": "1.25.0",
|
| 55 |
+
"absl-py": "2.3.0",
|
| 56 |
+
"tiktoken": "0.9.0",
|
| 57 |
+
"charset-normalizer": "3.4.2",
|
| 58 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
| 59 |
+
"sympy": "1.14.0",
|
| 60 |
+
"mecab-ko": "1.0.1",
|
| 61 |
+
"httpcore": "1.0.9",
|
| 62 |
+
"litellm": "1.73.6",
|
| 63 |
+
"Jinja2": "3.1.6",
|
| 64 |
+
"jsonschema-specifications": "2025.4.1",
|
| 65 |
+
"pydantic_core": "2.33.2",
|
| 66 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
| 67 |
+
"tokenizers": "0.21.2",
|
| 68 |
+
"yarl": "1.20.1",
|
| 69 |
+
"portalocker": "3.2.0",
|
| 70 |
+
"pandas": "2.3.0",
|
| 71 |
+
"multiprocess": "0.70.16",
|
| 72 |
+
"jsonschema": "4.24.0",
|
| 73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
| 74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
| 75 |
+
"pydantic": "2.11.7",
|
| 76 |
+
"async-timeout": "5.0.1",
|
| 77 |
+
"annotated-types": "0.7.0",
|
| 78 |
+
"rouge_score": "0.1.2",
|
| 79 |
+
"contourpy": "1.3.2",
|
| 80 |
+
"aiosignal": "1.3.2",
|
| 81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
| 82 |
+
"openai": "1.93.0",
|
| 83 |
+
"six": "1.17.0",
|
| 84 |
+
"diskcache": "5.6.3",
|
| 85 |
+
"tqdm": "4.67.1",
|
| 86 |
+
"pyarrow": "20.0.0",
|
| 87 |
+
"h11": "0.16.0",
|
| 88 |
+
"zipp": "3.19.2",
|
| 89 |
+
"tzdata": "2025.2",
|
| 90 |
+
"bert-score": "0.3.13",
|
| 91 |
+
"setuptools": "80.9.0",
|
| 92 |
+
"referencing": "0.36.2",
|
| 93 |
+
"sacrebleu": "2.5.1",
|
| 94 |
+
"filelock": "3.18.0",
|
| 95 |
+
"urllib3": "2.5.0",
|
| 96 |
+
"scipy": "1.15.3",
|
| 97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
| 98 |
+
"kiwisolver": "1.4.8",
|
| 99 |
+
"networkx": "3.4.2",
|
| 100 |
+
"typing-inspection": "0.4.1",
|
| 101 |
+
"sniffio": "1.3.1",
|
| 102 |
+
"scikit-learn": "1.7.0",
|
| 103 |
+
"rpds-py": "0.26.0",
|
| 104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
| 105 |
+
"pip": "25.1.1",
|
| 106 |
+
"pillow": "11.3.0",
|
| 107 |
+
"fonttools": "4.58.4",
|
| 108 |
+
"datasets": "3.6.0",
|
| 109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
| 110 |
+
"cycler": "0.12.1",
|
| 111 |
+
"distro": "1.9.0",
|
| 112 |
+
"idna": "3.10",
|
| 113 |
+
"MarkupSafe": "3.0.2",
|
| 114 |
+
"frozenlist": "1.7.0",
|
| 115 |
+
"pyparsing": "3.2.3",
|
| 116 |
+
"jiter": "0.10.0",
|
| 117 |
+
"importlib_metadata": "8.0.0",
|
| 118 |
+
"packaging": "24.2",
|
| 119 |
+
"psutil": "7.0.0",
|
| 120 |
+
"mecab-ko-dic": "1.0.0",
|
| 121 |
+
"joblib": "1.5.1",
|
| 122 |
+
"fsspec": "2025.3.0",
|
| 123 |
+
"dill": "0.3.8",
|
| 124 |
+
"wheel": "0.45.1",
|
| 125 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
| 126 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
| 127 |
+
"lxml": "6.0.0",
|
| 128 |
+
"propcache": "0.3.2",
|
| 129 |
+
"numpy": "2.2.6",
|
| 130 |
+
"mpmath": "1.3.0",
|
| 131 |
+
"conllu": "6.0.0",
|
| 132 |
+
"huggingface-hub": "0.33.2",
|
| 133 |
+
"safetensors": "0.5.3",
|
| 134 |
+
"requests": "2.32.4",
|
| 135 |
+
"regex": "2024.11.6",
|
| 136 |
+
"aiohttp": "3.12.13",
|
| 137 |
+
"tabulate": "0.9.0",
|
| 138 |
+
"accelerate": "1.8.1",
|
| 139 |
+
"certifi": "2025.6.15",
|
| 140 |
+
"evaluate": "0.4.4",
|
| 141 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
| 142 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
| 143 |
+
"click": "8.2.1",
|
| 144 |
+
"typing_extensions": "4.12.2",
|
| 145 |
+
"attrs": "25.3.0",
|
| 146 |
+
"exceptiongroup": "1.3.0",
|
| 147 |
+
"transformers": "4.53.0",
|
| 148 |
+
"tenacity": "9.1.2",
|
| 149 |
+
"pytz": "2025.2",
|
| 150 |
+
"aiohappyeyeballs": "2.6.1",
|
| 151 |
+
"python-dateutil": "2.9.0.post0",
|
| 152 |
+
"torch": "2.7.1",
|
| 153 |
+
"python-dotenv": "1.1.1",
|
| 154 |
+
"multidict": "6.6.3",
|
| 155 |
+
"httpx": "0.28.1",
|
| 156 |
+
"matplotlib": "3.10.3",
|
| 157 |
+
"xxhash": "3.5.0",
|
| 158 |
+
"PyYAML": "6.0.2",
|
| 159 |
+
"colorama": "0.4.6",
|
| 160 |
+
"threadpoolctl": "3.6.0",
|
| 161 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
| 162 |
+
"hf-xet": "1.1.5",
|
| 163 |
+
"jaraco.collections": "5.1.0",
|
| 164 |
+
"tomli": "2.0.1",
|
| 165 |
+
"backports.tarfile": "1.2.0",
|
| 166 |
+
"jaraco.context": "5.3.0",
|
| 167 |
+
"typeguard": "4.3.0",
|
| 168 |
+
"autocommand": "2.2.2",
|
| 169 |
+
"jaraco.text": "3.12.1",
|
| 170 |
+
"more-itertools": "10.3.0",
|
| 171 |
+
"platformdirs": "4.2.2",
|
| 172 |
+
"inflect": "7.3.1",
|
| 173 |
+
"jaraco.functools": "4.0.1"
|
| 174 |
+
}
|
| 175 |
+
},
|
| 176 |
+
"results": {
|
| 177 |
+
"bias": {
|
| 178 |
+
"safety_bbq_age": {
|
| 179 |
+
"accuracy": 0.8888888888888888,
|
| 180 |
+
"accuracy_ci_low": 0.47716657027690984,
|
| 181 |
+
"accuracy_ci_high": 1.0,
|
| 182 |
+
"score_name": "accuracy",
|
| 183 |
+
"score": 0.8888888888888888,
|
| 184 |
+
"score_ci_high": 1.0,
|
| 185 |
+
"score_ci_low": 0.47716657027690984,
|
| 186 |
+
"num_of_instances": 9
|
| 187 |
+
},
|
| 188 |
+
"safety_bbq_disability_status": {
|
| 189 |
+
"accuracy": 0.7777777777777778,
|
| 190 |
+
"accuracy_ci_low": 0.3333333333333333,
|
| 191 |
+
"accuracy_ci_high": 1.0,
|
| 192 |
+
"score_name": "accuracy",
|
| 193 |
+
"score": 0.7777777777777778,
|
| 194 |
+
"score_ci_high": 1.0,
|
| 195 |
+
"score_ci_low": 0.3333333333333333,
|
| 196 |
+
"num_of_instances": 9
|
| 197 |
+
},
|
| 198 |
+
"safety_bbq_gender_identity": {
|
| 199 |
+
"accuracy": 0.6666666666666666,
|
| 200 |
+
"accuracy_ci_low": 0.3333333333333333,
|
| 201 |
+
"accuracy_ci_high": 1.0,
|
| 202 |
+
"score_name": "accuracy",
|
| 203 |
+
"score": 0.6666666666666666,
|
| 204 |
+
"score_ci_high": 1.0,
|
| 205 |
+
"score_ci_low": 0.3333333333333333,
|
| 206 |
+
"num_of_instances": 9
|
| 207 |
+
},
|
| 208 |
+
"safety_bbq_nationality": {
|
| 209 |
+
"accuracy": 0.8888888888888888,
|
| 210 |
+
"accuracy_ci_low": 0.5555555555555556,
|
| 211 |
+
"accuracy_ci_high": 1.0,
|
| 212 |
+
"score_name": "accuracy",
|
| 213 |
+
"score": 0.8888888888888888,
|
| 214 |
+
"score_ci_high": 1.0,
|
| 215 |
+
"score_ci_low": 0.5555555555555556,
|
| 216 |
+
"num_of_instances": 9
|
| 217 |
+
},
|
| 218 |
+
"safety_bbq_physical_appearance": {
|
| 219 |
+
"accuracy": 1.0,
|
| 220 |
+
"accuracy_ci_low": 1.0,
|
| 221 |
+
"accuracy_ci_high": 1.0,
|
| 222 |
+
"score_name": "accuracy",
|
| 223 |
+
"score": 1.0,
|
| 224 |
+
"score_ci_high": 1.0,
|
| 225 |
+
"score_ci_low": 1.0,
|
| 226 |
+
"num_of_instances": 9
|
| 227 |
+
},
|
| 228 |
+
"safety_bbq_race_ethnicity": {
|
| 229 |
+
"accuracy": 0.8888888888888888,
|
| 230 |
+
"accuracy_ci_low": 0.5555555555555556,
|
| 231 |
+
"accuracy_ci_high": 1.0,
|
| 232 |
+
"score_name": "accuracy",
|
| 233 |
+
"score": 0.8888888888888888,
|
| 234 |
+
"score_ci_high": 1.0,
|
| 235 |
+
"score_ci_low": 0.5555555555555556,
|
| 236 |
+
"num_of_instances": 9
|
| 237 |
+
},
|
| 238 |
+
"safety_bbq_race_x_gender": {
|
| 239 |
+
"accuracy": 0.8888888888888888,
|
| 240 |
+
"accuracy_ci_low": 0.5555555555555556,
|
| 241 |
+
"accuracy_ci_high": 1.0,
|
| 242 |
+
"score_name": "accuracy",
|
| 243 |
+
"score": 0.8888888888888888,
|
| 244 |
+
"score_ci_high": 1.0,
|
| 245 |
+
"score_ci_low": 0.5555555555555556,
|
| 246 |
+
"num_of_instances": 9
|
| 247 |
+
},
|
| 248 |
+
"safety_bbq_race_x_ses": {
|
| 249 |
+
"accuracy": 0.8888888888888888,
|
| 250 |
+
"accuracy_ci_low": 0.4444444444444444,
|
| 251 |
+
"accuracy_ci_high": 1.0,
|
| 252 |
+
"score_name": "accuracy",
|
| 253 |
+
"score": 0.8888888888888888,
|
| 254 |
+
"score_ci_high": 1.0,
|
| 255 |
+
"score_ci_low": 0.4444444444444444,
|
| 256 |
+
"num_of_instances": 9
|
| 257 |
+
},
|
| 258 |
+
"safety_bbq_religion": {
|
| 259 |
+
"accuracy": 0.7777777777777778,
|
| 260 |
+
"accuracy_ci_low": 0.4444444444444444,
|
| 261 |
+
"accuracy_ci_high": 1.0,
|
| 262 |
+
"score_name": "accuracy",
|
| 263 |
+
"score": 0.7777777777777778,
|
| 264 |
+
"score_ci_high": 1.0,
|
| 265 |
+
"score_ci_low": 0.4444444444444444,
|
| 266 |
+
"num_of_instances": 9
|
| 267 |
+
},
|
| 268 |
+
"safety_bbq_ses": {
|
| 269 |
+
"accuracy": 0.6666666666666666,
|
| 270 |
+
"accuracy_ci_low": 0.2222222222222222,
|
| 271 |
+
"accuracy_ci_high": 0.8888888888888888,
|
| 272 |
+
"score_name": "accuracy",
|
| 273 |
+
"score": 0.6666666666666666,
|
| 274 |
+
"score_ci_high": 0.8888888888888888,
|
| 275 |
+
"score_ci_low": 0.2222222222222222,
|
| 276 |
+
"num_of_instances": 9
|
| 277 |
+
},
|
| 278 |
+
"safety_bbq_sexual_orientation": {
|
| 279 |
+
"accuracy": 0.8888888888888888,
|
| 280 |
+
"accuracy_ci_low": 0.5310928992288233,
|
| 281 |
+
"accuracy_ci_high": 1.0,
|
| 282 |
+
"score_name": "accuracy",
|
| 283 |
+
"score": 0.8888888888888888,
|
| 284 |
+
"score_ci_high": 1.0,
|
| 285 |
+
"score_ci_low": 0.5310928992288233,
|
| 286 |
+
"num_of_instances": 9
|
| 287 |
+
},
|
| 288 |
+
"score": 0.8383838383838383,
|
| 289 |
+
"score_name": "subsets_mean",
|
| 290 |
+
"num_of_instances": 99
|
| 291 |
+
},
|
| 292 |
+
"chatbot_abilities": {
|
| 293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
+
"num_of_instances": 100,
|
| 295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.5131578947368421,
|
| 296 |
+
"score": 0.5131578947368421,
|
| 297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
+
},
|
| 299 |
+
"score": 0.5131578947368421,
|
| 300 |
+
"score_name": "subsets_mean",
|
| 301 |
+
"num_of_instances": 100
|
| 302 |
+
},
|
| 303 |
+
"entity_extraction": {
|
| 304 |
+
"universal_ner_en_ewt": {
|
| 305 |
+
"num_of_instances": 100,
|
| 306 |
+
"f1_Location": 0.22727272727272727,
|
| 307 |
+
"f1_Person": 0.5217391304347826,
|
| 308 |
+
"f1_Organization": 0.4262295081967213,
|
| 309 |
+
"f1_macro": 0.39174712196807704,
|
| 310 |
+
"recall_macro": 0.39811939268461005,
|
| 311 |
+
"precision_macro": 0.3885595081247255,
|
| 312 |
+
"in_classes_support": 0.6178861788617886,
|
| 313 |
+
"f1_micro": 0.30303030303030304,
|
| 314 |
+
"recall_micro": 0.4,
|
| 315 |
+
"precision_micro": 0.24390243902439024,
|
| 316 |
+
"score": 0.30303030303030304,
|
| 317 |
+
"score_name": "f1_micro",
|
| 318 |
+
"score_ci_low": 0.07496516805145101,
|
| 319 |
+
"score_ci_high": 0.42390724339941516,
|
| 320 |
+
"f1_micro_ci_low": 0.07496516805145101,
|
| 321 |
+
"f1_micro_ci_high": 0.42390724339941516
|
| 322 |
+
},
|
| 323 |
+
"score": 0.30303030303030304,
|
| 324 |
+
"score_name": "subsets_mean",
|
| 325 |
+
"num_of_instances": 100
|
| 326 |
+
},
|
| 327 |
+
"knowledge": {
|
| 328 |
+
"mmlu_pro_biology": {
|
| 329 |
+
"accuracy": 0.5714285714285714,
|
| 330 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 331 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 332 |
+
"score_name": "accuracy",
|
| 333 |
+
"score": 0.5714285714285714,
|
| 334 |
+
"score_ci_high": 0.8571428571428571,
|
| 335 |
+
"score_ci_low": 0.14285714285714285,
|
| 336 |
+
"num_of_instances": 7
|
| 337 |
+
},
|
| 338 |
+
"mmlu_pro_business": {
|
| 339 |
+
"accuracy": 0.14285714285714285,
|
| 340 |
+
"accuracy_ci_low": 0.0,
|
| 341 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 342 |
+
"score_name": "accuracy",
|
| 343 |
+
"score": 0.14285714285714285,
|
| 344 |
+
"score_ci_high": 0.5714285714285714,
|
| 345 |
+
"score_ci_low": 0.0,
|
| 346 |
+
"num_of_instances": 7
|
| 347 |
+
},
|
| 348 |
+
"mmlu_pro_chemistry": {
|
| 349 |
+
"accuracy": 0.42857142857142855,
|
| 350 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 351 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 352 |
+
"score_name": "accuracy",
|
| 353 |
+
"score": 0.42857142857142855,
|
| 354 |
+
"score_ci_high": 0.8571428571428571,
|
| 355 |
+
"score_ci_low": 0.14285714285714285,
|
| 356 |
+
"num_of_instances": 7
|
| 357 |
+
},
|
| 358 |
+
"mmlu_pro_computer_science": {
|
| 359 |
+
"accuracy": 0.7142857142857143,
|
| 360 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 361 |
+
"accuracy_ci_high": 1.0,
|
| 362 |
+
"score_name": "accuracy",
|
| 363 |
+
"score": 0.7142857142857143,
|
| 364 |
+
"score_ci_high": 1.0,
|
| 365 |
+
"score_ci_low": 0.2857142857142857,
|
| 366 |
+
"num_of_instances": 7
|
| 367 |
+
},
|
| 368 |
+
"mmlu_pro_economics": {
|
| 369 |
+
"accuracy": 0.7142857142857143,
|
| 370 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 371 |
+
"accuracy_ci_high": 1.0,
|
| 372 |
+
"score_name": "accuracy",
|
| 373 |
+
"score": 0.7142857142857143,
|
| 374 |
+
"score_ci_high": 1.0,
|
| 375 |
+
"score_ci_low": 0.2857142857142857,
|
| 376 |
+
"num_of_instances": 7
|
| 377 |
+
},
|
| 378 |
+
"mmlu_pro_engineering": {
|
| 379 |
+
"accuracy": 0.2857142857142857,
|
| 380 |
+
"accuracy_ci_low": 0.0,
|
| 381 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 382 |
+
"score_name": "accuracy",
|
| 383 |
+
"score": 0.2857142857142857,
|
| 384 |
+
"score_ci_high": 0.7142857142857143,
|
| 385 |
+
"score_ci_low": 0.0,
|
| 386 |
+
"num_of_instances": 7
|
| 387 |
+
},
|
| 388 |
+
"mmlu_pro_health": {
|
| 389 |
+
"accuracy": 0.2857142857142857,
|
| 390 |
+
"accuracy_ci_low": 0.0,
|
| 391 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 392 |
+
"score_name": "accuracy",
|
| 393 |
+
"score": 0.2857142857142857,
|
| 394 |
+
"score_ci_high": 0.7142857142857143,
|
| 395 |
+
"score_ci_low": 0.0,
|
| 396 |
+
"num_of_instances": 7
|
| 397 |
+
},
|
| 398 |
+
"mmlu_pro_history": {
|
| 399 |
+
"accuracy": 0.2857142857142857,
|
| 400 |
+
"accuracy_ci_low": 0.0,
|
| 401 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 402 |
+
"score_name": "accuracy",
|
| 403 |
+
"score": 0.2857142857142857,
|
| 404 |
+
"score_ci_high": 0.7142857142857143,
|
| 405 |
+
"score_ci_low": 0.0,
|
| 406 |
+
"num_of_instances": 7
|
| 407 |
+
},
|
| 408 |
+
"mmlu_pro_law": {
|
| 409 |
+
"accuracy": 0.8571428571428571,
|
| 410 |
+
"accuracy_ci_low": 0.2530277506117974,
|
| 411 |
+
"accuracy_ci_high": 1.0,
|
| 412 |
+
"score_name": "accuracy",
|
| 413 |
+
"score": 0.8571428571428571,
|
| 414 |
+
"score_ci_high": 1.0,
|
| 415 |
+
"score_ci_low": 0.2530277506117974,
|
| 416 |
+
"num_of_instances": 7
|
| 417 |
+
},
|
| 418 |
+
"mmlu_pro_math": {
|
| 419 |
+
"accuracy": 0.2857142857142857,
|
| 420 |
+
"accuracy_ci_low": 0.0,
|
| 421 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 422 |
+
"score_name": "accuracy",
|
| 423 |
+
"score": 0.2857142857142857,
|
| 424 |
+
"score_ci_high": 0.7142857142857143,
|
| 425 |
+
"score_ci_low": 0.0,
|
| 426 |
+
"num_of_instances": 7
|
| 427 |
+
},
|
| 428 |
+
"mmlu_pro_other": {
|
| 429 |
+
"accuracy": 0.14285714285714285,
|
| 430 |
+
"accuracy_ci_low": 0.0,
|
| 431 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 432 |
+
"score_name": "accuracy",
|
| 433 |
+
"score": 0.14285714285714285,
|
| 434 |
+
"score_ci_high": 0.5714285714285714,
|
| 435 |
+
"score_ci_low": 0.0,
|
| 436 |
+
"num_of_instances": 7
|
| 437 |
+
},
|
| 438 |
+
"mmlu_pro_philosophy": {
|
| 439 |
+
"accuracy": 0.8571428571428571,
|
| 440 |
+
"accuracy_ci_low": 0.42857142857142855,
|
| 441 |
+
"accuracy_ci_high": 1.0,
|
| 442 |
+
"score_name": "accuracy",
|
| 443 |
+
"score": 0.8571428571428571,
|
| 444 |
+
"score_ci_high": 1.0,
|
| 445 |
+
"score_ci_low": 0.42857142857142855,
|
| 446 |
+
"num_of_instances": 7
|
| 447 |
+
},
|
| 448 |
+
"mmlu_pro_physics": {
|
| 449 |
+
"accuracy": 0.14285714285714285,
|
| 450 |
+
"accuracy_ci_low": 0.0,
|
| 451 |
+
"accuracy_ci_high": 0.7469722493882013,
|
| 452 |
+
"score_name": "accuracy",
|
| 453 |
+
"score": 0.14285714285714285,
|
| 454 |
+
"score_ci_high": 0.7469722493882013,
|
| 455 |
+
"score_ci_low": 0.0,
|
| 456 |
+
"num_of_instances": 7
|
| 457 |
+
},
|
| 458 |
+
"mmlu_pro_psychology": {
|
| 459 |
+
"accuracy": 0.42857142857142855,
|
| 460 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 461 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 462 |
+
"score_name": "accuracy",
|
| 463 |
+
"score": 0.42857142857142855,
|
| 464 |
+
"score_ci_high": 0.8571428571428571,
|
| 465 |
+
"score_ci_low": 0.14285714285714285,
|
| 466 |
+
"num_of_instances": 7
|
| 467 |
+
},
|
| 468 |
+
"score": 0.4387755102040816,
|
| 469 |
+
"score_name": "subsets_mean",
|
| 470 |
+
"num_of_instances": 98
|
| 471 |
+
},
|
| 472 |
+
"legal": {
|
| 473 |
+
"legalbench_abercrombie": {
|
| 474 |
+
"f1_macro": 0.43666666666666665,
|
| 475 |
+
"f1_suggestive": 0.5,
|
| 476 |
+
"f1_descriptive": 0.5333333333333333,
|
| 477 |
+
"f1_generic": 0.0,
|
| 478 |
+
"f1_fanciful": 0.4,
|
| 479 |
+
"f1_arbitrary": 0.75,
|
| 480 |
+
"f1_macro_ci_low": 0.2596895818708838,
|
| 481 |
+
"f1_macro_ci_high": 0.7109644675737796,
|
| 482 |
+
"score_name": "f1_micro",
|
| 483 |
+
"score": 0.5128205128205128,
|
| 484 |
+
"score_ci_high": 0.7295250374354838,
|
| 485 |
+
"score_ci_low": 0.2683716341971827,
|
| 486 |
+
"num_of_instances": 20,
|
| 487 |
+
"accuracy": 0.5,
|
| 488 |
+
"accuracy_ci_low": 0.25,
|
| 489 |
+
"accuracy_ci_high": 0.7,
|
| 490 |
+
"f1_micro": 0.5128205128205128,
|
| 491 |
+
"f1_micro_ci_low": 0.2683716341971827,
|
| 492 |
+
"f1_micro_ci_high": 0.7295250374354838
|
| 493 |
+
},
|
| 494 |
+
"legalbench_corporate_lobbying": {
|
| 495 |
+
"f1_macro": 0.4982078853046595,
|
| 496 |
+
"f1_no": 0.7741935483870968,
|
| 497 |
+
"f1_yes": 0.2222222222222222,
|
| 498 |
+
"f1_macro_ci_low": 0.3548387096774194,
|
| 499 |
+
"f1_macro_ci_high": 0.918918918918919,
|
| 500 |
+
"score_name": "f1_micro",
|
| 501 |
+
"score": 0.65,
|
| 502 |
+
"score_ci_high": 0.85,
|
| 503 |
+
"score_ci_low": 0.4,
|
| 504 |
+
"num_of_instances": 20,
|
| 505 |
+
"accuracy": 0.65,
|
| 506 |
+
"accuracy_ci_low": 0.4,
|
| 507 |
+
"accuracy_ci_high": 0.85,
|
| 508 |
+
"f1_micro": 0.65,
|
| 509 |
+
"f1_micro_ci_low": 0.4,
|
| 510 |
+
"f1_micro_ci_high": 0.85
|
| 511 |
+
},
|
| 512 |
+
"legalbench_function_of_decision_section": {
|
| 513 |
+
"f1_macro": 0.06746031746031746,
|
| 514 |
+
"f1_conclusion": 0.25,
|
| 515 |
+
"f1_issue": 0.2222222222222222,
|
| 516 |
+
"f1_decree": 0.0,
|
| 517 |
+
"f1_rule": 0.0,
|
| 518 |
+
"f1_analysis": 0.0,
|
| 519 |
+
"f1_facts": 0.0,
|
| 520 |
+
"f1_procedural history": 0.0,
|
| 521 |
+
"f1_macro_ci_low": 0.0,
|
| 522 |
+
"f1_macro_ci_high": 0.19797979797979798,
|
| 523 |
+
"score_name": "f1_micro",
|
| 524 |
+
"score": 0.1,
|
| 525 |
+
"score_ci_high": 0.2777777777777778,
|
| 526 |
+
"score_ci_low": 0.0,
|
| 527 |
+
"num_of_instances": 20,
|
| 528 |
+
"accuracy": 0.1,
|
| 529 |
+
"accuracy_ci_low": 0.0,
|
| 530 |
+
"accuracy_ci_high": 0.3,
|
| 531 |
+
"f1_micro": 0.1,
|
| 532 |
+
"f1_micro_ci_low": 0.0,
|
| 533 |
+
"f1_micro_ci_high": 0.2777777777777778
|
| 534 |
+
},
|
| 535 |
+
"legalbench_international_citizenship_questions": {
|
| 536 |
+
"f1_macro": 0.31666666666666665,
|
| 537 |
+
"f1_yes": 0.5,
|
| 538 |
+
"f1_no": 0.13333333333333333,
|
| 539 |
+
"f1_macro_ci_low": 0.19184876549102856,
|
| 540 |
+
"f1_macro_ci_high": 0.5562869410797749,
|
| 541 |
+
"score_name": "f1_micro",
|
| 542 |
+
"score": 0.358974358974359,
|
| 543 |
+
"score_ci_high": 0.5984946237513465,
|
| 544 |
+
"score_ci_low": 0.15789473684210525,
|
| 545 |
+
"num_of_instances": 20,
|
| 546 |
+
"accuracy": 0.35,
|
| 547 |
+
"accuracy_ci_low": 0.15,
|
| 548 |
+
"accuracy_ci_high": 0.6,
|
| 549 |
+
"f1_micro": 0.358974358974359,
|
| 550 |
+
"f1_micro_ci_low": 0.15789473684210525,
|
| 551 |
+
"f1_micro_ci_high": 0.5984946237513465
|
| 552 |
+
},
|
| 553 |
+
"legalbench_proa": {
|
| 554 |
+
"f1_macro": 0.8705882352941177,
|
| 555 |
+
"f1_yes": 0.9411764705882353,
|
| 556 |
+
"f1_no": 0.8,
|
| 557 |
+
"f1_macro_ci_low": 0.6705536779335216,
|
| 558 |
+
"f1_macro_ci_high": 0.9615384615384616,
|
| 559 |
+
"score_name": "f1_micro",
|
| 560 |
+
"score": 0.8648648648648649,
|
| 561 |
+
"score_ci_high": 0.95,
|
| 562 |
+
"score_ci_low": 0.6502941649122055,
|
| 563 |
+
"num_of_instances": 20,
|
| 564 |
+
"accuracy": 0.8,
|
| 565 |
+
"accuracy_ci_low": 0.5720066230431405,
|
| 566 |
+
"accuracy_ci_high": 0.95,
|
| 567 |
+
"f1_micro": 0.8648648648648649,
|
| 568 |
+
"f1_micro_ci_low": 0.6502941649122055,
|
| 569 |
+
"f1_micro_ci_high": 0.95
|
| 570 |
+
},
|
| 571 |
+
"score": 0.49733194733194735,
|
| 572 |
+
"score_name": "subsets_mean",
|
| 573 |
+
"num_of_instances": 100
|
| 574 |
+
},
|
| 575 |
+
"news_classification": {
|
| 576 |
+
"20_newsgroups_short": {
|
| 577 |
+
"f1_macro": 0.4239959524479648,
|
| 578 |
+
"f1_cars": 0.8,
|
| 579 |
+
"f1_pc hardware": 0.5454545454545454,
|
| 580 |
+
"f1_windows x": 0.0,
|
| 581 |
+
"f1_computer graphics": 0.5882352941176471,
|
| 582 |
+
"f1_atheism": 0.0,
|
| 583 |
+
"f1_religion": 0.18181818181818182,
|
| 584 |
+
"f1_medicine": 0.6666666666666666,
|
| 585 |
+
"f1_christianity": 0.0,
|
| 586 |
+
"f1_for sale": 0.6,
|
| 587 |
+
"f1_microsoft windows": 0.5,
|
| 588 |
+
"f1_middle east": 0.0,
|
| 589 |
+
"f1_motorcycles": 0.6,
|
| 590 |
+
"f1_mac hardware": 0.3333333333333333,
|
| 591 |
+
"f1_guns": 0.5,
|
| 592 |
+
"f1_politics": 0.5263157894736842,
|
| 593 |
+
"f1_space": 0.5714285714285714,
|
| 594 |
+
"f1_cryptography": 0.4,
|
| 595 |
+
"f1_hockey": 0.6666666666666666,
|
| 596 |
+
"f1_baseball": 1.0,
|
| 597 |
+
"f1_electronics": 0.0,
|
| 598 |
+
"f1_macro_ci_low": 0.34375053675075556,
|
| 599 |
+
"f1_macro_ci_high": 0.5229994216966102,
|
| 600 |
+
"score_name": "f1_micro",
|
| 601 |
+
"score": 0.5,
|
| 602 |
+
"score_ci_high": 0.6010991243020599,
|
| 603 |
+
"score_ci_low": 0.3954802259887006,
|
| 604 |
+
"num_of_instances": 100,
|
| 605 |
+
"accuracy": 0.46,
|
| 606 |
+
"accuracy_ci_low": 0.36,
|
| 607 |
+
"accuracy_ci_high": 0.56,
|
| 608 |
+
"f1_micro": 0.5,
|
| 609 |
+
"f1_micro_ci_low": 0.3954802259887006,
|
| 610 |
+
"f1_micro_ci_high": 0.6010991243020599
|
| 611 |
+
},
|
| 612 |
+
"score": 0.5,
|
| 613 |
+
"score_name": "subsets_mean",
|
| 614 |
+
"num_of_instances": 100
|
| 615 |
+
},
|
| 616 |
+
"product_help": {
|
| 617 |
+
"cfpb_product_2023": {
|
| 618 |
+
"f1_macro": 0.7207043858492499,
|
| 619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9253731343283582,
|
| 620 |
+
"f1_money transfer or virtual currency or money service": 0.8,
|
| 621 |
+
"f1_mortgage": 0.6666666666666666,
|
| 622 |
+
"f1_credit card or prepaid card": 0.7058823529411765,
|
| 623 |
+
"f1_debt collection": 0.7777777777777778,
|
| 624 |
+
"f1_checking or savings account": 0.7692307692307693,
|
| 625 |
+
"f1_payday loan or title loan or personal loan": 0.4,
|
| 626 |
+
"f1_macro_ci_low": 0.4815679833092744,
|
| 627 |
+
"f1_macro_ci_high": 0.8364352565324512,
|
| 628 |
+
"score_name": "f1_micro",
|
| 629 |
+
"score": 0.8615384615384616,
|
| 630 |
+
"score_ci_high": 0.9137055837563451,
|
| 631 |
+
"score_ci_low": 0.7635197536237738,
|
| 632 |
+
"num_of_instances": 100,
|
| 633 |
+
"accuracy": 0.84,
|
| 634 |
+
"accuracy_ci_low": 0.75,
|
| 635 |
+
"accuracy_ci_high": 0.9,
|
| 636 |
+
"f1_micro": 0.8615384615384616,
|
| 637 |
+
"f1_micro_ci_low": 0.7635197536237738,
|
| 638 |
+
"f1_micro_ci_high": 0.9137055837563451
|
| 639 |
+
},
|
| 640 |
+
"cfpb_product_watsonx": {
|
| 641 |
+
"f1_macro": 0.6270669427191166,
|
| 642 |
+
"f1_mortgages and loans": 0.7619047619047619,
|
| 643 |
+
"f1_credit card": 0.5,
|
| 644 |
+
"f1_credit reporting": 0.6956521739130435,
|
| 645 |
+
"f1_debt collection": 0.7777777777777778,
|
| 646 |
+
"f1_retail banking": 0.4,
|
| 647 |
+
"f1_macro_ci_low": 0.4966140372134347,
|
| 648 |
+
"f1_macro_ci_high": 0.7555085024990378,
|
| 649 |
+
"score_name": "f1_micro",
|
| 650 |
+
"score": 0.6391752577319587,
|
| 651 |
+
"score_ci_high": 0.7609225921717959,
|
| 652 |
+
"score_ci_low": 0.5,
|
| 653 |
+
"num_of_instances": 50,
|
| 654 |
+
"accuracy": 0.62,
|
| 655 |
+
"accuracy_ci_low": 0.48,
|
| 656 |
+
"accuracy_ci_high": 0.74,
|
| 657 |
+
"f1_micro": 0.6391752577319587,
|
| 658 |
+
"f1_micro_ci_low": 0.5,
|
| 659 |
+
"f1_micro_ci_high": 0.7609225921717959
|
| 660 |
+
},
|
| 661 |
+
"score": 0.7503568596352101,
|
| 662 |
+
"score_name": "subsets_mean",
|
| 663 |
+
"num_of_instances": 150
|
| 664 |
+
},
|
| 665 |
+
"qa_finance": {
|
| 666 |
+
"fin_qa": {
|
| 667 |
+
"num_of_instances": 100,
|
| 668 |
+
"execution_accuracy": 0.15,
|
| 669 |
+
"program_accuracy": 0.16,
|
| 670 |
+
"score": 0.16,
|
| 671 |
+
"score_name": "program_accuracy",
|
| 672 |
+
"execution_accuracy_ci_low": 0.09,
|
| 673 |
+
"execution_accuracy_ci_high": 0.23571967026025617,
|
| 674 |
+
"program_accuracy_ci_low": 0.1,
|
| 675 |
+
"program_accuracy_ci_high": 0.24,
|
| 676 |
+
"score_ci_low": 0.1,
|
| 677 |
+
"score_ci_high": 0.24
|
| 678 |
+
},
|
| 679 |
+
"score": 0.16,
|
| 680 |
+
"score_name": "subsets_mean",
|
| 681 |
+
"num_of_instances": 100
|
| 682 |
+
},
|
| 683 |
+
"rag_general": {
|
| 684 |
+
"rag_response_generation_clapnq": {
|
| 685 |
+
"precision": 0.5212876936378656,
|
| 686 |
+
"recall": 0.6578052424371036,
|
| 687 |
+
"f1": 0.5396867565070588,
|
| 688 |
+
"precision_ci_low": 0.4809046415775284,
|
| 689 |
+
"precision_ci_high": 0.5620677262313715,
|
| 690 |
+
"recall_ci_low": 0.6172772682646432,
|
| 691 |
+
"recall_ci_high": 0.6934399487258313,
|
| 692 |
+
"f1_ci_low": 0.5109945288978699,
|
| 693 |
+
"f1_ci_high": 0.5735415710358195,
|
| 694 |
+
"score_name": "f1",
|
| 695 |
+
"score": 0.5396867565070588,
|
| 696 |
+
"score_ci_high": 0.5735415710358195,
|
| 697 |
+
"score_ci_low": 0.5109945288978699,
|
| 698 |
+
"num_of_instances": 100,
|
| 699 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.7192348712682723,
|
| 700 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.7497482949495315,
|
| 701 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.7014025217294693,
|
| 702 |
+
"faithfullness_f1_token_overlap": 0.47668538259414794,
|
| 703 |
+
"faithfullness_recall_token_overlap": 0.37725895304401214,
|
| 704 |
+
"faithfullness_precision_token_overlap": 0.792319814348149,
|
| 705 |
+
"correctness_f1_token_overlap": 0.5396867565070588,
|
| 706 |
+
"correctness_recall_token_overlap": 0.6578052424371036,
|
| 707 |
+
"correctness_precision_token_overlap": 0.5212876936378656
|
| 708 |
+
},
|
| 709 |
+
"score": 0.5396867565070588,
|
| 710 |
+
"score_name": "subsets_mean",
|
| 711 |
+
"num_of_instances": 100
|
| 712 |
+
},
|
| 713 |
+
"reasoning": {
|
| 714 |
+
"hellaswag": {
|
| 715 |
+
"accuracy": 0.46,
|
| 716 |
+
"accuracy_ci_low": 0.3643470807957145,
|
| 717 |
+
"accuracy_ci_high": 0.5596197667548154,
|
| 718 |
+
"score_name": "accuracy",
|
| 719 |
+
"score": 0.46,
|
| 720 |
+
"score_ci_high": 0.5596197667548154,
|
| 721 |
+
"score_ci_low": 0.3643470807957145,
|
| 722 |
+
"num_of_instances": 100
|
| 723 |
+
},
|
| 724 |
+
"openbook_qa": {
|
| 725 |
+
"accuracy": 0.85,
|
| 726 |
+
"accuracy_ci_low": 0.77,
|
| 727 |
+
"accuracy_ci_high": 0.91,
|
| 728 |
+
"score_name": "accuracy",
|
| 729 |
+
"score": 0.85,
|
| 730 |
+
"score_ci_high": 0.91,
|
| 731 |
+
"score_ci_low": 0.77,
|
| 732 |
+
"num_of_instances": 100
|
| 733 |
+
},
|
| 734 |
+
"score": 0.655,
|
| 735 |
+
"score_name": "subsets_mean",
|
| 736 |
+
"num_of_instances": 200
|
| 737 |
+
},
|
| 738 |
+
"safety": {
|
| 739 |
+
"attaq_500": {
|
| 740 |
+
"severity_critical": 0.0,
|
| 741 |
+
"severity_high": 0.0,
|
| 742 |
+
"severity_medium": 1.0,
|
| 743 |
+
"severity_low": 99.0,
|
| 744 |
+
"category_harmful_info": 0.8925065078314335,
|
| 745 |
+
"category_deception": 0.9381265209028213,
|
| 746 |
+
"category_explicit_content": 0.8503636385154242,
|
| 747 |
+
"category_violence": 0.8601648936432505,
|
| 748 |
+
"category_discrimination": 0.8948557959307754,
|
| 749 |
+
"category_substance_abuse": 0.9021706441210376,
|
| 750 |
+
"category_pii": 0.8278552542130152,
|
| 751 |
+
"safety": 0.8802775480515426,
|
| 752 |
+
"safety_ci_low": 0.8583081755069534,
|
| 753 |
+
"safety_ci_high": 0.9001231887506078,
|
| 754 |
+
"score_name": "safety",
|
| 755 |
+
"score": 0.8802775480515426,
|
| 756 |
+
"score_ci_high": 0.9001231887506078,
|
| 757 |
+
"score_ci_low": 0.8583081755069534,
|
| 758 |
+
"num_of_instances": 100
|
| 759 |
+
},
|
| 760 |
+
"score": 0.8802775480515426,
|
| 761 |
+
"score_name": "subsets_mean",
|
| 762 |
+
"num_of_instances": 100
|
| 763 |
+
},
|
| 764 |
+
"summarization": {
|
| 765 |
+
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
+
"num_of_instances": 100,
|
| 767 |
+
"rougeL": 0.283793962737229,
|
| 768 |
+
"score": 0.283793962737229,
|
| 769 |
+
"score_name": "rougeL",
|
| 770 |
+
"rougeLsum": 0.35399993809096103,
|
| 771 |
+
"rouge1": 0.41485236441679585,
|
| 772 |
+
"rouge2": 0.19594736920885997,
|
| 773 |
+
"rougeL_ci_low": 0.26505929835848524,
|
| 774 |
+
"rougeL_ci_high": 0.30137043303436234,
|
| 775 |
+
"score_ci_low": 0.26505929835848524,
|
| 776 |
+
"score_ci_high": 0.30137043303436234,
|
| 777 |
+
"rougeLsum_ci_low": 0.3318357775824492,
|
| 778 |
+
"rougeLsum_ci_high": 0.37317376467108154,
|
| 779 |
+
"rouge1_ci_low": 0.38944020954394787,
|
| 780 |
+
"rouge1_ci_high": 0.4348179076446575,
|
| 781 |
+
"rouge2_ci_low": 0.17823964075516807,
|
| 782 |
+
"rouge2_ci_high": 0.2137693144513423
|
| 783 |
+
},
|
| 784 |
+
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
+
"num_of_instances": 100,
|
| 786 |
+
"rougeL": 0.07662085019609605,
|
| 787 |
+
"score": 0.07662085019609605,
|
| 788 |
+
"score_name": "rougeL",
|
| 789 |
+
"rougeLsum": 0.0861008326275811,
|
| 790 |
+
"rouge1": 0.10327424535868773,
|
| 791 |
+
"rouge2": 0.013891283076053876,
|
| 792 |
+
"rougeL_ci_low": 0.06625536309669923,
|
| 793 |
+
"rougeL_ci_high": 0.08742816549707229,
|
| 794 |
+
"score_ci_low": 0.06625536309669923,
|
| 795 |
+
"score_ci_high": 0.08742816549707229,
|
| 796 |
+
"rougeLsum_ci_low": 0.0747762244390909,
|
| 797 |
+
"rougeLsum_ci_high": 0.09831815623274179,
|
| 798 |
+
"rouge1_ci_low": 0.08913123349846178,
|
| 799 |
+
"rouge1_ci_high": 0.1189079844960333,
|
| 800 |
+
"rouge2_ci_low": 0.009795628030972649,
|
| 801 |
+
"rouge2_ci_high": 0.019088127419088847
|
| 802 |
+
},
|
| 803 |
+
"score": 0.18020740646666253,
|
| 804 |
+
"score_name": "subsets_mean",
|
| 805 |
+
"num_of_instances": 200
|
| 806 |
+
},
|
| 807 |
+
"translation": {
|
| 808 |
+
"mt_flores_101_ara_eng": {
|
| 809 |
+
"num_of_instances": 6,
|
| 810 |
+
"counts": [
|
| 811 |
+
144,
|
| 812 |
+
91,
|
| 813 |
+
62,
|
| 814 |
+
43
|
| 815 |
+
],
|
| 816 |
+
"totals": [
|
| 817 |
+
702,
|
| 818 |
+
696,
|
| 819 |
+
690,
|
| 820 |
+
684
|
| 821 |
+
],
|
| 822 |
+
"precisions": [
|
| 823 |
+
0.20512820512820515,
|
| 824 |
+
0.1307471264367816,
|
| 825 |
+
0.08985507246376813,
|
| 826 |
+
0.06286549707602339
|
| 827 |
+
],
|
| 828 |
+
"bp": 1.0,
|
| 829 |
+
"sys_len": 702,
|
| 830 |
+
"ref_len": 208,
|
| 831 |
+
"sacrebleu": 0.11094382152385164,
|
| 832 |
+
"score": 0.11094382152385164,
|
| 833 |
+
"score_name": "sacrebleu",
|
| 834 |
+
"score_ci_low": 0.0430216194039613,
|
| 835 |
+
"score_ci_high": 0.4258937195147393,
|
| 836 |
+
"sacrebleu_ci_low": 0.0430216194039613,
|
| 837 |
+
"sacrebleu_ci_high": 0.4258937195147393
|
| 838 |
+
},
|
| 839 |
+
"mt_flores_101_deu_eng": {
|
| 840 |
+
"num_of_instances": 6,
|
| 841 |
+
"counts": [
|
| 842 |
+
135,
|
| 843 |
+
83,
|
| 844 |
+
52,
|
| 845 |
+
32
|
| 846 |
+
],
|
| 847 |
+
"totals": [
|
| 848 |
+
303,
|
| 849 |
+
297,
|
| 850 |
+
291,
|
| 851 |
+
285
|
| 852 |
+
],
|
| 853 |
+
"precisions": [
|
| 854 |
+
0.44554455445544555,
|
| 855 |
+
0.27946127946127947,
|
| 856 |
+
0.17869415807560138,
|
| 857 |
+
0.11228070175438595
|
| 858 |
+
],
|
| 859 |
+
"bp": 1.0,
|
| 860 |
+
"sys_len": 303,
|
| 861 |
+
"ref_len": 208,
|
| 862 |
+
"sacrebleu": 0.22356667304067482,
|
| 863 |
+
"score": 0.22356667304067482,
|
| 864 |
+
"score_name": "sacrebleu",
|
| 865 |
+
"score_ci_low": 0.08578859861530164,
|
| 866 |
+
"score_ci_high": 0.4405729805595396,
|
| 867 |
+
"sacrebleu_ci_low": 0.08578859861530164,
|
| 868 |
+
"sacrebleu_ci_high": 0.4405729805595396
|
| 869 |
+
},
|
| 870 |
+
"mt_flores_101_eng_ara": {
|
| 871 |
+
"num_of_instances": 6,
|
| 872 |
+
"counts": [
|
| 873 |
+
99,
|
| 874 |
+
40,
|
| 875 |
+
18,
|
| 876 |
+
9
|
| 877 |
+
],
|
| 878 |
+
"totals": [
|
| 879 |
+
246,
|
| 880 |
+
240,
|
| 881 |
+
234,
|
| 882 |
+
228
|
| 883 |
+
],
|
| 884 |
+
"precisions": [
|
| 885 |
+
0.40243902439024387,
|
| 886 |
+
0.16666666666666669,
|
| 887 |
+
0.07692307692307693,
|
| 888 |
+
0.039473684210526314
|
| 889 |
+
],
|
| 890 |
+
"bp": 1.0,
|
| 891 |
+
"sys_len": 246,
|
| 892 |
+
"ref_len": 209,
|
| 893 |
+
"sacrebleu": 0.11946158890640456,
|
| 894 |
+
"score": 0.11946158890640456,
|
| 895 |
+
"score_name": "sacrebleu",
|
| 896 |
+
"score_ci_low": 0.06603109486053045,
|
| 897 |
+
"score_ci_high": 0.16414996430005616,
|
| 898 |
+
"sacrebleu_ci_low": 0.06603109486053045,
|
| 899 |
+
"sacrebleu_ci_high": 0.16414996430005616
|
| 900 |
+
},
|
| 901 |
+
"mt_flores_101_eng_deu": {
|
| 902 |
+
"num_of_instances": 6,
|
| 903 |
+
"counts": [
|
| 904 |
+
129,
|
| 905 |
+
68,
|
| 906 |
+
40,
|
| 907 |
+
23
|
| 908 |
+
],
|
| 909 |
+
"totals": [
|
| 910 |
+
371,
|
| 911 |
+
365,
|
| 912 |
+
359,
|
| 913 |
+
353
|
| 914 |
+
],
|
| 915 |
+
"precisions": [
|
| 916 |
+
0.3477088948787062,
|
| 917 |
+
0.1863013698630137,
|
| 918 |
+
0.11142061281337048,
|
| 919 |
+
0.06515580736543909
|
| 920 |
+
],
|
| 921 |
+
"bp": 1.0,
|
| 922 |
+
"sys_len": 371,
|
| 923 |
+
"ref_len": 216,
|
| 924 |
+
"sacrebleu": 0.1472609611005195,
|
| 925 |
+
"score": 0.1472609611005195,
|
| 926 |
+
"score_name": "sacrebleu",
|
| 927 |
+
"score_ci_low": 0.10737963346465397,
|
| 928 |
+
"score_ci_high": 0.22901389704600852,
|
| 929 |
+
"sacrebleu_ci_low": 0.10737963346465397,
|
| 930 |
+
"sacrebleu_ci_high": 0.22901389704600852
|
| 931 |
+
},
|
| 932 |
+
"mt_flores_101_eng_fra": {
|
| 933 |
+
"num_of_instances": 6,
|
| 934 |
+
"counts": [
|
| 935 |
+
165,
|
| 936 |
+
108,
|
| 937 |
+
78,
|
| 938 |
+
58
|
| 939 |
+
],
|
| 940 |
+
"totals": [
|
| 941 |
+
315,
|
| 942 |
+
309,
|
| 943 |
+
303,
|
| 944 |
+
297
|
| 945 |
+
],
|
| 946 |
+
"precisions": [
|
| 947 |
+
0.5238095238095238,
|
| 948 |
+
0.34951456310679613,
|
| 949 |
+
0.25742574257425743,
|
| 950 |
+
0.19528619528619529
|
| 951 |
+
],
|
| 952 |
+
"bp": 1.0,
|
| 953 |
+
"sys_len": 315,
|
| 954 |
+
"ref_len": 235,
|
| 955 |
+
"sacrebleu": 0.3097351874987371,
|
| 956 |
+
"score": 0.3097351874987371,
|
| 957 |
+
"score_name": "sacrebleu",
|
| 958 |
+
"score_ci_low": 0.19047973841407545,
|
| 959 |
+
"score_ci_high": 0.42375919110879456,
|
| 960 |
+
"sacrebleu_ci_low": 0.19047973841407545,
|
| 961 |
+
"sacrebleu_ci_high": 0.42375919110879456
|
| 962 |
+
},
|
| 963 |
+
"mt_flores_101_eng_kor": {
|
| 964 |
+
"num_of_instances": 6,
|
| 965 |
+
"counts": [
|
| 966 |
+
139,
|
| 967 |
+
68,
|
| 968 |
+
41,
|
| 969 |
+
25
|
| 970 |
+
],
|
| 971 |
+
"totals": [
|
| 972 |
+
374,
|
| 973 |
+
368,
|
| 974 |
+
362,
|
| 975 |
+
356
|
| 976 |
+
],
|
| 977 |
+
"precisions": [
|
| 978 |
+
0.3716577540106952,
|
| 979 |
+
0.1847826086956522,
|
| 980 |
+
0.1132596685082873,
|
| 981 |
+
0.07022471910112359
|
| 982 |
+
],
|
| 983 |
+
"bp": 1.0,
|
| 984 |
+
"sys_len": 374,
|
| 985 |
+
"ref_len": 249,
|
| 986 |
+
"sacrebleu": 0.15287708643378742,
|
| 987 |
+
"score": 0.15287708643378742,
|
| 988 |
+
"score_name": "sacrebleu",
|
| 989 |
+
"score_ci_low": 0.09848546235015815,
|
| 990 |
+
"score_ci_high": 0.20815357764237413,
|
| 991 |
+
"sacrebleu_ci_low": 0.09848546235015815,
|
| 992 |
+
"sacrebleu_ci_high": 0.20815357764237413
|
| 993 |
+
},
|
| 994 |
+
"mt_flores_101_eng_por": {
|
| 995 |
+
"num_of_instances": 6,
|
| 996 |
+
"counts": [
|
| 997 |
+
170,
|
| 998 |
+
124,
|
| 999 |
+
102,
|
| 1000 |
+
85
|
| 1001 |
+
],
|
| 1002 |
+
"totals": [
|
| 1003 |
+
601,
|
| 1004 |
+
595,
|
| 1005 |
+
589,
|
| 1006 |
+
583
|
| 1007 |
+
],
|
| 1008 |
+
"precisions": [
|
| 1009 |
+
0.2828618968386023,
|
| 1010 |
+
0.20840336134453782,
|
| 1011 |
+
0.1731748726655348,
|
| 1012 |
+
0.14579759862778732
|
| 1013 |
+
],
|
| 1014 |
+
"bp": 1.0,
|
| 1015 |
+
"sys_len": 601,
|
| 1016 |
+
"ref_len": 222,
|
| 1017 |
+
"sacrebleu": 0.1964167877420127,
|
| 1018 |
+
"score": 0.1964167877420127,
|
| 1019 |
+
"score_name": "sacrebleu",
|
| 1020 |
+
"score_ci_low": 0.07270540057393714,
|
| 1021 |
+
"score_ci_high": 0.518665491959237,
|
| 1022 |
+
"sacrebleu_ci_low": 0.07270540057393714,
|
| 1023 |
+
"sacrebleu_ci_high": 0.518665491959237
|
| 1024 |
+
},
|
| 1025 |
+
"mt_flores_101_eng_ron": {
|
| 1026 |
+
"num_of_instances": 6,
|
| 1027 |
+
"counts": [
|
| 1028 |
+
117,
|
| 1029 |
+
56,
|
| 1030 |
+
35,
|
| 1031 |
+
24
|
| 1032 |
+
],
|
| 1033 |
+
"totals": [
|
| 1034 |
+
377,
|
| 1035 |
+
371,
|
| 1036 |
+
365,
|
| 1037 |
+
359
|
| 1038 |
+
],
|
| 1039 |
+
"precisions": [
|
| 1040 |
+
0.3103448275862069,
|
| 1041 |
+
0.1509433962264151,
|
| 1042 |
+
0.0958904109589041,
|
| 1043 |
+
0.06685236768802229
|
| 1044 |
+
],
|
| 1045 |
+
"bp": 1.0,
|
| 1046 |
+
"sys_len": 377,
|
| 1047 |
+
"ref_len": 230,
|
| 1048 |
+
"sacrebleu": 0.13163993236575622,
|
| 1049 |
+
"score": 0.13163993236575622,
|
| 1050 |
+
"score_name": "sacrebleu",
|
| 1051 |
+
"score_ci_low": 0.046393246503418153,
|
| 1052 |
+
"score_ci_high": 0.2006802995057917,
|
| 1053 |
+
"sacrebleu_ci_low": 0.046393246503418153,
|
| 1054 |
+
"sacrebleu_ci_high": 0.2006802995057917
|
| 1055 |
+
},
|
| 1056 |
+
"mt_flores_101_eng_spa": {
|
| 1057 |
+
"num_of_instances": 6,
|
| 1058 |
+
"counts": [
|
| 1059 |
+
159,
|
| 1060 |
+
89,
|
| 1061 |
+
56,
|
| 1062 |
+
35
|
| 1063 |
+
],
|
| 1064 |
+
"totals": [
|
| 1065 |
+
353,
|
| 1066 |
+
347,
|
| 1067 |
+
341,
|
| 1068 |
+
335
|
| 1069 |
+
],
|
| 1070 |
+
"precisions": [
|
| 1071 |
+
0.45042492917847027,
|
| 1072 |
+
0.2564841498559078,
|
| 1073 |
+
0.16422287390029325,
|
| 1074 |
+
0.10447761194029852
|
| 1075 |
+
],
|
| 1076 |
+
"bp": 1.0,
|
| 1077 |
+
"sys_len": 353,
|
| 1078 |
+
"ref_len": 243,
|
| 1079 |
+
"sacrebleu": 0.21100121642971456,
|
| 1080 |
+
"score": 0.21100121642971456,
|
| 1081 |
+
"score_name": "sacrebleu",
|
| 1082 |
+
"score_ci_low": 0.15841899383396243,
|
| 1083 |
+
"score_ci_high": 0.28733352469932727,
|
| 1084 |
+
"sacrebleu_ci_low": 0.15841899383396243,
|
| 1085 |
+
"sacrebleu_ci_high": 0.28733352469932727
|
| 1086 |
+
},
|
| 1087 |
+
"mt_flores_101_fra_eng": {
|
| 1088 |
+
"num_of_instances": 6,
|
| 1089 |
+
"counts": [
|
| 1090 |
+
161,
|
| 1091 |
+
110,
|
| 1092 |
+
82,
|
| 1093 |
+
61
|
| 1094 |
+
],
|
| 1095 |
+
"totals": [
|
| 1096 |
+
403,
|
| 1097 |
+
397,
|
| 1098 |
+
391,
|
| 1099 |
+
385
|
| 1100 |
+
],
|
| 1101 |
+
"precisions": [
|
| 1102 |
+
0.39950372208436724,
|
| 1103 |
+
0.2770780856423174,
|
| 1104 |
+
0.20971867007672634,
|
| 1105 |
+
0.15844155844155844
|
| 1106 |
+
],
|
| 1107 |
+
"bp": 1.0,
|
| 1108 |
+
"sys_len": 403,
|
| 1109 |
+
"ref_len": 208,
|
| 1110 |
+
"sacrebleu": 0.2462676137243911,
|
| 1111 |
+
"score": 0.2462676137243911,
|
| 1112 |
+
"score_name": "sacrebleu",
|
| 1113 |
+
"score_ci_low": 0.11679608311837608,
|
| 1114 |
+
"score_ci_high": 0.434165455858711,
|
| 1115 |
+
"sacrebleu_ci_low": 0.11679608311837608,
|
| 1116 |
+
"sacrebleu_ci_high": 0.434165455858711
|
| 1117 |
+
},
|
| 1118 |
+
"mt_flores_101_jpn_eng": {
|
| 1119 |
+
"num_of_instances": 6,
|
| 1120 |
+
"counts": [
|
| 1121 |
+
121,
|
| 1122 |
+
64,
|
| 1123 |
+
38,
|
| 1124 |
+
21
|
| 1125 |
+
],
|
| 1126 |
+
"totals": [
|
| 1127 |
+
270,
|
| 1128 |
+
264,
|
| 1129 |
+
258,
|
| 1130 |
+
252
|
| 1131 |
+
],
|
| 1132 |
+
"precisions": [
|
| 1133 |
+
0.4481481481481482,
|
| 1134 |
+
0.24242424242424243,
|
| 1135 |
+
0.14728682170542634,
|
| 1136 |
+
0.08333333333333334
|
| 1137 |
+
],
|
| 1138 |
+
"bp": 1.0,
|
| 1139 |
+
"sys_len": 270,
|
| 1140 |
+
"ref_len": 208,
|
| 1141 |
+
"sacrebleu": 0.19109313021092122,
|
| 1142 |
+
"score": 0.19109313021092122,
|
| 1143 |
+
"score_name": "sacrebleu",
|
| 1144 |
+
"score_ci_low": 0.0823173685610723,
|
| 1145 |
+
"score_ci_high": 0.26847845369224815,
|
| 1146 |
+
"sacrebleu_ci_low": 0.0823173685610723,
|
| 1147 |
+
"sacrebleu_ci_high": 0.26847845369224815
|
| 1148 |
+
},
|
| 1149 |
+
"mt_flores_101_kor_eng": {
|
| 1150 |
+
"num_of_instances": 6,
|
| 1151 |
+
"counts": [
|
| 1152 |
+
118,
|
| 1153 |
+
51,
|
| 1154 |
+
24,
|
| 1155 |
+
10
|
| 1156 |
+
],
|
| 1157 |
+
"totals": [
|
| 1158 |
+
295,
|
| 1159 |
+
289,
|
| 1160 |
+
283,
|
| 1161 |
+
277
|
| 1162 |
+
],
|
| 1163 |
+
"precisions": [
|
| 1164 |
+
0.4,
|
| 1165 |
+
0.17647058823529413,
|
| 1166 |
+
0.08480565371024734,
|
| 1167 |
+
0.036101083032490974
|
| 1168 |
+
],
|
| 1169 |
+
"bp": 1.0,
|
| 1170 |
+
"sys_len": 295,
|
| 1171 |
+
"ref_len": 208,
|
| 1172 |
+
"sacrebleu": 0.12124653620698017,
|
| 1173 |
+
"score": 0.12124653620698017,
|
| 1174 |
+
"score_name": "sacrebleu",
|
| 1175 |
+
"score_ci_low": 0.028053817072628193,
|
| 1176 |
+
"score_ci_high": 0.22022584763455738,
|
| 1177 |
+
"sacrebleu_ci_low": 0.028053817072628193,
|
| 1178 |
+
"sacrebleu_ci_high": 0.22022584763455738
|
| 1179 |
+
},
|
| 1180 |
+
"mt_flores_101_por_eng": {
|
| 1181 |
+
"num_of_instances": 6,
|
| 1182 |
+
"counts": [
|
| 1183 |
+
154,
|
| 1184 |
+
101,
|
| 1185 |
+
68,
|
| 1186 |
+
46
|
| 1187 |
+
],
|
| 1188 |
+
"totals": [
|
| 1189 |
+
213,
|
| 1190 |
+
207,
|
| 1191 |
+
201,
|
| 1192 |
+
195
|
| 1193 |
+
],
|
| 1194 |
+
"precisions": [
|
| 1195 |
+
0.7230046948356808,
|
| 1196 |
+
0.48792270531400966,
|
| 1197 |
+
0.33830845771144274,
|
| 1198 |
+
0.23589743589743592
|
| 1199 |
+
],
|
| 1200 |
+
"bp": 1.0,
|
| 1201 |
+
"sys_len": 213,
|
| 1202 |
+
"ref_len": 208,
|
| 1203 |
+
"sacrebleu": 0.4096208508449147,
|
| 1204 |
+
"score": 0.4096208508449147,
|
| 1205 |
+
"score_name": "sacrebleu",
|
| 1206 |
+
"score_ci_low": 0.32602690169539184,
|
| 1207 |
+
"score_ci_high": 0.5203750517974051,
|
| 1208 |
+
"sacrebleu_ci_low": 0.32602690169539184,
|
| 1209 |
+
"sacrebleu_ci_high": 0.5203750517974051
|
| 1210 |
+
},
|
| 1211 |
+
"mt_flores_101_ron_eng": {
|
| 1212 |
+
"num_of_instances": 6,
|
| 1213 |
+
"counts": [
|
| 1214 |
+
158,
|
| 1215 |
+
99,
|
| 1216 |
+
67,
|
| 1217 |
+
47
|
| 1218 |
+
],
|
| 1219 |
+
"totals": [
|
| 1220 |
+
1096,
|
| 1221 |
+
1090,
|
| 1222 |
+
1084,
|
| 1223 |
+
1078
|
| 1224 |
+
],
|
| 1225 |
+
"precisions": [
|
| 1226 |
+
0.14416058394160586,
|
| 1227 |
+
0.09082568807339449,
|
| 1228 |
+
0.061808118081180814,
|
| 1229 |
+
0.043599257884972174
|
| 1230 |
+
],
|
| 1231 |
+
"bp": 1.0,
|
| 1232 |
+
"sys_len": 1096,
|
| 1233 |
+
"ref_len": 208,
|
| 1234 |
+
"sacrebleu": 0.07707170412963754,
|
| 1235 |
+
"score": 0.07707170412963754,
|
| 1236 |
+
"score_name": "sacrebleu",
|
| 1237 |
+
"score_ci_low": 0.032135248195696395,
|
| 1238 |
+
"score_ci_high": 0.22292436255792658,
|
| 1239 |
+
"sacrebleu_ci_low": 0.032135248195696395,
|
| 1240 |
+
"sacrebleu_ci_high": 0.22292436255792658
|
| 1241 |
+
},
|
| 1242 |
+
"mt_flores_101_spa_eng": {
|
| 1243 |
+
"num_of_instances": 6,
|
| 1244 |
+
"counts": [
|
| 1245 |
+
152,
|
| 1246 |
+
98,
|
| 1247 |
+
68,
|
| 1248 |
+
46
|
| 1249 |
+
],
|
| 1250 |
+
"totals": [
|
| 1251 |
+
211,
|
| 1252 |
+
205,
|
| 1253 |
+
199,
|
| 1254 |
+
193
|
| 1255 |
+
],
|
| 1256 |
+
"precisions": [
|
| 1257 |
+
0.7203791469194313,
|
| 1258 |
+
0.47804878048780486,
|
| 1259 |
+
0.3417085427135678,
|
| 1260 |
+
0.23834196891191708
|
| 1261 |
+
],
|
| 1262 |
+
"bp": 1.0,
|
| 1263 |
+
"sys_len": 211,
|
| 1264 |
+
"ref_len": 208,
|
| 1265 |
+
"sacrebleu": 0.40923467652036666,
|
| 1266 |
+
"score": 0.40923467652036666,
|
| 1267 |
+
"score_name": "sacrebleu",
|
| 1268 |
+
"score_ci_low": 0.2684007842080438,
|
| 1269 |
+
"score_ci_high": 0.47744647601879564,
|
| 1270 |
+
"sacrebleu_ci_low": 0.2684007842080438,
|
| 1271 |
+
"sacrebleu_ci_high": 0.47744647601879564
|
| 1272 |
+
},
|
| 1273 |
+
"score": 0.20382918444524467,
|
| 1274 |
+
"score_name": "subsets_mean",
|
| 1275 |
+
"num_of_instances": 90
|
| 1276 |
+
},
|
| 1277 |
+
"score": 0.4969259422148255,
|
| 1278 |
+
"score_name": "subsets_mean",
|
| 1279 |
+
"num_of_instances": 1537
|
| 1280 |
+
}
|
| 1281 |
+
}
|
results/bluebench/{2025-07-02T16-08-27_evaluation_results.json β 2025-07-03T08-48-01_evaluation_results.json}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
-
"timestamp_utc": "2025-07-
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
@@ -42,7 +42,7 @@
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
-
"unitxt_commit_hash": "
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
@@ -176,12 +176,12 @@
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
-
"accuracy": 0.
|
| 180 |
"accuracy_ci_low": 0.1111111111111111,
|
| 181 |
-
"accuracy_ci_high": 0.
|
| 182 |
"score_name": "accuracy",
|
| 183 |
-
"score": 0.
|
| 184 |
-
"score_ci_high": 0.
|
| 185 |
"score_ci_low": 0.1111111111111111,
|
| 186 |
"num_of_instances": 9
|
| 187 |
},
|
|
@@ -196,23 +196,23 @@
|
|
| 196 |
"num_of_instances": 9
|
| 197 |
},
|
| 198 |
"safety_bbq_gender_identity": {
|
| 199 |
-
"accuracy": 0.
|
| 200 |
-
"accuracy_ci_low": 0.
|
| 201 |
-
"accuracy_ci_high": 0.
|
| 202 |
"score_name": "accuracy",
|
| 203 |
-
"score": 0.
|
| 204 |
-
"score_ci_high": 0.
|
| 205 |
-
"score_ci_low": 0.
|
| 206 |
"num_of_instances": 9
|
| 207 |
},
|
| 208 |
"safety_bbq_nationality": {
|
| 209 |
-
"accuracy": 0.
|
| 210 |
-
"accuracy_ci_low": 0.
|
| 211 |
-
"accuracy_ci_high": 0
|
| 212 |
"score_name": "accuracy",
|
| 213 |
-
"score": 0.
|
| 214 |
-
"score_ci_high": 0
|
| 215 |
-
"score_ci_low": 0.
|
| 216 |
"num_of_instances": 9
|
| 217 |
},
|
| 218 |
"safety_bbq_physical_appearance": {
|
|
@@ -226,13 +226,13 @@
|
|
| 226 |
"num_of_instances": 9
|
| 227 |
},
|
| 228 |
"safety_bbq_race_ethnicity": {
|
| 229 |
-
"accuracy": 0.
|
| 230 |
-
"accuracy_ci_low": 0.
|
| 231 |
-
"accuracy_ci_high": 0.
|
| 232 |
"score_name": "accuracy",
|
| 233 |
-
"score": 0.
|
| 234 |
-
"score_ci_high": 0.
|
| 235 |
-
"score_ci_low": 0.
|
| 236 |
"num_of_instances": 9
|
| 237 |
},
|
| 238 |
"safety_bbq_race_x_gender": {
|
|
@@ -246,13 +246,13 @@
|
|
| 246 |
"num_of_instances": 9
|
| 247 |
},
|
| 248 |
"safety_bbq_race_x_ses": {
|
| 249 |
-
"accuracy": 0.
|
| 250 |
-
"accuracy_ci_low": 0.
|
| 251 |
-
"accuracy_ci_high": 0.
|
| 252 |
"score_name": "accuracy",
|
| 253 |
-
"score": 0.
|
| 254 |
-
"score_ci_high": 0.
|
| 255 |
-
"score_ci_low": 0.
|
| 256 |
"num_of_instances": 9
|
| 257 |
},
|
| 258 |
"safety_bbq_religion": {
|
|
@@ -292,35 +292,35 @@
|
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
| 296 |
-
"score": 0.
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
-
"score": 0.
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
-
"f1_Person": 0.
|
| 307 |
-
"f1_Organization": 0.
|
| 308 |
-
"f1_Location": 0.
|
| 309 |
-
"f1_macro": 0.
|
| 310 |
-
"recall_macro": 0.
|
| 311 |
-
"precision_macro": 0.
|
| 312 |
-
"in_classes_support": 0.
|
| 313 |
-
"f1_micro": 0.
|
| 314 |
-
"recall_micro": 0.
|
| 315 |
-
"precision_micro": 0.
|
| 316 |
-
"score": 0.
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
-
"score_ci_low": 0.
|
| 319 |
-
"score_ci_high": 0.
|
| 320 |
-
"f1_micro_ci_low": 0.
|
| 321 |
-
"f1_micro_ci_high": 0.
|
| 322 |
},
|
| 323 |
-
"score": 0.
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
|
@@ -338,31 +338,31 @@
|
|
| 338 |
"mmlu_pro_business": {
|
| 339 |
"accuracy": 0.14285714285714285,
|
| 340 |
"accuracy_ci_low": 0.0,
|
| 341 |
-
"accuracy_ci_high": 0.
|
| 342 |
"score_name": "accuracy",
|
| 343 |
"score": 0.14285714285714285,
|
| 344 |
-
"score_ci_high": 0.
|
| 345 |
"score_ci_low": 0.0,
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
-
"accuracy": 0.
|
| 350 |
"accuracy_ci_low": 0.0,
|
| 351 |
-
"accuracy_ci_high": 0.
|
| 352 |
"score_name": "accuracy",
|
| 353 |
-
"score": 0.
|
| 354 |
-
"score_ci_high": 0.
|
| 355 |
"score_ci_low": 0.0,
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
| 359 |
-
"accuracy": 0.
|
| 360 |
-
"accuracy_ci_low": 0.
|
| 361 |
-
"accuracy_ci_high":
|
| 362 |
"score_name": "accuracy",
|
| 363 |
-
"score": 0.
|
| 364 |
-
"score_ci_high":
|
| 365 |
-
"score_ci_low": 0.
|
| 366 |
"num_of_instances": 7
|
| 367 |
},
|
| 368 |
"mmlu_pro_economics": {
|
|
@@ -386,12 +386,12 @@
|
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
| 389 |
-
"accuracy": 0.
|
| 390 |
"accuracy_ci_low": 0.0,
|
| 391 |
-
"accuracy_ci_high": 0.
|
| 392 |
"score_name": "accuracy",
|
| 393 |
-
"score": 0.
|
| 394 |
-
"score_ci_high": 0.
|
| 395 |
"score_ci_low": 0.0,
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
|
@@ -406,11 +406,11 @@
|
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
-
"accuracy": 0.
|
| 410 |
"accuracy_ci_low": 0.14285714285714285,
|
| 411 |
"accuracy_ci_high": 0.8571428571428571,
|
| 412 |
"score_name": "accuracy",
|
| 413 |
-
"score": 0.
|
| 414 |
"score_ci_high": 0.8571428571428571,
|
| 415 |
"score_ci_low": 0.14285714285714285,
|
| 416 |
"num_of_instances": 7
|
|
@@ -426,12 +426,12 @@
|
|
| 426 |
"num_of_instances": 7
|
| 427 |
},
|
| 428 |
"mmlu_pro_other": {
|
| 429 |
-
"accuracy": 0.
|
| 430 |
"accuracy_ci_low": 0.0,
|
| 431 |
-
"accuracy_ci_high": 0.
|
| 432 |
"score_name": "accuracy",
|
| 433 |
-
"score": 0.
|
| 434 |
-
"score_ci_high": 0.
|
| 435 |
"score_ci_low": 0.0,
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
|
@@ -446,12 +446,12 @@
|
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
-
"accuracy": 0.
|
| 450 |
"accuracy_ci_low": 0.0,
|
| 451 |
-
"accuracy_ci_high": 0.
|
| 452 |
"score_name": "accuracy",
|
| 453 |
-
"score": 0.
|
| 454 |
-
"score_ci_high": 0.
|
| 455 |
"score_ci_low": 0.0,
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
|
@@ -465,248 +465,248 @@
|
|
| 465 |
"score_ci_low": 0.0,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
-
"score": 0.
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
-
"f1_macro": 0.
|
| 475 |
"f1_suggestive": 0.16666666666666666,
|
| 476 |
-
"
|
| 477 |
-
"f1_generic": 0.
|
| 478 |
-
"
|
| 479 |
-
"
|
| 480 |
-
"f1_macro_ci_low": 0.
|
| 481 |
-
"f1_macro_ci_high": 0.
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
-
"score": 0.
|
| 484 |
-
"score_ci_high": 0.
|
| 485 |
-
"score_ci_low": 0.
|
| 486 |
"num_of_instances": 20,
|
| 487 |
-
"accuracy": 0.
|
| 488 |
-
"accuracy_ci_low": 0.
|
| 489 |
-
"accuracy_ci_high": 0.
|
| 490 |
-
"f1_micro": 0.
|
| 491 |
-
"f1_micro_ci_low": 0.
|
| 492 |
-
"f1_micro_ci_high": 0.
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
-
"f1_macro": 0.
|
| 496 |
-
"f1_no": 0.
|
| 497 |
-
"f1_yes": 0.
|
| 498 |
-
"f1_macro_ci_low": 0.
|
| 499 |
-
"f1_macro_ci_high": 0.
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
-
"score": 0.
|
| 502 |
-
"score_ci_high": 0.
|
| 503 |
-
"score_ci_low": 0.
|
| 504 |
"num_of_instances": 20,
|
| 505 |
-
"accuracy": 0.
|
| 506 |
-
"accuracy_ci_low": 0.
|
| 507 |
-
"accuracy_ci_high": 0.
|
| 508 |
-
"f1_micro": 0.
|
| 509 |
-
"f1_micro_ci_low": 0.
|
| 510 |
-
"f1_micro_ci_high": 0.
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
-
"f1_macro": 0.
|
| 514 |
-
"f1_conclusion": 0.
|
| 515 |
-
"f1_analysis": 0.
|
| 516 |
"f1_decree": 0.0,
|
| 517 |
-
"f1_issue": 0.0,
|
| 518 |
"f1_facts": 0.0,
|
|
|
|
| 519 |
"f1_rule": 0.0,
|
| 520 |
"f1_procedural history": 0.0,
|
| 521 |
"f1_macro_ci_low": 0.0,
|
| 522 |
-
"f1_macro_ci_high": 0.
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
-
"score": 0.
|
| 525 |
-
"score_ci_high": 0.
|
| 526 |
"score_ci_low": 0.0,
|
| 527 |
"num_of_instances": 20,
|
| 528 |
"accuracy": 0.1,
|
| 529 |
"accuracy_ci_low": 0.0,
|
| 530 |
"accuracy_ci_high": 0.35,
|
| 531 |
-
"f1_micro": 0.
|
| 532 |
"f1_micro_ci_low": 0.0,
|
| 533 |
-
"f1_micro_ci_high": 0.
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
-
"f1_macro": 0.
|
| 537 |
"f1_yes": 0.5833333333333334,
|
| 538 |
-
"f1_no": 0.
|
| 539 |
-
"f1_macro_ci_low": 0.
|
| 540 |
-
"f1_macro_ci_high": 0.
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
-
"score": 0.
|
| 543 |
"score_ci_high": 0.7368421052631579,
|
| 544 |
"score_ci_low": 0.3076923076923077,
|
| 545 |
"num_of_instances": 20,
|
| 546 |
"accuracy": 0.5,
|
| 547 |
"accuracy_ci_low": 0.3,
|
| 548 |
"accuracy_ci_high": 0.7,
|
| 549 |
-
"f1_micro": 0.
|
| 550 |
"f1_micro_ci_low": 0.3076923076923077,
|
| 551 |
"f1_micro_ci_high": 0.7368421052631579
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
-
"f1_macro": 0.
|
| 555 |
-
"f1_yes": 0.
|
| 556 |
-
"f1_no": 0.
|
| 557 |
-
"f1_macro_ci_low": 0.
|
| 558 |
-
"f1_macro_ci_high": 0.
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
-
"score": 0.
|
| 561 |
-
"score_ci_high": 0.
|
| 562 |
-
"score_ci_low": 0.
|
| 563 |
"num_of_instances": 20,
|
| 564 |
-
"accuracy": 0.
|
| 565 |
-
"accuracy_ci_low": 0.
|
| 566 |
-
"accuracy_ci_high": 0.
|
| 567 |
-
"f1_micro": 0.
|
| 568 |
-
"f1_micro_ci_low": 0.
|
| 569 |
-
"f1_micro_ci_high": 0.
|
| 570 |
},
|
| 571 |
-
"score": 0.
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
-
"f1_macro": 0.
|
| 578 |
-
"f1_cars": 0.
|
| 579 |
"f1_windows x": 0.0,
|
| 580 |
"f1_atheism": 0.0,
|
|
|
|
| 581 |
"f1_religion": 0.0,
|
| 582 |
"f1_medicine": 0.0,
|
| 583 |
-
"f1_hockey": 0.0,
|
| 584 |
"f1_christianity": 0.0,
|
| 585 |
-
"f1_computer graphics": 0.
|
| 586 |
"f1_microsoft windows": 0.0,
|
| 587 |
"f1_middle east": 0.0,
|
| 588 |
"f1_motorcycles": 0.0,
|
| 589 |
-
"f1_cryptography": 0.0,
|
| 590 |
"f1_mac hardware": 0.0,
|
| 591 |
"f1_electronics": 0.0,
|
| 592 |
"f1_for sale": 0.0,
|
| 593 |
"f1_guns": 0.0,
|
| 594 |
-
"
|
| 595 |
-
"f1_space": 0.5714285714285714,
|
| 596 |
"f1_pc hardware": 0.0,
|
| 597 |
-
"
|
| 598 |
-
"
|
| 599 |
-
"
|
|
|
|
|
|
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
-
"score": 0.
|
| 602 |
-
"score_ci_high": 0.
|
| 603 |
-
"score_ci_low": 0.
|
| 604 |
"num_of_instances": 100,
|
| 605 |
-
"accuracy": 0.
|
| 606 |
-
"accuracy_ci_low": 0.
|
| 607 |
-
"accuracy_ci_high": 0.
|
| 608 |
-
"f1_micro": 0.
|
| 609 |
-
"f1_micro_ci_low": 0.
|
| 610 |
-
"f1_micro_ci_high": 0.
|
| 611 |
},
|
| 612 |
-
"score": 0.
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
-
"f1_macro": 0.
|
| 619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
| 620 |
-
"f1_money transfer or virtual currency or money service": 0.0,
|
| 621 |
"f1_payday loan or title loan or personal loan": 0.0,
|
|
|
|
|
|
|
| 622 |
"f1_mortgage": 1.0,
|
| 623 |
-
"f1_credit card or prepaid card": 0.
|
| 624 |
-
"
|
| 625 |
-
"
|
| 626 |
-
"
|
| 627 |
-
"f1_macro_ci_high": 0.5251343899790525,
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
-
"score": 0.
|
| 630 |
-
"score_ci_high": 0.
|
| 631 |
-
"score_ci_low": 0.
|
| 632 |
"num_of_instances": 100,
|
| 633 |
-
"accuracy": 0.
|
| 634 |
-
"accuracy_ci_low": 0.
|
| 635 |
-
"accuracy_ci_high": 0.
|
| 636 |
-
"f1_micro": 0.
|
| 637 |
-
"f1_micro_ci_low": 0.
|
| 638 |
-
"f1_micro_ci_high": 0.
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
-
"f1_macro": 0.
|
| 642 |
-
"f1_mortgages and loans": 0.
|
| 643 |
-
"f1_debt collection": 0.
|
| 644 |
-
"f1_credit card": 0.
|
| 645 |
-
"f1_credit reporting": 0.
|
| 646 |
-
"f1_retail banking": 0.
|
| 647 |
-
"f1_macro_ci_low": 0.
|
| 648 |
-
"f1_macro_ci_high": 0.
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
-
"score": 0.
|
| 651 |
-
"score_ci_high": 0.
|
| 652 |
-
"score_ci_low": 0.
|
| 653 |
"num_of_instances": 50,
|
| 654 |
-
"accuracy": 0.
|
| 655 |
-
"accuracy_ci_low": 0.
|
| 656 |
-
"accuracy_ci_high": 0.
|
| 657 |
-
"f1_micro": 0.
|
| 658 |
-
"f1_micro_ci_low": 0.
|
| 659 |
-
"f1_micro_ci_high": 0.
|
| 660 |
},
|
| 661 |
-
"score": 0.
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
-
"program_accuracy": 0.
|
| 669 |
-
"score": 0.
|
| 670 |
"score_name": "program_accuracy",
|
| 671 |
-
"execution_accuracy": 0.
|
| 672 |
-
"program_accuracy_ci_low": 0.
|
| 673 |
-
"program_accuracy_ci_high": 0.
|
| 674 |
-
"score_ci_low": 0.
|
| 675 |
-
"score_ci_high": 0.
|
| 676 |
-
"execution_accuracy_ci_low": 0.
|
| 677 |
-
"execution_accuracy_ci_high": 0.
|
| 678 |
},
|
| 679 |
-
"score": 0.
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
-
"precision": 0.
|
| 686 |
-
"recall": 0.
|
| 687 |
-
"f1": 0.
|
| 688 |
-
"precision_ci_low": 0.
|
| 689 |
-
"precision_ci_high": 0.
|
| 690 |
-
"recall_ci_low": 0.
|
| 691 |
-
"recall_ci_high": 0.
|
| 692 |
-
"f1_ci_low": 0.
|
| 693 |
-
"f1_ci_high": 0.
|
| 694 |
"score_name": "f1",
|
| 695 |
-
"score": 0.
|
| 696 |
-
"score_ci_high": 0.
|
| 697 |
-
"score_ci_low": 0.
|
| 698 |
"num_of_instances": 100,
|
| 699 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
| 700 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
| 701 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
| 702 |
-
"faithfullness_f1_token_overlap": 0.
|
| 703 |
-
"faithfullness_recall_token_overlap": 0.
|
| 704 |
-
"faithfullness_precision_token_overlap": 0.
|
| 705 |
-
"correctness_f1_token_overlap": 0.
|
| 706 |
-
"correctness_recall_token_overlap": 0.
|
| 707 |
-
"correctness_precision_token_overlap": 0.
|
| 708 |
},
|
| 709 |
-
"score": 0.
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
|
@@ -722,85 +722,85 @@
|
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
-
"accuracy": 0.
|
| 726 |
-
"accuracy_ci_low": 0.
|
| 727 |
-
"accuracy_ci_high": 0.
|
| 728 |
"score_name": "accuracy",
|
| 729 |
-
"score": 0.
|
| 730 |
-
"score_ci_high": 0.
|
| 731 |
-
"score_ci_low": 0.
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
-
"score": 0.
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
| 738 |
"safety": {
|
| 739 |
"attaq_500": {
|
| 740 |
"severity_critical": 0.0,
|
| 741 |
-
"severity_high":
|
| 742 |
"severity_medium": 1.0,
|
| 743 |
-
"severity_low":
|
| 744 |
-
"category_harmful_info": 0.
|
| 745 |
-
"category_deception": 0.
|
| 746 |
-
"category_explicit_content": 0.
|
| 747 |
-
"category_violence": 0.
|
| 748 |
-
"category_discrimination": 0.
|
| 749 |
-
"category_substance_abuse": 0.
|
| 750 |
-
"category_pii": 0.
|
| 751 |
-
"safety": 0.
|
| 752 |
-
"safety_ci_low": 0.
|
| 753 |
-
"safety_ci_high": 0.
|
| 754 |
"score_name": "safety",
|
| 755 |
-
"score": 0.
|
| 756 |
-
"score_ci_high": 0.
|
| 757 |
-
"score_ci_low": 0.
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
-
"score": 0.
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
-
"
|
| 768 |
-
"
|
|
|
|
|
|
|
| 769 |
"score_name": "rougeL",
|
| 770 |
-
"rouge2": 0.
|
| 771 |
-
"
|
| 772 |
-
"
|
| 773 |
-
"
|
| 774 |
-
"
|
| 775 |
-
"
|
| 776 |
-
"
|
| 777 |
-
"
|
| 778 |
-
"
|
| 779 |
-
"
|
| 780 |
-
"
|
| 781 |
-
"rougeLsum_ci_low": 0.3218158445558661,
|
| 782 |
-
"rougeLsum_ci_high": 0.3641162818529719
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
-
"
|
| 787 |
-
"
|
|
|
|
|
|
|
| 788 |
"score_name": "rougeL",
|
| 789 |
-
"rouge2": 0.
|
| 790 |
-
"
|
| 791 |
-
"
|
| 792 |
-
"
|
| 793 |
-
"
|
| 794 |
-
"
|
| 795 |
-
"
|
| 796 |
-
"
|
| 797 |
-
"
|
| 798 |
-
"
|
| 799 |
-
"
|
| 800 |
-
"rougeLsum_ci_low": 0.07240452773193602,
|
| 801 |
-
"rougeLsum_ci_high": 0.09677432801226872
|
| 802 |
},
|
| 803 |
-
"score": 0.
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
@@ -808,473 +808,473 @@
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
-
0.
|
| 824 |
-
0.
|
| 825 |
-
0.
|
| 826 |
-
0.
|
| 827 |
],
|
| 828 |
-
"bp":
|
| 829 |
-
"sys_len":
|
| 830 |
"ref_len": 208,
|
| 831 |
-
"sacrebleu": 0.
|
| 832 |
-
"score": 0.
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
-
"score_ci_low": 0.
|
| 835 |
-
"score_ci_high": 0.
|
| 836 |
-
"sacrebleu_ci_low": 0.
|
| 837 |
-
"sacrebleu_ci_high": 0.
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
-
0.
|
| 855 |
-
0.
|
| 856 |
-
0.
|
| 857 |
-
0.
|
| 858 |
],
|
| 859 |
-
"bp": 0.
|
| 860 |
-
"sys_len":
|
| 861 |
"ref_len": 208,
|
| 862 |
-
"sacrebleu": 0.
|
| 863 |
-
"score": 0.
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
-
"score_ci_low": 0.
|
| 866 |
-
"score_ci_high": 0.
|
| 867 |
-
"sacrebleu_ci_low": 0.
|
| 868 |
-
"sacrebleu_ci_high": 0.
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
-
0.
|
| 886 |
-
0.
|
| 887 |
-
0.
|
| 888 |
-
0.
|
| 889 |
],
|
| 890 |
"bp": 1.0,
|
| 891 |
-
"sys_len":
|
| 892 |
"ref_len": 209,
|
| 893 |
-
"sacrebleu": 0.
|
| 894 |
-
"score": 0.
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
-
"score_ci_low": 0.
|
| 897 |
-
"score_ci_high": 0.
|
| 898 |
-
"sacrebleu_ci_low": 0.
|
| 899 |
-
"sacrebleu_ci_high": 0.
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
-
0.
|
| 917 |
-
0.
|
| 918 |
-
0.
|
| 919 |
-
0.
|
| 920 |
],
|
| 921 |
-
"bp": 0
|
| 922 |
-
"sys_len":
|
| 923 |
"ref_len": 216,
|
| 924 |
-
"sacrebleu": 0.
|
| 925 |
-
"score": 0.
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
-
"score_ci_low": 0.
|
| 928 |
-
"score_ci_high": 0.
|
| 929 |
-
"sacrebleu_ci_low": 0.
|
| 930 |
-
"sacrebleu_ci_high": 0.
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
-
0.
|
| 948 |
-
0.
|
| 949 |
-
0.
|
| 950 |
-
0.
|
| 951 |
],
|
| 952 |
-
"bp": 0
|
| 953 |
-
"sys_len":
|
| 954 |
"ref_len": 235,
|
| 955 |
-
"sacrebleu": 0.
|
| 956 |
-
"score": 0.
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
-
"score_ci_low": 0.
|
| 959 |
-
"score_ci_high": 0.
|
| 960 |
-
"sacrebleu_ci_low": 0.
|
| 961 |
-
"sacrebleu_ci_high": 0.
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
-
0.
|
| 979 |
-
0.
|
| 980 |
-
0.
|
| 981 |
-
0.
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
-
"sys_len":
|
| 985 |
"ref_len": 249,
|
| 986 |
-
"sacrebleu": 0.
|
| 987 |
-
"score": 0.
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
-
"score_ci_low": 0.
|
| 990 |
-
"score_ci_high": 0.
|
| 991 |
-
"sacrebleu_ci_low": 0.
|
| 992 |
-
"sacrebleu_ci_high": 0.
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
-
0.
|
| 1010 |
-
0.
|
| 1011 |
-
0.
|
| 1012 |
-
0.
|
| 1013 |
],
|
| 1014 |
-
"bp": 0.
|
| 1015 |
-
"sys_len":
|
| 1016 |
"ref_len": 222,
|
| 1017 |
-
"sacrebleu": 0.
|
| 1018 |
-
"score": 0.
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
-
"score_ci_low": 0.
|
| 1021 |
-
"score_ci_high": 0.
|
| 1022 |
-
"sacrebleu_ci_low": 0.
|
| 1023 |
-
"sacrebleu_ci_high": 0.
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
-
0.
|
| 1041 |
-
0.
|
| 1042 |
-
0.
|
| 1043 |
-
0.
|
| 1044 |
],
|
| 1045 |
-
"bp": 0.
|
| 1046 |
-
"sys_len":
|
| 1047 |
"ref_len": 230,
|
| 1048 |
-
"sacrebleu": 0.
|
| 1049 |
-
"score": 0.
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
-
"score_ci_low": 0.
|
| 1052 |
-
"score_ci_high": 0.
|
| 1053 |
-
"sacrebleu_ci_low": 0.
|
| 1054 |
-
"sacrebleu_ci_high": 0.
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
-
0.
|
| 1072 |
-
0.
|
| 1073 |
-
0.
|
| 1074 |
-
0.
|
| 1075 |
],
|
| 1076 |
-
"bp": 0.
|
| 1077 |
-
"sys_len":
|
| 1078 |
"ref_len": 243,
|
| 1079 |
-
"sacrebleu": 0.
|
| 1080 |
-
"score": 0.
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
-
"score_ci_low": 0.
|
| 1083 |
-
"score_ci_high": 0.
|
| 1084 |
-
"sacrebleu_ci_low": 0.
|
| 1085 |
-
"sacrebleu_ci_high": 0.
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
-
0.
|
| 1103 |
-
0.
|
| 1104 |
-
0.
|
| 1105 |
-
0.
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
-
"sys_len":
|
| 1109 |
"ref_len": 208,
|
| 1110 |
-
"sacrebleu": 0.
|
| 1111 |
-
"score": 0.
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
-
"score_ci_low": 0.
|
| 1114 |
-
"score_ci_high": 0.
|
| 1115 |
-
"sacrebleu_ci_low": 0.
|
| 1116 |
-
"sacrebleu_ci_high": 0.
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
-
0.
|
| 1134 |
-
0.
|
| 1135 |
-
0.
|
| 1136 |
-
0.
|
| 1137 |
],
|
| 1138 |
-
"bp": 0.
|
| 1139 |
-
"sys_len":
|
| 1140 |
"ref_len": 208,
|
| 1141 |
-
"sacrebleu": 0.
|
| 1142 |
-
"score": 0.
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
-
"score_ci_low": 0.
|
| 1145 |
-
"score_ci_high": 0.
|
| 1146 |
-
"sacrebleu_ci_low": 0.
|
| 1147 |
-
"sacrebleu_ci_high": 0.
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
-
0.
|
| 1165 |
-
0.
|
| 1166 |
-
0.
|
| 1167 |
-
0.
|
| 1168 |
],
|
| 1169 |
-
"bp": 0.
|
| 1170 |
-
"sys_len":
|
| 1171 |
"ref_len": 208,
|
| 1172 |
-
"sacrebleu": 0.
|
| 1173 |
-
"score": 0.
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
-
"score_ci_low": 0.
|
| 1176 |
-
"score_ci_high": 0.
|
| 1177 |
-
"sacrebleu_ci_low": 0.
|
| 1178 |
-
"sacrebleu_ci_high": 0.
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
-
0.
|
| 1196 |
-
0.
|
| 1197 |
-
0.
|
| 1198 |
-
0.
|
| 1199 |
],
|
| 1200 |
-
"bp": 0.
|
| 1201 |
-
"sys_len":
|
| 1202 |
"ref_len": 208,
|
| 1203 |
-
"sacrebleu": 0.
|
| 1204 |
-
"score": 0.
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
-
"score_ci_low": 0.
|
| 1207 |
-
"score_ci_high": 0.
|
| 1208 |
-
"sacrebleu_ci_low": 0.
|
| 1209 |
-
"sacrebleu_ci_high": 0.
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
-
|
| 1215 |
-
|
| 1216 |
53,
|
| 1217 |
-
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
-
0.
|
| 1227 |
-
0.
|
| 1228 |
-
0.
|
| 1229 |
-
0.
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
-
"sys_len":
|
| 1233 |
"ref_len": 208,
|
| 1234 |
-
"sacrebleu": 0.
|
| 1235 |
-
"score": 0.
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
-
"score_ci_low": 0.
|
| 1238 |
-
"score_ci_high": 0.
|
| 1239 |
-
"sacrebleu_ci_low": 0.
|
| 1240 |
-
"sacrebleu_ci_high": 0.
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
-
|
| 1246 |
-
|
| 1247 |
-
|
| 1248 |
-
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
-
0.
|
| 1258 |
-
0.
|
| 1259 |
-
0.
|
| 1260 |
-
0.
|
| 1261 |
],
|
| 1262 |
-
"bp": 0.
|
| 1263 |
-
"sys_len":
|
| 1264 |
"ref_len": 208,
|
| 1265 |
-
"sacrebleu": 0.
|
| 1266 |
-
"score": 0.
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
-
"score_ci_low": 0.
|
| 1269 |
-
"score_ci_high": 0.
|
| 1270 |
-
"sacrebleu_ci_low": 0.
|
| 1271 |
-
"sacrebleu_ci_high": 0.
|
| 1272 |
},
|
| 1273 |
-
"score": 0.
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
-
"score": 0.
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
+
"timestamp_utc": "2025-07-03T12:47:54.386872Z",
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
+
"unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
+
"accuracy": 0.3333333333333333,
|
| 180 |
"accuracy_ci_low": 0.1111111111111111,
|
| 181 |
+
"accuracy_ci_high": 0.6666666666666666,
|
| 182 |
"score_name": "accuracy",
|
| 183 |
+
"score": 0.3333333333333333,
|
| 184 |
+
"score_ci_high": 0.6666666666666666,
|
| 185 |
"score_ci_low": 0.1111111111111111,
|
| 186 |
"num_of_instances": 9
|
| 187 |
},
|
|
|
|
| 196 |
"num_of_instances": 9
|
| 197 |
},
|
| 198 |
"safety_bbq_gender_identity": {
|
| 199 |
+
"accuracy": 0.3333333333333333,
|
| 200 |
+
"accuracy_ci_low": 0.1111111111111111,
|
| 201 |
+
"accuracy_ci_high": 0.6666666666666666,
|
| 202 |
"score_name": "accuracy",
|
| 203 |
+
"score": 0.3333333333333333,
|
| 204 |
+
"score_ci_high": 0.6666666666666666,
|
| 205 |
+
"score_ci_low": 0.1111111111111111,
|
| 206 |
"num_of_instances": 9
|
| 207 |
},
|
| 208 |
"safety_bbq_nationality": {
|
| 209 |
+
"accuracy": 0.6666666666666666,
|
| 210 |
+
"accuracy_ci_low": 0.3333333333333333,
|
| 211 |
+
"accuracy_ci_high": 1.0,
|
| 212 |
"score_name": "accuracy",
|
| 213 |
+
"score": 0.6666666666666666,
|
| 214 |
+
"score_ci_high": 1.0,
|
| 215 |
+
"score_ci_low": 0.3333333333333333,
|
| 216 |
"num_of_instances": 9
|
| 217 |
},
|
| 218 |
"safety_bbq_physical_appearance": {
|
|
|
|
| 226 |
"num_of_instances": 9
|
| 227 |
},
|
| 228 |
"safety_bbq_race_ethnicity": {
|
| 229 |
+
"accuracy": 0.5555555555555556,
|
| 230 |
+
"accuracy_ci_low": 0.2222222222222222,
|
| 231 |
+
"accuracy_ci_high": 0.8888888888888888,
|
| 232 |
"score_name": "accuracy",
|
| 233 |
+
"score": 0.5555555555555556,
|
| 234 |
+
"score_ci_high": 0.8888888888888888,
|
| 235 |
+
"score_ci_low": 0.2222222222222222,
|
| 236 |
"num_of_instances": 9
|
| 237 |
},
|
| 238 |
"safety_bbq_race_x_gender": {
|
|
|
|
| 246 |
"num_of_instances": 9
|
| 247 |
},
|
| 248 |
"safety_bbq_race_x_ses": {
|
| 249 |
+
"accuracy": 0.5555555555555556,
|
| 250 |
+
"accuracy_ci_low": 0.2222222222222222,
|
| 251 |
+
"accuracy_ci_high": 0.8888888888888888,
|
| 252 |
"score_name": "accuracy",
|
| 253 |
+
"score": 0.5555555555555556,
|
| 254 |
+
"score_ci_high": 0.8888888888888888,
|
| 255 |
+
"score_ci_low": 0.2222222222222222,
|
| 256 |
"num_of_instances": 9
|
| 257 |
},
|
| 258 |
"safety_bbq_religion": {
|
|
|
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.05701754385964912,
|
| 296 |
+
"score": 0.05701754385964912,
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
+
"score": 0.05701754385964912,
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
+
"f1_Person": 0.31999999999999995,
|
| 307 |
+
"f1_Organization": 0.21052631578947367,
|
| 308 |
+
"f1_Location": 0.2,
|
| 309 |
+
"f1_macro": 0.24350877192982456,
|
| 310 |
+
"recall_macro": 0.2429261559696342,
|
| 311 |
+
"precision_macro": 0.2510642826734781,
|
| 312 |
+
"in_classes_support": 0.7272727272727273,
|
| 313 |
+
"f1_micro": 0.20689655172413793,
|
| 314 |
+
"recall_micro": 0.24,
|
| 315 |
+
"precision_micro": 0.18181818181818182,
|
| 316 |
+
"score": 0.20689655172413793,
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
+
"score_ci_low": 0.15899844152563253,
|
| 319 |
+
"score_ci_high": 0.3195980554375623,
|
| 320 |
+
"f1_micro_ci_low": 0.15899844152563253,
|
| 321 |
+
"f1_micro_ci_high": 0.3195980554375623
|
| 322 |
},
|
| 323 |
+
"score": 0.20689655172413793,
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
|
|
|
| 338 |
"mmlu_pro_business": {
|
| 339 |
"accuracy": 0.14285714285714285,
|
| 340 |
"accuracy_ci_low": 0.0,
|
| 341 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 342 |
"score_name": "accuracy",
|
| 343 |
"score": 0.14285714285714285,
|
| 344 |
+
"score_ci_high": 0.5714285714285714,
|
| 345 |
"score_ci_low": 0.0,
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
+
"accuracy": 0.0,
|
| 350 |
"accuracy_ci_low": 0.0,
|
| 351 |
+
"accuracy_ci_high": 0.0,
|
| 352 |
"score_name": "accuracy",
|
| 353 |
+
"score": 0.0,
|
| 354 |
+
"score_ci_high": 0.0,
|
| 355 |
"score_ci_low": 0.0,
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
| 359 |
+
"accuracy": 0.5714285714285714,
|
| 360 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 361 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 362 |
"score_name": "accuracy",
|
| 363 |
+
"score": 0.5714285714285714,
|
| 364 |
+
"score_ci_high": 0.8571428571428571,
|
| 365 |
+
"score_ci_low": 0.14285714285714285,
|
| 366 |
"num_of_instances": 7
|
| 367 |
},
|
| 368 |
"mmlu_pro_economics": {
|
|
|
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
| 389 |
+
"accuracy": 0.0,
|
| 390 |
"accuracy_ci_low": 0.0,
|
| 391 |
+
"accuracy_ci_high": 0.0,
|
| 392 |
"score_name": "accuracy",
|
| 393 |
+
"score": 0.0,
|
| 394 |
+
"score_ci_high": 0.0,
|
| 395 |
"score_ci_low": 0.0,
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
|
|
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
+
"accuracy": 0.42857142857142855,
|
| 410 |
"accuracy_ci_low": 0.14285714285714285,
|
| 411 |
"accuracy_ci_high": 0.8571428571428571,
|
| 412 |
"score_name": "accuracy",
|
| 413 |
+
"score": 0.42857142857142855,
|
| 414 |
"score_ci_high": 0.8571428571428571,
|
| 415 |
"score_ci_low": 0.14285714285714285,
|
| 416 |
"num_of_instances": 7
|
|
|
|
| 426 |
"num_of_instances": 7
|
| 427 |
},
|
| 428 |
"mmlu_pro_other": {
|
| 429 |
+
"accuracy": 0.14285714285714285,
|
| 430 |
"accuracy_ci_low": 0.0,
|
| 431 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 432 |
"score_name": "accuracy",
|
| 433 |
+
"score": 0.14285714285714285,
|
| 434 |
+
"score_ci_high": 0.5714285714285714,
|
| 435 |
"score_ci_low": 0.0,
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
|
|
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
+
"accuracy": 0.0,
|
| 450 |
"accuracy_ci_low": 0.0,
|
| 451 |
+
"accuracy_ci_high": 0.0,
|
| 452 |
"score_name": "accuracy",
|
| 453 |
+
"score": 0.0,
|
| 454 |
+
"score_ci_high": 0.0,
|
| 455 |
"score_ci_low": 0.0,
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
|
|
|
| 465 |
"score_ci_low": 0.0,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
+
"score": 0.19387755102040816,
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
+
"f1_macro": 0.13333333333333333,
|
| 475 |
"f1_suggestive": 0.16666666666666666,
|
| 476 |
+
"f1_arbitrary": 0.5,
|
| 477 |
+
"f1_generic": 0.0,
|
| 478 |
+
"f1_fanciful": 0.0,
|
| 479 |
+
"f1_descriptive": 0.0,
|
| 480 |
+
"f1_macro_ci_low": 0.029544504197655135,
|
| 481 |
+
"f1_macro_ci_high": 0.275,
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
+
"score": 0.15,
|
| 484 |
+
"score_ci_high": 0.37270394126013257,
|
| 485 |
+
"score_ci_low": 0.05,
|
| 486 |
"num_of_instances": 20,
|
| 487 |
+
"accuracy": 0.15,
|
| 488 |
+
"accuracy_ci_low": 0.05,
|
| 489 |
+
"accuracy_ci_high": 0.3672085770953458,
|
| 490 |
+
"f1_micro": 0.15,
|
| 491 |
+
"f1_micro_ci_low": 0.05,
|
| 492 |
+
"f1_micro_ci_high": 0.37270394126013257
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
+
"f1_macro": 0.41333333333333333,
|
| 496 |
+
"f1_no": 0.56,
|
| 497 |
+
"f1_yes": 0.26666666666666666,
|
| 498 |
+
"f1_macro_ci_low": 0.23273657289002558,
|
| 499 |
+
"f1_macro_ci_high": 0.6865203761755485,
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
+
"score": 0.45,
|
| 502 |
+
"score_ci_high": 0.6666666666666666,
|
| 503 |
+
"score_ci_low": 0.25,
|
| 504 |
"num_of_instances": 20,
|
| 505 |
+
"accuracy": 0.45,
|
| 506 |
+
"accuracy_ci_low": 0.25,
|
| 507 |
+
"accuracy_ci_high": 0.6549800691648727,
|
| 508 |
+
"f1_micro": 0.45,
|
| 509 |
+
"f1_micro_ci_low": 0.25,
|
| 510 |
+
"f1_micro_ci_high": 0.6666666666666666
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
+
"f1_macro": 0.05372405372405372,
|
| 514 |
+
"f1_conclusion": 0.2222222222222222,
|
| 515 |
+
"f1_analysis": 0.15384615384615385,
|
| 516 |
"f1_decree": 0.0,
|
|
|
|
| 517 |
"f1_facts": 0.0,
|
| 518 |
+
"f1_issue": 0.0,
|
| 519 |
"f1_rule": 0.0,
|
| 520 |
"f1_procedural history": 0.0,
|
| 521 |
"f1_macro_ci_low": 0.0,
|
| 522 |
+
"f1_macro_ci_high": 0.15349110987889103,
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
+
"score": 0.1,
|
| 525 |
+
"score_ci_high": 0.3,
|
| 526 |
"score_ci_low": 0.0,
|
| 527 |
"num_of_instances": 20,
|
| 528 |
"accuracy": 0.1,
|
| 529 |
"accuracy_ci_low": 0.0,
|
| 530 |
"accuracy_ci_high": 0.35,
|
| 531 |
+
"f1_micro": 0.1,
|
| 532 |
"f1_micro_ci_low": 0.0,
|
| 533 |
+
"f1_micro_ci_high": 0.3
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
+
"f1_macro": 0.4916666666666667,
|
| 537 |
"f1_yes": 0.5833333333333334,
|
| 538 |
+
"f1_no": 0.4,
|
| 539 |
+
"f1_macro_ci_low": 0.29994839408816376,
|
| 540 |
+
"f1_macro_ci_high": 0.7184210526315788,
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
+
"score": 0.5128205128205128,
|
| 543 |
"score_ci_high": 0.7368421052631579,
|
| 544 |
"score_ci_low": 0.3076923076923077,
|
| 545 |
"num_of_instances": 20,
|
| 546 |
"accuracy": 0.5,
|
| 547 |
"accuracy_ci_low": 0.3,
|
| 548 |
"accuracy_ci_high": 0.7,
|
| 549 |
+
"f1_micro": 0.5128205128205128,
|
| 550 |
"f1_micro_ci_low": 0.3076923076923077,
|
| 551 |
"f1_micro_ci_high": 0.7368421052631579
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
+
"f1_macro": 0.6491228070175439,
|
| 555 |
+
"f1_yes": 0.631578947368421,
|
| 556 |
+
"f1_no": 0.6666666666666666,
|
| 557 |
+
"f1_macro_ci_low": 0.44862155388471175,
|
| 558 |
+
"f1_macro_ci_high": 0.8470120517129166,
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
+
"score": 0.65,
|
| 561 |
+
"score_ci_high": 0.8,
|
| 562 |
+
"score_ci_low": 0.4,
|
| 563 |
"num_of_instances": 20,
|
| 564 |
+
"accuracy": 0.65,
|
| 565 |
+
"accuracy_ci_low": 0.4,
|
| 566 |
+
"accuracy_ci_high": 0.8,
|
| 567 |
+
"f1_micro": 0.65,
|
| 568 |
+
"f1_micro_ci_low": 0.4,
|
| 569 |
+
"f1_micro_ci_high": 0.8
|
| 570 |
},
|
| 571 |
+
"score": 0.37256410256410255,
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
+
"f1_macro": 0.13032051282051282,
|
| 578 |
+
"f1_cars": 0.6,
|
| 579 |
"f1_windows x": 0.0,
|
| 580 |
"f1_atheism": 0.0,
|
| 581 |
+
"f1_politics": 0.3076923076923077,
|
| 582 |
"f1_religion": 0.0,
|
| 583 |
"f1_medicine": 0.0,
|
|
|
|
| 584 |
"f1_christianity": 0.0,
|
| 585 |
+
"f1_computer graphics": 0.3333333333333333,
|
| 586 |
"f1_microsoft windows": 0.0,
|
| 587 |
"f1_middle east": 0.0,
|
| 588 |
"f1_motorcycles": 0.0,
|
|
|
|
| 589 |
"f1_mac hardware": 0.0,
|
| 590 |
"f1_electronics": 0.0,
|
| 591 |
"f1_for sale": 0.0,
|
| 592 |
"f1_guns": 0.0,
|
| 593 |
+
"f1_space": 0.75,
|
|
|
|
| 594 |
"f1_pc hardware": 0.0,
|
| 595 |
+
"f1_cryptography": 0.0,
|
| 596 |
+
"f1_baseball": 0.6153846153846154,
|
| 597 |
+
"f1_hockey": 0.0,
|
| 598 |
+
"f1_macro_ci_low": 0.08300993096063736,
|
| 599 |
+
"f1_macro_ci_high": 0.17568685492942918,
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
+
"score": 0.2,
|
| 602 |
+
"score_ci_high": 0.3120567375886525,
|
| 603 |
+
"score_ci_low": 0.11678832116788321,
|
| 604 |
"num_of_instances": 100,
|
| 605 |
+
"accuracy": 0.14,
|
| 606 |
+
"accuracy_ci_low": 0.08,
|
| 607 |
+
"accuracy_ci_high": 0.23,
|
| 608 |
+
"f1_micro": 0.2,
|
| 609 |
+
"f1_micro_ci_low": 0.11678832116788321,
|
| 610 |
+
"f1_micro_ci_high": 0.3120567375886525
|
| 611 |
},
|
| 612 |
+
"score": 0.2,
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
+
"f1_macro": 0.3019138755980861,
|
| 619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.631578947368421,
|
|
|
|
| 620 |
"f1_payday loan or title loan or personal loan": 0.0,
|
| 621 |
+
"f1_money transfer or virtual currency or money service": 0.0,
|
| 622 |
+
"f1_checking or savings account": 0.18181818181818182,
|
| 623 |
"f1_mortgage": 1.0,
|
| 624 |
+
"f1_credit card or prepaid card": 0.0,
|
| 625 |
+
"f1_debt collection": 0.3,
|
| 626 |
+
"f1_macro_ci_low": 0.17209384713135212,
|
| 627 |
+
"f1_macro_ci_high": 0.4611549930931899,
|
|
|
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
+
"score": 0.5125,
|
| 630 |
+
"score_ci_high": 0.6128968486960344,
|
| 631 |
+
"score_ci_low": 0.40808931189367126,
|
| 632 |
"num_of_instances": 100,
|
| 633 |
+
"accuracy": 0.41,
|
| 634 |
+
"accuracy_ci_low": 0.32,
|
| 635 |
+
"accuracy_ci_high": 0.5131183635822549,
|
| 636 |
+
"f1_micro": 0.5125,
|
| 637 |
+
"f1_micro_ci_low": 0.40808931189367126,
|
| 638 |
+
"f1_micro_ci_high": 0.6128968486960344
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
+
"f1_macro": 0.31565591397849463,
|
| 642 |
+
"f1_mortgages and loans": 0.4,
|
| 643 |
+
"f1_debt collection": 0.45161290322580644,
|
| 644 |
+
"f1_credit card": 0.16666666666666666,
|
| 645 |
+
"f1_credit reporting": 0.56,
|
| 646 |
+
"f1_retail banking": 0.0,
|
| 647 |
+
"f1_macro_ci_low": 0.21637195045262772,
|
| 648 |
+
"f1_macro_ci_high": 0.4421317990640253,
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
+
"score": 0.4,
|
| 651 |
+
"score_ci_high": 0.5319148936170213,
|
| 652 |
+
"score_ci_low": 0.2608695652173913,
|
| 653 |
"num_of_instances": 50,
|
| 654 |
+
"accuracy": 0.38,
|
| 655 |
+
"accuracy_ci_low": 0.24,
|
| 656 |
+
"accuracy_ci_high": 0.5,
|
| 657 |
+
"f1_micro": 0.4,
|
| 658 |
+
"f1_micro_ci_low": 0.2608695652173913,
|
| 659 |
+
"f1_micro_ci_high": 0.5319148936170213
|
| 660 |
},
|
| 661 |
+
"score": 0.45625,
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
+
"program_accuracy": 0.07,
|
| 669 |
+
"score": 0.07,
|
| 670 |
"score_name": "program_accuracy",
|
| 671 |
+
"execution_accuracy": 0.05,
|
| 672 |
+
"program_accuracy_ci_low": 0.03,
|
| 673 |
+
"program_accuracy_ci_high": 0.13,
|
| 674 |
+
"score_ci_low": 0.03,
|
| 675 |
+
"score_ci_high": 0.13,
|
| 676 |
+
"execution_accuracy_ci_low": 0.02,
|
| 677 |
+
"execution_accuracy_ci_high": 0.11
|
| 678 |
},
|
| 679 |
+
"score": 0.07,
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
+
"precision": 0.523939092007554,
|
| 686 |
+
"recall": 0.4630665384804083,
|
| 687 |
+
"f1": 0.4470066183026211,
|
| 688 |
+
"precision_ci_low": 0.48178772883210663,
|
| 689 |
+
"precision_ci_high": 0.5657398750195303,
|
| 690 |
+
"recall_ci_low": 0.4204989413298855,
|
| 691 |
+
"recall_ci_high": 0.5060265317751137,
|
| 692 |
+
"f1_ci_low": 0.40883623553494647,
|
| 693 |
+
"f1_ci_high": 0.48378981442802294,
|
| 694 |
"score_name": "f1",
|
| 695 |
+
"score": 0.4470066183026211,
|
| 696 |
+
"score_ci_high": 0.48378981442802294,
|
| 697 |
+
"score_ci_low": 0.40883623553494647,
|
| 698 |
"num_of_instances": 100,
|
| 699 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6648982459306717,
|
| 700 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.661893335878849,
|
| 701 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.6803490483760833,
|
| 702 |
+
"faithfullness_f1_token_overlap": 0.3107343678104924,
|
| 703 |
+
"faithfullness_recall_token_overlap": 0.2175296397330215,
|
| 704 |
+
"faithfullness_precision_token_overlap": 0.7439144424456543,
|
| 705 |
+
"correctness_f1_token_overlap": 0.4470066183026211,
|
| 706 |
+
"correctness_recall_token_overlap": 0.4630665384804083,
|
| 707 |
+
"correctness_precision_token_overlap": 0.523939092007554
|
| 708 |
},
|
| 709 |
+
"score": 0.4470066183026211,
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
|
|
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
+
"accuracy": 0.54,
|
| 726 |
+
"accuracy_ci_low": 0.44,
|
| 727 |
+
"accuracy_ci_high": 0.63,
|
| 728 |
"score_name": "accuracy",
|
| 729 |
+
"score": 0.54,
|
| 730 |
+
"score_ci_high": 0.63,
|
| 731 |
+
"score_ci_low": 0.44,
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
+
"score": 0.41000000000000003,
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
| 738 |
"safety": {
|
| 739 |
"attaq_500": {
|
| 740 |
"severity_critical": 0.0,
|
| 741 |
+
"severity_high": 0.0,
|
| 742 |
"severity_medium": 1.0,
|
| 743 |
+
"severity_low": 99.0,
|
| 744 |
+
"category_harmful_info": 0.8276903443038464,
|
| 745 |
+
"category_deception": 0.8509014063712325,
|
| 746 |
+
"category_explicit_content": 0.8699462725733867,
|
| 747 |
+
"category_violence": 0.823169431019397,
|
| 748 |
+
"category_discrimination": 0.768092628982332,
|
| 749 |
+
"category_substance_abuse": 0.8343986455765036,
|
| 750 |
+
"category_pii": 0.8457972087241985,
|
| 751 |
+
"safety": 0.8300161745150882,
|
| 752 |
+
"safety_ci_low": 0.8130536746996235,
|
| 753 |
+
"safety_ci_high": 0.847588766122457,
|
| 754 |
"score_name": "safety",
|
| 755 |
+
"score": 0.8300161745150882,
|
| 756 |
+
"score_ci_high": 0.847588766122457,
|
| 757 |
+
"score_ci_low": 0.8130536746996235,
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
+
"score": 0.8300161745150882,
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
+
"rougeLsum": 0.3432004189089268,
|
| 768 |
+
"rouge1": 0.4042025000472494,
|
| 769 |
+
"rougeL": 0.27446864632702683,
|
| 770 |
+
"score": 0.27446864632702683,
|
| 771 |
"score_name": "rougeL",
|
| 772 |
+
"rouge2": 0.19586805105606808,
|
| 773 |
+
"rougeLsum_ci_low": 0.3220913706590708,
|
| 774 |
+
"rougeLsum_ci_high": 0.3618832326344797,
|
| 775 |
+
"rouge1_ci_low": 0.37966598950175057,
|
| 776 |
+
"rouge1_ci_high": 0.42455036578796235,
|
| 777 |
+
"rougeL_ci_low": 0.25908686634534905,
|
| 778 |
+
"rougeL_ci_high": 0.2912289617368165,
|
| 779 |
+
"score_ci_low": 0.25908686634534905,
|
| 780 |
+
"score_ci_high": 0.2912289617368165,
|
| 781 |
+
"rouge2_ci_low": 0.1802587963974639,
|
| 782 |
+
"rouge2_ci_high": 0.21175333730941023
|
|
|
|
|
|
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
+
"rougeLsum": 0.08407246092877058,
|
| 787 |
+
"rouge1": 0.10193639708473491,
|
| 788 |
+
"rougeL": 0.07786051266730082,
|
| 789 |
+
"score": 0.07786051266730082,
|
| 790 |
"score_name": "rougeL",
|
| 791 |
+
"rouge2": 0.011991811023543525,
|
| 792 |
+
"rougeLsum_ci_low": 0.07261612496065988,
|
| 793 |
+
"rougeLsum_ci_high": 0.09607861850289276,
|
| 794 |
+
"rouge1_ci_low": 0.08699540238464068,
|
| 795 |
+
"rouge1_ci_high": 0.11680045934783663,
|
| 796 |
+
"rougeL_ci_low": 0.067976352668634,
|
| 797 |
+
"rougeL_ci_high": 0.08831833326675104,
|
| 798 |
+
"score_ci_low": 0.067976352668634,
|
| 799 |
+
"score_ci_high": 0.08831833326675104,
|
| 800 |
+
"rouge2_ci_low": 0.008358797906991256,
|
| 801 |
+
"rouge2_ci_high": 0.017262915294486415
|
|
|
|
|
|
|
| 802 |
},
|
| 803 |
+
"score": 0.17616457949716383,
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
+
116,
|
| 812 |
+
60,
|
| 813 |
+
35,
|
| 814 |
+
19
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
+
199,
|
| 818 |
+
193,
|
| 819 |
+
187,
|
| 820 |
+
181
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
+
0.5829145728643216,
|
| 824 |
+
0.31088082901554404,
|
| 825 |
+
0.1871657754010695,
|
| 826 |
+
0.10497237569060774
|
| 827 |
],
|
| 828 |
+
"bp": 0.9557813259386698,
|
| 829 |
+
"sys_len": 199,
|
| 830 |
"ref_len": 208,
|
| 831 |
+
"sacrebleu": 0.2334713639801202,
|
| 832 |
+
"score": 0.2334713639801202,
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
+
"score_ci_low": 0.15943718765537457,
|
| 835 |
+
"score_ci_high": 0.31107836291057467,
|
| 836 |
+
"sacrebleu_ci_low": 0.15943718765537457,
|
| 837 |
+
"sacrebleu_ci_high": 0.31107836291057467
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
+
116,
|
| 843 |
+
59,
|
| 844 |
+
31,
|
| 845 |
+
18
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
+
191,
|
| 849 |
+
185,
|
| 850 |
+
179,
|
| 851 |
+
173
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
+
0.6073298429319371,
|
| 855 |
+
0.31891891891891894,
|
| 856 |
+
0.17318435754189945,
|
| 857 |
+
0.10404624277456648
|
| 858 |
],
|
| 859 |
+
"bp": 0.9148407838195897,
|
| 860 |
+
"sys_len": 191,
|
| 861 |
"ref_len": 208,
|
| 862 |
+
"sacrebleu": 0.22235940836247658,
|
| 863 |
+
"score": 0.22235940836247658,
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
+
"score_ci_low": 0.12250090520971424,
|
| 866 |
+
"score_ci_high": 0.39386620331090566,
|
| 867 |
+
"sacrebleu_ci_low": 0.12250090520971424,
|
| 868 |
+
"sacrebleu_ci_high": 0.39386620331090566
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
+
62,
|
| 874 |
+
12,
|
| 875 |
+
3,
|
| 876 |
+
1
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
+
211,
|
| 880 |
+
205,
|
| 881 |
+
199,
|
| 882 |
+
193
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
+
0.2938388625592417,
|
| 886 |
+
0.058536585365853655,
|
| 887 |
+
0.015075376884422112,
|
| 888 |
+
0.005181347150259067
|
| 889 |
],
|
| 890 |
"bp": 1.0,
|
| 891 |
+
"sys_len": 211,
|
| 892 |
"ref_len": 209,
|
| 893 |
+
"sacrebleu": 0.03404566896908617,
|
| 894 |
+
"score": 0.03404566896908617,
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
+
"score_ci_low": 0.014238102093315122,
|
| 897 |
+
"score_ci_high": 0.06777961671960171,
|
| 898 |
+
"sacrebleu_ci_low": 0.014238102093315122,
|
| 899 |
+
"sacrebleu_ci_high": 0.06777961671960171
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
+
104,
|
| 905 |
+
47,
|
| 906 |
+
26,
|
| 907 |
+
13
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
+
224,
|
| 911 |
+
218,
|
| 912 |
+
212,
|
| 913 |
+
206
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
+
0.4642857142857143,
|
| 917 |
+
0.21559633027522934,
|
| 918 |
+
0.12264150943396226,
|
| 919 |
+
0.06310679611650485
|
| 920 |
],
|
| 921 |
+
"bp": 1.0,
|
| 922 |
+
"sys_len": 224,
|
| 923 |
"ref_len": 216,
|
| 924 |
+
"sacrebleu": 0.1668341972740045,
|
| 925 |
+
"score": 0.1668341972740045,
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
+
"score_ci_low": 0.08494307414866337,
|
| 928 |
+
"score_ci_high": 0.2669068447403921,
|
| 929 |
+
"sacrebleu_ci_low": 0.08494307414866337,
|
| 930 |
+
"sacrebleu_ci_high": 0.2669068447403921
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
+
157,
|
| 936 |
+
95,
|
| 937 |
+
64,
|
| 938 |
+
43
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
+
249,
|
| 942 |
+
243,
|
| 943 |
+
237,
|
| 944 |
+
231
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
+
0.6305220883534137,
|
| 948 |
+
0.39094650205761317,
|
| 949 |
+
0.27004219409282704,
|
| 950 |
+
0.18614718614718614
|
| 951 |
],
|
| 952 |
+
"bp": 1.0,
|
| 953 |
+
"sys_len": 249,
|
| 954 |
"ref_len": 235,
|
| 955 |
+
"sacrebleu": 0.3336387113992972,
|
| 956 |
+
"score": 0.3336387113992972,
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
+
"score_ci_low": 0.21817943053845332,
|
| 959 |
+
"score_ci_high": 0.41307713856625455,
|
| 960 |
+
"sacrebleu_ci_low": 0.21817943053845332,
|
| 961 |
+
"sacrebleu_ci_high": 0.41307713856625455
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
+
97,
|
| 967 |
+
35,
|
| 968 |
+
13,
|
| 969 |
+
6
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
+
1750,
|
| 973 |
+
1744,
|
| 974 |
+
1738,
|
| 975 |
+
1732
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
+
0.055428571428571424,
|
| 979 |
+
0.02006880733944954,
|
| 980 |
+
0.007479861910241657,
|
| 981 |
+
0.003464203233256351
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
+
"sys_len": 1750,
|
| 985 |
"ref_len": 249,
|
| 986 |
+
"sacrebleu": 0.013029808954660122,
|
| 987 |
+
"score": 0.013029808954660122,
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
+
"score_ci_low": 0.005042950798954314,
|
| 990 |
+
"score_ci_high": 0.08046612284400166,
|
| 991 |
+
"sacrebleu_ci_low": 0.005042950798954314,
|
| 992 |
+
"sacrebleu_ci_high": 0.08046612284400166
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
+
158,
|
| 998 |
+
108,
|
| 999 |
+
79,
|
| 1000 |
+
58
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
+
213,
|
| 1004 |
+
207,
|
| 1005 |
+
201,
|
| 1006 |
+
195
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
+
0.7417840375586855,
|
| 1010 |
+
0.5217391304347826,
|
| 1011 |
+
0.3930348258706468,
|
| 1012 |
+
0.29743589743589743
|
| 1013 |
],
|
| 1014 |
+
"bp": 0.9586267176373937,
|
| 1015 |
+
"sys_len": 213,
|
| 1016 |
"ref_len": 222,
|
| 1017 |
+
"sacrebleu": 0.4421181624666089,
|
| 1018 |
+
"score": 0.4421181624666089,
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
+
"score_ci_low": 0.36288486848367396,
|
| 1021 |
+
"score_ci_high": 0.5136032567408664,
|
| 1022 |
+
"sacrebleu_ci_low": 0.36288486848367396,
|
| 1023 |
+
"sacrebleu_ci_high": 0.5136032567408664
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
+
123,
|
| 1029 |
+
63,
|
| 1030 |
+
41,
|
| 1031 |
+
32
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
+
227,
|
| 1035 |
+
221,
|
| 1036 |
+
215,
|
| 1037 |
+
209
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
+
0.5418502202643172,
|
| 1041 |
+
0.28506787330316746,
|
| 1042 |
+
0.19069767441860463,
|
| 1043 |
+
0.15311004784688995
|
| 1044 |
],
|
| 1045 |
+
"bp": 0.9868710869905453,
|
| 1046 |
+
"sys_len": 227,
|
| 1047 |
"ref_len": 230,
|
| 1048 |
+
"sacrebleu": 0.25574348566082084,
|
| 1049 |
+
"score": 0.25574348566082084,
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
+
"score_ci_low": 0.13165649076081204,
|
| 1052 |
+
"score_ci_high": 0.4446310754662533,
|
| 1053 |
+
"sacrebleu_ci_low": 0.13165649076081204,
|
| 1054 |
+
"sacrebleu_ci_high": 0.4446310754662533
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
+
141,
|
| 1060 |
+
80,
|
| 1061 |
+
48,
|
| 1062 |
+
33
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
+
223,
|
| 1066 |
+
217,
|
| 1067 |
+
211,
|
| 1068 |
+
205
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
+
0.632286995515695,
|
| 1072 |
+
0.36866359447004604,
|
| 1073 |
+
0.2274881516587678,
|
| 1074 |
+
0.16097560975609757
|
| 1075 |
],
|
| 1076 |
+
"bp": 0.914218114531173,
|
| 1077 |
+
"sys_len": 223,
|
| 1078 |
"ref_len": 243,
|
| 1079 |
+
"sacrebleu": 0.2778853574267633,
|
| 1080 |
+
"score": 0.2778853574267633,
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
+
"score_ci_low": 0.18397606219224352,
|
| 1083 |
+
"score_ci_high": 0.3891671906561486,
|
| 1084 |
+
"sacrebleu_ci_low": 0.18397606219224352,
|
| 1085 |
+
"sacrebleu_ci_high": 0.3891671906561486
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
+
141,
|
| 1091 |
+
80,
|
| 1092 |
+
46,
|
| 1093 |
+
26
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
+
210,
|
| 1097 |
+
204,
|
| 1098 |
+
198,
|
| 1099 |
+
192
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
+
0.6714285714285714,
|
| 1103 |
+
0.3921568627450981,
|
| 1104 |
+
0.23232323232323232,
|
| 1105 |
+
0.13541666666666666
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
+
"sys_len": 210,
|
| 1109 |
"ref_len": 208,
|
| 1110 |
+
"sacrebleu": 0.3016866548631982,
|
| 1111 |
+
"score": 0.3016866548631982,
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
+
"score_ci_low": 0.19135476382396907,
|
| 1114 |
+
"score_ci_high": 0.41499391897561666,
|
| 1115 |
+
"sacrebleu_ci_low": 0.19135476382396907,
|
| 1116 |
+
"sacrebleu_ci_high": 0.41499391897561666
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
+
114,
|
| 1122 |
+
43,
|
| 1123 |
+
19,
|
| 1124 |
+
9
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
+
199,
|
| 1128 |
+
193,
|
| 1129 |
+
187,
|
| 1130 |
+
181
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
+
0.5728643216080402,
|
| 1134 |
+
0.2227979274611399,
|
| 1135 |
+
0.10160427807486631,
|
| 1136 |
+
0.04972375690607735
|
| 1137 |
],
|
| 1138 |
+
"bp": 0.9557813259386698,
|
| 1139 |
+
"sys_len": 199,
|
| 1140 |
"ref_len": 208,
|
| 1141 |
+
"sacrebleu": 0.15230643528886997,
|
| 1142 |
+
"score": 0.15230643528886997,
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
+
"score_ci_low": 0.08366452659105056,
|
| 1145 |
+
"score_ci_high": 0.25458346437891377,
|
| 1146 |
+
"sacrebleu_ci_low": 0.08366452659105056,
|
| 1147 |
+
"sacrebleu_ci_high": 0.25458346437891377
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
+
84,
|
| 1153 |
+
24,
|
| 1154 |
+
10,
|
| 1155 |
+
5
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
+
195,
|
| 1159 |
+
189,
|
| 1160 |
+
183,
|
| 1161 |
+
177
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
+
0.4307692307692308,
|
| 1165 |
+
0.12698412698412698,
|
| 1166 |
+
0.0546448087431694,
|
| 1167 |
+
0.028248587570621472
|
| 1168 |
],
|
| 1169 |
+
"bp": 0.9355069850316178,
|
| 1170 |
+
"sys_len": 195,
|
| 1171 |
"ref_len": 208,
|
| 1172 |
+
"sacrebleu": 0.0896771865535579,
|
| 1173 |
+
"score": 0.0896771865535579,
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
+
"score_ci_low": 0.026997714791803698,
|
| 1176 |
+
"score_ci_high": 0.18938435699619116,
|
| 1177 |
+
"sacrebleu_ci_low": 0.026997714791803698,
|
| 1178 |
+
"sacrebleu_ci_high": 0.18938435699619116
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
+
145,
|
| 1184 |
+
99,
|
| 1185 |
+
68,
|
| 1186 |
+
47
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
+
200,
|
| 1190 |
+
194,
|
| 1191 |
+
188,
|
| 1192 |
+
182
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
+
0.725,
|
| 1196 |
+
0.5103092783505155,
|
| 1197 |
+
0.36170212765957444,
|
| 1198 |
+
0.25824175824175827
|
| 1199 |
],
|
| 1200 |
+
"bp": 0.9607894391523232,
|
| 1201 |
+
"sys_len": 200,
|
| 1202 |
"ref_len": 208,
|
| 1203 |
+
"sacrebleu": 0.4142528184241974,
|
| 1204 |
+
"score": 0.4142528184241974,
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
+
"score_ci_low": 0.2751136761392381,
|
| 1207 |
+
"score_ci_high": 0.5016871777154166,
|
| 1208 |
+
"sacrebleu_ci_low": 0.2751136761392381,
|
| 1209 |
+
"sacrebleu_ci_high": 0.5016871777154166
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
+
139,
|
| 1215 |
+
80,
|
| 1216 |
53,
|
| 1217 |
+
37
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
+
220,
|
| 1221 |
+
214,
|
| 1222 |
+
208,
|
| 1223 |
+
202
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
+
0.6318181818181818,
|
| 1227 |
+
0.37383177570093457,
|
| 1228 |
+
0.2548076923076923,
|
| 1229 |
+
0.18316831683168316
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
+
"sys_len": 220,
|
| 1233 |
"ref_len": 208,
|
| 1234 |
+
"sacrebleu": 0.32402819799793336,
|
| 1235 |
+
"score": 0.32402819799793336,
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
+
"score_ci_low": 0.17933889414478565,
|
| 1238 |
+
"score_ci_high": 0.4618316777407589,
|
| 1239 |
+
"sacrebleu_ci_low": 0.17933889414478565,
|
| 1240 |
+
"sacrebleu_ci_high": 0.4618316777407589
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
+
118,
|
| 1246 |
+
59,
|
| 1247 |
+
33,
|
| 1248 |
+
20
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
+
207,
|
| 1252 |
+
201,
|
| 1253 |
+
195,
|
| 1254 |
+
189
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
+
0.5700483091787439,
|
| 1258 |
+
0.2935323383084577,
|
| 1259 |
+
0.16923076923076924,
|
| 1260 |
+
0.10582010582010583
|
| 1261 |
],
|
| 1262 |
+
"bp": 0.9951807322415573,
|
| 1263 |
+
"sys_len": 207,
|
| 1264 |
"ref_len": 208,
|
| 1265 |
+
"sacrebleu": 0.23283900945772085,
|
| 1266 |
+
"score": 0.23283900945772085,
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
+
"score_ci_low": 0.1202707272049383,
|
| 1269 |
+
"score_ci_high": 0.3401632328329729,
|
| 1270 |
+
"sacrebleu_ci_low": 0.1202707272049383,
|
| 1271 |
+
"sacrebleu_ci_high": 0.3401632328329729
|
| 1272 |
},
|
| 1273 |
+
"score": 0.23292776447195437,
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
+
"score": 0.3182745669848931,
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
results/bluebench/{2025-07-02T16-23-36_evaluation_results.json β 2025-07-03T10-08-21_evaluation_results.json}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
-
"timestamp_utc": "2025-07-
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
@@ -42,7 +42,7 @@
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
-
"unitxt_commit_hash": "
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
@@ -176,36 +176,46 @@
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
-
"accuracy": 0.
|
| 180 |
-
"accuracy_ci_low": 0.
|
| 181 |
-
"accuracy_ci_high":
|
| 182 |
"score_name": "accuracy",
|
| 183 |
-
"score": 0.
|
| 184 |
-
"score_ci_high":
|
| 185 |
-
"score_ci_low": 0.
|
| 186 |
"num_of_instances": 9
|
| 187 |
},
|
| 188 |
"safety_bbq_disability_status": {
|
| 189 |
-
"accuracy": 0.
|
| 190 |
-
"accuracy_ci_low": 0.
|
| 191 |
-
"accuracy_ci_high": 0.
|
| 192 |
"score_name": "accuracy",
|
| 193 |
-
"score": 0.
|
| 194 |
-
"score_ci_high": 0.
|
| 195 |
-
"score_ci_low": 0.
|
| 196 |
"num_of_instances": 9
|
| 197 |
},
|
| 198 |
"safety_bbq_gender_identity": {
|
| 199 |
-
"accuracy": 0.
|
| 200 |
"accuracy_ci_low": 0.3333333333333333,
|
| 201 |
-
"accuracy_ci_high":
|
| 202 |
"score_name": "accuracy",
|
| 203 |
-
"score": 0.
|
| 204 |
-
"score_ci_high":
|
| 205 |
"score_ci_low": 0.3333333333333333,
|
| 206 |
"num_of_instances": 9
|
| 207 |
},
|
| 208 |
"safety_bbq_nationality": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
"accuracy": 0.5555555555555556,
|
| 210 |
"accuracy_ci_low": 0.2222222222222222,
|
| 211 |
"accuracy_ci_high": 0.8888888888888888,
|
|
@@ -215,27 +225,27 @@
|
|
| 215 |
"score_ci_low": 0.2222222222222222,
|
| 216 |
"num_of_instances": 9
|
| 217 |
},
|
| 218 |
-
"
|
| 219 |
-
"accuracy": 0.
|
| 220 |
-
"accuracy_ci_low": 0.
|
| 221 |
"accuracy_ci_high": 1.0,
|
| 222 |
"score_name": "accuracy",
|
| 223 |
-
"score": 0.
|
| 224 |
"score_ci_high": 1.0,
|
| 225 |
-
"score_ci_low": 0.
|
| 226 |
"num_of_instances": 9
|
| 227 |
},
|
| 228 |
-
"
|
| 229 |
-
"accuracy":
|
| 230 |
-
"accuracy_ci_low":
|
| 231 |
-
"accuracy_ci_high":
|
| 232 |
"score_name": "accuracy",
|
| 233 |
-
"score":
|
| 234 |
-
"score_ci_high":
|
| 235 |
-
"score_ci_low":
|
| 236 |
"num_of_instances": 9
|
| 237 |
},
|
| 238 |
-
"
|
| 239 |
"accuracy": 0.6666666666666666,
|
| 240 |
"accuracy_ci_low": 0.3333333333333333,
|
| 241 |
"accuracy_ci_high": 0.8888888888888888,
|
|
@@ -245,16 +255,6 @@
|
|
| 245 |
"score_ci_low": 0.3333333333333333,
|
| 246 |
"num_of_instances": 9
|
| 247 |
},
|
| 248 |
-
"safety_bbq_race_x_ses": {
|
| 249 |
-
"accuracy": 0.7777777777777778,
|
| 250 |
-
"accuracy_ci_low": 0.4444444444444444,
|
| 251 |
-
"accuracy_ci_high": 1.0,
|
| 252 |
-
"score_name": "accuracy",
|
| 253 |
-
"score": 0.7777777777777778,
|
| 254 |
-
"score_ci_high": 1.0,
|
| 255 |
-
"score_ci_low": 0.4444444444444444,
|
| 256 |
-
"num_of_instances": 9
|
| 257 |
-
},
|
| 258 |
"safety_bbq_religion": {
|
| 259 |
"accuracy": 0.6666666666666666,
|
| 260 |
"accuracy_ci_low": 0.3333333333333333,
|
|
@@ -266,73 +266,73 @@
|
|
| 266 |
"num_of_instances": 9
|
| 267 |
},
|
| 268 |
"safety_bbq_ses": {
|
| 269 |
-
"accuracy": 0.
|
| 270 |
-
"accuracy_ci_low": 0.
|
| 271 |
-
"accuracy_ci_high": 0.
|
| 272 |
"score_name": "accuracy",
|
| 273 |
-
"score": 0.
|
| 274 |
-
"score_ci_high": 0.
|
| 275 |
-
"score_ci_low": 0.
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
"safety_bbq_sexual_orientation": {
|
| 279 |
-
"accuracy": 0.
|
| 280 |
-
"accuracy_ci_low": 0.
|
| 281 |
-
"accuracy_ci_high": 0.
|
| 282 |
"score_name": "accuracy",
|
| 283 |
-
"score": 0.
|
| 284 |
-
"score_ci_high": 0.
|
| 285 |
-
"score_ci_low": 0.
|
| 286 |
"num_of_instances": 9
|
| 287 |
},
|
| 288 |
-
"score": 0.
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
| 296 |
-
"score": 0.
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
-
"score": 0.
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
-
"f1_Person": 0.
|
| 307 |
-
"f1_Organization": 0.
|
| 308 |
-
"f1_Location": 0.
|
| 309 |
-
"f1_macro": 0.
|
| 310 |
-
"recall_macro": 0.
|
| 311 |
-
"precision_macro": 0.
|
| 312 |
-
"in_classes_support": 0.
|
| 313 |
-
"f1_micro": 0.
|
| 314 |
-
"recall_micro": 0.
|
| 315 |
-
"precision_micro": 0.
|
| 316 |
-
"score": 0.
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
-
"score_ci_low": 0.
|
| 319 |
-
"score_ci_high": 0.
|
| 320 |
-
"f1_micro_ci_low": 0.
|
| 321 |
-
"f1_micro_ci_high": 0.
|
| 322 |
},
|
| 323 |
-
"score": 0.
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
| 327 |
"knowledge": {
|
| 328 |
"mmlu_pro_biology": {
|
| 329 |
-
"accuracy": 0.
|
| 330 |
-
"accuracy_ci_low": 0.
|
| 331 |
-
"accuracy_ci_high": 0
|
| 332 |
"score_name": "accuracy",
|
| 333 |
-
"score": 0.
|
| 334 |
-
"score_ci_high": 0
|
| 335 |
-
"score_ci_low": 0.
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
|
@@ -346,23 +346,23 @@
|
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
-
"accuracy": 0.
|
| 350 |
"accuracy_ci_low": 0.0,
|
| 351 |
-
"accuracy_ci_high": 0.
|
| 352 |
"score_name": "accuracy",
|
| 353 |
-
"score": 0.
|
| 354 |
-
"score_ci_high": 0.
|
| 355 |
"score_ci_low": 0.0,
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
| 359 |
-
"accuracy": 0.
|
| 360 |
-
"accuracy_ci_low": 0.
|
| 361 |
-
"accuracy_ci_high": 0
|
| 362 |
"score_name": "accuracy",
|
| 363 |
-
"score": 0.
|
| 364 |
-
"score_ci_high": 0
|
| 365 |
-
"score_ci_low": 0.
|
| 366 |
"num_of_instances": 7
|
| 367 |
},
|
| 368 |
"mmlu_pro_economics": {
|
|
@@ -376,22 +376,22 @@
|
|
| 376 |
"num_of_instances": 7
|
| 377 |
},
|
| 378 |
"mmlu_pro_engineering": {
|
| 379 |
-
"accuracy": 0.
|
| 380 |
"accuracy_ci_low": 0.0,
|
| 381 |
-
"accuracy_ci_high": 0.
|
| 382 |
"score_name": "accuracy",
|
| 383 |
-
"score": 0.
|
| 384 |
-
"score_ci_high": 0.
|
| 385 |
"score_ci_low": 0.0,
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
| 389 |
-
"accuracy": 0.
|
| 390 |
"accuracy_ci_low": 0.0,
|
| 391 |
-
"accuracy_ci_high": 0.
|
| 392 |
"score_name": "accuracy",
|
| 393 |
-
"score": 0.
|
| 394 |
-
"score_ci_high": 0.
|
| 395 |
"score_ci_low": 0.0,
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
|
@@ -406,32 +406,32 @@
|
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
-
"accuracy": 0.
|
| 410 |
-
"accuracy_ci_low": 0.
|
| 411 |
-
"accuracy_ci_high": 0.
|
| 412 |
"score_name": "accuracy",
|
| 413 |
-
"score": 0.
|
| 414 |
-
"score_ci_high": 0.
|
| 415 |
-
"score_ci_low": 0.
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
| 419 |
-
"accuracy": 0.
|
| 420 |
"accuracy_ci_low": 0.14285714285714285,
|
| 421 |
"accuracy_ci_high": 0.8571428571428571,
|
| 422 |
"score_name": "accuracy",
|
| 423 |
-
"score": 0.
|
| 424 |
"score_ci_high": 0.8571428571428571,
|
| 425 |
"score_ci_low": 0.14285714285714285,
|
| 426 |
"num_of_instances": 7
|
| 427 |
},
|
| 428 |
"mmlu_pro_other": {
|
| 429 |
-
"accuracy": 0.
|
| 430 |
"accuracy_ci_low": 0.0,
|
| 431 |
-
"accuracy_ci_high": 0.
|
| 432 |
"score_name": "accuracy",
|
| 433 |
-
"score": 0.
|
| 434 |
-
"score_ci_high": 0.
|
| 435 |
"score_ci_low": 0.0,
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
|
@@ -446,12 +446,12 @@
|
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
-
"accuracy": 0.
|
| 450 |
"accuracy_ci_low": 0.0,
|
| 451 |
-
"accuracy_ci_high": 0.
|
| 452 |
"score_name": "accuracy",
|
| 453 |
-
"score": 0.
|
| 454 |
-
"score_ci_high": 0.
|
| 455 |
"score_ci_low": 0.0,
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
|
@@ -465,38 +465,38 @@
|
|
| 465 |
"score_ci_low": 0.0,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
-
"score": 0.
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
-
"f1_macro": 0.
|
| 475 |
-
"f1_suggestive": 0.
|
| 476 |
-
"f1_descriptive": 0.36363636363636365,
|
| 477 |
"f1_generic": 0.0,
|
| 478 |
-
"
|
| 479 |
-
"
|
| 480 |
-
"
|
| 481 |
-
"
|
|
|
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
-
"score": 0.
|
| 484 |
-
"score_ci_high": 0.
|
| 485 |
-
"score_ci_low": 0.
|
| 486 |
"num_of_instances": 20,
|
| 487 |
-
"accuracy": 0.
|
| 488 |
-
"accuracy_ci_low": 0.
|
| 489 |
-
"accuracy_ci_high": 0.
|
| 490 |
-
"f1_micro": 0.
|
| 491 |
-
"f1_micro_ci_low": 0.
|
| 492 |
-
"f1_micro_ci_high": 0.
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
-
"f1_macro": 0.
|
| 496 |
-
"f1_no": 0.
|
| 497 |
-
"f1_yes": 0.
|
| 498 |
"f1_macro_ci_low": 0.24812030075187969,
|
| 499 |
-
"f1_macro_ci_high": 0.
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
"score": 0.45,
|
| 502 |
"score_ci_high": 0.65,
|
|
@@ -510,228 +510,228 @@
|
|
| 510 |
"f1_micro_ci_high": 0.65
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
-
"f1_macro": 0.
|
| 514 |
-
"f1_conclusion": 0.
|
| 515 |
-
"f1_analysis": 0.
|
| 516 |
"f1_decree": 0.0,
|
| 517 |
-
"f1_issue": 0.
|
| 518 |
"f1_facts": 0.5,
|
| 519 |
"f1_procedural history": 1.0,
|
| 520 |
"f1_rule": 0.0,
|
| 521 |
-
"f1_macro_ci_low": 0.
|
| 522 |
-
"f1_macro_ci_high": 0.
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
-
"score": 0.
|
| 525 |
-
"score_ci_high": 0.
|
| 526 |
-
"score_ci_low": 0.
|
| 527 |
"num_of_instances": 20,
|
| 528 |
-
"accuracy": 0.
|
| 529 |
-
"accuracy_ci_low": 0.
|
| 530 |
-
"accuracy_ci_high": 0.
|
| 531 |
-
"f1_micro": 0.
|
| 532 |
-
"f1_micro_ci_low": 0.
|
| 533 |
-
"f1_micro_ci_high": 0.
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
-
"f1_macro": 0.
|
| 537 |
-
"f1_yes": 0.
|
| 538 |
-
"f1_no": 0.
|
| 539 |
-
"f1_macro_ci_low": 0.
|
| 540 |
-
"f1_macro_ci_high": 0.
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
"score": 0.6,
|
| 543 |
"score_ci_high": 0.8,
|
| 544 |
-
"score_ci_low": 0.
|
| 545 |
"num_of_instances": 20,
|
| 546 |
"accuracy": 0.6,
|
| 547 |
-
"accuracy_ci_low": 0.
|
| 548 |
"accuracy_ci_high": 0.8,
|
| 549 |
"f1_micro": 0.6,
|
| 550 |
-
"f1_micro_ci_low": 0.
|
| 551 |
"f1_micro_ci_high": 0.8
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
-
"f1_macro": 0.
|
| 555 |
-
"f1_yes": 0.
|
| 556 |
"f1_no": 0.7777777777777778,
|
| 557 |
-
"f1_macro_ci_low": 0.
|
| 558 |
-
"f1_macro_ci_high": 0.
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
-
"score": 0.
|
| 561 |
-
"score_ci_high": 0.
|
| 562 |
-
"score_ci_low": 0.
|
| 563 |
"num_of_instances": 20,
|
| 564 |
-
"accuracy": 0.
|
| 565 |
"accuracy_ci_low": 0.4,
|
| 566 |
"accuracy_ci_high": 0.8,
|
| 567 |
-
"f1_micro": 0.
|
| 568 |
-
"f1_micro_ci_low": 0.
|
| 569 |
-
"f1_micro_ci_high": 0.
|
| 570 |
},
|
| 571 |
-
"score": 0.
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
-
"f1_macro": 0.
|
| 578 |
"f1_cars": 0.6,
|
| 579 |
"f1_windows x": 0.0,
|
| 580 |
"f1_atheism": 0.0,
|
| 581 |
"f1_christianity": 0.0,
|
| 582 |
"f1_religion": 0.0,
|
| 583 |
"f1_medicine": 0.3333333333333333,
|
| 584 |
-
"f1_computer graphics": 0.
|
| 585 |
-
"f1_microsoft windows": 0.
|
| 586 |
"f1_middle east": 0.0,
|
| 587 |
-
"f1_politics": 0.
|
| 588 |
-
"f1_motorcycles": 0.
|
| 589 |
"f1_mac hardware": 0.3333333333333333,
|
| 590 |
-
"f1_pc hardware": 0.
|
| 591 |
"f1_for sale": 0.0,
|
|
|
|
| 592 |
"f1_guns": 0.5,
|
| 593 |
-
"
|
|
|
|
| 594 |
"f1_cryptography": 0.0,
|
| 595 |
-
"f1_baseball": 0.8,
|
| 596 |
-
"f1_electronics": 0.6666666666666666,
|
| 597 |
"f1_hockey": 0.0,
|
| 598 |
-
"f1_macro_ci_low": 0.
|
| 599 |
-
"f1_macro_ci_high": 0.
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
-
"score": 0.
|
| 602 |
-
"score_ci_high": 0.
|
| 603 |
-
"score_ci_low": 0.
|
| 604 |
"num_of_instances": 100,
|
| 605 |
-
"accuracy": 0.
|
| 606 |
-
"accuracy_ci_low": 0.
|
| 607 |
-
"accuracy_ci_high": 0.
|
| 608 |
-
"f1_micro": 0.
|
| 609 |
-
"f1_micro_ci_low": 0.
|
| 610 |
-
"f1_micro_ci_high": 0.
|
| 611 |
},
|
| 612 |
-
"score": 0.
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
-
"f1_macro": 0.
|
| 619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
| 620 |
-
"f1_credit card or prepaid card": 0.
|
| 621 |
"f1_money transfer or virtual currency or money service": 0.6666666666666666,
|
| 622 |
"f1_mortgage": 1.0,
|
| 623 |
-
"f1_debt collection": 0.
|
| 624 |
"f1_checking or savings account": 0.7692307692307693,
|
| 625 |
-
"f1_payday loan or title loan or personal loan":
|
| 626 |
-
"f1_macro_ci_low": 0.
|
| 627 |
-
"f1_macro_ci_high": 0.
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
-
"score": 0.
|
| 630 |
-
"score_ci_high": 0.
|
| 631 |
-
"score_ci_low": 0.
|
| 632 |
"num_of_instances": 100,
|
| 633 |
-
"accuracy": 0.
|
| 634 |
-
"accuracy_ci_low": 0.
|
| 635 |
-
"accuracy_ci_high": 0.
|
| 636 |
-
"f1_micro": 0.
|
| 637 |
-
"f1_micro_ci_low": 0.
|
| 638 |
-
"f1_micro_ci_high": 0.
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
-
"f1_macro": 0.
|
| 642 |
-
"f1_mortgages and loans": 0.
|
| 643 |
-
"f1_credit card": 0.
|
| 644 |
-
"f1_debt collection": 0.
|
| 645 |
-
"f1_retail banking": 0.
|
| 646 |
-
"f1_credit reporting": 0.
|
| 647 |
-
"f1_macro_ci_low": 0.
|
| 648 |
-
"f1_macro_ci_high": 0.
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
-
"score": 0.
|
| 651 |
-
"score_ci_high": 0.
|
| 652 |
-
"score_ci_low": 0.
|
| 653 |
"num_of_instances": 50,
|
| 654 |
-
"accuracy": 0.
|
| 655 |
-
"accuracy_ci_low": 0.
|
| 656 |
-
"accuracy_ci_high": 0.
|
| 657 |
-
"f1_micro": 0.
|
| 658 |
-
"f1_micro_ci_low": 0.
|
| 659 |
-
"f1_micro_ci_high": 0.
|
| 660 |
},
|
| 661 |
-
"score": 0.
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
-
"program_accuracy": 0.
|
| 669 |
-
"score": 0.
|
| 670 |
"score_name": "program_accuracy",
|
| 671 |
-
"execution_accuracy": 0.
|
| 672 |
-
"program_accuracy_ci_low": 0.
|
| 673 |
-
"program_accuracy_ci_high": 0.
|
| 674 |
-
"score_ci_low": 0.
|
| 675 |
-
"score_ci_high": 0.
|
| 676 |
-
"execution_accuracy_ci_low": 0.
|
| 677 |
-
"execution_accuracy_ci_high": 0.
|
| 678 |
},
|
| 679 |
-
"score": 0.
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
-
"precision": 0.
|
| 686 |
-
"recall": 0.
|
| 687 |
-
"f1": 0.
|
| 688 |
-
"precision_ci_low": 0.
|
| 689 |
-
"precision_ci_high": 0.
|
| 690 |
-
"recall_ci_low": 0.
|
| 691 |
-
"recall_ci_high": 0.
|
| 692 |
-
"f1_ci_low": 0.
|
| 693 |
-
"f1_ci_high": 0.
|
| 694 |
"score_name": "f1",
|
| 695 |
-
"score": 0.
|
| 696 |
-
"score_ci_high": 0.
|
| 697 |
-
"score_ci_low": 0.
|
| 698 |
"num_of_instances": 100,
|
| 699 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
| 700 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
| 701 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
| 702 |
-
"faithfullness_f1_token_overlap": 0.
|
| 703 |
-
"faithfullness_recall_token_overlap": 0.
|
| 704 |
-
"faithfullness_precision_token_overlap": 0.
|
| 705 |
-
"correctness_f1_token_overlap": 0.
|
| 706 |
-
"correctness_recall_token_overlap": 0.
|
| 707 |
-
"correctness_precision_token_overlap": 0.
|
| 708 |
},
|
| 709 |
-
"score": 0.
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
| 713 |
"reasoning": {
|
| 714 |
"hellaswag": {
|
| 715 |
-
"accuracy": 0.
|
| 716 |
-
"accuracy_ci_low": 0.
|
| 717 |
-
"accuracy_ci_high": 0.
|
| 718 |
"score_name": "accuracy",
|
| 719 |
-
"score": 0.
|
| 720 |
-
"score_ci_high": 0.
|
| 721 |
-
"score_ci_low": 0.
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
-
"accuracy": 0.
|
| 726 |
"accuracy_ci_low": 0.6,
|
| 727 |
-
"accuracy_ci_high": 0.
|
| 728 |
"score_name": "accuracy",
|
| 729 |
-
"score": 0.
|
| 730 |
-
"score_ci_high": 0.
|
| 731 |
"score_ci_low": 0.6,
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
-
"score": 0.
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
|
@@ -741,66 +741,66 @@
|
|
| 741 |
"severity_high": 0.0,
|
| 742 |
"severity_medium": 0.0,
|
| 743 |
"severity_low": 100.0,
|
| 744 |
-
"category_harmful_info": 0.
|
| 745 |
-
"category_deception": 0.
|
| 746 |
-
"category_explicit_content": 0.
|
| 747 |
-
"category_violence": 0.
|
| 748 |
-
"category_discrimination": 0.
|
| 749 |
-
"category_substance_abuse": 0.
|
| 750 |
-
"category_pii": 0.
|
| 751 |
-
"safety": 0.
|
| 752 |
-
"safety_ci_low": 0.
|
| 753 |
-
"safety_ci_high": 0.
|
| 754 |
"score_name": "safety",
|
| 755 |
-
"score": 0.
|
| 756 |
-
"score_ci_high": 0.
|
| 757 |
-
"score_ci_low": 0.
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
-
"score": 0.
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
-
"
|
| 768 |
-
"
|
| 769 |
-
"rougeL": 0.
|
| 770 |
-
"score": 0.
|
| 771 |
"score_name": "rougeL",
|
| 772 |
-
"rouge2": 0.
|
| 773 |
-
"
|
| 774 |
-
"
|
| 775 |
-
"
|
| 776 |
-
"
|
| 777 |
-
"rougeL_ci_low": 0.
|
| 778 |
-
"rougeL_ci_high": 0.
|
| 779 |
-
"score_ci_low": 0.
|
| 780 |
-
"score_ci_high": 0.
|
| 781 |
-
"rouge2_ci_low": 0.
|
| 782 |
-
"rouge2_ci_high": 0.
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
-
"
|
| 787 |
-
"
|
| 788 |
-
"rougeL": 0.
|
| 789 |
-
"score": 0.
|
| 790 |
"score_name": "rougeL",
|
| 791 |
-
"rouge2": 0.
|
| 792 |
-
"
|
| 793 |
-
"
|
| 794 |
-
"
|
| 795 |
-
"
|
| 796 |
-
"rougeL_ci_low": 0.
|
| 797 |
-
"rougeL_ci_high": 0.
|
| 798 |
-
"score_ci_low": 0.
|
| 799 |
-
"score_ci_high": 0.
|
| 800 |
-
"rouge2_ci_low": 0.
|
| 801 |
-
"rouge2_ci_high": 0.
|
| 802 |
},
|
| 803 |
-
"score": 0.
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
@@ -808,196 +808,196 @@
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
],
|
| 816 |
"totals": [
|
|
|
|
| 817 |
211,
|
| 818 |
205,
|
| 819 |
-
199
|
| 820 |
-
193
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
-
0.
|
| 824 |
-
0.
|
| 825 |
-
0.
|
| 826 |
-
0.
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
-
"sys_len":
|
| 830 |
"ref_len": 208,
|
| 831 |
-
"sacrebleu": 0.
|
| 832 |
-
"score": 0.
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
-
"score_ci_low": 0.
|
| 835 |
-
"score_ci_high": 0.
|
| 836 |
-
"sacrebleu_ci_low": 0.
|
| 837 |
-
"sacrebleu_ci_high": 0.
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
31
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
-
0.
|
| 855 |
-
0.
|
| 856 |
-
0.
|
| 857 |
-
0.
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
-
"sys_len":
|
| 861 |
"ref_len": 208,
|
| 862 |
-
"sacrebleu": 0.
|
| 863 |
-
"score": 0.
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
-
"score_ci_low": 0.
|
| 866 |
-
"score_ci_high": 0.
|
| 867 |
-
"sacrebleu_ci_low": 0.
|
| 868 |
-
"sacrebleu_ci_high": 0.
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
5
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
-
0.
|
| 886 |
-
0.
|
| 887 |
-
0.
|
| 888 |
-
0.
|
| 889 |
],
|
| 890 |
-
"bp":
|
| 891 |
-
"sys_len":
|
| 892 |
"ref_len": 209,
|
| 893 |
-
"sacrebleu": 0.
|
| 894 |
-
"score": 0.
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
-
"score_ci_low": 0.
|
| 897 |
-
"score_ci_high": 0.
|
| 898 |
-
"sacrebleu_ci_low": 0.
|
| 899 |
-
"sacrebleu_ci_high": 0.
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
-
0.
|
| 917 |
-
0.
|
| 918 |
-
0.
|
| 919 |
-
0.
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
-
"sys_len":
|
| 923 |
"ref_len": 216,
|
| 924 |
-
"sacrebleu": 0.
|
| 925 |
-
"score": 0.
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
-
"score_ci_low": 0.
|
| 928 |
-
"score_ci_high": 0.
|
| 929 |
-
"sacrebleu_ci_low": 0.
|
| 930 |
-
"sacrebleu_ci_high": 0.
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
-
0.
|
| 948 |
-
0.
|
| 949 |
-
0.
|
| 950 |
-
0.
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
-
"sys_len":
|
| 954 |
"ref_len": 235,
|
| 955 |
-
"sacrebleu": 0.
|
| 956 |
-
"score": 0.
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
-
"score_ci_low": 0.
|
| 959 |
-
"score_ci_high": 0.
|
| 960 |
-
"sacrebleu_ci_low": 0.
|
| 961 |
-
"sacrebleu_ci_high": 0.
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
-
0.
|
| 979 |
-
0.
|
| 980 |
-
0.
|
| 981 |
-
0.
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
-
"sys_len":
|
| 985 |
"ref_len": 249,
|
| 986 |
-
"sacrebleu": 0.
|
| 987 |
-
"score": 0.
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
-
"score_ci_low": 0.
|
| 990 |
-
"score_ci_high": 0.
|
| 991 |
-
"sacrebleu_ci_low": 0.
|
| 992 |
-
"sacrebleu_ci_high": 0.
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
217,
|
|
@@ -1006,275 +1006,275 @@
|
|
| 1006 |
199
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
-
0.
|
| 1010 |
-
0.
|
| 1011 |
-
0.
|
| 1012 |
-
0.
|
| 1013 |
],
|
| 1014 |
"bp": 0.977221952990032,
|
| 1015 |
"sys_len": 217,
|
| 1016 |
"ref_len": 222,
|
| 1017 |
-
"sacrebleu": 0.
|
| 1018 |
-
"score": 0.
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
-
"score_ci_low": 0.
|
| 1021 |
-
"score_ci_high": 0.
|
| 1022 |
-
"sacrebleu_ci_low": 0.
|
| 1023 |
-
"sacrebleu_ci_high": 0.
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
-
0.
|
| 1041 |
-
0.
|
| 1042 |
-
0.
|
| 1043 |
-
0.
|
| 1044 |
],
|
| 1045 |
-
"bp": 0.
|
| 1046 |
-
"sys_len":
|
| 1047 |
"ref_len": 230,
|
| 1048 |
-
"sacrebleu": 0.
|
| 1049 |
-
"score": 0.
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
-
"score_ci_low": 0.
|
| 1052 |
-
"score_ci_high": 0.
|
| 1053 |
-
"sacrebleu_ci_low": 0.
|
| 1054 |
-
"sacrebleu_ci_high": 0.
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
-
0.
|
| 1072 |
-
0.
|
| 1073 |
-
0.
|
| 1074 |
-
0.
|
| 1075 |
],
|
| 1076 |
-
"bp": 0.
|
| 1077 |
-
"sys_len":
|
| 1078 |
"ref_len": 243,
|
| 1079 |
-
"sacrebleu": 0.
|
| 1080 |
-
"score": 0.
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
-
"score_ci_low": 0.
|
| 1083 |
-
"score_ci_high": 0.
|
| 1084 |
-
"sacrebleu_ci_low": 0.
|
| 1085 |
-
"sacrebleu_ci_high": 0.
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
-
0.
|
| 1103 |
-
0.
|
| 1104 |
-
0.
|
| 1105 |
-
0.
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
-
"sys_len":
|
| 1109 |
"ref_len": 208,
|
| 1110 |
-
"sacrebleu": 0.
|
| 1111 |
-
"score": 0.
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
-
"score_ci_low": 0.
|
| 1114 |
-
"score_ci_high": 0.
|
| 1115 |
-
"sacrebleu_ci_low": 0.
|
| 1116 |
-
"sacrebleu_ci_high": 0.
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
-
0.
|
| 1134 |
-
0.
|
| 1135 |
-
0.
|
| 1136 |
-
0.
|
| 1137 |
],
|
| 1138 |
"bp": 1.0,
|
| 1139 |
-
"sys_len":
|
| 1140 |
"ref_len": 208,
|
| 1141 |
-
"sacrebleu": 0.
|
| 1142 |
-
"score": 0.
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
-
"score_ci_low": 0.
|
| 1145 |
-
"score_ci_high": 0.
|
| 1146 |
-
"sacrebleu_ci_low": 0.
|
| 1147 |
-
"sacrebleu_ci_high": 0.
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
-
0.
|
| 1165 |
-
0.
|
| 1166 |
-
0.
|
| 1167 |
-
0.
|
| 1168 |
],
|
| 1169 |
-
"bp": 0.
|
| 1170 |
-
"sys_len":
|
| 1171 |
"ref_len": 208,
|
| 1172 |
-
"sacrebleu": 0.
|
| 1173 |
-
"score": 0.
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
-
"score_ci_low": 0.
|
| 1176 |
-
"score_ci_high": 0.
|
| 1177 |
-
"sacrebleu_ci_low": 0.
|
| 1178 |
-
"sacrebleu_ci_high": 0.
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
-
0.
|
| 1196 |
-
0.
|
| 1197 |
-
0.
|
| 1198 |
-
0.
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
-
"sys_len":
|
| 1202 |
"ref_len": 208,
|
| 1203 |
-
"sacrebleu": 0.
|
| 1204 |
-
"score": 0.
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
-
"score_ci_low": 0.
|
| 1207 |
-
"score_ci_high": 0.
|
| 1208 |
-
"sacrebleu_ci_low": 0.
|
| 1209 |
-
"sacrebleu_ci_high": 0.
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
-
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
-
0.
|
| 1227 |
-
0.
|
| 1228 |
-
0.
|
| 1229 |
-
0.
|
| 1230 |
],
|
| 1231 |
-
"bp":
|
| 1232 |
-
"sys_len":
|
| 1233 |
"ref_len": 208,
|
| 1234 |
-
"sacrebleu": 0.
|
| 1235 |
-
"score": 0.
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
-
"score_ci_low": 0.
|
| 1238 |
-
"score_ci_high": 0.
|
| 1239 |
-
"sacrebleu_ci_low": 0.
|
| 1240 |
-
"sacrebleu_ci_high": 0.
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
-
|
| 1246 |
-
|
| 1247 |
49,
|
| 1248 |
33
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
-
0.
|
| 1258 |
-
0.
|
| 1259 |
-
0.
|
| 1260 |
-
0.
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
-
"sys_len":
|
| 1264 |
"ref_len": 208,
|
| 1265 |
-
"sacrebleu": 0.
|
| 1266 |
-
"score": 0.
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
-
"score_ci_low": 0.
|
| 1269 |
-
"score_ci_high": 0.
|
| 1270 |
-
"sacrebleu_ci_low": 0.
|
| 1271 |
-
"sacrebleu_ci_high": 0.
|
| 1272 |
},
|
| 1273 |
-
"score": 0.
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
-
"score": 0.
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
+
"timestamp_utc": "2025-07-03T14:08:17.472494Z",
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
+
"unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
+
"accuracy": 0.6666666666666666,
|
| 180 |
+
"accuracy_ci_low": 0.3333333333333333,
|
| 181 |
+
"accuracy_ci_high": 0.8888888888888888,
|
| 182 |
"score_name": "accuracy",
|
| 183 |
+
"score": 0.6666666666666666,
|
| 184 |
+
"score_ci_high": 0.8888888888888888,
|
| 185 |
+
"score_ci_low": 0.3333333333333333,
|
| 186 |
"num_of_instances": 9
|
| 187 |
},
|
| 188 |
"safety_bbq_disability_status": {
|
| 189 |
+
"accuracy": 0.5555555555555556,
|
| 190 |
+
"accuracy_ci_low": 0.2222222222222222,
|
| 191 |
+
"accuracy_ci_high": 0.8888888888888888,
|
| 192 |
"score_name": "accuracy",
|
| 193 |
+
"score": 0.5555555555555556,
|
| 194 |
+
"score_ci_high": 0.8888888888888888,
|
| 195 |
+
"score_ci_low": 0.2222222222222222,
|
| 196 |
"num_of_instances": 9
|
| 197 |
},
|
| 198 |
"safety_bbq_gender_identity": {
|
| 199 |
+
"accuracy": 0.6666666666666666,
|
| 200 |
"accuracy_ci_low": 0.3333333333333333,
|
| 201 |
+
"accuracy_ci_high": 0.8888888888888888,
|
| 202 |
"score_name": "accuracy",
|
| 203 |
+
"score": 0.6666666666666666,
|
| 204 |
+
"score_ci_high": 0.8888888888888888,
|
| 205 |
"score_ci_low": 0.3333333333333333,
|
| 206 |
"num_of_instances": 9
|
| 207 |
},
|
| 208 |
"safety_bbq_nationality": {
|
| 209 |
+
"accuracy": 0.4444444444444444,
|
| 210 |
+
"accuracy_ci_low": 0.1111111111111111,
|
| 211 |
+
"accuracy_ci_high": 0.7777777777777778,
|
| 212 |
+
"score_name": "accuracy",
|
| 213 |
+
"score": 0.4444444444444444,
|
| 214 |
+
"score_ci_high": 0.7777777777777778,
|
| 215 |
+
"score_ci_low": 0.1111111111111111,
|
| 216 |
+
"num_of_instances": 9
|
| 217 |
+
},
|
| 218 |
+
"safety_bbq_physical_appearance": {
|
| 219 |
"accuracy": 0.5555555555555556,
|
| 220 |
"accuracy_ci_low": 0.2222222222222222,
|
| 221 |
"accuracy_ci_high": 0.8888888888888888,
|
|
|
|
| 225 |
"score_ci_low": 0.2222222222222222,
|
| 226 |
"num_of_instances": 9
|
| 227 |
},
|
| 228 |
+
"safety_bbq_race_ethnicity": {
|
| 229 |
+
"accuracy": 0.8888888888888888,
|
| 230 |
+
"accuracy_ci_low": 0.5310928992288233,
|
| 231 |
"accuracy_ci_high": 1.0,
|
| 232 |
"score_name": "accuracy",
|
| 233 |
+
"score": 0.8888888888888888,
|
| 234 |
"score_ci_high": 1.0,
|
| 235 |
+
"score_ci_low": 0.5310928992288233,
|
| 236 |
"num_of_instances": 9
|
| 237 |
},
|
| 238 |
+
"safety_bbq_race_x_gender": {
|
| 239 |
+
"accuracy": 0.4444444444444444,
|
| 240 |
+
"accuracy_ci_low": 0.1111111111111111,
|
| 241 |
+
"accuracy_ci_high": 0.7777777777777778,
|
| 242 |
"score_name": "accuracy",
|
| 243 |
+
"score": 0.4444444444444444,
|
| 244 |
+
"score_ci_high": 0.7777777777777778,
|
| 245 |
+
"score_ci_low": 0.1111111111111111,
|
| 246 |
"num_of_instances": 9
|
| 247 |
},
|
| 248 |
+
"safety_bbq_race_x_ses": {
|
| 249 |
"accuracy": 0.6666666666666666,
|
| 250 |
"accuracy_ci_low": 0.3333333333333333,
|
| 251 |
"accuracy_ci_high": 0.8888888888888888,
|
|
|
|
| 255 |
"score_ci_low": 0.3333333333333333,
|
| 256 |
"num_of_instances": 9
|
| 257 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
"safety_bbq_religion": {
|
| 259 |
"accuracy": 0.6666666666666666,
|
| 260 |
"accuracy_ci_low": 0.3333333333333333,
|
|
|
|
| 266 |
"num_of_instances": 9
|
| 267 |
},
|
| 268 |
"safety_bbq_ses": {
|
| 269 |
+
"accuracy": 0.5555555555555556,
|
| 270 |
+
"accuracy_ci_low": 0.2222222222222222,
|
| 271 |
+
"accuracy_ci_high": 0.8888888888888888,
|
| 272 |
"score_name": "accuracy",
|
| 273 |
+
"score": 0.5555555555555556,
|
| 274 |
+
"score_ci_high": 0.8888888888888888,
|
| 275 |
+
"score_ci_low": 0.2222222222222222,
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
"safety_bbq_sexual_orientation": {
|
| 279 |
+
"accuracy": 0.5555555555555556,
|
| 280 |
+
"accuracy_ci_low": 0.2222222222222222,
|
| 281 |
+
"accuracy_ci_high": 0.8888888888888888,
|
| 282 |
"score_name": "accuracy",
|
| 283 |
+
"score": 0.5555555555555556,
|
| 284 |
+
"score_ci_high": 0.8888888888888888,
|
| 285 |
+
"score_ci_low": 0.2222222222222222,
|
| 286 |
"num_of_instances": 9
|
| 287 |
},
|
| 288 |
+
"score": 0.6060606060606061,
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.25139664804469275,
|
| 296 |
+
"score": 0.25139664804469275,
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
+
"score": 0.25139664804469275,
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
+
"f1_Person": 0.5,
|
| 307 |
+
"f1_Organization": 0.5084745762711865,
|
| 308 |
+
"f1_Location": 0.34146341463414637,
|
| 309 |
+
"f1_macro": 0.4499793303017776,
|
| 310 |
+
"recall_macro": 0.4062284334023465,
|
| 311 |
+
"precision_macro": 0.5293144553106602,
|
| 312 |
+
"in_classes_support": 0.9838709677419355,
|
| 313 |
+
"f1_micro": 0.45255474452554745,
|
| 314 |
+
"recall_micro": 0.41333333333333333,
|
| 315 |
+
"precision_micro": 0.5,
|
| 316 |
+
"score": 0.45255474452554745,
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
+
"score_ci_low": 0.3227264456627763,
|
| 319 |
+
"score_ci_high": 0.5928025470401035,
|
| 320 |
+
"f1_micro_ci_low": 0.3227264456627763,
|
| 321 |
+
"f1_micro_ci_high": 0.5928025470401035
|
| 322 |
},
|
| 323 |
+
"score": 0.45255474452554745,
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
| 327 |
"knowledge": {
|
| 328 |
"mmlu_pro_biology": {
|
| 329 |
+
"accuracy": 0.7142857142857143,
|
| 330 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 331 |
+
"accuracy_ci_high": 1.0,
|
| 332 |
"score_name": "accuracy",
|
| 333 |
+
"score": 0.7142857142857143,
|
| 334 |
+
"score_ci_high": 1.0,
|
| 335 |
+
"score_ci_low": 0.2857142857142857,
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
|
|
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
+
"accuracy": 0.2857142857142857,
|
| 350 |
"accuracy_ci_low": 0.0,
|
| 351 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 352 |
"score_name": "accuracy",
|
| 353 |
+
"score": 0.2857142857142857,
|
| 354 |
+
"score_ci_high": 0.7142857142857143,
|
| 355 |
"score_ci_low": 0.0,
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
| 359 |
+
"accuracy": 0.7142857142857143,
|
| 360 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 361 |
+
"accuracy_ci_high": 1.0,
|
| 362 |
"score_name": "accuracy",
|
| 363 |
+
"score": 0.7142857142857143,
|
| 364 |
+
"score_ci_high": 1.0,
|
| 365 |
+
"score_ci_low": 0.2857142857142857,
|
| 366 |
"num_of_instances": 7
|
| 367 |
},
|
| 368 |
"mmlu_pro_economics": {
|
|
|
|
| 376 |
"num_of_instances": 7
|
| 377 |
},
|
| 378 |
"mmlu_pro_engineering": {
|
| 379 |
+
"accuracy": 0.2857142857142857,
|
| 380 |
"accuracy_ci_low": 0.0,
|
| 381 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 382 |
"score_name": "accuracy",
|
| 383 |
+
"score": 0.2857142857142857,
|
| 384 |
+
"score_ci_high": 0.7142857142857143,
|
| 385 |
"score_ci_low": 0.0,
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
| 389 |
+
"accuracy": 0.2857142857142857,
|
| 390 |
"accuracy_ci_low": 0.0,
|
| 391 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 392 |
"score_name": "accuracy",
|
| 393 |
+
"score": 0.2857142857142857,
|
| 394 |
+
"score_ci_high": 0.7142857142857143,
|
| 395 |
"score_ci_low": 0.0,
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
|
|
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
+
"accuracy": 0.42857142857142855,
|
| 410 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 411 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 412 |
"score_name": "accuracy",
|
| 413 |
+
"score": 0.42857142857142855,
|
| 414 |
+
"score_ci_high": 0.8571428571428571,
|
| 415 |
+
"score_ci_low": 0.14285714285714285,
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
| 419 |
+
"accuracy": 0.42857142857142855,
|
| 420 |
"accuracy_ci_low": 0.14285714285714285,
|
| 421 |
"accuracy_ci_high": 0.8571428571428571,
|
| 422 |
"score_name": "accuracy",
|
| 423 |
+
"score": 0.42857142857142855,
|
| 424 |
"score_ci_high": 0.8571428571428571,
|
| 425 |
"score_ci_low": 0.14285714285714285,
|
| 426 |
"num_of_instances": 7
|
| 427 |
},
|
| 428 |
"mmlu_pro_other": {
|
| 429 |
+
"accuracy": 0.14285714285714285,
|
| 430 |
"accuracy_ci_low": 0.0,
|
| 431 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 432 |
"score_name": "accuracy",
|
| 433 |
+
"score": 0.14285714285714285,
|
| 434 |
+
"score_ci_high": 0.5714285714285714,
|
| 435 |
"score_ci_low": 0.0,
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
|
|
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
+
"accuracy": 0.2857142857142857,
|
| 450 |
"accuracy_ci_low": 0.0,
|
| 451 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 452 |
"score_name": "accuracy",
|
| 453 |
+
"score": 0.2857142857142857,
|
| 454 |
+
"score_ci_high": 0.7142857142857143,
|
| 455 |
"score_ci_low": 0.0,
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
|
|
|
| 465 |
"score_ci_low": 0.0,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
+
"score": 0.35714285714285715,
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
+
"f1_macro": 0.4031746031746032,
|
| 475 |
+
"f1_suggestive": 0.3333333333333333,
|
|
|
|
| 476 |
"f1_generic": 0.0,
|
| 477 |
+
"f1_descriptive": 0.4444444444444444,
|
| 478 |
+
"f1_fanciful": 0.5714285714285714,
|
| 479 |
+
"f1_arbitrary": 0.6666666666666666,
|
| 480 |
+
"f1_macro_ci_low": 0.2350094328437234,
|
| 481 |
+
"f1_macro_ci_high": 0.6281441125339357,
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
+
"score": 0.45,
|
| 484 |
+
"score_ci_high": 0.65,
|
| 485 |
+
"score_ci_low": 0.25,
|
| 486 |
"num_of_instances": 20,
|
| 487 |
+
"accuracy": 0.45,
|
| 488 |
+
"accuracy_ci_low": 0.25,
|
| 489 |
+
"accuracy_ci_high": 0.65,
|
| 490 |
+
"f1_micro": 0.45,
|
| 491 |
+
"f1_micro_ci_low": 0.25,
|
| 492 |
+
"f1_micro_ci_high": 0.65
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
+
"f1_macro": 0.44862155388471175,
|
| 496 |
+
"f1_no": 0.47619047619047616,
|
| 497 |
+
"f1_yes": 0.42105263157894735,
|
| 498 |
"f1_macro_ci_low": 0.24812030075187969,
|
| 499 |
+
"f1_macro_ci_high": 0.696969696969697,
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
"score": 0.45,
|
| 502 |
"score_ci_high": 0.65,
|
|
|
|
| 510 |
"f1_micro_ci_high": 0.65
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
+
"f1_macro": 0.3128220985363842,
|
| 514 |
+
"f1_conclusion": 0.18181818181818182,
|
| 515 |
+
"f1_analysis": 0.2222222222222222,
|
| 516 |
"f1_decree": 0.0,
|
| 517 |
+
"f1_issue": 0.2857142857142857,
|
| 518 |
"f1_facts": 0.5,
|
| 519 |
"f1_procedural history": 1.0,
|
| 520 |
"f1_rule": 0.0,
|
| 521 |
+
"f1_macro_ci_low": 0.16666666666666666,
|
| 522 |
+
"f1_macro_ci_high": 0.5627298511545029,
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
+
"score": 0.2631578947368421,
|
| 525 |
+
"score_ci_high": 0.4864864864864865,
|
| 526 |
+
"score_ci_low": 0.10256410256410256,
|
| 527 |
"num_of_instances": 20,
|
| 528 |
+
"accuracy": 0.25,
|
| 529 |
+
"accuracy_ci_low": 0.1,
|
| 530 |
+
"accuracy_ci_high": 0.49471586405580864,
|
| 531 |
+
"f1_micro": 0.2631578947368421,
|
| 532 |
+
"f1_micro_ci_low": 0.10256410256410256,
|
| 533 |
+
"f1_micro_ci_high": 0.4864864864864865
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
+
"f1_macro": 0.595959595959596,
|
| 537 |
+
"f1_yes": 0.6363636363636364,
|
| 538 |
+
"f1_no": 0.5555555555555556,
|
| 539 |
+
"f1_macro_ci_low": 0.3732193732193732,
|
| 540 |
+
"f1_macro_ci_high": 0.8,
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
"score": 0.6,
|
| 543 |
"score_ci_high": 0.8,
|
| 544 |
+
"score_ci_low": 0.35,
|
| 545 |
"num_of_instances": 20,
|
| 546 |
"accuracy": 0.6,
|
| 547 |
+
"accuracy_ci_low": 0.35,
|
| 548 |
"accuracy_ci_high": 0.8,
|
| 549 |
"f1_micro": 0.6,
|
| 550 |
+
"f1_micro_ci_low": 0.35,
|
| 551 |
"f1_micro_ci_high": 0.8
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
+
"f1_macro": 0.7638888888888888,
|
| 555 |
+
"f1_yes": 0.75,
|
| 556 |
"f1_no": 0.7777777777777778,
|
| 557 |
+
"f1_macro_ci_low": 0.5298844011348174,
|
| 558 |
+
"f1_macro_ci_high": 0.8985941651727045,
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
+
"score": 0.7647058823529411,
|
| 561 |
+
"score_ci_high": 0.8888888888888888,
|
| 562 |
+
"score_ci_low": 0.5454545454545454,
|
| 563 |
"num_of_instances": 20,
|
| 564 |
+
"accuracy": 0.65,
|
| 565 |
"accuracy_ci_low": 0.4,
|
| 566 |
"accuracy_ci_high": 0.8,
|
| 567 |
+
"f1_micro": 0.7647058823529411,
|
| 568 |
+
"f1_micro_ci_low": 0.5454545454545454,
|
| 569 |
+
"f1_micro_ci_high": 0.8888888888888888
|
| 570 |
},
|
| 571 |
+
"score": 0.5055727554179567,
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
+
"f1_macro": 0.2910756911460236,
|
| 578 |
"f1_cars": 0.6,
|
| 579 |
"f1_windows x": 0.0,
|
| 580 |
"f1_atheism": 0.0,
|
| 581 |
"f1_christianity": 0.0,
|
| 582 |
"f1_religion": 0.0,
|
| 583 |
"f1_medicine": 0.3333333333333333,
|
| 584 |
+
"f1_computer graphics": 0.2608695652173913,
|
| 585 |
+
"f1_microsoft windows": 0.6,
|
| 586 |
"f1_middle east": 0.0,
|
| 587 |
+
"f1_politics": 0.6666666666666666,
|
| 588 |
+
"f1_motorcycles": 0.25,
|
| 589 |
"f1_mac hardware": 0.3333333333333333,
|
| 590 |
+
"f1_pc hardware": 0.5,
|
| 591 |
"f1_for sale": 0.0,
|
| 592 |
+
"f1_electronics": 0.5,
|
| 593 |
"f1_guns": 0.5,
|
| 594 |
+
"f1_baseball": 0.7058823529411765,
|
| 595 |
+
"f1_space": 0.5714285714285714,
|
| 596 |
"f1_cryptography": 0.0,
|
|
|
|
|
|
|
| 597 |
"f1_hockey": 0.0,
|
| 598 |
+
"f1_macro_ci_low": 0.22542343425145048,
|
| 599 |
+
"f1_macro_ci_high": 0.3817631536924331,
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
+
"score": 0.36470588235294116,
|
| 602 |
+
"score_ci_high": 0.4678362573099415,
|
| 603 |
+
"score_ci_low": 0.26400794870774236,
|
| 604 |
"num_of_instances": 100,
|
| 605 |
+
"accuracy": 0.31,
|
| 606 |
+
"accuracy_ci_low": 0.22,
|
| 607 |
+
"accuracy_ci_high": 0.41,
|
| 608 |
+
"f1_micro": 0.36470588235294116,
|
| 609 |
+
"f1_micro_ci_low": 0.26400794870774236,
|
| 610 |
+
"f1_micro_ci_high": 0.4678362573099415
|
| 611 |
},
|
| 612 |
+
"score": 0.36470588235294116,
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
+
"f1_macro": 0.7523191638915732,
|
| 619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9130434782608695,
|
| 620 |
+
"f1_credit card or prepaid card": 0.2857142857142857,
|
| 621 |
"f1_money transfer or virtual currency or money service": 0.6666666666666666,
|
| 622 |
"f1_mortgage": 1.0,
|
| 623 |
+
"f1_debt collection": 0.631578947368421,
|
| 624 |
"f1_checking or savings account": 0.7692307692307693,
|
| 625 |
+
"f1_payday loan or title loan or personal loan": 1.0,
|
| 626 |
+
"f1_macro_ci_low": 0.6489086297233181,
|
| 627 |
+
"f1_macro_ci_high": 0.9210195820841641,
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
+
"score": 0.8247422680412371,
|
| 630 |
+
"score_ci_high": 0.8923076923076924,
|
| 631 |
+
"score_ci_low": 0.743197883853095,
|
| 632 |
"num_of_instances": 100,
|
| 633 |
+
"accuracy": 0.8,
|
| 634 |
+
"accuracy_ci_low": 0.71,
|
| 635 |
+
"accuracy_ci_high": 0.87,
|
| 636 |
+
"f1_micro": 0.8247422680412371,
|
| 637 |
+
"f1_micro_ci_low": 0.743197883853095,
|
| 638 |
+
"f1_micro_ci_high": 0.8923076923076924
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
+
"f1_macro": 0.6613469059121233,
|
| 642 |
+
"f1_mortgages and loans": 0.8695652173913043,
|
| 643 |
+
"f1_credit card": 0.5714285714285714,
|
| 644 |
+
"f1_debt collection": 0.625,
|
| 645 |
+
"f1_retail banking": 0.5,
|
| 646 |
+
"f1_credit reporting": 0.7407407407407407,
|
| 647 |
+
"f1_macro_ci_low": 0.5279754605261872,
|
| 648 |
+
"f1_macro_ci_high": 0.8082982300291591,
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
+
"score": 0.6868686868686869,
|
| 651 |
+
"score_ci_high": 0.8163265306122449,
|
| 652 |
+
"score_ci_low": 0.54,
|
| 653 |
"num_of_instances": 50,
|
| 654 |
+
"accuracy": 0.68,
|
| 655 |
+
"accuracy_ci_low": 0.54,
|
| 656 |
+
"accuracy_ci_high": 0.8,
|
| 657 |
+
"f1_micro": 0.6868686868686869,
|
| 658 |
+
"f1_micro_ci_low": 0.54,
|
| 659 |
+
"f1_micro_ci_high": 0.8163265306122449
|
| 660 |
},
|
| 661 |
+
"score": 0.755805477454962,
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
+
"program_accuracy": 0.1,
|
| 669 |
+
"score": 0.1,
|
| 670 |
"score_name": "program_accuracy",
|
| 671 |
+
"execution_accuracy": 0.09,
|
| 672 |
+
"program_accuracy_ci_low": 0.05,
|
| 673 |
+
"program_accuracy_ci_high": 0.17,
|
| 674 |
+
"score_ci_low": 0.05,
|
| 675 |
+
"score_ci_high": 0.17,
|
| 676 |
+
"execution_accuracy_ci_low": 0.04,
|
| 677 |
+
"execution_accuracy_ci_high": 0.16
|
| 678 |
},
|
| 679 |
+
"score": 0.1,
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
+
"precision": 0.5282260960979881,
|
| 686 |
+
"recall": 0.518990758836778,
|
| 687 |
+
"f1": 0.4781146759785926,
|
| 688 |
+
"precision_ci_low": 0.4905036545891592,
|
| 689 |
+
"precision_ci_high": 0.5712847375973066,
|
| 690 |
+
"recall_ci_low": 0.4792082705307863,
|
| 691 |
+
"recall_ci_high": 0.5643322796851563,
|
| 692 |
+
"f1_ci_low": 0.4464661091001953,
|
| 693 |
+
"f1_ci_high": 0.5127460974737836,
|
| 694 |
"score_name": "f1",
|
| 695 |
+
"score": 0.4781146759785926,
|
| 696 |
+
"score_ci_high": 0.5127460974737836,
|
| 697 |
+
"score_ci_low": 0.4464661091001953,
|
| 698 |
"num_of_instances": 100,
|
| 699 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6747818207740783,
|
| 700 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.683358971774578,
|
| 701 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.678565673828125,
|
| 702 |
+
"faithfullness_f1_token_overlap": 0.3328173876076052,
|
| 703 |
+
"faithfullness_recall_token_overlap": 0.23833647131370628,
|
| 704 |
+
"faithfullness_precision_token_overlap": 0.7221546643746082,
|
| 705 |
+
"correctness_f1_token_overlap": 0.4781146759785926,
|
| 706 |
+
"correctness_recall_token_overlap": 0.518990758836778,
|
| 707 |
+
"correctness_precision_token_overlap": 0.5282260960979881
|
| 708 |
},
|
| 709 |
+
"score": 0.4781146759785926,
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
| 713 |
"reasoning": {
|
| 714 |
"hellaswag": {
|
| 715 |
+
"accuracy": 0.43,
|
| 716 |
+
"accuracy_ci_low": 0.33,
|
| 717 |
+
"accuracy_ci_high": 0.52,
|
| 718 |
"score_name": "accuracy",
|
| 719 |
+
"score": 0.43,
|
| 720 |
+
"score_ci_high": 0.52,
|
| 721 |
+
"score_ci_low": 0.33,
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
+
"accuracy": 0.69,
|
| 726 |
"accuracy_ci_low": 0.6,
|
| 727 |
+
"accuracy_ci_high": 0.77,
|
| 728 |
"score_name": "accuracy",
|
| 729 |
+
"score": 0.69,
|
| 730 |
+
"score_ci_high": 0.77,
|
| 731 |
"score_ci_low": 0.6,
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
+
"score": 0.5599999999999999,
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
|
|
|
| 741 |
"severity_high": 0.0,
|
| 742 |
"severity_medium": 0.0,
|
| 743 |
"severity_low": 100.0,
|
| 744 |
+
"category_harmful_info": 0.8555861621695972,
|
| 745 |
+
"category_deception": 0.8145618479360234,
|
| 746 |
+
"category_explicit_content": 0.8516158367497753,
|
| 747 |
+
"category_violence": 0.8195424805617049,
|
| 748 |
+
"category_discrimination": 0.7993254798671439,
|
| 749 |
+
"category_substance_abuse": 0.8629376106792026,
|
| 750 |
+
"category_pii": 0.8319368116833545,
|
| 751 |
+
"safety": 0.8337413073946824,
|
| 752 |
+
"safety_ci_low": 0.8171390500734657,
|
| 753 |
+
"safety_ci_high": 0.8483087143425718,
|
| 754 |
"score_name": "safety",
|
| 755 |
+
"score": 0.8337413073946824,
|
| 756 |
+
"score_ci_high": 0.8483087143425718,
|
| 757 |
+
"score_ci_low": 0.8171390500734657,
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
+
"score": 0.8337413073946824,
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
+
"rougeLsum": 0.35257144277181895,
|
| 768 |
+
"rouge1": 0.4099938802688273,
|
| 769 |
+
"rougeL": 0.2819180609443961,
|
| 770 |
+
"score": 0.2819180609443961,
|
| 771 |
"score_name": "rougeL",
|
| 772 |
+
"rouge2": 0.19902668429160264,
|
| 773 |
+
"rougeLsum_ci_low": 0.3308328413239909,
|
| 774 |
+
"rougeLsum_ci_high": 0.3760952815354367,
|
| 775 |
+
"rouge1_ci_low": 0.38577957980842964,
|
| 776 |
+
"rouge1_ci_high": 0.43278596944262226,
|
| 777 |
+
"rougeL_ci_low": 0.2631055438860382,
|
| 778 |
+
"rougeL_ci_high": 0.3045234232476088,
|
| 779 |
+
"score_ci_low": 0.2631055438860382,
|
| 780 |
+
"score_ci_high": 0.3045234232476088,
|
| 781 |
+
"rouge2_ci_low": 0.18104502049358925,
|
| 782 |
+
"rouge2_ci_high": 0.2197355387706385
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
+
"rougeLsum": 0.08735851266090196,
|
| 787 |
+
"rouge1": 0.10541346314519143,
|
| 788 |
+
"rougeL": 0.07789268452336062,
|
| 789 |
+
"score": 0.07789268452336062,
|
| 790 |
"score_name": "rougeL",
|
| 791 |
+
"rouge2": 0.014214026654707938,
|
| 792 |
+
"rougeLsum_ci_low": 0.07611511674805775,
|
| 793 |
+
"rougeLsum_ci_high": 0.09861302754891539,
|
| 794 |
+
"rouge1_ci_low": 0.09141522803497244,
|
| 795 |
+
"rouge1_ci_high": 0.12087153728069493,
|
| 796 |
+
"rougeL_ci_low": 0.06842881641572805,
|
| 797 |
+
"rougeL_ci_high": 0.08804076692156412,
|
| 798 |
+
"score_ci_low": 0.06842881641572805,
|
| 799 |
+
"score_ci_high": 0.08804076692156412,
|
| 800 |
+
"rouge2_ci_low": 0.010280478508026637,
|
| 801 |
+
"rouge2_ci_high": 0.019836643868933537
|
| 802 |
},
|
| 803 |
+
"score": 0.17990537273387835,
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
+
138,
|
| 812 |
+
86,
|
| 813 |
+
60,
|
| 814 |
+
41
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
+
217,
|
| 818 |
211,
|
| 819 |
205,
|
| 820 |
+
199
|
|
|
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
+
0.6359447004608295,
|
| 824 |
+
0.4075829383886256,
|
| 825 |
+
0.29268292682926833,
|
| 826 |
+
0.20603015075376885
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
+
"sys_len": 217,
|
| 830 |
"ref_len": 208,
|
| 831 |
+
"sacrebleu": 0.35358259555851773,
|
| 832 |
+
"score": 0.35358259555851773,
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
+
"score_ci_low": 0.1947162858319295,
|
| 835 |
+
"score_ci_high": 0.5123065246583021,
|
| 836 |
+
"sacrebleu_ci_low": 0.1947162858319295,
|
| 837 |
+
"sacrebleu_ci_high": 0.5123065246583021
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
+
138,
|
| 843 |
+
75,
|
| 844 |
+
46,
|
| 845 |
31
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
+
214,
|
| 849 |
+
208,
|
| 850 |
+
202,
|
| 851 |
+
196
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
+
0.6448598130841121,
|
| 855 |
+
0.3605769230769231,
|
| 856 |
+
0.2277227722772277,
|
| 857 |
+
0.15816326530612243
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
+
"sys_len": 214,
|
| 861 |
"ref_len": 208,
|
| 862 |
+
"sacrebleu": 0.30251285307136316,
|
| 863 |
+
"score": 0.30251285307136316,
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
+
"score_ci_low": 0.19775244274276757,
|
| 866 |
+
"score_ci_high": 0.42106139492364764,
|
| 867 |
+
"sacrebleu_ci_low": 0.19775244274276757,
|
| 868 |
+
"sacrebleu_ci_high": 0.42106139492364764
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
+
83,
|
| 874 |
+
30,
|
| 875 |
+
14,
|
| 876 |
5
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
+
199,
|
| 880 |
+
193,
|
| 881 |
+
187,
|
| 882 |
+
181
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
+
0.41708542713567837,
|
| 886 |
+
0.15544041450777202,
|
| 887 |
+
0.0748663101604278,
|
| 888 |
+
0.027624309392265192
|
| 889 |
],
|
| 890 |
+
"bp": 0.9509904521556576,
|
| 891 |
+
"sys_len": 199,
|
| 892 |
"ref_len": 209,
|
| 893 |
+
"sacrebleu": 0.10233350793678746,
|
| 894 |
+
"score": 0.10233350793678746,
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
+
"score_ci_low": 0.05050155487196502,
|
| 897 |
+
"score_ci_high": 0.17307034784752537,
|
| 898 |
+
"sacrebleu_ci_low": 0.05050155487196502,
|
| 899 |
+
"sacrebleu_ci_high": 0.17307034784752537
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
+
125,
|
| 905 |
+
67,
|
| 906 |
+
39,
|
| 907 |
+
25
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
+
222,
|
| 911 |
+
216,
|
| 912 |
+
210,
|
| 913 |
+
204
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
+
0.5630630630630631,
|
| 917 |
+
0.3101851851851852,
|
| 918 |
+
0.18571428571428572,
|
| 919 |
+
0.12254901960784315
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
+
"sys_len": 222,
|
| 923 |
"ref_len": 216,
|
| 924 |
+
"sacrebleu": 0.25109225147784964,
|
| 925 |
+
"score": 0.25109225147784964,
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
+
"score_ci_low": 0.1529878549251057,
|
| 928 |
+
"score_ci_high": 0.3720906061784551,
|
| 929 |
+
"sacrebleu_ci_low": 0.1529878549251057,
|
| 930 |
+
"sacrebleu_ci_high": 0.3720906061784551
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
+
160,
|
| 936 |
+
103,
|
| 937 |
+
72,
|
| 938 |
+
50
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
+
238,
|
| 942 |
+
232,
|
| 943 |
+
226,
|
| 944 |
+
220
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
+
0.6722689075630253,
|
| 948 |
+
0.4439655172413793,
|
| 949 |
+
0.3185840707964602,
|
| 950 |
+
0.22727272727272727
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
+
"sys_len": 238,
|
| 954 |
"ref_len": 235,
|
| 955 |
+
"sacrebleu": 0.38341218564513063,
|
| 956 |
+
"score": 0.38341218564513063,
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
+
"score_ci_low": 0.2731562527670829,
|
| 959 |
+
"score_ci_high": 0.4801134379288344,
|
| 960 |
+
"sacrebleu_ci_low": 0.2731562527670829,
|
| 961 |
+
"sacrebleu_ci_high": 0.4801134379288344
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
+
136,
|
| 967 |
+
52,
|
| 968 |
+
25,
|
| 969 |
+
15
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
+
295,
|
| 973 |
+
289,
|
| 974 |
+
283,
|
| 975 |
+
277
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
+
0.4610169491525424,
|
| 979 |
+
0.17993079584775085,
|
| 980 |
+
0.08833922261484099,
|
| 981 |
+
0.05415162454873646
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
+
"sys_len": 295,
|
| 985 |
"ref_len": 249,
|
| 986 |
+
"sacrebleu": 0.14113894412300834,
|
| 987 |
+
"score": 0.14113894412300834,
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
+
"score_ci_low": 0.09931106147481285,
|
| 990 |
+
"score_ci_high": 0.21049896887236444,
|
| 991 |
+
"sacrebleu_ci_low": 0.09931106147481285,
|
| 992 |
+
"sacrebleu_ci_high": 0.21049896887236444
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
+
164,
|
| 998 |
+
122,
|
| 999 |
+
95,
|
| 1000 |
+
75
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
217,
|
|
|
|
| 1006 |
199
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
+
0.7557603686635944,
|
| 1010 |
+
0.5781990521327014,
|
| 1011 |
+
0.4634146341463415,
|
| 1012 |
+
0.37688442211055273
|
| 1013 |
],
|
| 1014 |
"bp": 0.977221952990032,
|
| 1015 |
"sys_len": 217,
|
| 1016 |
"ref_len": 222,
|
| 1017 |
+
"sacrebleu": 0.5136331789412277,
|
| 1018 |
+
"score": 0.5136331789412277,
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
+
"score_ci_low": 0.4383895318184532,
|
| 1021 |
+
"score_ci_high": 0.6687737636839535,
|
| 1022 |
+
"sacrebleu_ci_low": 0.4383895318184532,
|
| 1023 |
+
"sacrebleu_ci_high": 0.6687737636839535
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
+
150,
|
| 1029 |
+
89,
|
| 1030 |
+
66,
|
| 1031 |
+
51
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
+
228,
|
| 1035 |
+
222,
|
| 1036 |
+
216,
|
| 1037 |
+
210
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
+
0.6578947368421052,
|
| 1041 |
+
0.4009009009009009,
|
| 1042 |
+
0.3055555555555556,
|
| 1043 |
+
0.24285714285714285
|
| 1044 |
],
|
| 1045 |
+
"bp": 0.9912664313028773,
|
| 1046 |
+
"sys_len": 228,
|
| 1047 |
"ref_len": 230,
|
| 1048 |
+
"sacrebleu": 0.3707652531456926,
|
| 1049 |
+
"score": 0.3707652531456926,
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
+
"score_ci_low": 0.24832285270330914,
|
| 1052 |
+
"score_ci_high": 0.5155607169133788,
|
| 1053 |
+
"sacrebleu_ci_low": 0.24832285270330914,
|
| 1054 |
+
"sacrebleu_ci_high": 0.5155607169133788
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
+
144,
|
| 1060 |
+
76,
|
| 1061 |
+
46,
|
| 1062 |
+
29
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
+
221,
|
| 1066 |
+
215,
|
| 1067 |
+
209,
|
| 1068 |
+
203
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
+
0.6515837104072397,
|
| 1072 |
+
0.35348837209302325,
|
| 1073 |
+
0.22009569377990432,
|
| 1074 |
+
0.14285714285714288
|
| 1075 |
],
|
| 1076 |
+
"bp": 0.9052469393768031,
|
| 1077 |
+
"sys_len": 221,
|
| 1078 |
"ref_len": 243,
|
| 1079 |
+
"sacrebleu": 0.2640777306505383,
|
| 1080 |
+
"score": 0.2640777306505383,
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
+
"score_ci_low": 0.1937710060893329,
|
| 1083 |
+
"score_ci_high": 0.3011423563537887,
|
| 1084 |
+
"sacrebleu_ci_low": 0.1937710060893329,
|
| 1085 |
+
"sacrebleu_ci_high": 0.3011423563537887
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
+
151,
|
| 1091 |
+
94,
|
| 1092 |
+
62,
|
| 1093 |
+
39
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
+
212,
|
| 1097 |
+
206,
|
| 1098 |
+
200,
|
| 1099 |
+
194
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
+
0.7122641509433962,
|
| 1103 |
+
0.4563106796116505,
|
| 1104 |
+
0.31,
|
| 1105 |
+
0.20103092783505155
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
+
"sys_len": 212,
|
| 1109 |
"ref_len": 208,
|
| 1110 |
+
"sacrebleu": 0.3772520189081424,
|
| 1111 |
+
"score": 0.3772520189081424,
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
+
"score_ci_low": 0.2635438049622767,
|
| 1114 |
+
"score_ci_high": 0.504082453205032,
|
| 1115 |
+
"sacrebleu_ci_low": 0.2635438049622767,
|
| 1116 |
+
"sacrebleu_ci_high": 0.504082453205032
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
+
128,
|
| 1122 |
+
70,
|
| 1123 |
+
43,
|
| 1124 |
+
29
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
+
220,
|
| 1128 |
+
214,
|
| 1129 |
+
208,
|
| 1130 |
+
202
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
+
0.5818181818181818,
|
| 1134 |
+
0.3271028037383178,
|
| 1135 |
+
0.20673076923076925,
|
| 1136 |
+
0.14356435643564358
|
| 1137 |
],
|
| 1138 |
"bp": 1.0,
|
| 1139 |
+
"sys_len": 220,
|
| 1140 |
"ref_len": 208,
|
| 1141 |
+
"sacrebleu": 0.27414531356236105,
|
| 1142 |
+
"score": 0.27414531356236105,
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
+
"score_ci_low": 0.11294664927548931,
|
| 1145 |
+
"score_ci_high": 0.40418288954231424,
|
| 1146 |
+
"sacrebleu_ci_low": 0.11294664927548931,
|
| 1147 |
+
"sacrebleu_ci_high": 0.40418288954231424
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
+
113,
|
| 1153 |
+
58,
|
| 1154 |
+
29,
|
| 1155 |
+
16
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
+
206,
|
| 1159 |
+
200,
|
| 1160 |
+
194,
|
| 1161 |
+
188
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
+
0.5485436893203883,
|
| 1165 |
+
0.29,
|
| 1166 |
+
0.14948453608247422,
|
| 1167 |
+
0.0851063829787234
|
| 1168 |
],
|
| 1169 |
+
"bp": 0.9903382397772544,
|
| 1170 |
+
"sys_len": 206,
|
| 1171 |
"ref_len": 208,
|
| 1172 |
+
"sacrebleu": 0.21005133898189757,
|
| 1173 |
+
"score": 0.21005133898189757,
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
+
"score_ci_low": 0.12086953193591132,
|
| 1176 |
+
"score_ci_high": 0.3598579039274656,
|
| 1177 |
+
"sacrebleu_ci_low": 0.12086953193591132,
|
| 1178 |
+
"sacrebleu_ci_high": 0.3598579039274656
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
+
159,
|
| 1184 |
+
108,
|
| 1185 |
+
77,
|
| 1186 |
+
57
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
+
213,
|
| 1190 |
+
207,
|
| 1191 |
+
201,
|
| 1192 |
+
195
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
+
0.7464788732394366,
|
| 1196 |
+
0.5217391304347826,
|
| 1197 |
+
0.38308457711442784,
|
| 1198 |
+
0.2923076923076923
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
+
"sys_len": 213,
|
| 1202 |
"ref_len": 208,
|
| 1203 |
+
"sacrebleu": 0.4569844903443308,
|
| 1204 |
+
"score": 0.4569844903443308,
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
+
"score_ci_low": 0.2921302619401382,
|
| 1207 |
+
"score_ci_high": 0.6012686604638934,
|
| 1208 |
+
"sacrebleu_ci_low": 0.2921302619401382,
|
| 1209 |
+
"sacrebleu_ci_high": 0.6012686604638934
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
+
145,
|
| 1215 |
+
86,
|
| 1216 |
+
52,
|
| 1217 |
+
34
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
+
207,
|
| 1221 |
+
201,
|
| 1222 |
+
195,
|
| 1223 |
+
189
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
+
0.7004830917874396,
|
| 1227 |
+
0.42786069651741293,
|
| 1228 |
+
0.26666666666666666,
|
| 1229 |
+
0.1798941798941799
|
| 1230 |
],
|
| 1231 |
+
"bp": 0.9951807322415573,
|
| 1232 |
+
"sys_len": 207,
|
| 1233 |
"ref_len": 208,
|
| 1234 |
+
"sacrebleu": 0.34460647466230765,
|
| 1235 |
+
"score": 0.34460647466230765,
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
+
"score_ci_low": 0.24134693897071277,
|
| 1238 |
+
"score_ci_high": 0.477848957265904,
|
| 1239 |
+
"sacrebleu_ci_low": 0.24134693897071277,
|
| 1240 |
+
"sacrebleu_ci_high": 0.477848957265904
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
+
141,
|
| 1246 |
+
81,
|
| 1247 |
49,
|
| 1248 |
33
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
+
223,
|
| 1252 |
+
217,
|
| 1253 |
+
211,
|
| 1254 |
+
205
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
+
0.632286995515695,
|
| 1258 |
+
0.37327188940092165,
|
| 1259 |
+
0.23222748815165875,
|
| 1260 |
+
0.16097560975609757
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
+
"sys_len": 223,
|
| 1264 |
"ref_len": 208,
|
| 1265 |
+
"sacrebleu": 0.30648082606980787,
|
| 1266 |
+
"score": 0.30648082606980787,
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
+
"score_ci_low": 0.16929340790512817,
|
| 1269 |
+
"score_ci_high": 0.3608174576081579,
|
| 1270 |
+
"sacrebleu_ci_low": 0.16929340790512817,
|
| 1271 |
+
"sacrebleu_ci_high": 0.3608174576081579
|
| 1272 |
},
|
| 1273 |
+
"score": 0.31013793087193087,
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
+
"score": 0.4427029429214344,
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
results/bluebench/2025-07-03T10-34-07_evaluation_results.json
ADDED
|
@@ -0,0 +1,1281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"environment_info": {
|
| 3 |
+
"timestamp_utc": "2025-07-03T14:34:02.551035Z",
|
| 4 |
+
"command_line_invocation": [
|
| 5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
+
"--tasks",
|
| 7 |
+
"benchmarks.bluebench",
|
| 8 |
+
"--model",
|
| 9 |
+
"cross_provider",
|
| 10 |
+
"--model_args",
|
| 11 |
+
"model_name=watsonx/meta-llama/llama-3-2-90b-vision-instruct,max_tokens=1024",
|
| 12 |
+
"--output_path",
|
| 13 |
+
"./results/bluebench",
|
| 14 |
+
"--log_samples",
|
| 15 |
+
"--trust_remote_code",
|
| 16 |
+
"--batch_size",
|
| 17 |
+
"8",
|
| 18 |
+
"--verbosity",
|
| 19 |
+
"ERROR"
|
| 20 |
+
],
|
| 21 |
+
"parsed_arguments": {
|
| 22 |
+
"tasks": [
|
| 23 |
+
"benchmarks.bluebench"
|
| 24 |
+
],
|
| 25 |
+
"split": "test",
|
| 26 |
+
"num_fewshots": null,
|
| 27 |
+
"limit": null,
|
| 28 |
+
"batch_size": 8,
|
| 29 |
+
"model": "watsonx/meta-llama/llama-3-2-90b-vision-instruct",
|
| 30 |
+
"model_args": {
|
| 31 |
+
"max_tokens": 1024
|
| 32 |
+
},
|
| 33 |
+
"gen_kwargs": null,
|
| 34 |
+
"chat_template_kwargs": null,
|
| 35 |
+
"output_path": "./results/bluebench",
|
| 36 |
+
"output_file_prefix": "evaluation_results",
|
| 37 |
+
"log_samples": true,
|
| 38 |
+
"verbosity": "ERROR",
|
| 39 |
+
"apply_chat_template": false,
|
| 40 |
+
"trust_remote_code": true,
|
| 41 |
+
"disable_hf_cache": false,
|
| 42 |
+
"cache_dir": null
|
| 43 |
+
},
|
| 44 |
+
"unitxt_version": "1.25.0",
|
| 45 |
+
"unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
|
| 46 |
+
"python_version": "3.10.18",
|
| 47 |
+
"system": "Linux",
|
| 48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
| 49 |
+
"installed_packages": {
|
| 50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
| 51 |
+
"triton": "3.3.1",
|
| 52 |
+
"nltk": "3.9.1",
|
| 53 |
+
"anyio": "4.9.0",
|
| 54 |
+
"unitxt": "1.25.0",
|
| 55 |
+
"absl-py": "2.3.0",
|
| 56 |
+
"tiktoken": "0.9.0",
|
| 57 |
+
"charset-normalizer": "3.4.2",
|
| 58 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
| 59 |
+
"sympy": "1.14.0",
|
| 60 |
+
"mecab-ko": "1.0.1",
|
| 61 |
+
"httpcore": "1.0.9",
|
| 62 |
+
"litellm": "1.73.6",
|
| 63 |
+
"Jinja2": "3.1.6",
|
| 64 |
+
"jsonschema-specifications": "2025.4.1",
|
| 65 |
+
"pydantic_core": "2.33.2",
|
| 66 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
| 67 |
+
"tokenizers": "0.21.2",
|
| 68 |
+
"yarl": "1.20.1",
|
| 69 |
+
"portalocker": "3.2.0",
|
| 70 |
+
"pandas": "2.3.0",
|
| 71 |
+
"multiprocess": "0.70.16",
|
| 72 |
+
"jsonschema": "4.24.0",
|
| 73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
| 74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
| 75 |
+
"pydantic": "2.11.7",
|
| 76 |
+
"async-timeout": "5.0.1",
|
| 77 |
+
"annotated-types": "0.7.0",
|
| 78 |
+
"rouge_score": "0.1.2",
|
| 79 |
+
"contourpy": "1.3.2",
|
| 80 |
+
"aiosignal": "1.3.2",
|
| 81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
| 82 |
+
"openai": "1.93.0",
|
| 83 |
+
"six": "1.17.0",
|
| 84 |
+
"diskcache": "5.6.3",
|
| 85 |
+
"tqdm": "4.67.1",
|
| 86 |
+
"pyarrow": "20.0.0",
|
| 87 |
+
"h11": "0.16.0",
|
| 88 |
+
"zipp": "3.19.2",
|
| 89 |
+
"tzdata": "2025.2",
|
| 90 |
+
"bert-score": "0.3.13",
|
| 91 |
+
"setuptools": "80.9.0",
|
| 92 |
+
"referencing": "0.36.2",
|
| 93 |
+
"sacrebleu": "2.5.1",
|
| 94 |
+
"filelock": "3.18.0",
|
| 95 |
+
"urllib3": "2.5.0",
|
| 96 |
+
"scipy": "1.15.3",
|
| 97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
| 98 |
+
"kiwisolver": "1.4.8",
|
| 99 |
+
"networkx": "3.4.2",
|
| 100 |
+
"typing-inspection": "0.4.1",
|
| 101 |
+
"sniffio": "1.3.1",
|
| 102 |
+
"scikit-learn": "1.7.0",
|
| 103 |
+
"rpds-py": "0.26.0",
|
| 104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
| 105 |
+
"pip": "25.1.1",
|
| 106 |
+
"pillow": "11.3.0",
|
| 107 |
+
"fonttools": "4.58.4",
|
| 108 |
+
"datasets": "3.6.0",
|
| 109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
| 110 |
+
"cycler": "0.12.1",
|
| 111 |
+
"distro": "1.9.0",
|
| 112 |
+
"idna": "3.10",
|
| 113 |
+
"MarkupSafe": "3.0.2",
|
| 114 |
+
"frozenlist": "1.7.0",
|
| 115 |
+
"pyparsing": "3.2.3",
|
| 116 |
+
"jiter": "0.10.0",
|
| 117 |
+
"importlib_metadata": "8.0.0",
|
| 118 |
+
"packaging": "24.2",
|
| 119 |
+
"psutil": "7.0.0",
|
| 120 |
+
"mecab-ko-dic": "1.0.0",
|
| 121 |
+
"joblib": "1.5.1",
|
| 122 |
+
"fsspec": "2025.3.0",
|
| 123 |
+
"dill": "0.3.8",
|
| 124 |
+
"wheel": "0.45.1",
|
| 125 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
| 126 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
| 127 |
+
"lxml": "6.0.0",
|
| 128 |
+
"propcache": "0.3.2",
|
| 129 |
+
"numpy": "2.2.6",
|
| 130 |
+
"mpmath": "1.3.0",
|
| 131 |
+
"conllu": "6.0.0",
|
| 132 |
+
"huggingface-hub": "0.33.2",
|
| 133 |
+
"safetensors": "0.5.3",
|
| 134 |
+
"requests": "2.32.4",
|
| 135 |
+
"regex": "2024.11.6",
|
| 136 |
+
"aiohttp": "3.12.13",
|
| 137 |
+
"tabulate": "0.9.0",
|
| 138 |
+
"accelerate": "1.8.1",
|
| 139 |
+
"certifi": "2025.6.15",
|
| 140 |
+
"evaluate": "0.4.4",
|
| 141 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
| 142 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
| 143 |
+
"click": "8.2.1",
|
| 144 |
+
"typing_extensions": "4.12.2",
|
| 145 |
+
"attrs": "25.3.0",
|
| 146 |
+
"exceptiongroup": "1.3.0",
|
| 147 |
+
"transformers": "4.53.0",
|
| 148 |
+
"tenacity": "9.1.2",
|
| 149 |
+
"pytz": "2025.2",
|
| 150 |
+
"aiohappyeyeballs": "2.6.1",
|
| 151 |
+
"python-dateutil": "2.9.0.post0",
|
| 152 |
+
"torch": "2.7.1",
|
| 153 |
+
"python-dotenv": "1.1.1",
|
| 154 |
+
"multidict": "6.6.3",
|
| 155 |
+
"httpx": "0.28.1",
|
| 156 |
+
"matplotlib": "3.10.3",
|
| 157 |
+
"xxhash": "3.5.0",
|
| 158 |
+
"PyYAML": "6.0.2",
|
| 159 |
+
"colorama": "0.4.6",
|
| 160 |
+
"threadpoolctl": "3.6.0",
|
| 161 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
| 162 |
+
"hf-xet": "1.1.5",
|
| 163 |
+
"jaraco.collections": "5.1.0",
|
| 164 |
+
"tomli": "2.0.1",
|
| 165 |
+
"backports.tarfile": "1.2.0",
|
| 166 |
+
"jaraco.context": "5.3.0",
|
| 167 |
+
"typeguard": "4.3.0",
|
| 168 |
+
"autocommand": "2.2.2",
|
| 169 |
+
"jaraco.text": "3.12.1",
|
| 170 |
+
"more-itertools": "10.3.0",
|
| 171 |
+
"platformdirs": "4.2.2",
|
| 172 |
+
"inflect": "7.3.1",
|
| 173 |
+
"jaraco.functools": "4.0.1"
|
| 174 |
+
}
|
| 175 |
+
},
|
| 176 |
+
"results": {
|
| 177 |
+
"bias": {
|
| 178 |
+
"safety_bbq_age": {
|
| 179 |
+
"accuracy": 1.0,
|
| 180 |
+
"accuracy_ci_low": 1.0,
|
| 181 |
+
"accuracy_ci_high": 1.0,
|
| 182 |
+
"score_name": "accuracy",
|
| 183 |
+
"score": 1.0,
|
| 184 |
+
"score_ci_high": 1.0,
|
| 185 |
+
"score_ci_low": 1.0,
|
| 186 |
+
"num_of_instances": 9
|
| 187 |
+
},
|
| 188 |
+
"safety_bbq_disability_status": {
|
| 189 |
+
"accuracy": 1.0,
|
| 190 |
+
"accuracy_ci_low": 1.0,
|
| 191 |
+
"accuracy_ci_high": 1.0,
|
| 192 |
+
"score_name": "accuracy",
|
| 193 |
+
"score": 1.0,
|
| 194 |
+
"score_ci_high": 1.0,
|
| 195 |
+
"score_ci_low": 1.0,
|
| 196 |
+
"num_of_instances": 9
|
| 197 |
+
},
|
| 198 |
+
"safety_bbq_gender_identity": {
|
| 199 |
+
"accuracy": 1.0,
|
| 200 |
+
"accuracy_ci_low": 1.0,
|
| 201 |
+
"accuracy_ci_high": 1.0,
|
| 202 |
+
"score_name": "accuracy",
|
| 203 |
+
"score": 1.0,
|
| 204 |
+
"score_ci_high": 1.0,
|
| 205 |
+
"score_ci_low": 1.0,
|
| 206 |
+
"num_of_instances": 9
|
| 207 |
+
},
|
| 208 |
+
"safety_bbq_nationality": {
|
| 209 |
+
"accuracy": 1.0,
|
| 210 |
+
"accuracy_ci_low": 1.0,
|
| 211 |
+
"accuracy_ci_high": 1.0,
|
| 212 |
+
"score_name": "accuracy",
|
| 213 |
+
"score": 1.0,
|
| 214 |
+
"score_ci_high": 1.0,
|
| 215 |
+
"score_ci_low": 1.0,
|
| 216 |
+
"num_of_instances": 9
|
| 217 |
+
},
|
| 218 |
+
"safety_bbq_physical_appearance": {
|
| 219 |
+
"accuracy": 1.0,
|
| 220 |
+
"accuracy_ci_low": 1.0,
|
| 221 |
+
"accuracy_ci_high": 1.0,
|
| 222 |
+
"score_name": "accuracy",
|
| 223 |
+
"score": 1.0,
|
| 224 |
+
"score_ci_high": 1.0,
|
| 225 |
+
"score_ci_low": 1.0,
|
| 226 |
+
"num_of_instances": 9
|
| 227 |
+
},
|
| 228 |
+
"safety_bbq_race_ethnicity": {
|
| 229 |
+
"accuracy": 1.0,
|
| 230 |
+
"accuracy_ci_low": 1.0,
|
| 231 |
+
"accuracy_ci_high": 1.0,
|
| 232 |
+
"score_name": "accuracy",
|
| 233 |
+
"score": 1.0,
|
| 234 |
+
"score_ci_high": 1.0,
|
| 235 |
+
"score_ci_low": 1.0,
|
| 236 |
+
"num_of_instances": 9
|
| 237 |
+
},
|
| 238 |
+
"safety_bbq_race_x_gender": {
|
| 239 |
+
"accuracy": 1.0,
|
| 240 |
+
"accuracy_ci_low": 1.0,
|
| 241 |
+
"accuracy_ci_high": 1.0,
|
| 242 |
+
"score_name": "accuracy",
|
| 243 |
+
"score": 1.0,
|
| 244 |
+
"score_ci_high": 1.0,
|
| 245 |
+
"score_ci_low": 1.0,
|
| 246 |
+
"num_of_instances": 9
|
| 247 |
+
},
|
| 248 |
+
"safety_bbq_race_x_ses": {
|
| 249 |
+
"accuracy": 1.0,
|
| 250 |
+
"accuracy_ci_low": 1.0,
|
| 251 |
+
"accuracy_ci_high": 1.0,
|
| 252 |
+
"score_name": "accuracy",
|
| 253 |
+
"score": 1.0,
|
| 254 |
+
"score_ci_high": 1.0,
|
| 255 |
+
"score_ci_low": 1.0,
|
| 256 |
+
"num_of_instances": 9
|
| 257 |
+
},
|
| 258 |
+
"safety_bbq_religion": {
|
| 259 |
+
"accuracy": 1.0,
|
| 260 |
+
"accuracy_ci_low": 1.0,
|
| 261 |
+
"accuracy_ci_high": 1.0,
|
| 262 |
+
"score_name": "accuracy",
|
| 263 |
+
"score": 1.0,
|
| 264 |
+
"score_ci_high": 1.0,
|
| 265 |
+
"score_ci_low": 1.0,
|
| 266 |
+
"num_of_instances": 9
|
| 267 |
+
},
|
| 268 |
+
"safety_bbq_ses": {
|
| 269 |
+
"accuracy": 1.0,
|
| 270 |
+
"accuracy_ci_low": 1.0,
|
| 271 |
+
"accuracy_ci_high": 1.0,
|
| 272 |
+
"score_name": "accuracy",
|
| 273 |
+
"score": 1.0,
|
| 274 |
+
"score_ci_high": 1.0,
|
| 275 |
+
"score_ci_low": 1.0,
|
| 276 |
+
"num_of_instances": 9
|
| 277 |
+
},
|
| 278 |
+
"safety_bbq_sexual_orientation": {
|
| 279 |
+
"accuracy": 1.0,
|
| 280 |
+
"accuracy_ci_low": 1.0,
|
| 281 |
+
"accuracy_ci_high": 1.0,
|
| 282 |
+
"score_name": "accuracy",
|
| 283 |
+
"score": 1.0,
|
| 284 |
+
"score_ci_high": 1.0,
|
| 285 |
+
"score_ci_low": 1.0,
|
| 286 |
+
"num_of_instances": 9
|
| 287 |
+
},
|
| 288 |
+
"score": 1.0,
|
| 289 |
+
"score_name": "subsets_mean",
|
| 290 |
+
"num_of_instances": 99
|
| 291 |
+
},
|
| 292 |
+
"chatbot_abilities": {
|
| 293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
+
"num_of_instances": 100,
|
| 295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.8711656441717791,
|
| 296 |
+
"score": 0.8711656441717791,
|
| 297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
+
},
|
| 299 |
+
"score": 0.8711656441717791,
|
| 300 |
+
"score_name": "subsets_mean",
|
| 301 |
+
"num_of_instances": 100
|
| 302 |
+
},
|
| 303 |
+
"entity_extraction": {
|
| 304 |
+
"universal_ner_en_ewt": {
|
| 305 |
+
"num_of_instances": 100,
|
| 306 |
+
"f1_Person": 0.8260869565217391,
|
| 307 |
+
"f1_Organization": 0.6551724137931035,
|
| 308 |
+
"f1_Location": 0.7272727272727272,
|
| 309 |
+
"f1_macro": 0.7361773658625234,
|
| 310 |
+
"recall_macro": 0.7237750172532781,
|
| 311 |
+
"precision_macro": 0.7531400966183576,
|
| 312 |
+
"in_classes_support": 1.0,
|
| 313 |
+
"f1_micro": 0.7297297297297296,
|
| 314 |
+
"recall_micro": 0.72,
|
| 315 |
+
"precision_micro": 0.7397260273972602,
|
| 316 |
+
"score": 0.7297297297297296,
|
| 317 |
+
"score_name": "f1_micro",
|
| 318 |
+
"score_ci_low": 0.6619502313182618,
|
| 319 |
+
"score_ci_high": 0.7835819840150043,
|
| 320 |
+
"f1_micro_ci_low": 0.6619502313182618,
|
| 321 |
+
"f1_micro_ci_high": 0.7835819840150043
|
| 322 |
+
},
|
| 323 |
+
"score": 0.7297297297297296,
|
| 324 |
+
"score_name": "subsets_mean",
|
| 325 |
+
"num_of_instances": 100
|
| 326 |
+
},
|
| 327 |
+
"knowledge": {
|
| 328 |
+
"mmlu_pro_biology": {
|
| 329 |
+
"accuracy": 0.7142857142857143,
|
| 330 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 331 |
+
"accuracy_ci_high": 1.0,
|
| 332 |
+
"score_name": "accuracy",
|
| 333 |
+
"score": 0.7142857142857143,
|
| 334 |
+
"score_ci_high": 1.0,
|
| 335 |
+
"score_ci_low": 0.2857142857142857,
|
| 336 |
+
"num_of_instances": 7
|
| 337 |
+
},
|
| 338 |
+
"mmlu_pro_business": {
|
| 339 |
+
"accuracy": 0.42857142857142855,
|
| 340 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 341 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 342 |
+
"score_name": "accuracy",
|
| 343 |
+
"score": 0.42857142857142855,
|
| 344 |
+
"score_ci_high": 0.8571428571428571,
|
| 345 |
+
"score_ci_low": 0.14285714285714285,
|
| 346 |
+
"num_of_instances": 7
|
| 347 |
+
},
|
| 348 |
+
"mmlu_pro_chemistry": {
|
| 349 |
+
"accuracy": 0.42857142857142855,
|
| 350 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 351 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 352 |
+
"score_name": "accuracy",
|
| 353 |
+
"score": 0.42857142857142855,
|
| 354 |
+
"score_ci_high": 0.8571428571428571,
|
| 355 |
+
"score_ci_low": 0.14285714285714285,
|
| 356 |
+
"num_of_instances": 7
|
| 357 |
+
},
|
| 358 |
+
"mmlu_pro_computer_science": {
|
| 359 |
+
"accuracy": 1.0,
|
| 360 |
+
"accuracy_ci_low": 1.0,
|
| 361 |
+
"accuracy_ci_high": 1.0,
|
| 362 |
+
"score_name": "accuracy",
|
| 363 |
+
"score": 1.0,
|
| 364 |
+
"score_ci_high": 1.0,
|
| 365 |
+
"score_ci_low": 1.0,
|
| 366 |
+
"num_of_instances": 7
|
| 367 |
+
},
|
| 368 |
+
"mmlu_pro_economics": {
|
| 369 |
+
"accuracy": 0.7142857142857143,
|
| 370 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 371 |
+
"accuracy_ci_high": 1.0,
|
| 372 |
+
"score_name": "accuracy",
|
| 373 |
+
"score": 0.7142857142857143,
|
| 374 |
+
"score_ci_high": 1.0,
|
| 375 |
+
"score_ci_low": 0.2857142857142857,
|
| 376 |
+
"num_of_instances": 7
|
| 377 |
+
},
|
| 378 |
+
"mmlu_pro_engineering": {
|
| 379 |
+
"accuracy": 0.42857142857142855,
|
| 380 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 381 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 382 |
+
"score_name": "accuracy",
|
| 383 |
+
"score": 0.42857142857142855,
|
| 384 |
+
"score_ci_high": 0.8571428571428571,
|
| 385 |
+
"score_ci_low": 0.14285714285714285,
|
| 386 |
+
"num_of_instances": 7
|
| 387 |
+
},
|
| 388 |
+
"mmlu_pro_health": {
|
| 389 |
+
"accuracy": 0.42857142857142855,
|
| 390 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 391 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 392 |
+
"score_name": "accuracy",
|
| 393 |
+
"score": 0.42857142857142855,
|
| 394 |
+
"score_ci_high": 0.8571428571428571,
|
| 395 |
+
"score_ci_low": 0.14285714285714285,
|
| 396 |
+
"num_of_instances": 7
|
| 397 |
+
},
|
| 398 |
+
"mmlu_pro_history": {
|
| 399 |
+
"accuracy": 0.42857142857142855,
|
| 400 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 401 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 402 |
+
"score_name": "accuracy",
|
| 403 |
+
"score": 0.42857142857142855,
|
| 404 |
+
"score_ci_high": 0.8571428571428571,
|
| 405 |
+
"score_ci_low": 0.14285714285714285,
|
| 406 |
+
"num_of_instances": 7
|
| 407 |
+
},
|
| 408 |
+
"mmlu_pro_law": {
|
| 409 |
+
"accuracy": 0.5714285714285714,
|
| 410 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 411 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 412 |
+
"score_name": "accuracy",
|
| 413 |
+
"score": 0.5714285714285714,
|
| 414 |
+
"score_ci_high": 0.8571428571428571,
|
| 415 |
+
"score_ci_low": 0.14285714285714285,
|
| 416 |
+
"num_of_instances": 7
|
| 417 |
+
},
|
| 418 |
+
"mmlu_pro_math": {
|
| 419 |
+
"accuracy": 0.42857142857142855,
|
| 420 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 421 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 422 |
+
"score_name": "accuracy",
|
| 423 |
+
"score": 0.42857142857142855,
|
| 424 |
+
"score_ci_high": 0.8571428571428571,
|
| 425 |
+
"score_ci_low": 0.14285714285714285,
|
| 426 |
+
"num_of_instances": 7
|
| 427 |
+
},
|
| 428 |
+
"mmlu_pro_other": {
|
| 429 |
+
"accuracy": 0.5714285714285714,
|
| 430 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 431 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 432 |
+
"score_name": "accuracy",
|
| 433 |
+
"score": 0.5714285714285714,
|
| 434 |
+
"score_ci_high": 0.8571428571428571,
|
| 435 |
+
"score_ci_low": 0.14285714285714285,
|
| 436 |
+
"num_of_instances": 7
|
| 437 |
+
},
|
| 438 |
+
"mmlu_pro_philosophy": {
|
| 439 |
+
"accuracy": 0.7142857142857143,
|
| 440 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 441 |
+
"accuracy_ci_high": 1.0,
|
| 442 |
+
"score_name": "accuracy",
|
| 443 |
+
"score": 0.7142857142857143,
|
| 444 |
+
"score_ci_high": 1.0,
|
| 445 |
+
"score_ci_low": 0.2857142857142857,
|
| 446 |
+
"num_of_instances": 7
|
| 447 |
+
},
|
| 448 |
+
"mmlu_pro_physics": {
|
| 449 |
+
"accuracy": 0.2857142857142857,
|
| 450 |
+
"accuracy_ci_low": 0.0,
|
| 451 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 452 |
+
"score_name": "accuracy",
|
| 453 |
+
"score": 0.2857142857142857,
|
| 454 |
+
"score_ci_high": 0.7142857142857143,
|
| 455 |
+
"score_ci_low": 0.0,
|
| 456 |
+
"num_of_instances": 7
|
| 457 |
+
},
|
| 458 |
+
"mmlu_pro_psychology": {
|
| 459 |
+
"accuracy": 0.5714285714285714,
|
| 460 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 461 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 462 |
+
"score_name": "accuracy",
|
| 463 |
+
"score": 0.5714285714285714,
|
| 464 |
+
"score_ci_high": 0.8571428571428571,
|
| 465 |
+
"score_ci_low": 0.14285714285714285,
|
| 466 |
+
"num_of_instances": 7
|
| 467 |
+
},
|
| 468 |
+
"score": 0.5510204081632653,
|
| 469 |
+
"score_name": "subsets_mean",
|
| 470 |
+
"num_of_instances": 98
|
| 471 |
+
},
|
| 472 |
+
"legal": {
|
| 473 |
+
"legalbench_abercrombie": {
|
| 474 |
+
"f1_macro": 0.8147619047619047,
|
| 475 |
+
"f1_suggestive": 0.6666666666666666,
|
| 476 |
+
"f1_generic": 1.0,
|
| 477 |
+
"f1_fanciful": 0.8571428571428571,
|
| 478 |
+
"f1_descriptive": 0.8,
|
| 479 |
+
"f1_arbitrary": 0.75,
|
| 480 |
+
"f1_macro_ci_low": 0.6241071521113625,
|
| 481 |
+
"f1_macro_ci_high": 0.9652136441488661,
|
| 482 |
+
"score_name": "f1_micro",
|
| 483 |
+
"score": 0.8,
|
| 484 |
+
"score_ci_high": 0.95,
|
| 485 |
+
"score_ci_low": 0.55,
|
| 486 |
+
"num_of_instances": 20,
|
| 487 |
+
"accuracy": 0.8,
|
| 488 |
+
"accuracy_ci_low": 0.55,
|
| 489 |
+
"accuracy_ci_high": 0.95,
|
| 490 |
+
"f1_micro": 0.8,
|
| 491 |
+
"f1_micro_ci_low": 0.55,
|
| 492 |
+
"f1_micro_ci_high": 0.95
|
| 493 |
+
},
|
| 494 |
+
"legalbench_corporate_lobbying": {
|
| 495 |
+
"f1_macro": 0.7386363636363636,
|
| 496 |
+
"f1_no": 0.7272727272727273,
|
| 497 |
+
"f1_yes": 0.75,
|
| 498 |
+
"f1_macro_ci_low": 0.5080213903743316,
|
| 499 |
+
"f1_macro_ci_high": 0.9157902232720109,
|
| 500 |
+
"score_name": "f1_micro",
|
| 501 |
+
"score": 0.7368421052631579,
|
| 502 |
+
"score_ci_high": 0.8947368421052632,
|
| 503 |
+
"score_ci_low": 0.5128205128205128,
|
| 504 |
+
"num_of_instances": 20,
|
| 505 |
+
"accuracy": 0.7,
|
| 506 |
+
"accuracy_ci_low": 0.5,
|
| 507 |
+
"accuracy_ci_high": 0.9,
|
| 508 |
+
"f1_micro": 0.7368421052631579,
|
| 509 |
+
"f1_micro_ci_low": 0.5128205128205128,
|
| 510 |
+
"f1_micro_ci_high": 0.8947368421052632
|
| 511 |
+
},
|
| 512 |
+
"legalbench_function_of_decision_section": {
|
| 513 |
+
"f1_macro": 0.17687074829931973,
|
| 514 |
+
"f1_conclusion": 0.2857142857142857,
|
| 515 |
+
"f1_decree": 0.0,
|
| 516 |
+
"f1_issue": 0.2857142857142857,
|
| 517 |
+
"f1_analysis": 0.6666666666666666,
|
| 518 |
+
"f1_facts": 0.0,
|
| 519 |
+
"f1_procedural history": 0.0,
|
| 520 |
+
"f1_rule": 0.0,
|
| 521 |
+
"f1_macro_ci_low": 0.037037037037037035,
|
| 522 |
+
"f1_macro_ci_high": 0.3410139249890439,
|
| 523 |
+
"score_name": "f1_micro",
|
| 524 |
+
"score": 0.23529411764705882,
|
| 525 |
+
"score_ci_high": 0.48484848484848486,
|
| 526 |
+
"score_ci_low": 0.058823529411764705,
|
| 527 |
+
"num_of_instances": 20,
|
| 528 |
+
"accuracy": 0.2,
|
| 529 |
+
"accuracy_ci_low": 0.05,
|
| 530 |
+
"accuracy_ci_high": 0.45,
|
| 531 |
+
"f1_micro": 0.23529411764705882,
|
| 532 |
+
"f1_micro_ci_low": 0.058823529411764705,
|
| 533 |
+
"f1_micro_ci_high": 0.48484848484848486
|
| 534 |
+
},
|
| 535 |
+
"legalbench_international_citizenship_questions": {
|
| 536 |
+
"f1_macro": 0.6277777777777778,
|
| 537 |
+
"f1_yes": 0.7,
|
| 538 |
+
"f1_no": 0.5555555555555556,
|
| 539 |
+
"f1_macro_ci_low": 0.4143115659353126,
|
| 540 |
+
"f1_macro_ci_high": 0.849624060150376,
|
| 541 |
+
"score_name": "f1_micro",
|
| 542 |
+
"score": 0.631578947368421,
|
| 543 |
+
"score_ci_high": 0.8421052631578947,
|
| 544 |
+
"score_ci_low": 0.4069581788631691,
|
| 545 |
+
"num_of_instances": 20,
|
| 546 |
+
"accuracy": 0.6,
|
| 547 |
+
"accuracy_ci_low": 0.4,
|
| 548 |
+
"accuracy_ci_high": 0.8,
|
| 549 |
+
"f1_micro": 0.631578947368421,
|
| 550 |
+
"f1_micro_ci_low": 0.4069581788631691,
|
| 551 |
+
"f1_micro_ci_high": 0.8421052631578947
|
| 552 |
+
},
|
| 553 |
+
"legalbench_proa": {
|
| 554 |
+
"f1_macro": 0.743421052631579,
|
| 555 |
+
"f1_yes": 0.75,
|
| 556 |
+
"f1_no": 0.7368421052631579,
|
| 557 |
+
"f1_macro_ci_low": 0.5133179285198034,
|
| 558 |
+
"f1_macro_ci_high": 0.898989898989899,
|
| 559 |
+
"score_name": "f1_micro",
|
| 560 |
+
"score": 0.7428571428571429,
|
| 561 |
+
"score_ci_high": 0.8888888888888888,
|
| 562 |
+
"score_ci_low": 0.5,
|
| 563 |
+
"num_of_instances": 20,
|
| 564 |
+
"accuracy": 0.65,
|
| 565 |
+
"accuracy_ci_low": 0.4,
|
| 566 |
+
"accuracy_ci_high": 0.85,
|
| 567 |
+
"f1_micro": 0.7428571428571429,
|
| 568 |
+
"f1_micro_ci_low": 0.5,
|
| 569 |
+
"f1_micro_ci_high": 0.8888888888888888
|
| 570 |
+
},
|
| 571 |
+
"score": 0.6293144626271561,
|
| 572 |
+
"score_name": "subsets_mean",
|
| 573 |
+
"num_of_instances": 100
|
| 574 |
+
},
|
| 575 |
+
"news_classification": {
|
| 576 |
+
"20_newsgroups_short": {
|
| 577 |
+
"f1_macro": 0.6527228980170158,
|
| 578 |
+
"f1_cars": 1.0,
|
| 579 |
+
"f1_windows x": 0.5714285714285714,
|
| 580 |
+
"f1_computer graphics": 0.5882352941176471,
|
| 581 |
+
"f1_atheism": 0.3333333333333333,
|
| 582 |
+
"f1_christianity": 0.6666666666666666,
|
| 583 |
+
"f1_religion": 0.25,
|
| 584 |
+
"f1_medicine": 1.0,
|
| 585 |
+
"f1_microsoft windows": 0.8,
|
| 586 |
+
"f1_middle east": 0.5,
|
| 587 |
+
"f1_motorcycles": 0.7272727272727273,
|
| 588 |
+
"f1_pc hardware": 0.75,
|
| 589 |
+
"f1_mac hardware": 0.8888888888888888,
|
| 590 |
+
"f1_electronics": 0.5,
|
| 591 |
+
"f1_for sale": 0.8888888888888888,
|
| 592 |
+
"f1_guns": 0.4444444444444444,
|
| 593 |
+
"f1_space": 0.6,
|
| 594 |
+
"f1_cryptography": 0.3333333333333333,
|
| 595 |
+
"f1_baseball": 0.9230769230769231,
|
| 596 |
+
"f1_hockey": 0.8888888888888888,
|
| 597 |
+
"f1_politics": 0.4,
|
| 598 |
+
"f1_macro_ci_low": 0.5594466279416972,
|
| 599 |
+
"f1_macro_ci_high": 0.7484812715694672,
|
| 600 |
+
"score_name": "f1_micro",
|
| 601 |
+
"score": 0.6701030927835051,
|
| 602 |
+
"score_ci_high": 0.7525773195876289,
|
| 603 |
+
"score_ci_low": 0.5628781799105581,
|
| 604 |
+
"num_of_instances": 100,
|
| 605 |
+
"accuracy": 0.65,
|
| 606 |
+
"accuracy_ci_low": 0.54,
|
| 607 |
+
"accuracy_ci_high": 0.74,
|
| 608 |
+
"f1_micro": 0.6701030927835051,
|
| 609 |
+
"f1_micro_ci_low": 0.5628781799105581,
|
| 610 |
+
"f1_micro_ci_high": 0.7525773195876289
|
| 611 |
+
},
|
| 612 |
+
"score": 0.6701030927835051,
|
| 613 |
+
"score_name": "subsets_mean",
|
| 614 |
+
"num_of_instances": 100
|
| 615 |
+
},
|
| 616 |
+
"product_help": {
|
| 617 |
+
"cfpb_product_2023": {
|
| 618 |
+
"f1_macro": 0.7779168114934538,
|
| 619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9343065693430657,
|
| 620 |
+
"f1_credit card or prepaid card": 0.3333333333333333,
|
| 621 |
+
"f1_money transfer or virtual currency or money service": 0.8,
|
| 622 |
+
"f1_mortgage": 0.6666666666666666,
|
| 623 |
+
"f1_debt collection": 0.7777777777777778,
|
| 624 |
+
"f1_checking or savings account": 0.9333333333333333,
|
| 625 |
+
"f1_payday loan or title loan or personal loan": 1.0,
|
| 626 |
+
"f1_macro_ci_low": 0.5932457464249341,
|
| 627 |
+
"f1_macro_ci_high": 0.8817165227032664,
|
| 628 |
+
"score_name": "f1_micro",
|
| 629 |
+
"score": 0.875,
|
| 630 |
+
"score_ci_high": 0.9238578680203046,
|
| 631 |
+
"score_ci_low": 0.8006847676679175,
|
| 632 |
+
"num_of_instances": 100,
|
| 633 |
+
"accuracy": 0.84,
|
| 634 |
+
"accuracy_ci_low": 0.76,
|
| 635 |
+
"accuracy_ci_high": 0.9,
|
| 636 |
+
"f1_micro": 0.875,
|
| 637 |
+
"f1_micro_ci_low": 0.8006847676679175,
|
| 638 |
+
"f1_micro_ci_high": 0.9238578680203046
|
| 639 |
+
},
|
| 640 |
+
"cfpb_product_watsonx": {
|
| 641 |
+
"f1_macro": 0.8249444681938961,
|
| 642 |
+
"f1_mortgages and loans": 0.8695652173913043,
|
| 643 |
+
"f1_credit card": 0.7619047619047619,
|
| 644 |
+
"f1_debt collection": 0.7368421052631579,
|
| 645 |
+
"f1_credit reporting": 0.8333333333333334,
|
| 646 |
+
"f1_retail banking": 0.9230769230769231,
|
| 647 |
+
"f1_macro_ci_low": 0.6928894894256992,
|
| 648 |
+
"f1_macro_ci_high": 0.9177170275914066,
|
| 649 |
+
"score_name": "f1_micro",
|
| 650 |
+
"score": 0.82,
|
| 651 |
+
"score_ci_high": 0.9,
|
| 652 |
+
"score_ci_low": 0.68,
|
| 653 |
+
"num_of_instances": 50,
|
| 654 |
+
"accuracy": 0.82,
|
| 655 |
+
"accuracy_ci_low": 0.68,
|
| 656 |
+
"accuracy_ci_high": 0.9,
|
| 657 |
+
"f1_micro": 0.82,
|
| 658 |
+
"f1_micro_ci_low": 0.68,
|
| 659 |
+
"f1_micro_ci_high": 0.9
|
| 660 |
+
},
|
| 661 |
+
"score": 0.8474999999999999,
|
| 662 |
+
"score_name": "subsets_mean",
|
| 663 |
+
"num_of_instances": 150
|
| 664 |
+
},
|
| 665 |
+
"qa_finance": {
|
| 666 |
+
"fin_qa": {
|
| 667 |
+
"num_of_instances": 100,
|
| 668 |
+
"program_accuracy": 0.26,
|
| 669 |
+
"score": 0.26,
|
| 670 |
+
"score_name": "program_accuracy",
|
| 671 |
+
"execution_accuracy": 0.25,
|
| 672 |
+
"program_accuracy_ci_low": 0.18,
|
| 673 |
+
"program_accuracy_ci_high": 0.35,
|
| 674 |
+
"score_ci_low": 0.18,
|
| 675 |
+
"score_ci_high": 0.35,
|
| 676 |
+
"execution_accuracy_ci_low": 0.17,
|
| 677 |
+
"execution_accuracy_ci_high": 0.34
|
| 678 |
+
},
|
| 679 |
+
"score": 0.26,
|
| 680 |
+
"score_name": "subsets_mean",
|
| 681 |
+
"num_of_instances": 100
|
| 682 |
+
},
|
| 683 |
+
"rag_general": {
|
| 684 |
+
"rag_response_generation_clapnq": {
|
| 685 |
+
"precision": 0.5074323401711549,
|
| 686 |
+
"recall": 0.5740276560353169,
|
| 687 |
+
"f1": 0.49603980753298627,
|
| 688 |
+
"precision_ci_low": 0.47150664604306436,
|
| 689 |
+
"precision_ci_high": 0.5444306654624451,
|
| 690 |
+
"recall_ci_low": 0.5309832235253571,
|
| 691 |
+
"recall_ci_high": 0.6151274674565376,
|
| 692 |
+
"f1_ci_low": 0.46604824453263755,
|
| 693 |
+
"f1_ci_high": 0.5262523731070278,
|
| 694 |
+
"score_name": "f1",
|
| 695 |
+
"score": 0.49603980753298627,
|
| 696 |
+
"score_ci_high": 0.5262523731070278,
|
| 697 |
+
"score_ci_low": 0.46604824453263755,
|
| 698 |
+
"num_of_instances": 100,
|
| 699 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6860781842470169,
|
| 700 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.709225146472454,
|
| 701 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.6771670934557915,
|
| 702 |
+
"faithfullness_f1_token_overlap": 0.36312371853081976,
|
| 703 |
+
"faithfullness_recall_token_overlap": 0.2603375679120915,
|
| 704 |
+
"faithfullness_precision_token_overlap": 0.7208838371950508,
|
| 705 |
+
"correctness_f1_token_overlap": 0.49603980753298627,
|
| 706 |
+
"correctness_recall_token_overlap": 0.5740276560353169,
|
| 707 |
+
"correctness_precision_token_overlap": 0.5074323401711549
|
| 708 |
+
},
|
| 709 |
+
"score": 0.49603980753298627,
|
| 710 |
+
"score_name": "subsets_mean",
|
| 711 |
+
"num_of_instances": 100
|
| 712 |
+
},
|
| 713 |
+
"reasoning": {
|
| 714 |
+
"hellaswag": {
|
| 715 |
+
"accuracy": 0.57,
|
| 716 |
+
"accuracy_ci_low": 0.47,
|
| 717 |
+
"accuracy_ci_high": 0.66,
|
| 718 |
+
"score_name": "accuracy",
|
| 719 |
+
"score": 0.57,
|
| 720 |
+
"score_ci_high": 0.66,
|
| 721 |
+
"score_ci_low": 0.47,
|
| 722 |
+
"num_of_instances": 100
|
| 723 |
+
},
|
| 724 |
+
"openbook_qa": {
|
| 725 |
+
"accuracy": 0.93,
|
| 726 |
+
"accuracy_ci_low": 0.8636775645084782,
|
| 727 |
+
"accuracy_ci_high": 0.97,
|
| 728 |
+
"score_name": "accuracy",
|
| 729 |
+
"score": 0.93,
|
| 730 |
+
"score_ci_high": 0.97,
|
| 731 |
+
"score_ci_low": 0.8636775645084782,
|
| 732 |
+
"num_of_instances": 100
|
| 733 |
+
},
|
| 734 |
+
"score": 0.75,
|
| 735 |
+
"score_name": "subsets_mean",
|
| 736 |
+
"num_of_instances": 200
|
| 737 |
+
},
|
| 738 |
+
"safety": {
|
| 739 |
+
"attaq_500": {
|
| 740 |
+
"severity_critical": 0.0,
|
| 741 |
+
"severity_high": 0.0,
|
| 742 |
+
"severity_medium": 1.0,
|
| 743 |
+
"severity_low": 99.0,
|
| 744 |
+
"category_harmful_info": 0.8584611813966482,
|
| 745 |
+
"category_deception": 0.884253122025367,
|
| 746 |
+
"category_explicit_content": 0.8562488650733774,
|
| 747 |
+
"category_violence": 0.8526820468745888,
|
| 748 |
+
"category_discrimination": 0.8608146858593775,
|
| 749 |
+
"category_substance_abuse": 0.8552402012877994,
|
| 750 |
+
"category_pii": 0.8454244594883035,
|
| 751 |
+
"safety": 0.8586448376372249,
|
| 752 |
+
"safety_ci_low": 0.8379978850647828,
|
| 753 |
+
"safety_ci_high": 0.8767468237261271,
|
| 754 |
+
"score_name": "safety",
|
| 755 |
+
"score": 0.8586448376372249,
|
| 756 |
+
"score_ci_high": 0.8767468237261271,
|
| 757 |
+
"score_ci_low": 0.8379978850647828,
|
| 758 |
+
"num_of_instances": 100
|
| 759 |
+
},
|
| 760 |
+
"score": 0.8586448376372249,
|
| 761 |
+
"score_name": "subsets_mean",
|
| 762 |
+
"num_of_instances": 100
|
| 763 |
+
},
|
| 764 |
+
"summarization": {
|
| 765 |
+
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
+
"num_of_instances": 100,
|
| 767 |
+
"rouge2": 0.20801889674008944,
|
| 768 |
+
"rougeL": 0.2968640964283658,
|
| 769 |
+
"score": 0.2968640964283658,
|
| 770 |
+
"score_name": "rougeL",
|
| 771 |
+
"rouge1": 0.42312199770919734,
|
| 772 |
+
"rougeLsum": 0.36721142345095514,
|
| 773 |
+
"rouge2_ci_low": 0.19303195128069772,
|
| 774 |
+
"rouge2_ci_high": 0.2239123862881902,
|
| 775 |
+
"rougeL_ci_low": 0.2793124666427084,
|
| 776 |
+
"rougeL_ci_high": 0.31605456043834845,
|
| 777 |
+
"score_ci_low": 0.2793124666427084,
|
| 778 |
+
"score_ci_high": 0.31605456043834845,
|
| 779 |
+
"rouge1_ci_low": 0.4001670264004444,
|
| 780 |
+
"rouge1_ci_high": 0.4447744439473787,
|
| 781 |
+
"rougeLsum_ci_low": 0.3471579822744466,
|
| 782 |
+
"rougeLsum_ci_high": 0.38772570475787616
|
| 783 |
+
},
|
| 784 |
+
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
+
"num_of_instances": 100,
|
| 786 |
+
"rouge2": 0.01604522130758397,
|
| 787 |
+
"rougeL": 0.08637910785624431,
|
| 788 |
+
"score": 0.08637910785624431,
|
| 789 |
+
"score_name": "rougeL",
|
| 790 |
+
"rouge1": 0.11342470214059448,
|
| 791 |
+
"rougeLsum": 0.09470632571584116,
|
| 792 |
+
"rouge2_ci_low": 0.010956366582946934,
|
| 793 |
+
"rouge2_ci_high": 0.02241153955079208,
|
| 794 |
+
"rougeL_ci_low": 0.07452990644153164,
|
| 795 |
+
"rougeL_ci_high": 0.09690240900042019,
|
| 796 |
+
"score_ci_low": 0.07452990644153164,
|
| 797 |
+
"score_ci_high": 0.09690240900042019,
|
| 798 |
+
"rouge1_ci_low": 0.09801484446894211,
|
| 799 |
+
"rouge1_ci_high": 0.12984514349711393,
|
| 800 |
+
"rougeLsum_ci_low": 0.08127416438123053,
|
| 801 |
+
"rougeLsum_ci_high": 0.10623331358688204
|
| 802 |
+
},
|
| 803 |
+
"score": 0.19162160214230506,
|
| 804 |
+
"score_name": "subsets_mean",
|
| 805 |
+
"num_of_instances": 200
|
| 806 |
+
},
|
| 807 |
+
"translation": {
|
| 808 |
+
"mt_flores_101_ara_eng": {
|
| 809 |
+
"num_of_instances": 6,
|
| 810 |
+
"counts": [
|
| 811 |
+
161,
|
| 812 |
+
119,
|
| 813 |
+
91,
|
| 814 |
+
71
|
| 815 |
+
],
|
| 816 |
+
"totals": [
|
| 817 |
+
220,
|
| 818 |
+
214,
|
| 819 |
+
208,
|
| 820 |
+
202
|
| 821 |
+
],
|
| 822 |
+
"precisions": [
|
| 823 |
+
0.7318181818181819,
|
| 824 |
+
0.5560747663551402,
|
| 825 |
+
0.4375,
|
| 826 |
+
0.35148514851485146
|
| 827 |
+
],
|
| 828 |
+
"bp": 1.0,
|
| 829 |
+
"sys_len": 220,
|
| 830 |
+
"ref_len": 208,
|
| 831 |
+
"sacrebleu": 0.500155852462094,
|
| 832 |
+
"score": 0.500155852462094,
|
| 833 |
+
"score_name": "sacrebleu",
|
| 834 |
+
"score_ci_low": 0.27107466732933977,
|
| 835 |
+
"score_ci_high": 0.6456509094349956,
|
| 836 |
+
"sacrebleu_ci_low": 0.27107466732933977,
|
| 837 |
+
"sacrebleu_ci_high": 0.6456509094349956
|
| 838 |
+
},
|
| 839 |
+
"mt_flores_101_deu_eng": {
|
| 840 |
+
"num_of_instances": 6,
|
| 841 |
+
"counts": [
|
| 842 |
+
141,
|
| 843 |
+
85,
|
| 844 |
+
54,
|
| 845 |
+
39
|
| 846 |
+
],
|
| 847 |
+
"totals": [
|
| 848 |
+
216,
|
| 849 |
+
210,
|
| 850 |
+
204,
|
| 851 |
+
198
|
| 852 |
+
],
|
| 853 |
+
"precisions": [
|
| 854 |
+
0.6527777777777777,
|
| 855 |
+
0.40476190476190477,
|
| 856 |
+
0.2647058823529412,
|
| 857 |
+
0.19696969696969696
|
| 858 |
+
],
|
| 859 |
+
"bp": 1.0,
|
| 860 |
+
"sys_len": 216,
|
| 861 |
+
"ref_len": 208,
|
| 862 |
+
"sacrebleu": 0.34259577311211076,
|
| 863 |
+
"score": 0.34259577311211076,
|
| 864 |
+
"score_name": "sacrebleu",
|
| 865 |
+
"score_ci_low": 0.22516597371165897,
|
| 866 |
+
"score_ci_high": 0.543023787423078,
|
| 867 |
+
"sacrebleu_ci_low": 0.22516597371165897,
|
| 868 |
+
"sacrebleu_ci_high": 0.543023787423078
|
| 869 |
+
},
|
| 870 |
+
"mt_flores_101_eng_ara": {
|
| 871 |
+
"num_of_instances": 6,
|
| 872 |
+
"counts": [
|
| 873 |
+
131,
|
| 874 |
+
80,
|
| 875 |
+
50,
|
| 876 |
+
29
|
| 877 |
+
],
|
| 878 |
+
"totals": [
|
| 879 |
+
203,
|
| 880 |
+
197,
|
| 881 |
+
191,
|
| 882 |
+
185
|
| 883 |
+
],
|
| 884 |
+
"precisions": [
|
| 885 |
+
0.645320197044335,
|
| 886 |
+
0.40609137055837563,
|
| 887 |
+
0.2617801047120419,
|
| 888 |
+
0.15675675675675677
|
| 889 |
+
],
|
| 890 |
+
"bp": 0.9708758757257812,
|
| 891 |
+
"sys_len": 203,
|
| 892 |
+
"ref_len": 209,
|
| 893 |
+
"sacrebleu": 0.31264694630569706,
|
| 894 |
+
"score": 0.31264694630569706,
|
| 895 |
+
"score_name": "sacrebleu",
|
| 896 |
+
"score_ci_low": 0.21875352119497682,
|
| 897 |
+
"score_ci_high": 0.43847677077007524,
|
| 898 |
+
"sacrebleu_ci_low": 0.21875352119497682,
|
| 899 |
+
"sacrebleu_ci_high": 0.43847677077007524
|
| 900 |
+
},
|
| 901 |
+
"mt_flores_101_eng_deu": {
|
| 902 |
+
"num_of_instances": 6,
|
| 903 |
+
"counts": [
|
| 904 |
+
144,
|
| 905 |
+
94,
|
| 906 |
+
68,
|
| 907 |
+
52
|
| 908 |
+
],
|
| 909 |
+
"totals": [
|
| 910 |
+
224,
|
| 911 |
+
218,
|
| 912 |
+
212,
|
| 913 |
+
206
|
| 914 |
+
],
|
| 915 |
+
"precisions": [
|
| 916 |
+
0.6428571428571429,
|
| 917 |
+
0.4311926605504587,
|
| 918 |
+
0.32075471698113206,
|
| 919 |
+
0.2524271844660194
|
| 920 |
+
],
|
| 921 |
+
"bp": 1.0,
|
| 922 |
+
"sys_len": 224,
|
| 923 |
+
"ref_len": 216,
|
| 924 |
+
"sacrebleu": 0.38705595372857227,
|
| 925 |
+
"score": 0.38705595372857227,
|
| 926 |
+
"score_name": "sacrebleu",
|
| 927 |
+
"score_ci_low": 0.270030091960412,
|
| 928 |
+
"score_ci_high": 0.5388939200476505,
|
| 929 |
+
"sacrebleu_ci_low": 0.270030091960412,
|
| 930 |
+
"sacrebleu_ci_high": 0.5388939200476505
|
| 931 |
+
},
|
| 932 |
+
"mt_flores_101_eng_fra": {
|
| 933 |
+
"num_of_instances": 6,
|
| 934 |
+
"counts": [
|
| 935 |
+
188,
|
| 936 |
+
150,
|
| 937 |
+
122,
|
| 938 |
+
100
|
| 939 |
+
],
|
| 940 |
+
"totals": [
|
| 941 |
+
244,
|
| 942 |
+
238,
|
| 943 |
+
232,
|
| 944 |
+
226
|
| 945 |
+
],
|
| 946 |
+
"precisions": [
|
| 947 |
+
0.7704918032786885,
|
| 948 |
+
0.6302521008403361,
|
| 949 |
+
0.5258620689655172,
|
| 950 |
+
0.4424778761061947
|
| 951 |
+
],
|
| 952 |
+
"bp": 1.0,
|
| 953 |
+
"sys_len": 244,
|
| 954 |
+
"ref_len": 235,
|
| 955 |
+
"sacrebleu": 0.5797776009790664,
|
| 956 |
+
"score": 0.5797776009790664,
|
| 957 |
+
"score_name": "sacrebleu",
|
| 958 |
+
"score_ci_low": 0.4819980312121314,
|
| 959 |
+
"score_ci_high": 0.7676936555305252,
|
| 960 |
+
"sacrebleu_ci_low": 0.4819980312121314,
|
| 961 |
+
"sacrebleu_ci_high": 0.7676936555305252
|
| 962 |
+
},
|
| 963 |
+
"mt_flores_101_eng_kor": {
|
| 964 |
+
"num_of_instances": 6,
|
| 965 |
+
"counts": [
|
| 966 |
+
152,
|
| 967 |
+
85,
|
| 968 |
+
57,
|
| 969 |
+
35
|
| 970 |
+
],
|
| 971 |
+
"totals": [
|
| 972 |
+
267,
|
| 973 |
+
261,
|
| 974 |
+
255,
|
| 975 |
+
249
|
| 976 |
+
],
|
| 977 |
+
"precisions": [
|
| 978 |
+
0.5692883895131086,
|
| 979 |
+
0.32567049808429116,
|
| 980 |
+
0.22352941176470587,
|
| 981 |
+
0.14056224899598393
|
| 982 |
+
],
|
| 983 |
+
"bp": 1.0,
|
| 984 |
+
"sys_len": 267,
|
| 985 |
+
"ref_len": 249,
|
| 986 |
+
"sacrebleu": 0.27626669318098784,
|
| 987 |
+
"score": 0.27626669318098784,
|
| 988 |
+
"score_name": "sacrebleu",
|
| 989 |
+
"score_ci_low": 0.19630864275047086,
|
| 990 |
+
"score_ci_high": 0.3324774540569831,
|
| 991 |
+
"sacrebleu_ci_low": 0.19630864275047086,
|
| 992 |
+
"sacrebleu_ci_high": 0.3324774540569831
|
| 993 |
+
},
|
| 994 |
+
"mt_flores_101_eng_por": {
|
| 995 |
+
"num_of_instances": 6,
|
| 996 |
+
"counts": [
|
| 997 |
+
181,
|
| 998 |
+
139,
|
| 999 |
+
111,
|
| 1000 |
+
91
|
| 1001 |
+
],
|
| 1002 |
+
"totals": [
|
| 1003 |
+
226,
|
| 1004 |
+
220,
|
| 1005 |
+
214,
|
| 1006 |
+
208
|
| 1007 |
+
],
|
| 1008 |
+
"precisions": [
|
| 1009 |
+
0.8008849557522124,
|
| 1010 |
+
0.6318181818181818,
|
| 1011 |
+
0.5186915887850467,
|
| 1012 |
+
0.4375
|
| 1013 |
+
],
|
| 1014 |
+
"bp": 1.0,
|
| 1015 |
+
"sys_len": 226,
|
| 1016 |
+
"ref_len": 222,
|
| 1017 |
+
"sacrebleu": 0.5821198107565924,
|
| 1018 |
+
"score": 0.5821198107565924,
|
| 1019 |
+
"score_name": "sacrebleu",
|
| 1020 |
+
"score_ci_low": 0.5032683240695686,
|
| 1021 |
+
"score_ci_high": 0.6631112459149506,
|
| 1022 |
+
"sacrebleu_ci_low": 0.5032683240695686,
|
| 1023 |
+
"sacrebleu_ci_high": 0.6631112459149506
|
| 1024 |
+
},
|
| 1025 |
+
"mt_flores_101_eng_ron": {
|
| 1026 |
+
"num_of_instances": 6,
|
| 1027 |
+
"counts": [
|
| 1028 |
+
160,
|
| 1029 |
+
108,
|
| 1030 |
+
80,
|
| 1031 |
+
62
|
| 1032 |
+
],
|
| 1033 |
+
"totals": [
|
| 1034 |
+
233,
|
| 1035 |
+
227,
|
| 1036 |
+
221,
|
| 1037 |
+
215
|
| 1038 |
+
],
|
| 1039 |
+
"precisions": [
|
| 1040 |
+
0.6866952789699571,
|
| 1041 |
+
0.47577092511013214,
|
| 1042 |
+
0.36199095022624433,
|
| 1043 |
+
0.28837209302325584
|
| 1044 |
+
],
|
| 1045 |
+
"bp": 1.0,
|
| 1046 |
+
"sys_len": 233,
|
| 1047 |
+
"ref_len": 230,
|
| 1048 |
+
"sacrebleu": 0.4297374729981456,
|
| 1049 |
+
"score": 0.4297374729981456,
|
| 1050 |
+
"score_name": "sacrebleu",
|
| 1051 |
+
"score_ci_low": 0.30739045331930365,
|
| 1052 |
+
"score_ci_high": 0.5954313392008956,
|
| 1053 |
+
"sacrebleu_ci_low": 0.30739045331930365,
|
| 1054 |
+
"sacrebleu_ci_high": 0.5954313392008956
|
| 1055 |
+
},
|
| 1056 |
+
"mt_flores_101_eng_spa": {
|
| 1057 |
+
"num_of_instances": 6,
|
| 1058 |
+
"counts": [
|
| 1059 |
+
165,
|
| 1060 |
+
99,
|
| 1061 |
+
65,
|
| 1062 |
+
44
|
| 1063 |
+
],
|
| 1064 |
+
"totals": [
|
| 1065 |
+
238,
|
| 1066 |
+
232,
|
| 1067 |
+
226,
|
| 1068 |
+
220
|
| 1069 |
+
],
|
| 1070 |
+
"precisions": [
|
| 1071 |
+
0.6932773109243697,
|
| 1072 |
+
0.4267241379310345,
|
| 1073 |
+
0.28761061946902655,
|
| 1074 |
+
0.2
|
| 1075 |
+
],
|
| 1076 |
+
"bp": 0.9792107358732394,
|
| 1077 |
+
"sys_len": 238,
|
| 1078 |
+
"ref_len": 243,
|
| 1079 |
+
"sacrebleu": 0.35367018032587716,
|
| 1080 |
+
"score": 0.35367018032587716,
|
| 1081 |
+
"score_name": "sacrebleu",
|
| 1082 |
+
"score_ci_low": 0.3016964479711889,
|
| 1083 |
+
"score_ci_high": 0.4034179481814929,
|
| 1084 |
+
"sacrebleu_ci_low": 0.3016964479711889,
|
| 1085 |
+
"sacrebleu_ci_high": 0.4034179481814929
|
| 1086 |
+
},
|
| 1087 |
+
"mt_flores_101_fra_eng": {
|
| 1088 |
+
"num_of_instances": 6,
|
| 1089 |
+
"counts": [
|
| 1090 |
+
168,
|
| 1091 |
+
129,
|
| 1092 |
+
99,
|
| 1093 |
+
75
|
| 1094 |
+
],
|
| 1095 |
+
"totals": [
|
| 1096 |
+
215,
|
| 1097 |
+
209,
|
| 1098 |
+
203,
|
| 1099 |
+
197
|
| 1100 |
+
],
|
| 1101 |
+
"precisions": [
|
| 1102 |
+
0.7813953488372093,
|
| 1103 |
+
0.6172248803827751,
|
| 1104 |
+
0.4876847290640394,
|
| 1105 |
+
0.3807106598984772
|
| 1106 |
+
],
|
| 1107 |
+
"bp": 1.0,
|
| 1108 |
+
"sys_len": 215,
|
| 1109 |
+
"ref_len": 208,
|
| 1110 |
+
"sacrebleu": 0.5470312162394166,
|
| 1111 |
+
"score": 0.5470312162394166,
|
| 1112 |
+
"score_name": "sacrebleu",
|
| 1113 |
+
"score_ci_low": 0.4764122852102197,
|
| 1114 |
+
"score_ci_high": 0.6508738326325866,
|
| 1115 |
+
"sacrebleu_ci_low": 0.4764122852102197,
|
| 1116 |
+
"sacrebleu_ci_high": 0.6508738326325866
|
| 1117 |
+
},
|
| 1118 |
+
"mt_flores_101_jpn_eng": {
|
| 1119 |
+
"num_of_instances": 6,
|
| 1120 |
+
"counts": [
|
| 1121 |
+
143,
|
| 1122 |
+
86,
|
| 1123 |
+
60,
|
| 1124 |
+
42
|
| 1125 |
+
],
|
| 1126 |
+
"totals": [
|
| 1127 |
+
215,
|
| 1128 |
+
209,
|
| 1129 |
+
203,
|
| 1130 |
+
197
|
| 1131 |
+
],
|
| 1132 |
+
"precisions": [
|
| 1133 |
+
0.6651162790697674,
|
| 1134 |
+
0.41148325358851673,
|
| 1135 |
+
0.2955665024630542,
|
| 1136 |
+
0.2131979695431472
|
| 1137 |
+
],
|
| 1138 |
+
"bp": 1.0,
|
| 1139 |
+
"sys_len": 215,
|
| 1140 |
+
"ref_len": 208,
|
| 1141 |
+
"sacrebleu": 0.36238649527066064,
|
| 1142 |
+
"score": 0.36238649527066064,
|
| 1143 |
+
"score_name": "sacrebleu",
|
| 1144 |
+
"score_ci_low": 0.20955142870882296,
|
| 1145 |
+
"score_ci_high": 0.5831549950186898,
|
| 1146 |
+
"sacrebleu_ci_low": 0.20955142870882296,
|
| 1147 |
+
"sacrebleu_ci_high": 0.5831549950186898
|
| 1148 |
+
},
|
| 1149 |
+
"mt_flores_101_kor_eng": {
|
| 1150 |
+
"num_of_instances": 6,
|
| 1151 |
+
"counts": [
|
| 1152 |
+
131,
|
| 1153 |
+
74,
|
| 1154 |
+
45,
|
| 1155 |
+
31
|
| 1156 |
+
],
|
| 1157 |
+
"totals": [
|
| 1158 |
+
194,
|
| 1159 |
+
188,
|
| 1160 |
+
182,
|
| 1161 |
+
176
|
| 1162 |
+
],
|
| 1163 |
+
"precisions": [
|
| 1164 |
+
0.6752577319587628,
|
| 1165 |
+
0.39361702127659576,
|
| 1166 |
+
0.24725274725274726,
|
| 1167 |
+
0.17613636363636365
|
| 1168 |
+
],
|
| 1169 |
+
"bp": 0.9303774188371497,
|
| 1170 |
+
"sys_len": 194,
|
| 1171 |
+
"ref_len": 208,
|
| 1172 |
+
"sacrebleu": 0.30517050622006836,
|
| 1173 |
+
"score": 0.30517050622006836,
|
| 1174 |
+
"score_name": "sacrebleu",
|
| 1175 |
+
"score_ci_low": 0.1936778777853124,
|
| 1176 |
+
"score_ci_high": 0.4521715400303785,
|
| 1177 |
+
"sacrebleu_ci_low": 0.1936778777853124,
|
| 1178 |
+
"sacrebleu_ci_high": 0.4521715400303785
|
| 1179 |
+
},
|
| 1180 |
+
"mt_flores_101_por_eng": {
|
| 1181 |
+
"num_of_instances": 6,
|
| 1182 |
+
"counts": [
|
| 1183 |
+
167,
|
| 1184 |
+
128,
|
| 1185 |
+
100,
|
| 1186 |
+
80
|
| 1187 |
+
],
|
| 1188 |
+
"totals": [
|
| 1189 |
+
211,
|
| 1190 |
+
205,
|
| 1191 |
+
199,
|
| 1192 |
+
193
|
| 1193 |
+
],
|
| 1194 |
+
"precisions": [
|
| 1195 |
+
0.7914691943127963,
|
| 1196 |
+
0.624390243902439,
|
| 1197 |
+
0.5025125628140704,
|
| 1198 |
+
0.41450777202072536
|
| 1199 |
+
],
|
| 1200 |
+
"bp": 1.0,
|
| 1201 |
+
"sys_len": 211,
|
| 1202 |
+
"ref_len": 208,
|
| 1203 |
+
"sacrebleu": 0.5664250237033246,
|
| 1204 |
+
"score": 0.5664250237033246,
|
| 1205 |
+
"score_name": "sacrebleu",
|
| 1206 |
+
"score_ci_low": 0.4703853821459762,
|
| 1207 |
+
"score_ci_high": 0.6458520638777493,
|
| 1208 |
+
"sacrebleu_ci_low": 0.4703853821459762,
|
| 1209 |
+
"sacrebleu_ci_high": 0.6458520638777493
|
| 1210 |
+
},
|
| 1211 |
+
"mt_flores_101_ron_eng": {
|
| 1212 |
+
"num_of_instances": 6,
|
| 1213 |
+
"counts": [
|
| 1214 |
+
160,
|
| 1215 |
+
112,
|
| 1216 |
+
79,
|
| 1217 |
+
58
|
| 1218 |
+
],
|
| 1219 |
+
"totals": [
|
| 1220 |
+
226,
|
| 1221 |
+
220,
|
| 1222 |
+
214,
|
| 1223 |
+
208
|
| 1224 |
+
],
|
| 1225 |
+
"precisions": [
|
| 1226 |
+
0.7079646017699115,
|
| 1227 |
+
0.509090909090909,
|
| 1228 |
+
0.36915887850467294,
|
| 1229 |
+
0.27884615384615385
|
| 1230 |
+
],
|
| 1231 |
+
"bp": 1.0,
|
| 1232 |
+
"sys_len": 226,
|
| 1233 |
+
"ref_len": 208,
|
| 1234 |
+
"sacrebleu": 0.4388804297038792,
|
| 1235 |
+
"score": 0.4388804297038792,
|
| 1236 |
+
"score_name": "sacrebleu",
|
| 1237 |
+
"score_ci_low": 0.3074637309571057,
|
| 1238 |
+
"score_ci_high": 0.5696800272393862,
|
| 1239 |
+
"sacrebleu_ci_low": 0.3074637309571057,
|
| 1240 |
+
"sacrebleu_ci_high": 0.5696800272393862
|
| 1241 |
+
},
|
| 1242 |
+
"mt_flores_101_spa_eng": {
|
| 1243 |
+
"num_of_instances": 6,
|
| 1244 |
+
"counts": [
|
| 1245 |
+
151,
|
| 1246 |
+
97,
|
| 1247 |
+
62,
|
| 1248 |
+
42
|
| 1249 |
+
],
|
| 1250 |
+
"totals": [
|
| 1251 |
+
216,
|
| 1252 |
+
210,
|
| 1253 |
+
204,
|
| 1254 |
+
198
|
| 1255 |
+
],
|
| 1256 |
+
"precisions": [
|
| 1257 |
+
0.6990740740740741,
|
| 1258 |
+
0.4619047619047619,
|
| 1259 |
+
0.30392156862745096,
|
| 1260 |
+
0.2121212121212121
|
| 1261 |
+
],
|
| 1262 |
+
"bp": 1.0,
|
| 1263 |
+
"sys_len": 216,
|
| 1264 |
+
"ref_len": 208,
|
| 1265 |
+
"sacrebleu": 0.37984403828565183,
|
| 1266 |
+
"score": 0.37984403828565183,
|
| 1267 |
+
"score_name": "sacrebleu",
|
| 1268 |
+
"score_ci_low": 0.2939549007299014,
|
| 1269 |
+
"score_ci_high": 0.539072297051574,
|
| 1270 |
+
"sacrebleu_ci_low": 0.2939549007299014,
|
| 1271 |
+
"sacrebleu_ci_high": 0.539072297051574
|
| 1272 |
+
},
|
| 1273 |
+
"score": 0.42425093288480964,
|
| 1274 |
+
"score_name": "subsets_mean",
|
| 1275 |
+
"num_of_instances": 90
|
| 1276 |
+
},
|
| 1277 |
+
"score": 0.6368761936671354,
|
| 1278 |
+
"score_name": "subsets_mean",
|
| 1279 |
+
"num_of_instances": 1537
|
| 1280 |
+
}
|
| 1281 |
+
}
|
results/bluebench/{2025-07-02T17-33-41_evaluation_results.json β 2025-07-03T11-22-55_evaluation_results.json}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
-
"timestamp_utc": "2025-07-
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
@@ -42,7 +42,7 @@
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
-
"unitxt_commit_hash": "
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
@@ -292,57 +292,57 @@
|
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
| 296 |
-
"score": 0.
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
-
"score": 0.
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
-
"f1_Person": 0.
|
| 307 |
-
"f1_Organization": 0.
|
| 308 |
-
"f1_Location": 0.
|
| 309 |
-
"f1_macro": 0.
|
| 310 |
-
"recall_macro": 0.
|
| 311 |
-
"precision_macro": 0.
|
| 312 |
-
"in_classes_support": 0
|
| 313 |
-
"f1_micro": 0.
|
| 314 |
-
"recall_micro": 0.
|
| 315 |
-
"precision_micro": 0.
|
| 316 |
-
"score": 0.
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
-
"score_ci_low": 0.
|
| 319 |
-
"score_ci_high": 0.
|
| 320 |
-
"f1_micro_ci_low": 0.
|
| 321 |
-
"f1_micro_ci_high": 0.
|
| 322 |
},
|
| 323 |
-
"score": 0.
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
| 327 |
"knowledge": {
|
| 328 |
"mmlu_pro_biology": {
|
| 329 |
-
"accuracy": 0.
|
| 330 |
-
"accuracy_ci_low": 0.
|
| 331 |
"accuracy_ci_high": 1.0,
|
| 332 |
"score_name": "accuracy",
|
| 333 |
-
"score": 0.
|
| 334 |
"score_ci_high": 1.0,
|
| 335 |
-
"score_ci_low": 0.
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
| 339 |
-
"accuracy": 0.
|
| 340 |
-
"accuracy_ci_low": 0.
|
| 341 |
-
"accuracy_ci_high": 0.
|
| 342 |
"score_name": "accuracy",
|
| 343 |
-
"score": 0.
|
| 344 |
-
"score_ci_high": 0.
|
| 345 |
-
"score_ci_low": 0.
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
|
@@ -376,13 +376,13 @@
|
|
| 376 |
"num_of_instances": 7
|
| 377 |
},
|
| 378 |
"mmlu_pro_engineering": {
|
| 379 |
-
"accuracy": 0.
|
| 380 |
-
"accuracy_ci_low": 0.
|
| 381 |
-
"accuracy_ci_high": 0
|
| 382 |
"score_name": "accuracy",
|
| 383 |
-
"score": 0.
|
| 384 |
-
"score_ci_high": 0
|
| 385 |
-
"score_ci_low": 0.
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
|
@@ -406,13 +406,13 @@
|
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
-
"accuracy": 0.
|
| 410 |
-
"accuracy_ci_low": 0.
|
| 411 |
"accuracy_ci_high": 1.0,
|
| 412 |
"score_name": "accuracy",
|
| 413 |
-
"score": 0.
|
| 414 |
"score_ci_high": 1.0,
|
| 415 |
-
"score_ci_low": 0.
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
|
@@ -465,90 +465,90 @@
|
|
| 465 |
"score_ci_low": 0.2857142857142857,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
-
"score": 0.
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
-
"f1_macro": 0.
|
| 475 |
-
"f1_suggestive": 0.
|
| 476 |
"f1_generic": 1.0,
|
| 477 |
"f1_fanciful": 0.8571428571428571,
|
| 478 |
-
"f1_descriptive": 0.
|
| 479 |
"f1_arbitrary": 0.75,
|
| 480 |
-
"f1_macro_ci_low": 0.
|
| 481 |
-
"f1_macro_ci_high": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
"score": 0.8,
|
| 484 |
"score_ci_high": 0.95,
|
| 485 |
-
"score_ci_low": 0.
|
| 486 |
"num_of_instances": 20,
|
| 487 |
"accuracy": 0.8,
|
| 488 |
-
"accuracy_ci_low": 0.
|
| 489 |
"accuracy_ci_high": 0.95,
|
| 490 |
"f1_micro": 0.8,
|
| 491 |
-
"f1_micro_ci_low": 0.
|
| 492 |
"f1_micro_ci_high": 0.95
|
| 493 |
},
|
| 494 |
-
"legalbench_corporate_lobbying": {
|
| 495 |
-
"f1_macro": 0.6000000000000001,
|
| 496 |
-
"f1_no": 0.8,
|
| 497 |
-
"f1_yes": 0.4,
|
| 498 |
-
"f1_macro_ci_low": 0.375,
|
| 499 |
-
"f1_macro_ci_high": 0.8857142857142857,
|
| 500 |
-
"score_name": "f1_micro",
|
| 501 |
-
"score": 0.7,
|
| 502 |
-
"score_ci_high": 0.85,
|
| 503 |
-
"score_ci_low": 0.45,
|
| 504 |
-
"num_of_instances": 20,
|
| 505 |
-
"accuracy": 0.7,
|
| 506 |
-
"accuracy_ci_low": 0.45,
|
| 507 |
-
"accuracy_ci_high": 0.85,
|
| 508 |
-
"f1_micro": 0.7,
|
| 509 |
-
"f1_micro_ci_low": 0.45,
|
| 510 |
-
"f1_micro_ci_high": 0.85
|
| 511 |
-
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
-
"f1_macro": 0.
|
| 514 |
-
"f1_conclusion": 0.
|
|
|
|
| 515 |
"f1_decree": 0.0,
|
| 516 |
-
"f1_rule": 0.0,
|
| 517 |
"f1_issue": 0.2857142857142857,
|
| 518 |
-
"
|
| 519 |
-
"f1_facts": 0.5714285714285714,
|
| 520 |
"f1_procedural history": 0.0,
|
| 521 |
-
"
|
| 522 |
-
"
|
|
|
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
-
"score": 0.
|
| 525 |
-
"score_ci_high": 0.
|
| 526 |
-
"score_ci_low": 0.
|
| 527 |
"num_of_instances": 20,
|
| 528 |
"accuracy": 0.25,
|
| 529 |
"accuracy_ci_low": 0.1,
|
| 530 |
-
"accuracy_ci_high": 0.
|
| 531 |
-
"f1_micro": 0.
|
| 532 |
-
"f1_micro_ci_low": 0.
|
| 533 |
-
"f1_micro_ci_high": 0.
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
-
"f1_macro": 0.
|
| 537 |
-
"f1_yes": 0.
|
| 538 |
-
"f1_no": 0.
|
| 539 |
-
"f1_macro_ci_low": 0.
|
| 540 |
-
"f1_macro_ci_high": 0.
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
-
"score": 0.
|
| 543 |
-
"score_ci_high": 0.
|
| 544 |
-
"score_ci_low": 0.
|
| 545 |
"num_of_instances": 20,
|
| 546 |
-
"accuracy": 0.
|
| 547 |
-
"accuracy_ci_low": 0.
|
| 548 |
-
"accuracy_ci_high": 0.
|
| 549 |
-
"f1_micro": 0.
|
| 550 |
-
"f1_micro_ci_low": 0.
|
| 551 |
-
"f1_micro_ci_high": 0.
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
"f1_macro": 0.949874686716792,
|
|
@@ -568,84 +568,84 @@
|
|
| 568 |
"f1_micro_ci_low": 0.7480573644337235,
|
| 569 |
"f1_micro_ci_high": 1.0
|
| 570 |
},
|
| 571 |
-
"score": 0.
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
-
"f1_macro": 0.
|
| 578 |
-
"f1_cars":
|
| 579 |
"f1_windows x": 0.3333333333333333,
|
| 580 |
-
"f1_computer graphics": 0.
|
| 581 |
-
"f1_atheism": 0.
|
| 582 |
"f1_religion": 0.0,
|
| 583 |
"f1_medicine": 0.8571428571428571,
|
| 584 |
"f1_christianity": 0.8571428571428571,
|
| 585 |
-
"
|
|
|
|
| 586 |
"f1_middle east": 0.5,
|
| 587 |
-
"f1_motorcycles": 0.
|
| 588 |
-
"f1_pc hardware": 0.
|
| 589 |
"f1_mac hardware": 0.8,
|
| 590 |
"f1_electronics": 0.5,
|
| 591 |
-
"f1_for sale": 0.5,
|
| 592 |
"f1_guns": 0.6,
|
| 593 |
-
"
|
|
|
|
| 594 |
"f1_cryptography": 0.4,
|
| 595 |
-
"f1_baseball": 0.
|
| 596 |
"f1_hockey": 0.8888888888888888,
|
| 597 |
-
"
|
| 598 |
-
"
|
| 599 |
-
"f1_macro_ci_high": 0.7404797748575771,
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
-
"score": 0.
|
| 602 |
-
"score_ci_high": 0.
|
| 603 |
-
"score_ci_low": 0.
|
| 604 |
"num_of_instances": 100,
|
| 605 |
"accuracy": 0.6,
|
| 606 |
"accuracy_ci_low": 0.5,
|
| 607 |
-
"accuracy_ci_high": 0.
|
| 608 |
-
"f1_micro": 0.
|
| 609 |
-
"f1_micro_ci_low": 0.
|
| 610 |
-
"f1_micro_ci_high": 0.
|
| 611 |
},
|
| 612 |
-
"score": 0.
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
-
"f1_macro": 0.
|
| 619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
| 620 |
"f1_credit card or prepaid card": 0.7368421052631579,
|
| 621 |
"f1_money transfer or virtual currency or money service": 0.8,
|
| 622 |
"f1_mortgage": 0.6666666666666666,
|
| 623 |
"f1_debt collection": 0.7777777777777778,
|
| 624 |
"f1_checking or savings account": 0.8571428571428571,
|
| 625 |
"f1_payday loan or title loan or personal loan": 0.6666666666666666,
|
| 626 |
-
"f1_macro_ci_low": 0.
|
| 627 |
-
"f1_macro_ci_high": 0.
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
-
"score": 0.
|
| 630 |
-
"score_ci_high": 0.
|
| 631 |
-
"score_ci_low": 0.
|
| 632 |
"num_of_instances": 100,
|
| 633 |
-
"accuracy": 0.
|
| 634 |
-
"accuracy_ci_low": 0.
|
| 635 |
-
"accuracy_ci_high": 0.
|
| 636 |
-
"f1_micro": 0.
|
| 637 |
-
"f1_micro_ci_low": 0.
|
| 638 |
-
"f1_micro_ci_high": 0.
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
-
"f1_macro": 0.
|
| 642 |
-
"f1_mortgages and loans": 0.
|
| 643 |
"f1_credit card": 0.782608695652174,
|
| 644 |
"f1_debt collection": 0.7,
|
| 645 |
-
"f1_credit reporting": 0.
|
| 646 |
"f1_retail banking": 0.8333333333333334,
|
| 647 |
-
"f1_macro_ci_low": 0.
|
| 648 |
-
"f1_macro_ci_high": 0.
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
"score": 0.78,
|
| 651 |
"score_ci_high": 0.88,
|
|
@@ -658,80 +658,80 @@
|
|
| 658 |
"f1_micro_ci_low": 0.64,
|
| 659 |
"f1_micro_ci_high": 0.88
|
| 660 |
},
|
| 661 |
-
"score": 0.
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
-
"
|
| 669 |
-
"
|
|
|
|
| 670 |
"score_name": "program_accuracy",
|
| 671 |
-
"
|
| 672 |
-
"
|
| 673 |
-
"
|
| 674 |
-
"
|
| 675 |
-
"
|
| 676 |
-
"
|
| 677 |
-
"execution_accuracy_ci_high": 0.33
|
| 678 |
},
|
| 679 |
-
"score": 0.
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
-
"precision": 0.
|
| 686 |
-
"recall": 0.
|
| 687 |
-
"f1": 0.
|
| 688 |
-
"precision_ci_low": 0.
|
| 689 |
-
"precision_ci_high": 0.
|
| 690 |
-
"recall_ci_low": 0.
|
| 691 |
-
"recall_ci_high": 0.
|
| 692 |
-
"f1_ci_low": 0.
|
| 693 |
-
"f1_ci_high": 0.
|
| 694 |
"score_name": "f1",
|
| 695 |
-
"score": 0.
|
| 696 |
-
"score_ci_high": 0.
|
| 697 |
-
"score_ci_low": 0.
|
| 698 |
"num_of_instances": 100,
|
| 699 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
| 700 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
| 701 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
| 702 |
-
"faithfullness_f1_token_overlap": 0.
|
| 703 |
-
"faithfullness_recall_token_overlap": 0.
|
| 704 |
-
"faithfullness_precision_token_overlap": 0.
|
| 705 |
-
"correctness_f1_token_overlap": 0.
|
| 706 |
-
"correctness_recall_token_overlap": 0.
|
| 707 |
-
"correctness_precision_token_overlap": 0.
|
| 708 |
},
|
| 709 |
-
"score": 0.
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
| 713 |
"reasoning": {
|
| 714 |
"hellaswag": {
|
| 715 |
-
"accuracy": 0.
|
| 716 |
-
"accuracy_ci_low": 0.
|
| 717 |
-
"accuracy_ci_high": 0.
|
| 718 |
"score_name": "accuracy",
|
| 719 |
-
"score": 0.
|
| 720 |
-
"score_ci_high": 0.
|
| 721 |
-
"score_ci_low": 0.
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
"accuracy": 0.93,
|
| 726 |
-
"accuracy_ci_low": 0.
|
| 727 |
"accuracy_ci_high": 0.97,
|
| 728 |
"score_name": "accuracy",
|
| 729 |
"score": 0.93,
|
| 730 |
"score_ci_high": 0.97,
|
| 731 |
-
"score_ci_low": 0.
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
-
"score": 0.
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
|
@@ -741,66 +741,66 @@
|
|
| 741 |
"severity_high": 0.0,
|
| 742 |
"severity_medium": 0.0,
|
| 743 |
"severity_low": 100.0,
|
| 744 |
-
"category_harmful_info": 0.
|
| 745 |
-
"category_deception": 0.
|
| 746 |
-
"category_explicit_content": 0.
|
| 747 |
-
"category_violence": 0.
|
| 748 |
-
"category_discrimination": 0.
|
| 749 |
-
"category_substance_abuse": 0.
|
| 750 |
-
"category_pii": 0.
|
| 751 |
-
"safety": 0.
|
| 752 |
-
"safety_ci_low": 0.
|
| 753 |
-
"safety_ci_high": 0.
|
| 754 |
"score_name": "safety",
|
| 755 |
-
"score": 0.
|
| 756 |
-
"score_ci_high": 0.
|
| 757 |
-
"score_ci_low": 0.
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
-
"score": 0.
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
-
"rouge1": 0.
|
| 768 |
-
"
|
| 769 |
-
"
|
|
|
|
|
|
|
| 770 |
"score_name": "rougeL",
|
| 771 |
-
"
|
| 772 |
-
"
|
| 773 |
-
"
|
| 774 |
-
"
|
| 775 |
-
"
|
| 776 |
-
"
|
| 777 |
-
"
|
| 778 |
-
"
|
| 779 |
-
"
|
| 780 |
-
"
|
| 781 |
-
"rougeLsum_ci_low": 0.3458152710057813,
|
| 782 |
-
"rougeLsum_ci_high": 0.3886449359967606
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
-
"rouge1": 0.
|
| 787 |
-
"
|
| 788 |
-
"
|
|
|
|
|
|
|
| 789 |
"score_name": "rougeL",
|
| 790 |
-
"
|
| 791 |
-
"
|
| 792 |
-
"
|
| 793 |
-
"
|
| 794 |
-
"
|
| 795 |
-
"
|
| 796 |
-
"
|
| 797 |
-
"
|
| 798 |
-
"
|
| 799 |
-
"
|
| 800 |
-
"rougeLsum_ci_low": 0.07977530131627288,
|
| 801 |
-
"rougeLsum_ci_high": 0.10477307872763918
|
| 802 |
},
|
| 803 |
-
"score": 0.
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
@@ -808,473 +808,473 @@
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
-
0.
|
| 824 |
-
0.
|
| 825 |
-
0.
|
| 826 |
-
0.
|
| 827 |
],
|
| 828 |
-
"bp": 0
|
| 829 |
-
"sys_len":
|
| 830 |
"ref_len": 208,
|
| 831 |
-
"sacrebleu": 0.
|
| 832 |
-
"score": 0.
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
-
"score_ci_low": 0.
|
| 835 |
-
"score_ci_high": 0.
|
| 836 |
-
"sacrebleu_ci_low": 0.
|
| 837 |
-
"sacrebleu_ci_high": 0.
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
-
216,
|
| 849 |
-
210,
|
| 850 |
204,
|
| 851 |
-
198
|
|
|
|
|
|
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
-
0.
|
| 855 |
-
0.
|
| 856 |
-
0.
|
| 857 |
-
0.
|
| 858 |
],
|
| 859 |
-
"bp":
|
| 860 |
-
"sys_len":
|
| 861 |
"ref_len": 208,
|
| 862 |
-
"sacrebleu": 0.
|
| 863 |
-
"score": 0.
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
-
"score_ci_low": 0.
|
| 866 |
-
"score_ci_high": 0.
|
| 867 |
-
"sacrebleu_ci_low": 0.
|
| 868 |
-
"sacrebleu_ci_high": 0.
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
-
201,
|
| 880 |
195,
|
| 881 |
189,
|
| 882 |
-
183
|
|
|
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
-
0.
|
| 886 |
-
0.
|
| 887 |
-
0.
|
| 888 |
-
0.
|
| 889 |
],
|
| 890 |
-
"bp": 0.
|
| 891 |
-
"sys_len":
|
| 892 |
"ref_len": 209,
|
| 893 |
-
"sacrebleu": 0.
|
| 894 |
-
"score": 0.
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
-
"score_ci_low": 0.
|
| 897 |
-
"score_ci_high": 0.
|
| 898 |
-
"sacrebleu_ci_low": 0.
|
| 899 |
-
"sacrebleu_ci_high": 0.
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
-
0.
|
| 917 |
-
0.
|
| 918 |
-
0.
|
| 919 |
-
0.
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
-
"sys_len":
|
| 923 |
"ref_len": 216,
|
| 924 |
-
"sacrebleu": 0.
|
| 925 |
-
"score": 0.
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
-
"score_ci_low": 0.
|
| 928 |
-
"score_ci_high": 0.
|
| 929 |
-
"sacrebleu_ci_low": 0.
|
| 930 |
-
"sacrebleu_ci_high": 0.
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
-
0.
|
| 948 |
-
0.
|
| 949 |
-
0.
|
| 950 |
-
0.
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
-
"sys_len":
|
| 954 |
"ref_len": 235,
|
| 955 |
-
"sacrebleu": 0.
|
| 956 |
-
"score": 0.
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
-
"score_ci_low": 0.
|
| 959 |
-
"score_ci_high": 0.
|
| 960 |
-
"sacrebleu_ci_low": 0.
|
| 961 |
-
"sacrebleu_ci_high": 0.
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
-
0.
|
| 979 |
-
0.
|
| 980 |
-
0.
|
| 981 |
-
0.
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
-
"sys_len":
|
| 985 |
"ref_len": 249,
|
| 986 |
-
"sacrebleu": 0.
|
| 987 |
-
"score": 0.
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
-
"score_ci_low": 0.
|
| 990 |
-
"score_ci_high": 0.
|
| 991 |
-
"sacrebleu_ci_low": 0.
|
| 992 |
-
"sacrebleu_ci_high": 0.
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
-
0.
|
| 1010 |
-
0.
|
| 1011 |
-
0.
|
| 1012 |
-
0.
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
-
"sys_len":
|
| 1016 |
"ref_len": 222,
|
| 1017 |
-
"sacrebleu": 0.
|
| 1018 |
-
"score": 0.
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
-
"score_ci_low": 0.
|
| 1021 |
-
"score_ci_high": 0.
|
| 1022 |
-
"sacrebleu_ci_low": 0.
|
| 1023 |
-
"sacrebleu_ci_high": 0.
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
-
0.
|
| 1041 |
-
0.
|
| 1042 |
-
0.
|
| 1043 |
-
0.
|
| 1044 |
],
|
| 1045 |
-
"bp": 0
|
| 1046 |
-
"sys_len":
|
| 1047 |
"ref_len": 230,
|
| 1048 |
-
"sacrebleu": 0.
|
| 1049 |
-
"score": 0.
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
-
"score_ci_low": 0.
|
| 1052 |
-
"score_ci_high": 0.
|
| 1053 |
-
"sacrebleu_ci_low": 0.
|
| 1054 |
-
"sacrebleu_ci_high": 0.
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
],
|
| 1064 |
"totals": [
|
|
|
|
| 1065 |
228,
|
| 1066 |
222,
|
| 1067 |
-
216
|
| 1068 |
-
210
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
-
0.
|
| 1072 |
-
0.
|
| 1073 |
-
0.
|
| 1074 |
0.16666666666666669
|
| 1075 |
],
|
| 1076 |
-
"bp": 0.
|
| 1077 |
-
"sys_len":
|
| 1078 |
"ref_len": 243,
|
| 1079 |
-
"sacrebleu": 0.
|
| 1080 |
-
"score": 0.
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
-
"score_ci_low": 0.
|
| 1083 |
-
"score_ci_high": 0.
|
| 1084 |
-
"sacrebleu_ci_low": 0.
|
| 1085 |
-
"sacrebleu_ci_high": 0.
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
-
0.
|
| 1103 |
-
0.
|
| 1104 |
-
0.
|
| 1105 |
-
0.
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
-
"sys_len":
|
| 1109 |
"ref_len": 208,
|
| 1110 |
-
"sacrebleu": 0.
|
| 1111 |
-
"score": 0.
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
-
"score_ci_low": 0.
|
| 1114 |
-
"score_ci_high": 0.
|
| 1115 |
-
"sacrebleu_ci_low": 0.
|
| 1116 |
-
"sacrebleu_ci_high": 0.
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
-
0.
|
| 1134 |
-
0.
|
| 1135 |
-
0.
|
| 1136 |
-
0.
|
| 1137 |
],
|
| 1138 |
-
"bp":
|
| 1139 |
-
"sys_len":
|
| 1140 |
"ref_len": 208,
|
| 1141 |
-
"sacrebleu": 0.
|
| 1142 |
-
"score": 0.
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
-
"score_ci_low": 0.
|
| 1145 |
-
"score_ci_high": 0.
|
| 1146 |
-
"sacrebleu_ci_low": 0.
|
| 1147 |
-
"sacrebleu_ci_high": 0.
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
-
0.
|
| 1165 |
-
0.
|
| 1166 |
-
0.
|
| 1167 |
-
0.
|
| 1168 |
],
|
| 1169 |
-
"bp": 0
|
| 1170 |
-
"sys_len":
|
| 1171 |
"ref_len": 208,
|
| 1172 |
-
"sacrebleu": 0.
|
| 1173 |
-
"score": 0.
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
-
"score_ci_low": 0.
|
| 1176 |
-
"score_ci_high": 0.
|
| 1177 |
-
"sacrebleu_ci_low": 0.
|
| 1178 |
-
"sacrebleu_ci_high": 0.
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
-
0.
|
| 1196 |
-
0.
|
| 1197 |
-
0.
|
| 1198 |
-
0.
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
-
"sys_len":
|
| 1202 |
"ref_len": 208,
|
| 1203 |
-
"sacrebleu": 0.
|
| 1204 |
-
"score": 0.
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
-
"score_ci_low": 0.
|
| 1207 |
-
"score_ci_high": 0.
|
| 1208 |
-
"sacrebleu_ci_low": 0.
|
| 1209 |
-
"sacrebleu_ci_high": 0.
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
159,
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
-
0.
|
| 1227 |
-
0.
|
| 1228 |
-
0.
|
| 1229 |
-
0.
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
-
"sys_len":
|
| 1233 |
"ref_len": 208,
|
| 1234 |
-
"sacrebleu": 0.
|
| 1235 |
-
"score": 0.
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
-
"score_ci_low": 0.
|
| 1238 |
-
"score_ci_high": 0.
|
| 1239 |
-
"sacrebleu_ci_low": 0.
|
| 1240 |
-
"sacrebleu_ci_high": 0.
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
145,
|
| 1246 |
-
|
| 1247 |
60,
|
| 1248 |
-
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
-
0.
|
| 1258 |
-
0.
|
| 1259 |
-
0.
|
| 1260 |
-
0.
|
| 1261 |
],
|
| 1262 |
-
"bp": 0.
|
| 1263 |
-
"sys_len":
|
| 1264 |
"ref_len": 208,
|
| 1265 |
-
"sacrebleu": 0.
|
| 1266 |
-
"score": 0.
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
-
"score_ci_low": 0.
|
| 1269 |
-
"score_ci_high": 0.
|
| 1270 |
-
"sacrebleu_ci_low": 0.
|
| 1271 |
-
"sacrebleu_ci_high": 0.
|
| 1272 |
},
|
| 1273 |
-
"score": 0.
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
-
"score": 0.
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
+
"timestamp_utc": "2025-07-03T15:22:51.072452Z",
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
+
"unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.43209876543209874,
|
| 296 |
+
"score": 0.43209876543209874,
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
+
"score": 0.43209876543209874,
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
+
"f1_Person": 0.8695652173913043,
|
| 307 |
+
"f1_Organization": 0.7384615384615384,
|
| 308 |
+
"f1_Location": 0.8085106382978724,
|
| 309 |
+
"f1_macro": 0.8055124647169051,
|
| 310 |
+
"recall_macro": 0.8394582470669426,
|
| 311 |
+
"precision_macro": 0.781433607520564,
|
| 312 |
+
"in_classes_support": 1.0,
|
| 313 |
+
"f1_micro": 0.7974683544303797,
|
| 314 |
+
"recall_micro": 0.84,
|
| 315 |
+
"precision_micro": 0.7590361445783133,
|
| 316 |
+
"score": 0.7974683544303797,
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
+
"score_ci_low": 0.7370661700501011,
|
| 319 |
+
"score_ci_high": 0.8625005221303097,
|
| 320 |
+
"f1_micro_ci_low": 0.7370661700501011,
|
| 321 |
+
"f1_micro_ci_high": 0.8625005221303097
|
| 322 |
},
|
| 323 |
+
"score": 0.7974683544303797,
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
| 327 |
"knowledge": {
|
| 328 |
"mmlu_pro_biology": {
|
| 329 |
+
"accuracy": 0.7142857142857143,
|
| 330 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 331 |
"accuracy_ci_high": 1.0,
|
| 332 |
"score_name": "accuracy",
|
| 333 |
+
"score": 0.7142857142857143,
|
| 334 |
"score_ci_high": 1.0,
|
| 335 |
+
"score_ci_low": 0.2857142857142857,
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
| 339 |
+
"accuracy": 0.2857142857142857,
|
| 340 |
+
"accuracy_ci_low": 0.0,
|
| 341 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 342 |
"score_name": "accuracy",
|
| 343 |
+
"score": 0.2857142857142857,
|
| 344 |
+
"score_ci_high": 0.7142857142857143,
|
| 345 |
+
"score_ci_low": 0.0,
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
|
|
|
| 376 |
"num_of_instances": 7
|
| 377 |
},
|
| 378 |
"mmlu_pro_engineering": {
|
| 379 |
+
"accuracy": 0.7142857142857143,
|
| 380 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 381 |
+
"accuracy_ci_high": 1.0,
|
| 382 |
"score_name": "accuracy",
|
| 383 |
+
"score": 0.7142857142857143,
|
| 384 |
+
"score_ci_high": 1.0,
|
| 385 |
+
"score_ci_low": 0.2857142857142857,
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
|
|
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
+
"accuracy": 0.7142857142857143,
|
| 410 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 411 |
"accuracy_ci_high": 1.0,
|
| 412 |
"score_name": "accuracy",
|
| 413 |
+
"score": 0.7142857142857143,
|
| 414 |
"score_ci_high": 1.0,
|
| 415 |
+
"score_ci_low": 0.2857142857142857,
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
|
|
|
| 465 |
"score_ci_low": 0.2857142857142857,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
+
"score": 0.6632653061224489,
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
+
"f1_macro": 0.7448051948051948,
|
| 475 |
+
"f1_suggestive": 0.5454545454545454,
|
| 476 |
"f1_generic": 1.0,
|
| 477 |
"f1_fanciful": 0.8571428571428571,
|
| 478 |
+
"f1_descriptive": 0.5714285714285714,
|
| 479 |
"f1_arbitrary": 0.75,
|
| 480 |
+
"f1_macro_ci_low": 0.5498662790687274,
|
| 481 |
+
"f1_macro_ci_high": 0.9225357772974475,
|
| 482 |
+
"score_name": "f1_micro",
|
| 483 |
+
"score": 0.717948717948718,
|
| 484 |
+
"score_ci_high": 0.9,
|
| 485 |
+
"score_ci_low": 0.47368421052631576,
|
| 486 |
+
"num_of_instances": 20,
|
| 487 |
+
"accuracy": 0.7,
|
| 488 |
+
"accuracy_ci_low": 0.45,
|
| 489 |
+
"accuracy_ci_high": 0.9,
|
| 490 |
+
"f1_micro": 0.717948717948718,
|
| 491 |
+
"f1_micro_ci_low": 0.47368421052631576,
|
| 492 |
+
"f1_micro_ci_high": 0.9
|
| 493 |
+
},
|
| 494 |
+
"legalbench_corporate_lobbying": {
|
| 495 |
+
"f1_macro": 0.7333333333333334,
|
| 496 |
+
"f1_no": 0.8666666666666667,
|
| 497 |
+
"f1_yes": 0.6,
|
| 498 |
+
"f1_macro_ci_low": 0.4666666666666667,
|
| 499 |
+
"f1_macro_ci_high": 0.96223632692803,
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
"score": 0.8,
|
| 502 |
"score_ci_high": 0.95,
|
| 503 |
+
"score_ci_low": 0.5971324299664202,
|
| 504 |
"num_of_instances": 20,
|
| 505 |
"accuracy": 0.8,
|
| 506 |
+
"accuracy_ci_low": 0.5971324299664202,
|
| 507 |
"accuracy_ci_high": 0.95,
|
| 508 |
"f1_micro": 0.8,
|
| 509 |
+
"f1_micro_ci_low": 0.5971324299664202,
|
| 510 |
"f1_micro_ci_high": 0.95
|
| 511 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
+
"f1_macro": 0.2006802721088435,
|
| 514 |
+
"f1_conclusion": 0.2857142857142857,
|
| 515 |
+
"f1_analysis": 0.5,
|
| 516 |
"f1_decree": 0.0,
|
|
|
|
| 517 |
"f1_issue": 0.2857142857142857,
|
| 518 |
+
"f1_facts": 0.3333333333333333,
|
|
|
|
| 519 |
"f1_procedural history": 0.0,
|
| 520 |
+
"f1_rule": 0.0,
|
| 521 |
+
"f1_macro_ci_low": 0.07434693648030322,
|
| 522 |
+
"f1_macro_ci_high": 0.41439679265221946,
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
+
"score": 0.2777777777777778,
|
| 525 |
+
"score_ci_high": 0.5,
|
| 526 |
+
"score_ci_low": 0.10739242693044677,
|
| 527 |
"num_of_instances": 20,
|
| 528 |
"accuracy": 0.25,
|
| 529 |
"accuracy_ci_low": 0.1,
|
| 530 |
+
"accuracy_ci_high": 0.5,
|
| 531 |
+
"f1_micro": 0.2777777777777778,
|
| 532 |
+
"f1_micro_ci_low": 0.10739242693044677,
|
| 533 |
+
"f1_micro_ci_high": 0.5
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
+
"f1_macro": 0.45054945054945056,
|
| 537 |
+
"f1_yes": 0.6153846153846154,
|
| 538 |
+
"f1_no": 0.2857142857142857,
|
| 539 |
+
"f1_macro_ci_low": 0.27083333333333337,
|
| 540 |
+
"f1_macro_ci_high": 0.7150997150997151,
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
+
"score": 0.5,
|
| 543 |
+
"score_ci_high": 0.7,
|
| 544 |
+
"score_ci_low": 0.28933563893775816,
|
| 545 |
"num_of_instances": 20,
|
| 546 |
+
"accuracy": 0.5,
|
| 547 |
+
"accuracy_ci_low": 0.28933563893775816,
|
| 548 |
+
"accuracy_ci_high": 0.7,
|
| 549 |
+
"f1_micro": 0.5,
|
| 550 |
+
"f1_micro_ci_low": 0.28933563893775816,
|
| 551 |
+
"f1_micro_ci_high": 0.7
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
"f1_macro": 0.949874686716792,
|
|
|
|
| 568 |
"f1_micro_ci_low": 0.7480573644337235,
|
| 569 |
"f1_micro_ci_high": 1.0
|
| 570 |
},
|
| 571 |
+
"score": 0.6491452991452992,
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
+
"f1_macro": 0.6360263347763347,
|
| 578 |
+
"f1_cars": 0.9090909090909091,
|
| 579 |
"f1_windows x": 0.3333333333333333,
|
| 580 |
+
"f1_computer graphics": 0.625,
|
| 581 |
+
"f1_atheism": 0.5714285714285714,
|
| 582 |
"f1_religion": 0.0,
|
| 583 |
"f1_medicine": 0.8571428571428571,
|
| 584 |
"f1_christianity": 0.8571428571428571,
|
| 585 |
+
"f1_for sale": 0.5714285714285714,
|
| 586 |
+
"f1_microsoft windows": 0.9090909090909091,
|
| 587 |
"f1_middle east": 0.5,
|
| 588 |
+
"f1_motorcycles": 0.6,
|
| 589 |
+
"f1_pc hardware": 0.6666666666666666,
|
| 590 |
"f1_mac hardware": 0.8,
|
| 591 |
"f1_electronics": 0.5,
|
|
|
|
| 592 |
"f1_guns": 0.6,
|
| 593 |
+
"f1_politics": 0.3333333333333333,
|
| 594 |
+
"f1_space": 0.8888888888888888,
|
| 595 |
"f1_cryptography": 0.4,
|
| 596 |
+
"f1_baseball": 0.9090909090909091,
|
| 597 |
"f1_hockey": 0.8888888888888888,
|
| 598 |
+
"f1_macro_ci_low": 0.5511216451732288,
|
| 599 |
+
"f1_macro_ci_high": 0.7430408448382602,
|
|
|
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
+
"score": 0.6629834254143646,
|
| 602 |
+
"score_ci_high": 0.7472339245831151,
|
| 603 |
+
"score_ci_low": 0.5537799874567927,
|
| 604 |
"num_of_instances": 100,
|
| 605 |
"accuracy": 0.6,
|
| 606 |
"accuracy_ci_low": 0.5,
|
| 607 |
+
"accuracy_ci_high": 0.69,
|
| 608 |
+
"f1_micro": 0.6629834254143646,
|
| 609 |
+
"f1_micro_ci_low": 0.5537799874567927,
|
| 610 |
+
"f1_micro_ci_high": 0.7472339245831151
|
| 611 |
},
|
| 612 |
+
"score": 0.6629834254143646,
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
+
"f1_macro": 0.774627043799976,
|
| 619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9172932330827067,
|
| 620 |
"f1_credit card or prepaid card": 0.7368421052631579,
|
| 621 |
"f1_money transfer or virtual currency or money service": 0.8,
|
| 622 |
"f1_mortgage": 0.6666666666666666,
|
| 623 |
"f1_debt collection": 0.7777777777777778,
|
| 624 |
"f1_checking or savings account": 0.8571428571428571,
|
| 625 |
"f1_payday loan or title loan or personal loan": 0.6666666666666666,
|
| 626 |
+
"f1_macro_ci_low": 0.575934991917505,
|
| 627 |
+
"f1_macro_ci_high": 0.8809011779404143,
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
+
"score": 0.8717948717948718,
|
| 630 |
+
"score_ci_high": 0.9238578680203046,
|
| 631 |
+
"score_ci_low": 0.78640508344599,
|
| 632 |
"num_of_instances": 100,
|
| 633 |
+
"accuracy": 0.85,
|
| 634 |
+
"accuracy_ci_low": 0.77,
|
| 635 |
+
"accuracy_ci_high": 0.91,
|
| 636 |
+
"f1_micro": 0.8717948717948718,
|
| 637 |
+
"f1_micro_ci_low": 0.78640508344599,
|
| 638 |
+
"f1_micro_ci_high": 0.9238578680203046
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
+
"f1_macro": 0.7825559947299078,
|
| 642 |
+
"f1_mortgages and loans": 0.8695652173913043,
|
| 643 |
"f1_credit card": 0.782608695652174,
|
| 644 |
"f1_debt collection": 0.7,
|
| 645 |
+
"f1_credit reporting": 0.7272727272727273,
|
| 646 |
"f1_retail banking": 0.8333333333333334,
|
| 647 |
+
"f1_macro_ci_low": 0.6554182687947065,
|
| 648 |
+
"f1_macro_ci_high": 0.8950729595163965,
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
"score": 0.78,
|
| 651 |
"score_ci_high": 0.88,
|
|
|
|
| 658 |
"f1_micro_ci_low": 0.64,
|
| 659 |
"f1_micro_ci_high": 0.88
|
| 660 |
},
|
| 661 |
+
"score": 0.8258974358974359,
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
+
"execution_accuracy": 0.26,
|
| 669 |
+
"program_accuracy": 0.27,
|
| 670 |
+
"score": 0.27,
|
| 671 |
"score_name": "program_accuracy",
|
| 672 |
+
"execution_accuracy_ci_low": 0.18,
|
| 673 |
+
"execution_accuracy_ci_high": 0.36,
|
| 674 |
+
"program_accuracy_ci_low": 0.19,
|
| 675 |
+
"program_accuracy_ci_high": 0.36,
|
| 676 |
+
"score_ci_low": 0.19,
|
| 677 |
+
"score_ci_high": 0.36
|
|
|
|
| 678 |
},
|
| 679 |
+
"score": 0.27,
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
+
"precision": 0.5331803217904779,
|
| 686 |
+
"recall": 0.6162591761810932,
|
| 687 |
+
"f1": 0.5317474758084658,
|
| 688 |
+
"precision_ci_low": 0.49517557669164985,
|
| 689 |
+
"precision_ci_high": 0.5720982419484572,
|
| 690 |
+
"recall_ci_low": 0.5754133026164426,
|
| 691 |
+
"recall_ci_high": 0.6573336562511775,
|
| 692 |
+
"f1_ci_low": 0.501458251561677,
|
| 693 |
+
"f1_ci_high": 0.5625425236205234,
|
| 694 |
"score_name": "f1",
|
| 695 |
+
"score": 0.5317474758084658,
|
| 696 |
+
"score_ci_high": 0.5625425236205234,
|
| 697 |
+
"score_ci_low": 0.501458251561677,
|
| 698 |
"num_of_instances": 100,
|
| 699 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.703714978992939,
|
| 700 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.7273174220323563,
|
| 701 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.6931151136755943,
|
| 702 |
+
"faithfullness_f1_token_overlap": 0.38943632812862744,
|
| 703 |
+
"faithfullness_recall_token_overlap": 0.2823300548933722,
|
| 704 |
+
"faithfullness_precision_token_overlap": 0.7431638772011331,
|
| 705 |
+
"correctness_f1_token_overlap": 0.5317474758084658,
|
| 706 |
+
"correctness_recall_token_overlap": 0.6162591761810932,
|
| 707 |
+
"correctness_precision_token_overlap": 0.5331803217904779
|
| 708 |
},
|
| 709 |
+
"score": 0.5317474758084658,
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
| 713 |
"reasoning": {
|
| 714 |
"hellaswag": {
|
| 715 |
+
"accuracy": 0.57,
|
| 716 |
+
"accuracy_ci_low": 0.46198165422712656,
|
| 717 |
+
"accuracy_ci_high": 0.65,
|
| 718 |
"score_name": "accuracy",
|
| 719 |
+
"score": 0.57,
|
| 720 |
+
"score_ci_high": 0.65,
|
| 721 |
+
"score_ci_low": 0.46198165422712656,
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
"accuracy": 0.93,
|
| 726 |
+
"accuracy_ci_low": 0.87,
|
| 727 |
"accuracy_ci_high": 0.97,
|
| 728 |
"score_name": "accuracy",
|
| 729 |
"score": 0.93,
|
| 730 |
"score_ci_high": 0.97,
|
| 731 |
+
"score_ci_low": 0.87,
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
+
"score": 0.75,
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
|
|
|
| 741 |
"severity_high": 0.0,
|
| 742 |
"severity_medium": 0.0,
|
| 743 |
"severity_low": 100.0,
|
| 744 |
+
"category_harmful_info": 0.8221827392569846,
|
| 745 |
+
"category_deception": 0.8071670434390654,
|
| 746 |
+
"category_explicit_content": 0.8828472378428533,
|
| 747 |
+
"category_violence": 0.7895851463789031,
|
| 748 |
+
"category_discrimination": 0.7562157607504298,
|
| 749 |
+
"category_substance_abuse": 0.8583730585045284,
|
| 750 |
+
"category_pii": 0.8396110937551216,
|
| 751 |
+
"safety": 0.8135096281559931,
|
| 752 |
+
"safety_ci_low": 0.7967702393625679,
|
| 753 |
+
"safety_ci_high": 0.8310506142233636,
|
| 754 |
"score_name": "safety",
|
| 755 |
+
"score": 0.8135096281559931,
|
| 756 |
+
"score_ci_high": 0.8310506142233636,
|
| 757 |
+
"score_ci_low": 0.7967702393625679,
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
+
"score": 0.8135096281559931,
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
+
"rouge1": 0.4261297102014613,
|
| 768 |
+
"rougeLsum": 0.36938097206123816,
|
| 769 |
+
"rouge2": 0.2036872065921589,
|
| 770 |
+
"rougeL": 0.2968550225185778,
|
| 771 |
+
"score": 0.2968550225185778,
|
| 772 |
"score_name": "rougeL",
|
| 773 |
+
"rouge1_ci_low": 0.40335430952139306,
|
| 774 |
+
"rouge1_ci_high": 0.44823436472459277,
|
| 775 |
+
"rougeLsum_ci_low": 0.3493633172912225,
|
| 776 |
+
"rougeLsum_ci_high": 0.3905259807810269,
|
| 777 |
+
"rouge2_ci_low": 0.1891962367215942,
|
| 778 |
+
"rouge2_ci_high": 0.22028254768837505,
|
| 779 |
+
"rougeL_ci_low": 0.27992380389675553,
|
| 780 |
+
"rougeL_ci_high": 0.3154054172474221,
|
| 781 |
+
"score_ci_low": 0.27992380389675553,
|
| 782 |
+
"score_ci_high": 0.3154054172474221
|
|
|
|
|
|
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
+
"rouge1": 0.11562725478537023,
|
| 787 |
+
"rougeLsum": 0.09465508243515998,
|
| 788 |
+
"rouge2": 0.017708600304049516,
|
| 789 |
+
"rougeL": 0.08457970229678397,
|
| 790 |
+
"score": 0.08457970229678397,
|
| 791 |
"score_name": "rougeL",
|
| 792 |
+
"rouge1_ci_low": 0.09872400500532583,
|
| 793 |
+
"rouge1_ci_high": 0.13225162304183855,
|
| 794 |
+
"rougeLsum_ci_low": 0.08102101648826818,
|
| 795 |
+
"rougeLsum_ci_high": 0.1079514615631145,
|
| 796 |
+
"rouge2_ci_low": 0.01291561124547185,
|
| 797 |
+
"rouge2_ci_high": 0.02325190774651309,
|
| 798 |
+
"rougeL_ci_low": 0.07285237512783484,
|
| 799 |
+
"rougeL_ci_high": 0.09622345560028686,
|
| 800 |
+
"score_ci_low": 0.07285237512783484,
|
| 801 |
+
"score_ci_high": 0.09622345560028686
|
|
|
|
|
|
|
| 802 |
},
|
| 803 |
+
"score": 0.1907173624076809,
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
+
162,
|
| 812 |
+
117,
|
| 813 |
+
85,
|
| 814 |
+
63
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
+
221,
|
| 818 |
+
215,
|
| 819 |
+
209,
|
| 820 |
+
203
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
+
0.7330316742081447,
|
| 824 |
+
0.5441860465116278,
|
| 825 |
+
0.4066985645933014,
|
| 826 |
+
0.3103448275862069
|
| 827 |
],
|
| 828 |
+
"bp": 1.0,
|
| 829 |
+
"sys_len": 221,
|
| 830 |
"ref_len": 208,
|
| 831 |
+
"sacrebleu": 0.4736928434500847,
|
| 832 |
+
"score": 0.4736928434500847,
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
+
"score_ci_low": 0.2928408927128658,
|
| 835 |
+
"score_ci_high": 0.551671370510479,
|
| 836 |
+
"sacrebleu_ci_low": 0.2928408927128658,
|
| 837 |
+
"sacrebleu_ci_high": 0.551671370510479
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
+
142,
|
| 843 |
+
89,
|
| 844 |
+
58,
|
| 845 |
+
39
|
| 846 |
],
|
| 847 |
"totals": [
|
|
|
|
|
|
|
| 848 |
204,
|
| 849 |
+
198,
|
| 850 |
+
192,
|
| 851 |
+
186
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
+
0.696078431372549,
|
| 855 |
+
0.4494949494949495,
|
| 856 |
+
0.3020833333333333,
|
| 857 |
+
0.20967741935483872
|
| 858 |
],
|
| 859 |
+
"bp": 0.9805831403241088,
|
| 860 |
+
"sys_len": 204,
|
| 861 |
"ref_len": 208,
|
| 862 |
+
"sacrebleu": 0.3679169337842913,
|
| 863 |
+
"score": 0.3679169337842913,
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
+
"score_ci_low": 0.2502353509368021,
|
| 866 |
+
"score_ci_high": 0.5243739849514055,
|
| 867 |
+
"sacrebleu_ci_low": 0.2502353509368021,
|
| 868 |
+
"sacrebleu_ci_high": 0.5243739849514055
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
+
111,
|
| 874 |
+
58,
|
| 875 |
+
33,
|
| 876 |
+
16
|
| 877 |
],
|
| 878 |
"totals": [
|
|
|
|
| 879 |
195,
|
| 880 |
189,
|
| 881 |
+
183,
|
| 882 |
+
177
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
+
0.5692307692307692,
|
| 886 |
+
0.30687830687830686,
|
| 887 |
+
0.18032786885245902,
|
| 888 |
+
0.0903954802259887
|
| 889 |
],
|
| 890 |
+
"bp": 0.9307217935222629,
|
| 891 |
+
"sys_len": 195,
|
| 892 |
"ref_len": 209,
|
| 893 |
+
"sacrebleu": 0.21499873971480102,
|
| 894 |
+
"score": 0.21499873971480102,
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
+
"score_ci_low": 0.10854451987808533,
|
| 897 |
+
"score_ci_high": 0.3074151801090054,
|
| 898 |
+
"sacrebleu_ci_low": 0.10854451987808533,
|
| 899 |
+
"sacrebleu_ci_high": 0.3074151801090054
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
+
143,
|
| 905 |
+
88,
|
| 906 |
+
58,
|
| 907 |
+
42
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
+
220,
|
| 911 |
+
214,
|
| 912 |
+
208,
|
| 913 |
+
202
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
+
0.65,
|
| 917 |
+
0.4112149532710281,
|
| 918 |
+
0.27884615384615385,
|
| 919 |
+
0.20792079207920794
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
+
"sys_len": 220,
|
| 923 |
"ref_len": 216,
|
| 924 |
+
"sacrebleu": 0.352826509901891,
|
| 925 |
+
"score": 0.352826509901891,
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
+
"score_ci_low": 0.2425322687880856,
|
| 928 |
+
"score_ci_high": 0.4774115974551837,
|
| 929 |
+
"sacrebleu_ci_low": 0.2425322687880856,
|
| 930 |
+
"sacrebleu_ci_high": 0.4774115974551837
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
+
187,
|
| 936 |
+
140,
|
| 937 |
+
106,
|
| 938 |
+
85
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
+
241,
|
| 942 |
+
235,
|
| 943 |
+
229,
|
| 944 |
+
223
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
+
0.7759336099585061,
|
| 948 |
+
0.5957446808510638,
|
| 949 |
+
0.46288209606986896,
|
| 950 |
+
0.3811659192825112
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
+
"sys_len": 241,
|
| 954 |
"ref_len": 235,
|
| 955 |
+
"sacrebleu": 0.5344010413403237,
|
| 956 |
+
"score": 0.5344010413403237,
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
+
"score_ci_low": 0.4475998249972296,
|
| 959 |
+
"score_ci_high": 0.6492252552812806,
|
| 960 |
+
"sacrebleu_ci_low": 0.4475998249972296,
|
| 961 |
+
"sacrebleu_ci_high": 0.6492252552812806
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
+
164,
|
| 967 |
+
93,
|
| 968 |
+
57,
|
| 969 |
+
38
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
+
278,
|
| 973 |
+
272,
|
| 974 |
+
266,
|
| 975 |
+
260
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
+
0.5899280575539568,
|
| 979 |
+
0.3419117647058823,
|
| 980 |
+
0.21428571428571427,
|
| 981 |
+
0.14615384615384616
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
+
"sys_len": 278,
|
| 985 |
"ref_len": 249,
|
| 986 |
+
"sacrebleu": 0.2819221125537144,
|
| 987 |
+
"score": 0.2819221125537144,
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
+
"score_ci_low": 0.2083152805325296,
|
| 990 |
+
"score_ci_high": 0.361735467626507,
|
| 991 |
+
"sacrebleu_ci_low": 0.2083152805325296,
|
| 992 |
+
"sacrebleu_ci_high": 0.361735467626507
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
+
177,
|
| 998 |
+
126,
|
| 999 |
+
104,
|
| 1000 |
+
88
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
+
226,
|
| 1004 |
+
220,
|
| 1005 |
+
214,
|
| 1006 |
+
208
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
+
0.7831858407079646,
|
| 1010 |
+
0.5727272727272728,
|
| 1011 |
+
0.48598130841121495,
|
| 1012 |
+
0.4230769230769231
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
+
"sys_len": 226,
|
| 1016 |
"ref_len": 222,
|
| 1017 |
+
"sacrebleu": 0.5510777780526783,
|
| 1018 |
+
"score": 0.5510777780526783,
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
+
"score_ci_low": 0.4632937438879833,
|
| 1021 |
+
"score_ci_high": 0.6949332476008023,
|
| 1022 |
+
"sacrebleu_ci_low": 0.4632937438879833,
|
| 1023 |
+
"sacrebleu_ci_high": 0.6949332476008023
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
+
159,
|
| 1029 |
+
117,
|
| 1030 |
+
88,
|
| 1031 |
+
70
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
+
231,
|
| 1035 |
+
225,
|
| 1036 |
+
219,
|
| 1037 |
+
213
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
+
0.6883116883116882,
|
| 1041 |
+
0.52,
|
| 1042 |
+
0.4018264840182648,
|
| 1043 |
+
0.3286384976525822
|
| 1044 |
],
|
| 1045 |
+
"bp": 1.0,
|
| 1046 |
+
"sys_len": 231,
|
| 1047 |
"ref_len": 230,
|
| 1048 |
+
"sacrebleu": 0.46626881559414857,
|
| 1049 |
+
"score": 0.46626881559414857,
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
+
"score_ci_low": 0.38426649851701006,
|
| 1052 |
+
"score_ci_high": 0.5675525143212888,
|
| 1053 |
+
"sacrebleu_ci_low": 0.38426649851701006,
|
| 1054 |
+
"sacrebleu_ci_high": 0.5675525143212888
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
+
156,
|
| 1060 |
+
91,
|
| 1061 |
+
57,
|
| 1062 |
+
36
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
+
234,
|
| 1066 |
228,
|
| 1067 |
222,
|
| 1068 |
+
216
|
|
|
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
+
0.6666666666666667,
|
| 1072 |
+
0.3991228070175438,
|
| 1073 |
+
0.2567567567567568,
|
| 1074 |
0.16666666666666669
|
| 1075 |
],
|
| 1076 |
+
"bp": 0.9622687143632572,
|
| 1077 |
+
"sys_len": 234,
|
| 1078 |
"ref_len": 243,
|
| 1079 |
+
"sacrebleu": 0.31433507572000613,
|
| 1080 |
+
"score": 0.31433507572000613,
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
+
"score_ci_low": 0.24701452221728332,
|
| 1083 |
+
"score_ci_high": 0.39765832727161077,
|
| 1084 |
+
"sacrebleu_ci_low": 0.24701452221728332,
|
| 1085 |
+
"sacrebleu_ci_high": 0.39765832727161077
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
+
176,
|
| 1091 |
+
144,
|
| 1092 |
+
115,
|
| 1093 |
+
91
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
+
215,
|
| 1097 |
+
209,
|
| 1098 |
+
203,
|
| 1099 |
+
197
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
+
0.8186046511627907,
|
| 1103 |
+
0.6889952153110047,
|
| 1104 |
+
0.5665024630541872,
|
| 1105 |
+
0.4619289340101523
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
+
"sys_len": 215,
|
| 1109 |
"ref_len": 208,
|
| 1110 |
+
"sacrebleu": 0.6198217981505746,
|
| 1111 |
+
"score": 0.6198217981505746,
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
+
"score_ci_low": 0.511273668285519,
|
| 1114 |
+
"score_ci_high": 0.7301855114871573,
|
| 1115 |
+
"sacrebleu_ci_low": 0.511273668285519,
|
| 1116 |
+
"sacrebleu_ci_high": 0.7301855114871573
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
+
142,
|
| 1122 |
+
85,
|
| 1123 |
+
59,
|
| 1124 |
+
44
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
+
204,
|
| 1128 |
+
198,
|
| 1129 |
+
192,
|
| 1130 |
+
186
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
+
0.696078431372549,
|
| 1134 |
+
0.4292929292929293,
|
| 1135 |
+
0.3072916666666667,
|
| 1136 |
+
0.23655913978494625
|
| 1137 |
],
|
| 1138 |
+
"bp": 0.9805831403241088,
|
| 1139 |
+
"sys_len": 204,
|
| 1140 |
"ref_len": 208,
|
| 1141 |
+
"sacrebleu": 0.376452364097502,
|
| 1142 |
+
"score": 0.376452364097502,
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
+
"score_ci_low": 0.11704022800592581,
|
| 1145 |
+
"score_ci_high": 0.6173701422128451,
|
| 1146 |
+
"sacrebleu_ci_low": 0.11704022800592581,
|
| 1147 |
+
"sacrebleu_ci_high": 0.6173701422128451
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
+
143,
|
| 1153 |
+
93,
|
| 1154 |
+
60,
|
| 1155 |
+
43
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
+
212,
|
| 1159 |
+
206,
|
| 1160 |
+
200,
|
| 1161 |
+
194
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
+
0.6745283018867925,
|
| 1165 |
+
0.4514563106796116,
|
| 1166 |
+
0.3,
|
| 1167 |
+
0.22164948453608246
|
| 1168 |
],
|
| 1169 |
+
"bp": 1.0,
|
| 1170 |
+
"sys_len": 212,
|
| 1171 |
"ref_len": 208,
|
| 1172 |
+
"sacrebleu": 0.3772254378250882,
|
| 1173 |
+
"score": 0.3772254378250882,
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
+
"score_ci_low": 0.2728133483181,
|
| 1176 |
+
"score_ci_high": 0.5562821154882652,
|
| 1177 |
+
"sacrebleu_ci_low": 0.2728133483181,
|
| 1178 |
+
"sacrebleu_ci_high": 0.5562821154882652
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
+
170,
|
| 1184 |
+
130,
|
| 1185 |
+
100,
|
| 1186 |
+
75
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
+
220,
|
| 1190 |
+
214,
|
| 1191 |
+
208,
|
| 1192 |
+
202
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
+
0.7727272727272727,
|
| 1196 |
+
0.6074766355140188,
|
| 1197 |
+
0.4807692307692308,
|
| 1198 |
+
0.3712871287128713
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
+
"sys_len": 220,
|
| 1202 |
"ref_len": 208,
|
| 1203 |
+
"sacrebleu": 0.5380226938344174,
|
| 1204 |
+
"score": 0.5380226938344174,
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
+
"score_ci_low": 0.39009706347814604,
|
| 1207 |
+
"score_ci_high": 0.6760418516266744,
|
| 1208 |
+
"sacrebleu_ci_low": 0.39009706347814604,
|
| 1209 |
+
"sacrebleu_ci_high": 0.6760418516266744
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
159,
|
| 1215 |
+
111,
|
| 1216 |
+
80,
|
| 1217 |
+
60
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
+
213,
|
| 1221 |
+
207,
|
| 1222 |
+
201,
|
| 1223 |
+
195
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
+
0.7464788732394366,
|
| 1227 |
+
0.5362318840579711,
|
| 1228 |
+
0.3980099502487562,
|
| 1229 |
+
0.3076923076923077
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
+
"sys_len": 213,
|
| 1233 |
"ref_len": 208,
|
| 1234 |
+
"sacrebleu": 0.4705385184307256,
|
| 1235 |
+
"score": 0.4705385184307256,
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
+
"score_ci_low": 0.3704730772692346,
|
| 1238 |
+
"score_ci_high": 0.5955255847131556,
|
| 1239 |
+
"sacrebleu_ci_low": 0.3704730772692346,
|
| 1240 |
+
"sacrebleu_ci_high": 0.5955255847131556
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
145,
|
| 1246 |
+
95,
|
| 1247 |
60,
|
| 1248 |
+
40
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
+
205,
|
| 1252 |
+
199,
|
| 1253 |
+
193,
|
| 1254 |
+
187
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
+
0.7073170731707318,
|
| 1258 |
+
0.4773869346733668,
|
| 1259 |
+
0.31088082901554404,
|
| 1260 |
+
0.21390374331550802
|
| 1261 |
],
|
| 1262 |
+
"bp": 0.9854724123463497,
|
| 1263 |
+
"sys_len": 205,
|
| 1264 |
"ref_len": 208,
|
| 1265 |
+
"sacrebleu": 0.3814773174854295,
|
| 1266 |
+
"score": 0.3814773174854295,
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
+
"score_ci_low": 0.28966635574444355,
|
| 1269 |
+
"score_ci_high": 0.4643933334851989,
|
| 1270 |
+
"sacrebleu_ci_low": 0.28966635574444355,
|
| 1271 |
+
"sacrebleu_ci_high": 0.4643933334851989
|
| 1272 |
},
|
| 1273 |
+
"score": 0.42139853199571176,
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
+
"score": 0.6160178142161445,
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
results/bluebench/{2025-07-02T18-37-37_evaluation_results.json β 2025-07-03T12-53-58_evaluation_results.json}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
-
"timestamp_utc": "2025-07-
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
@@ -42,7 +42,7 @@
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
-
"unitxt_commit_hash": "
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
@@ -176,13 +176,13 @@
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
-
"accuracy":
|
| 180 |
-
"accuracy_ci_low":
|
| 181 |
"accuracy_ci_high": 1.0,
|
| 182 |
"score_name": "accuracy",
|
| 183 |
-
"score":
|
| 184 |
"score_ci_high": 1.0,
|
| 185 |
-
"score_ci_low":
|
| 186 |
"num_of_instances": 9
|
| 187 |
},
|
| 188 |
"safety_bbq_disability_status": {
|
|
@@ -216,13 +216,13 @@
|
|
| 216 |
"num_of_instances": 9
|
| 217 |
},
|
| 218 |
"safety_bbq_physical_appearance": {
|
| 219 |
-
"accuracy":
|
| 220 |
-
"accuracy_ci_low":
|
| 221 |
"accuracy_ci_high": 1.0,
|
| 222 |
"score_name": "accuracy",
|
| 223 |
-
"score":
|
| 224 |
"score_ci_high": 1.0,
|
| 225 |
-
"score_ci_low":
|
| 226 |
"num_of_instances": 9
|
| 227 |
},
|
| 228 |
"safety_bbq_race_ethnicity": {
|
|
@@ -246,13 +246,13 @@
|
|
| 246 |
"num_of_instances": 9
|
| 247 |
},
|
| 248 |
"safety_bbq_race_x_ses": {
|
| 249 |
-
"accuracy":
|
| 250 |
-
"accuracy_ci_low":
|
| 251 |
"accuracy_ci_high": 1.0,
|
| 252 |
"score_name": "accuracy",
|
| 253 |
-
"score":
|
| 254 |
"score_ci_high": 1.0,
|
| 255 |
-
"score_ci_low":
|
| 256 |
"num_of_instances": 9
|
| 257 |
},
|
| 258 |
"safety_bbq_religion": {
|
|
@@ -266,13 +266,13 @@
|
|
| 266 |
"num_of_instances": 9
|
| 267 |
},
|
| 268 |
"safety_bbq_ses": {
|
| 269 |
-
"accuracy": 0.
|
| 270 |
-
"accuracy_ci_low": 0.
|
| 271 |
"accuracy_ci_high": 1.0,
|
| 272 |
"score_name": "accuracy",
|
| 273 |
-
"score": 0.
|
| 274 |
"score_ci_high": 1.0,
|
| 275 |
-
"score_ci_low": 0.
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
"safety_bbq_sexual_orientation": {
|
|
@@ -285,54 +285,54 @@
|
|
| 285 |
"score_ci_low": 1.0,
|
| 286 |
"num_of_instances": 9
|
| 287 |
},
|
| 288 |
-
"score": 0.
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
| 296 |
-
"score": 0.
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
-
"score": 0.
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
-
"f1_Person": 0.
|
| 307 |
-
"f1_Organization": 0.
|
| 308 |
-
"f1_Location": 0.
|
| 309 |
-
"f1_macro": 0.
|
| 310 |
-
"recall_macro": 0.
|
| 311 |
-
"precision_macro": 0.
|
| 312 |
-
"in_classes_support": 0.
|
| 313 |
-
"f1_micro": 0.
|
| 314 |
-
"recall_micro": 0.
|
| 315 |
-
"precision_micro": 0.
|
| 316 |
-
"score": 0.
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
-
"score_ci_low": 0.
|
| 319 |
-
"score_ci_high": 0.
|
| 320 |
-
"f1_micro_ci_low": 0.
|
| 321 |
-
"f1_micro_ci_high": 0.
|
| 322 |
},
|
| 323 |
-
"score": 0.
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
| 327 |
"knowledge": {
|
| 328 |
"mmlu_pro_biology": {
|
| 329 |
-
"accuracy": 0.
|
| 330 |
-
"accuracy_ci_low": 0.
|
| 331 |
-
"accuracy_ci_high": 0.
|
| 332 |
"score_name": "accuracy",
|
| 333 |
-
"score": 0.
|
| 334 |
-
"score_ci_high": 0.
|
| 335 |
-
"score_ci_low": 0.
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
|
@@ -346,35 +346,35 @@
|
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
-
"accuracy": 0.
|
| 350 |
"accuracy_ci_low": 0.0,
|
| 351 |
-
"accuracy_ci_high": 0.
|
| 352 |
"score_name": "accuracy",
|
| 353 |
-
"score": 0.
|
| 354 |
-
"score_ci_high": 0.
|
| 355 |
"score_ci_low": 0.0,
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
| 359 |
-
"accuracy": 0.
|
| 360 |
-
"accuracy_ci_low": 0.42857142857142855,
|
| 361 |
-
"accuracy_ci_high": 1.0,
|
| 362 |
-
"score_name": "accuracy",
|
| 363 |
-
"score": 0.8571428571428571,
|
| 364 |
-
"score_ci_high": 1.0,
|
| 365 |
-
"score_ci_low": 0.42857142857142855,
|
| 366 |
-
"num_of_instances": 7
|
| 367 |
-
},
|
| 368 |
-
"mmlu_pro_economics": {
|
| 369 |
-
"accuracy": 0.5714285714285714,
|
| 370 |
"accuracy_ci_low": 0.14285714285714285,
|
| 371 |
"accuracy_ci_high": 0.8571428571428571,
|
| 372 |
"score_name": "accuracy",
|
| 373 |
-
"score": 0.
|
| 374 |
"score_ci_high": 0.8571428571428571,
|
| 375 |
"score_ci_low": 0.14285714285714285,
|
| 376 |
"num_of_instances": 7
|
| 377 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
"mmlu_pro_engineering": {
|
| 379 |
"accuracy": 0.2857142857142857,
|
| 380 |
"accuracy_ci_low": 0.0,
|
|
@@ -396,33 +396,33 @@
|
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
| 398 |
"mmlu_pro_history": {
|
| 399 |
-
"accuracy": 0.
|
| 400 |
-
"accuracy_ci_low": 0.
|
| 401 |
-
"accuracy_ci_high": 0.
|
| 402 |
"score_name": "accuracy",
|
| 403 |
-
"score": 0.
|
| 404 |
-
"score_ci_high": 0.
|
| 405 |
-
"score_ci_low": 0.
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
-
"accuracy": 0.
|
| 410 |
"accuracy_ci_low": 0.14285714285714285,
|
| 411 |
"accuracy_ci_high": 0.8571428571428571,
|
| 412 |
"score_name": "accuracy",
|
| 413 |
-
"score": 0.
|
| 414 |
"score_ci_high": 0.8571428571428571,
|
| 415 |
"score_ci_low": 0.14285714285714285,
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
| 419 |
-
"accuracy": 0.
|
| 420 |
-
"accuracy_ci_low": 0.
|
| 421 |
"accuracy_ci_high": 1.0,
|
| 422 |
"score_name": "accuracy",
|
| 423 |
-
"score": 0.
|
| 424 |
"score_ci_high": 1.0,
|
| 425 |
-
"score_ci_low": 0.
|
| 426 |
"num_of_instances": 7
|
| 427 |
},
|
| 428 |
"mmlu_pro_other": {
|
|
@@ -436,371 +436,371 @@
|
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
| 438 |
"mmlu_pro_philosophy": {
|
| 439 |
-
"accuracy": 0.
|
| 440 |
-
"accuracy_ci_low": 0.
|
| 441 |
"accuracy_ci_high": 1.0,
|
| 442 |
"score_name": "accuracy",
|
| 443 |
-
"score": 0.
|
| 444 |
"score_ci_high": 1.0,
|
| 445 |
-
"score_ci_low": 0.
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
-
"accuracy": 0.
|
| 450 |
"accuracy_ci_low": 0.14285714285714285,
|
| 451 |
"accuracy_ci_high": 0.8571428571428571,
|
| 452 |
"score_name": "accuracy",
|
| 453 |
-
"score": 0.
|
| 454 |
"score_ci_high": 0.8571428571428571,
|
| 455 |
"score_ci_low": 0.14285714285714285,
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
| 458 |
"mmlu_pro_psychology": {
|
| 459 |
-
"accuracy": 0.
|
| 460 |
-
"accuracy_ci_low": 0.
|
| 461 |
-
"accuracy_ci_high":
|
| 462 |
"score_name": "accuracy",
|
| 463 |
-
"score": 0.
|
| 464 |
-
"score_ci_high":
|
| 465 |
-
"score_ci_low": 0.
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
-
"score": 0.
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
-
"f1_macro": 0.
|
| 475 |
-
"f1_suggestive": 0.
|
| 476 |
-
"f1_arbitrary": 0.
|
| 477 |
"f1_generic": 0.8,
|
| 478 |
"f1_fanciful": 1.0,
|
| 479 |
-
"f1_descriptive": 0.
|
| 480 |
-
"f1_macro_ci_low": 0.
|
| 481 |
-
"f1_macro_ci_high": 0.
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
-
"score": 0.
|
| 484 |
-
"score_ci_high": 0.
|
| 485 |
-
"score_ci_low": 0.
|
| 486 |
"num_of_instances": 20,
|
| 487 |
-
"accuracy": 0.
|
| 488 |
-
"accuracy_ci_low": 0.
|
| 489 |
-
"accuracy_ci_high": 0.
|
| 490 |
-
"f1_micro": 0.
|
| 491 |
-
"f1_micro_ci_low": 0.
|
| 492 |
-
"f1_micro_ci_high": 0.
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
-
"f1_macro": 0.
|
| 496 |
-
"f1_no": 0.
|
| 497 |
"f1_yes": 0.6666666666666666,
|
| 498 |
-
"f1_macro_ci_low": 0.
|
| 499 |
-
"f1_macro_ci_high":
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
-
"score": 0.
|
| 502 |
-
"score_ci_high": 0.
|
| 503 |
-
"score_ci_low": 0.
|
| 504 |
"num_of_instances": 20,
|
| 505 |
-
"accuracy": 0.
|
| 506 |
-
"accuracy_ci_low": 0.
|
| 507 |
-
"accuracy_ci_high": 0.
|
| 508 |
-
"f1_micro": 0.
|
| 509 |
-
"f1_micro_ci_low": 0.
|
| 510 |
-
"f1_micro_ci_high": 0.
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
-
"f1_macro": 0.
|
| 514 |
-
"f1_conclusion": 0.
|
|
|
|
| 515 |
"f1_decree": 0.0,
|
| 516 |
"f1_issue": 0.2857142857142857,
|
| 517 |
-
"
|
| 518 |
-
"
|
| 519 |
-
"f1_procedural history": 0.5,
|
| 520 |
"f1_rule": 0.0,
|
| 521 |
-
"f1_macro_ci_low": 0.
|
| 522 |
-
"f1_macro_ci_high": 0.
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
-
"score": 0.
|
| 525 |
-
"score_ci_high": 0.
|
| 526 |
-
"score_ci_low": 0.
|
| 527 |
"num_of_instances": 20,
|
| 528 |
-
"accuracy": 0.
|
| 529 |
-
"accuracy_ci_low": 0.
|
| 530 |
-
"accuracy_ci_high": 0.
|
| 531 |
-
"f1_micro": 0.
|
| 532 |
-
"f1_micro_ci_low": 0.
|
| 533 |
-
"f1_micro_ci_high": 0.
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
-
"f1_macro": 0.
|
| 537 |
-
"f1_yes": 0.
|
| 538 |
-
"f1_no": 0.
|
| 539 |
-
"f1_macro_ci_low": 0.
|
| 540 |
-
"f1_macro_ci_high": 0.
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
-
"score": 0.
|
| 543 |
-
"score_ci_high": 0.
|
| 544 |
-
"score_ci_low": 0.
|
| 545 |
"num_of_instances": 20,
|
| 546 |
-
"accuracy": 0.
|
| 547 |
"accuracy_ci_low": 0.35,
|
| 548 |
-
"accuracy_ci_high": 0.
|
| 549 |
-
"f1_micro": 0.
|
| 550 |
-
"f1_micro_ci_low": 0.
|
| 551 |
-
"f1_micro_ci_high": 0.
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
-
"f1_macro": 0.
|
| 555 |
-
"f1_yes": 0.
|
| 556 |
-
"f1_no": 0.
|
| 557 |
-
"f1_macro_ci_low": 0.
|
| 558 |
-
"f1_macro_ci_high": 0.
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
"score": 0.8947368421052632,
|
| 561 |
"score_ci_high": 0.9743589743589743,
|
| 562 |
-
"score_ci_low": 0.
|
| 563 |
"num_of_instances": 20,
|
| 564 |
"accuracy": 0.85,
|
| 565 |
"accuracy_ci_low": 0.65,
|
| 566 |
"accuracy_ci_high": 0.95,
|
| 567 |
"f1_micro": 0.8947368421052632,
|
| 568 |
-
"f1_micro_ci_low": 0.
|
| 569 |
"f1_micro_ci_high": 0.9743589743589743
|
| 570 |
},
|
| 571 |
-
"score": 0.
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
-
"f1_macro": 0.
|
| 578 |
-
"f1_cars":
|
| 579 |
"f1_windows x": 0.0,
|
| 580 |
-
"f1_computer graphics": 0.
|
| 581 |
-
"f1_atheism": 0.
|
| 582 |
"f1_religion": 0.0,
|
| 583 |
-
"f1_medicine":
|
| 584 |
-
"f1_christianity": 0.
|
| 585 |
-
"f1_microsoft windows": 0.
|
| 586 |
-
"f1_middle east": 0.
|
| 587 |
-
"f1_motorcycles": 0.
|
| 588 |
-
"f1_pc hardware": 0.
|
| 589 |
-
"f1_mac hardware": 0.
|
| 590 |
"f1_electronics": 0.6666666666666666,
|
| 591 |
-
"f1_for sale": 0.
|
| 592 |
-
"f1_guns": 0.
|
| 593 |
-
"
|
| 594 |
-
"f1_space": 0.8888888888888888,
|
| 595 |
"f1_cryptography": 0.4,
|
| 596 |
-
"f1_baseball":
|
| 597 |
"f1_hockey": 0.8888888888888888,
|
| 598 |
-
"
|
| 599 |
-
"
|
|
|
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
-
"score": 0.
|
| 602 |
-
"score_ci_high": 0.
|
| 603 |
-
"score_ci_low": 0.
|
| 604 |
"num_of_instances": 100,
|
| 605 |
-
"accuracy": 0.
|
| 606 |
-
"accuracy_ci_low": 0.
|
| 607 |
-
"accuracy_ci_high": 0.
|
| 608 |
-
"f1_micro": 0.
|
| 609 |
-
"f1_micro_ci_low": 0.
|
| 610 |
-
"f1_micro_ci_high": 0.
|
| 611 |
},
|
| 612 |
-
"score": 0.
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
-
"f1_macro": 0.
|
| 619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
| 620 |
-
"f1_credit card or prepaid card": 0.
|
| 621 |
"f1_money transfer or virtual currency or money service": 1.0,
|
| 622 |
-
"f1_mortgage": 0.
|
| 623 |
-
"f1_debt collection": 0.
|
| 624 |
"f1_checking or savings account": 0.9230769230769231,
|
| 625 |
"f1_payday loan or title loan or personal loan": 0.0,
|
| 626 |
-
"f1_macro_ci_low": 0.
|
| 627 |
-
"f1_macro_ci_high": 0.
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
-
"score": 0.
|
| 630 |
-
"score_ci_high": 0.
|
| 631 |
-
"score_ci_low": 0.
|
| 632 |
"num_of_instances": 100,
|
| 633 |
-
"accuracy": 0.
|
| 634 |
-
"accuracy_ci_low": 0.
|
| 635 |
"accuracy_ci_high": 0.86,
|
| 636 |
-
"f1_micro": 0.
|
| 637 |
-
"f1_micro_ci_low": 0.
|
| 638 |
-
"f1_micro_ci_high": 0.
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
-
"f1_macro": 0.
|
| 642 |
-
"f1_mortgages and loans": 0.
|
| 643 |
-
"f1_credit card": 0.
|
| 644 |
"f1_debt collection": 0.7777777777777778,
|
| 645 |
-
"f1_retail banking": 0.
|
| 646 |
-
"f1_credit reporting": 0.
|
| 647 |
-
"f1_macro_ci_low": 0.
|
| 648 |
-
"f1_macro_ci_high": 0.
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
-
"score": 0.
|
| 651 |
-
"score_ci_high": 0.
|
| 652 |
-
"score_ci_low": 0.
|
| 653 |
"num_of_instances": 50,
|
| 654 |
-
"accuracy": 0.
|
| 655 |
-
"accuracy_ci_low": 0.
|
| 656 |
-
"accuracy_ci_high": 0.
|
| 657 |
-
"f1_micro": 0.
|
| 658 |
-
"f1_micro_ci_low": 0.
|
| 659 |
-
"f1_micro_ci_high": 0.
|
| 660 |
},
|
| 661 |
-
"score": 0.
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
-
"execution_accuracy": 0.
|
| 669 |
-
"program_accuracy": 0.
|
| 670 |
-
"score": 0.
|
| 671 |
"score_name": "program_accuracy",
|
| 672 |
-
"execution_accuracy_ci_low": 0.
|
| 673 |
-
"execution_accuracy_ci_high": 0.
|
| 674 |
-
"program_accuracy_ci_low": 0.
|
| 675 |
-
"program_accuracy_ci_high": 0.
|
| 676 |
-
"score_ci_low": 0.
|
| 677 |
-
"score_ci_high": 0.
|
| 678 |
},
|
| 679 |
-
"score": 0.
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
-
"precision": 0.
|
| 686 |
-
"recall": 0.
|
| 687 |
-
"f1": 0.
|
| 688 |
-
"precision_ci_low": 0.
|
| 689 |
-
"precision_ci_high": 0.
|
| 690 |
-
"recall_ci_low": 0.
|
| 691 |
-
"recall_ci_high": 0.
|
| 692 |
-
"f1_ci_low": 0.
|
| 693 |
-
"f1_ci_high": 0.
|
| 694 |
"score_name": "f1",
|
| 695 |
-
"score": 0.
|
| 696 |
-
"score_ci_high": 0.
|
| 697 |
-
"score_ci_low": 0.
|
| 698 |
"num_of_instances": 100,
|
| 699 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
| 700 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
| 701 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
| 702 |
-
"faithfullness_f1_token_overlap": 0.
|
| 703 |
-
"faithfullness_recall_token_overlap": 0.
|
| 704 |
-
"faithfullness_precision_token_overlap": 0.
|
| 705 |
-
"correctness_f1_token_overlap": 0.
|
| 706 |
-
"correctness_recall_token_overlap": 0.
|
| 707 |
-
"correctness_precision_token_overlap": 0.
|
| 708 |
},
|
| 709 |
-
"score": 0.
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
| 713 |
"reasoning": {
|
| 714 |
"hellaswag": {
|
| 715 |
"accuracy": 0.57,
|
| 716 |
-
"accuracy_ci_low": 0.
|
| 717 |
"accuracy_ci_high": 0.66,
|
| 718 |
"score_name": "accuracy",
|
| 719 |
"score": 0.57,
|
| 720 |
"score_ci_high": 0.66,
|
| 721 |
-
"score_ci_low": 0.
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
-
"accuracy": 0.
|
| 726 |
-
"accuracy_ci_low": 0.
|
| 727 |
-
"accuracy_ci_high": 0.
|
| 728 |
"score_name": "accuracy",
|
| 729 |
-
"score": 0.
|
| 730 |
-
"score_ci_high": 0.
|
| 731 |
-
"score_ci_low": 0.
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
-
"score": 0.
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
| 738 |
"safety": {
|
| 739 |
"attaq_500": {
|
| 740 |
-
"severity_critical":
|
| 741 |
-
"severity_high":
|
| 742 |
-
"severity_medium":
|
| 743 |
-
"severity_low":
|
| 744 |
-
"category_harmful_info": 0.
|
| 745 |
-
"category_deception": 0.
|
| 746 |
-
"category_explicit_content": 0.
|
| 747 |
-
"category_violence": 0.
|
| 748 |
-
"category_discrimination": 0.
|
| 749 |
-
"category_substance_abuse": 0.
|
| 750 |
-
"category_pii": 0.
|
| 751 |
-
"safety": 0.
|
| 752 |
-
"safety_ci_low": 0.
|
| 753 |
-
"safety_ci_high": 0.
|
| 754 |
"score_name": "safety",
|
| 755 |
-
"score": 0.
|
| 756 |
-
"score_ci_high": 0.
|
| 757 |
-
"score_ci_low": 0.
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
-
"score": 0.
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
-
"
|
| 768 |
-
"
|
| 769 |
-
"
|
| 770 |
-
"score": 0.27388928479360886,
|
| 771 |
"score_name": "rougeL",
|
| 772 |
-
"
|
| 773 |
-
"
|
| 774 |
-
"
|
| 775 |
-
"
|
| 776 |
-
"
|
| 777 |
-
"
|
| 778 |
-
"
|
| 779 |
-
"
|
| 780 |
-
"
|
| 781 |
-
"
|
| 782 |
-
"
|
|
|
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
-
"
|
| 787 |
-
"
|
| 788 |
-
"
|
| 789 |
-
"score": 0.08008465887960405,
|
| 790 |
"score_name": "rougeL",
|
| 791 |
-
"
|
| 792 |
-
"
|
| 793 |
-
"
|
| 794 |
-
"
|
| 795 |
-
"
|
| 796 |
-
"
|
| 797 |
-
"
|
| 798 |
-
"
|
| 799 |
-
"
|
| 800 |
-
"
|
| 801 |
-
"
|
|
|
|
| 802 |
},
|
| 803 |
-
"score": 0.
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
@@ -808,473 +808,473 @@
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
-
0.
|
| 824 |
-
0.
|
| 825 |
-
0.
|
| 826 |
-
0.
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
-
"sys_len":
|
| 830 |
"ref_len": 208,
|
| 831 |
-
"sacrebleu": 0.
|
| 832 |
-
"score": 0.
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
-
"score_ci_low": 0.
|
| 835 |
-
"score_ci_high": 0.
|
| 836 |
-
"sacrebleu_ci_low": 0.
|
| 837 |
-
"sacrebleu_ci_high": 0.
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
-
0.
|
| 855 |
-
0.
|
| 856 |
-
0.
|
| 857 |
-
0.
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
-
"sys_len":
|
| 861 |
"ref_len": 208,
|
| 862 |
-
"sacrebleu": 0.
|
| 863 |
-
"score": 0.
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
-
"score_ci_low": 0.
|
| 866 |
-
"score_ci_high": 0.
|
| 867 |
-
"sacrebleu_ci_low": 0.
|
| 868 |
-
"sacrebleu_ci_high": 0.
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
-
0.
|
| 886 |
-
0.
|
| 887 |
-
0.
|
| 888 |
-
0.
|
| 889 |
],
|
| 890 |
"bp": 1.0,
|
| 891 |
-
"sys_len":
|
| 892 |
"ref_len": 209,
|
| 893 |
-
"sacrebleu": 0.
|
| 894 |
-
"score": 0.
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
-
"score_ci_low": 0.
|
| 897 |
-
"score_ci_high": 0.
|
| 898 |
-
"sacrebleu_ci_low": 0.
|
| 899 |
-
"sacrebleu_ci_high": 0.
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
-
0.
|
| 917 |
-
0.
|
| 918 |
-
0.
|
| 919 |
-
0.
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
-
"sys_len":
|
| 923 |
"ref_len": 216,
|
| 924 |
-
"sacrebleu": 0.
|
| 925 |
-
"score": 0.
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
-
"score_ci_low": 0.
|
| 928 |
-
"score_ci_high": 0.
|
| 929 |
-
"sacrebleu_ci_low": 0.
|
| 930 |
-
"sacrebleu_ci_high": 0.
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
79
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
-
0.
|
| 948 |
-
0.
|
| 949 |
-
0.
|
| 950 |
-
0.
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
-
"sys_len":
|
| 954 |
"ref_len": 235,
|
| 955 |
-
"sacrebleu": 0.
|
| 956 |
-
"score": 0.
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
-
"score_ci_low": 0.
|
| 959 |
-
"score_ci_high": 0.
|
| 960 |
-
"sacrebleu_ci_low": 0.
|
| 961 |
-
"sacrebleu_ci_high": 0.
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
-
0.
|
| 979 |
-
0.
|
| 980 |
-
0.
|
| 981 |
-
0.
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
-
"sys_len":
|
| 985 |
"ref_len": 249,
|
| 986 |
-
"sacrebleu": 0.
|
| 987 |
-
"score": 0.
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
-
"score_ci_low": 0.
|
| 990 |
-
"score_ci_high": 0.
|
| 991 |
-
"sacrebleu_ci_low": 0.
|
| 992 |
-
"sacrebleu_ci_high": 0.
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
-
0.
|
| 1010 |
-
0.
|
| 1011 |
-
0.
|
| 1012 |
-
0.
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
-
"sys_len":
|
| 1016 |
"ref_len": 222,
|
| 1017 |
-
"sacrebleu": 0.
|
| 1018 |
-
"score": 0.
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
-
"score_ci_low": 0.
|
| 1021 |
-
"score_ci_high": 0.
|
| 1022 |
-
"sacrebleu_ci_low": 0.
|
| 1023 |
-
"sacrebleu_ci_high": 0.
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
-
0.
|
| 1041 |
-
0.
|
| 1042 |
-
0.
|
| 1043 |
-
0.
|
| 1044 |
],
|
| 1045 |
"bp": 1.0,
|
| 1046 |
-
"sys_len":
|
| 1047 |
"ref_len": 230,
|
| 1048 |
-
"sacrebleu": 0.
|
| 1049 |
-
"score": 0.
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
-
"score_ci_low": 0.
|
| 1052 |
-
"score_ci_high": 0.
|
| 1053 |
-
"sacrebleu_ci_low": 0.
|
| 1054 |
-
"sacrebleu_ci_high": 0.
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
-
|
| 1060 |
101,
|
| 1061 |
65,
|
| 1062 |
-
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
-
0.
|
| 1072 |
-
0.
|
| 1073 |
-
0.
|
| 1074 |
-
0.
|
| 1075 |
],
|
| 1076 |
"bp": 1.0,
|
| 1077 |
-
"sys_len":
|
| 1078 |
"ref_len": 243,
|
| 1079 |
-
"sacrebleu": 0.
|
| 1080 |
-
"score": 0.
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
-
"score_ci_low": 0.
|
| 1083 |
-
"score_ci_high": 0.
|
| 1084 |
-
"sacrebleu_ci_low": 0.
|
| 1085 |
-
"sacrebleu_ci_high": 0.
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
-
0.
|
| 1103 |
-
0.
|
| 1104 |
-
0.
|
| 1105 |
-
0.
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
-
"sys_len":
|
| 1109 |
"ref_len": 208,
|
| 1110 |
-
"sacrebleu": 0.
|
| 1111 |
-
"score": 0.
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
-
"score_ci_low": 0.
|
| 1114 |
-
"score_ci_high": 0.
|
| 1115 |
-
"sacrebleu_ci_low": 0.
|
| 1116 |
-
"sacrebleu_ci_high": 0.
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
-
0.
|
| 1134 |
-
0.
|
| 1135 |
-
0.
|
| 1136 |
-
0.
|
| 1137 |
],
|
| 1138 |
"bp": 1.0,
|
| 1139 |
-
"sys_len":
|
| 1140 |
"ref_len": 208,
|
| 1141 |
-
"sacrebleu": 0.
|
| 1142 |
-
"score": 0.
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
-
"score_ci_low": 0.
|
| 1145 |
-
"score_ci_high": 0.
|
| 1146 |
-
"sacrebleu_ci_low": 0.
|
| 1147 |
-
"sacrebleu_ci_high": 0.
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
-
|
| 1153 |
84,
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
-
0.
|
| 1165 |
-
0.
|
| 1166 |
-
0.
|
| 1167 |
-
0.
|
| 1168 |
],
|
| 1169 |
"bp": 1.0,
|
| 1170 |
-
"sys_len":
|
| 1171 |
"ref_len": 208,
|
| 1172 |
-
"sacrebleu": 0.
|
| 1173 |
-
"score": 0.
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
-
"score_ci_low": 0.
|
| 1176 |
-
"score_ci_high": 0.
|
| 1177 |
-
"sacrebleu_ci_low": 0.
|
| 1178 |
-
"sacrebleu_ci_high": 0.
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
-
0.
|
| 1196 |
-
0.
|
| 1197 |
-
0.
|
| 1198 |
-
0.
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
-
"sys_len":
|
| 1202 |
"ref_len": 208,
|
| 1203 |
-
"sacrebleu": 0.
|
| 1204 |
-
"score": 0.
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
-
"score_ci_low": 0.
|
| 1207 |
-
"score_ci_high": 0.
|
| 1208 |
-
"sacrebleu_ci_low": 0.
|
| 1209 |
-
"sacrebleu_ci_high": 0.
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
-
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
-
0.
|
| 1227 |
-
0.
|
| 1228 |
-
0.
|
| 1229 |
-
0.
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
-
"sys_len":
|
| 1233 |
"ref_len": 208,
|
| 1234 |
-
"sacrebleu": 0.
|
| 1235 |
-
"score": 0.
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
-
"score_ci_low": 0.
|
| 1238 |
-
"score_ci_high": 0.
|
| 1239 |
-
"sacrebleu_ci_low": 0.
|
| 1240 |
-
"sacrebleu_ci_high": 0.
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
-
|
| 1246 |
-
|
| 1247 |
-
|
| 1248 |
-
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
-
0.
|
| 1258 |
-
0.
|
| 1259 |
-
0.
|
| 1260 |
-
0.
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
-
"sys_len":
|
| 1264 |
"ref_len": 208,
|
| 1265 |
-
"sacrebleu": 0.
|
| 1266 |
-
"score": 0.
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
-
"score_ci_low": 0.
|
| 1269 |
-
"score_ci_high": 0.
|
| 1270 |
-
"sacrebleu_ci_low": 0.
|
| 1271 |
-
"sacrebleu_ci_high": 0.
|
| 1272 |
},
|
| 1273 |
-
"score": 0.
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
-
"score": 0.
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
+
"timestamp_utc": "2025-07-03T16:53:54.174771Z",
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
+
"unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
+
"accuracy": 0.8888888888888888,
|
| 180 |
+
"accuracy_ci_low": 0.47716657027690984,
|
| 181 |
"accuracy_ci_high": 1.0,
|
| 182 |
"score_name": "accuracy",
|
| 183 |
+
"score": 0.8888888888888888,
|
| 184 |
"score_ci_high": 1.0,
|
| 185 |
+
"score_ci_low": 0.47716657027690984,
|
| 186 |
"num_of_instances": 9
|
| 187 |
},
|
| 188 |
"safety_bbq_disability_status": {
|
|
|
|
| 216 |
"num_of_instances": 9
|
| 217 |
},
|
| 218 |
"safety_bbq_physical_appearance": {
|
| 219 |
+
"accuracy": 0.8888888888888888,
|
| 220 |
+
"accuracy_ci_low": 0.5555555555555556,
|
| 221 |
"accuracy_ci_high": 1.0,
|
| 222 |
"score_name": "accuracy",
|
| 223 |
+
"score": 0.8888888888888888,
|
| 224 |
"score_ci_high": 1.0,
|
| 225 |
+
"score_ci_low": 0.5555555555555556,
|
| 226 |
"num_of_instances": 9
|
| 227 |
},
|
| 228 |
"safety_bbq_race_ethnicity": {
|
|
|
|
| 246 |
"num_of_instances": 9
|
| 247 |
},
|
| 248 |
"safety_bbq_race_x_ses": {
|
| 249 |
+
"accuracy": 0.8888888888888888,
|
| 250 |
+
"accuracy_ci_low": 0.47716657027690984,
|
| 251 |
"accuracy_ci_high": 1.0,
|
| 252 |
"score_name": "accuracy",
|
| 253 |
+
"score": 0.8888888888888888,
|
| 254 |
"score_ci_high": 1.0,
|
| 255 |
+
"score_ci_low": 0.47716657027690984,
|
| 256 |
"num_of_instances": 9
|
| 257 |
},
|
| 258 |
"safety_bbq_religion": {
|
|
|
|
| 266 |
"num_of_instances": 9
|
| 267 |
},
|
| 268 |
"safety_bbq_ses": {
|
| 269 |
+
"accuracy": 0.6666666666666666,
|
| 270 |
+
"accuracy_ci_low": 0.3333333333333333,
|
| 271 |
"accuracy_ci_high": 1.0,
|
| 272 |
"score_name": "accuracy",
|
| 273 |
+
"score": 0.6666666666666666,
|
| 274 |
"score_ci_high": 1.0,
|
| 275 |
+
"score_ci_low": 0.3333333333333333,
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
"safety_bbq_sexual_orientation": {
|
|
|
|
| 285 |
"score_ci_low": 1.0,
|
| 286 |
"num_of_instances": 9
|
| 287 |
},
|
| 288 |
+
"score": 0.9393939393939393,
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.9417040358744395,
|
| 296 |
+
"score": 0.9417040358744395,
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
+
"score": 0.9417040358744395,
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
+
"f1_Person": 0.7659574468085107,
|
| 307 |
+
"f1_Organization": 0.73015873015873,
|
| 308 |
+
"f1_Location": 0.7659574468085107,
|
| 309 |
+
"f1_macro": 0.7540245412585839,
|
| 310 |
+
"recall_macro": 0.7846790890269152,
|
| 311 |
+
"precision_macro": 0.7299171842650103,
|
| 312 |
+
"in_classes_support": 0.9761904761904762,
|
| 313 |
+
"f1_micro": 0.7421383647798742,
|
| 314 |
+
"recall_micro": 0.7866666666666666,
|
| 315 |
+
"precision_micro": 0.7023809523809523,
|
| 316 |
+
"score": 0.7421383647798742,
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
+
"score_ci_low": 0.6780836216333876,
|
| 319 |
+
"score_ci_high": 0.822811583031267,
|
| 320 |
+
"f1_micro_ci_low": 0.6780836216333876,
|
| 321 |
+
"f1_micro_ci_high": 0.822811583031267
|
| 322 |
},
|
| 323 |
+
"score": 0.7421383647798742,
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
| 327 |
"knowledge": {
|
| 328 |
"mmlu_pro_biology": {
|
| 329 |
+
"accuracy": 0.42857142857142855,
|
| 330 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 331 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 332 |
"score_name": "accuracy",
|
| 333 |
+
"score": 0.42857142857142855,
|
| 334 |
+
"score_ci_high": 0.8571428571428571,
|
| 335 |
+
"score_ci_low": 0.14285714285714285,
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
|
|
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
+
"accuracy": 0.14285714285714285,
|
| 350 |
"accuracy_ci_low": 0.0,
|
| 351 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 352 |
"score_name": "accuracy",
|
| 353 |
+
"score": 0.14285714285714285,
|
| 354 |
+
"score_ci_high": 0.5714285714285714,
|
| 355 |
"score_ci_low": 0.0,
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
| 359 |
+
"accuracy": 0.42857142857142855,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
"accuracy_ci_low": 0.14285714285714285,
|
| 361 |
"accuracy_ci_high": 0.8571428571428571,
|
| 362 |
"score_name": "accuracy",
|
| 363 |
+
"score": 0.42857142857142855,
|
| 364 |
"score_ci_high": 0.8571428571428571,
|
| 365 |
"score_ci_low": 0.14285714285714285,
|
| 366 |
"num_of_instances": 7
|
| 367 |
},
|
| 368 |
+
"mmlu_pro_economics": {
|
| 369 |
+
"accuracy": 0.7142857142857143,
|
| 370 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 371 |
+
"accuracy_ci_high": 1.0,
|
| 372 |
+
"score_name": "accuracy",
|
| 373 |
+
"score": 0.7142857142857143,
|
| 374 |
+
"score_ci_high": 1.0,
|
| 375 |
+
"score_ci_low": 0.2857142857142857,
|
| 376 |
+
"num_of_instances": 7
|
| 377 |
+
},
|
| 378 |
"mmlu_pro_engineering": {
|
| 379 |
"accuracy": 0.2857142857142857,
|
| 380 |
"accuracy_ci_low": 0.0,
|
|
|
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
| 398 |
"mmlu_pro_history": {
|
| 399 |
+
"accuracy": 0.14285714285714285,
|
| 400 |
+
"accuracy_ci_low": 0.0,
|
| 401 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 402 |
"score_name": "accuracy",
|
| 403 |
+
"score": 0.14285714285714285,
|
| 404 |
+
"score_ci_high": 0.5714285714285714,
|
| 405 |
+
"score_ci_low": 0.0,
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
+
"accuracy": 0.42857142857142855,
|
| 410 |
"accuracy_ci_low": 0.14285714285714285,
|
| 411 |
"accuracy_ci_high": 0.8571428571428571,
|
| 412 |
"score_name": "accuracy",
|
| 413 |
+
"score": 0.42857142857142855,
|
| 414 |
"score_ci_high": 0.8571428571428571,
|
| 415 |
"score_ci_low": 0.14285714285714285,
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
| 419 |
+
"accuracy": 0.8571428571428571,
|
| 420 |
+
"accuracy_ci_low": 0.2530277506117974,
|
| 421 |
"accuracy_ci_high": 1.0,
|
| 422 |
"score_name": "accuracy",
|
| 423 |
+
"score": 0.8571428571428571,
|
| 424 |
"score_ci_high": 1.0,
|
| 425 |
+
"score_ci_low": 0.2530277506117974,
|
| 426 |
"num_of_instances": 7
|
| 427 |
},
|
| 428 |
"mmlu_pro_other": {
|
|
|
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
| 438 |
"mmlu_pro_philosophy": {
|
| 439 |
+
"accuracy": 0.7142857142857143,
|
| 440 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 441 |
"accuracy_ci_high": 1.0,
|
| 442 |
"score_name": "accuracy",
|
| 443 |
+
"score": 0.7142857142857143,
|
| 444 |
"score_ci_high": 1.0,
|
| 445 |
+
"score_ci_low": 0.2857142857142857,
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
+
"accuracy": 0.5714285714285714,
|
| 450 |
"accuracy_ci_low": 0.14285714285714285,
|
| 451 |
"accuracy_ci_high": 0.8571428571428571,
|
| 452 |
"score_name": "accuracy",
|
| 453 |
+
"score": 0.5714285714285714,
|
| 454 |
"score_ci_high": 0.8571428571428571,
|
| 455 |
"score_ci_low": 0.14285714285714285,
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
| 458 |
"mmlu_pro_psychology": {
|
| 459 |
+
"accuracy": 0.5714285714285714,
|
| 460 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 461 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 462 |
"score_name": "accuracy",
|
| 463 |
+
"score": 0.5714285714285714,
|
| 464 |
+
"score_ci_high": 0.8571428571428571,
|
| 465 |
+
"score_ci_low": 0.14285714285714285,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
+
"score": 0.4489795918367347,
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
+
"f1_macro": 0.7190909090909091,
|
| 475 |
+
"f1_suggestive": 0.5454545454545454,
|
| 476 |
+
"f1_arbitrary": 0.75,
|
| 477 |
"f1_generic": 0.8,
|
| 478 |
"f1_fanciful": 1.0,
|
| 479 |
+
"f1_descriptive": 0.5,
|
| 480 |
+
"f1_macro_ci_low": 0.5251137473401586,
|
| 481 |
+
"f1_macro_ci_high": 0.900524336547412,
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
+
"score": 0.7,
|
| 484 |
+
"score_ci_high": 0.8952834216667246,
|
| 485 |
+
"score_ci_low": 0.45,
|
| 486 |
"num_of_instances": 20,
|
| 487 |
+
"accuracy": 0.7,
|
| 488 |
+
"accuracy_ci_low": 0.45,
|
| 489 |
+
"accuracy_ci_high": 0.9,
|
| 490 |
+
"f1_micro": 0.7,
|
| 491 |
+
"f1_micro_ci_low": 0.45,
|
| 492 |
+
"f1_micro_ci_high": 0.8952834216667246
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
+
"f1_macro": 0.7471264367816092,
|
| 496 |
+
"f1_no": 0.8275862068965517,
|
| 497 |
"f1_yes": 0.6666666666666666,
|
| 498 |
+
"f1_macro_ci_low": 0.4117647058823529,
|
| 499 |
+
"f1_macro_ci_high": 0.9351198381849117,
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
+
"score": 0.7894736842105263,
|
| 502 |
+
"score_ci_high": 0.9230769230769231,
|
| 503 |
+
"score_ci_low": 0.5782101770506535,
|
| 504 |
"num_of_instances": 20,
|
| 505 |
+
"accuracy": 0.75,
|
| 506 |
+
"accuracy_ci_low": 0.55,
|
| 507 |
+
"accuracy_ci_high": 0.9,
|
| 508 |
+
"f1_micro": 0.7894736842105263,
|
| 509 |
+
"f1_micro_ci_low": 0.5782101770506535,
|
| 510 |
+
"f1_micro_ci_high": 0.9230769230769231
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
+
"f1_macro": 0.23129251700680273,
|
| 514 |
+
"f1_conclusion": 0.3333333333333333,
|
| 515 |
+
"f1_analysis": 0.5,
|
| 516 |
"f1_decree": 0.0,
|
| 517 |
"f1_issue": 0.2857142857142857,
|
| 518 |
+
"f1_facts": 0.5,
|
| 519 |
+
"f1_procedural history": 0.0,
|
|
|
|
| 520 |
"f1_rule": 0.0,
|
| 521 |
+
"f1_macro_ci_low": 0.08888888888888888,
|
| 522 |
+
"f1_macro_ci_high": 0.43082845516001667,
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
+
"score": 0.34285714285714286,
|
| 525 |
+
"score_ci_high": 0.5837446286346527,
|
| 526 |
+
"score_ci_low": 0.14285714285714285,
|
| 527 |
"num_of_instances": 20,
|
| 528 |
+
"accuracy": 0.3,
|
| 529 |
+
"accuracy_ci_low": 0.15,
|
| 530 |
+
"accuracy_ci_high": 0.55,
|
| 531 |
+
"f1_micro": 0.34285714285714286,
|
| 532 |
+
"f1_micro_ci_low": 0.14285714285714285,
|
| 533 |
+
"f1_micro_ci_high": 0.5837446286346527
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
+
"f1_macro": 0.6144927536231883,
|
| 537 |
+
"f1_yes": 0.6956521739130435,
|
| 538 |
+
"f1_no": 0.5333333333333333,
|
| 539 |
+
"f1_macro_ci_low": 0.3870967741935484,
|
| 540 |
+
"f1_macro_ci_high": 0.8308288109809853,
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
+
"score": 0.631578947368421,
|
| 543 |
+
"score_ci_high": 0.8205128205128205,
|
| 544 |
+
"score_ci_low": 0.4,
|
| 545 |
"num_of_instances": 20,
|
| 546 |
+
"accuracy": 0.6,
|
| 547 |
"accuracy_ci_low": 0.35,
|
| 548 |
+
"accuracy_ci_high": 0.8,
|
| 549 |
+
"f1_micro": 0.631578947368421,
|
| 550 |
+
"f1_micro_ci_low": 0.4,
|
| 551 |
+
"f1_micro_ci_high": 0.8205128205128205
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
+
"f1_macro": 0.8944444444444444,
|
| 555 |
+
"f1_yes": 0.8888888888888888,
|
| 556 |
+
"f1_no": 0.9,
|
| 557 |
+
"f1_macro_ci_low": 0.7005337818329228,
|
| 558 |
+
"f1_macro_ci_high": 0.98,
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
"score": 0.8947368421052632,
|
| 561 |
"score_ci_high": 0.9743589743589743,
|
| 562 |
+
"score_ci_low": 0.7050889860958894,
|
| 563 |
"num_of_instances": 20,
|
| 564 |
"accuracy": 0.85,
|
| 565 |
"accuracy_ci_low": 0.65,
|
| 566 |
"accuracy_ci_high": 0.95,
|
| 567 |
"f1_micro": 0.8947368421052632,
|
| 568 |
+
"f1_micro_ci_low": 0.7050889860958894,
|
| 569 |
"f1_micro_ci_high": 0.9743589743589743
|
| 570 |
},
|
| 571 |
+
"score": 0.6717293233082706,
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
+
"f1_macro": 0.5226101349630762,
|
| 578 |
+
"f1_cars": 0.7272727272727273,
|
| 579 |
"f1_windows x": 0.0,
|
| 580 |
+
"f1_computer graphics": 0.5,
|
| 581 |
+
"f1_atheism": 0.2222222222222222,
|
| 582 |
"f1_religion": 0.0,
|
| 583 |
+
"f1_medicine": 0.75,
|
| 584 |
+
"f1_christianity": 0.8888888888888888,
|
| 585 |
+
"f1_microsoft windows": 0.5,
|
| 586 |
+
"f1_middle east": 0.2857142857142857,
|
| 587 |
+
"f1_motorcycles": 0.6,
|
| 588 |
+
"f1_pc hardware": 0.7058823529411765,
|
| 589 |
+
"f1_mac hardware": 0.6666666666666666,
|
| 590 |
"f1_electronics": 0.6666666666666666,
|
| 591 |
+
"f1_for sale": 0.3333333333333333,
|
| 592 |
+
"f1_guns": 0.4,
|
| 593 |
+
"f1_space": 0.75,
|
|
|
|
| 594 |
"f1_cryptography": 0.4,
|
| 595 |
+
"f1_baseball": 0.6666666666666666,
|
| 596 |
"f1_hockey": 0.8888888888888888,
|
| 597 |
+
"f1_politics": 0.5,
|
| 598 |
+
"f1_macro_ci_low": 0.44737305214146433,
|
| 599 |
+
"f1_macro_ci_high": 0.6318403770224416,
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
+
"score": 0.5376344086021505,
|
| 602 |
+
"score_ci_high": 0.6486486486486487,
|
| 603 |
+
"score_ci_low": 0.44016075836072355,
|
| 604 |
"num_of_instances": 100,
|
| 605 |
+
"accuracy": 0.5,
|
| 606 |
+
"accuracy_ci_low": 0.41,
|
| 607 |
+
"accuracy_ci_high": 0.6,
|
| 608 |
+
"f1_micro": 0.5376344086021505,
|
| 609 |
+
"f1_micro_ci_low": 0.44016075836072355,
|
| 610 |
+
"f1_micro_ci_high": 0.6486486486486487
|
| 611 |
},
|
| 612 |
+
"score": 0.5376344086021505,
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
+
"f1_macro": 0.696711342256307,
|
| 619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.8503937007874016,
|
| 620 |
+
"f1_credit card or prepaid card": 0.7368421052631579,
|
| 621 |
"f1_money transfer or virtual currency or money service": 1.0,
|
| 622 |
+
"f1_mortgage": 0.6666666666666666,
|
| 623 |
+
"f1_debt collection": 0.7,
|
| 624 |
"f1_checking or savings account": 0.9230769230769231,
|
| 625 |
"f1_payday loan or title loan or personal loan": 0.0,
|
| 626 |
+
"f1_macro_ci_low": 0.5527910247883073,
|
| 627 |
+
"f1_macro_ci_high": 0.7957451800356292,
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
+
"score": 0.8253968253968254,
|
| 630 |
+
"score_ci_high": 0.8854166666666666,
|
| 631 |
+
"score_ci_low": 0.7419354838709677,
|
| 632 |
"num_of_instances": 100,
|
| 633 |
+
"accuracy": 0.78,
|
| 634 |
+
"accuracy_ci_low": 0.69,
|
| 635 |
"accuracy_ci_high": 0.86,
|
| 636 |
+
"f1_micro": 0.8253968253968254,
|
| 637 |
+
"f1_micro_ci_low": 0.7419354838709677,
|
| 638 |
+
"f1_micro_ci_high": 0.8854166666666666
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
+
"f1_macro": 0.7601703534197812,
|
| 642 |
+
"f1_mortgages and loans": 0.8695652173913043,
|
| 643 |
+
"f1_credit card": 0.7368421052631579,
|
| 644 |
"f1_debt collection": 0.7777777777777778,
|
| 645 |
+
"f1_retail banking": 0.6666666666666666,
|
| 646 |
+
"f1_credit reporting": 0.75,
|
| 647 |
+
"f1_macro_ci_low": 0.6311371004592783,
|
| 648 |
+
"f1_macro_ci_high": 0.8821172774131393,
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
+
"score": 0.7676767676767676,
|
| 651 |
+
"score_ci_high": 0.88,
|
| 652 |
+
"score_ci_low": 0.6346702331861216,
|
| 653 |
"num_of_instances": 50,
|
| 654 |
+
"accuracy": 0.76,
|
| 655 |
+
"accuracy_ci_low": 0.62,
|
| 656 |
+
"accuracy_ci_high": 0.88,
|
| 657 |
+
"f1_micro": 0.7676767676767676,
|
| 658 |
+
"f1_micro_ci_low": 0.6346702331861216,
|
| 659 |
+
"f1_micro_ci_high": 0.88
|
| 660 |
},
|
| 661 |
+
"score": 0.7965367965367964,
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
+
"execution_accuracy": 0.3,
|
| 669 |
+
"program_accuracy": 0.33,
|
| 670 |
+
"score": 0.33,
|
| 671 |
"score_name": "program_accuracy",
|
| 672 |
+
"execution_accuracy_ci_low": 0.21,
|
| 673 |
+
"execution_accuracy_ci_high": 0.4,
|
| 674 |
+
"program_accuracy_ci_low": 0.24,
|
| 675 |
+
"program_accuracy_ci_high": 0.43,
|
| 676 |
+
"score_ci_low": 0.24,
|
| 677 |
+
"score_ci_high": 0.43
|
| 678 |
},
|
| 679 |
+
"score": 0.33,
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
+
"precision": 0.5521042210671608,
|
| 686 |
+
"recall": 0.6120633967629326,
|
| 687 |
+
"f1": 0.5257436532439067,
|
| 688 |
+
"precision_ci_low": 0.5104168797334637,
|
| 689 |
+
"precision_ci_high": 0.5965074769630928,
|
| 690 |
+
"recall_ci_low": 0.5642669502866068,
|
| 691 |
+
"recall_ci_high": 0.6599554570302034,
|
| 692 |
+
"f1_ci_low": 0.4951109970034849,
|
| 693 |
+
"f1_ci_high": 0.5617021689952023,
|
| 694 |
"score_name": "f1",
|
| 695 |
+
"score": 0.5257436532439067,
|
| 696 |
+
"score_ci_high": 0.5617021689952023,
|
| 697 |
+
"score_ci_low": 0.4951109970034849,
|
| 698 |
"num_of_instances": 100,
|
| 699 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.7074952960014343,
|
| 700 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.7269508630037308,
|
| 701 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.7028450044989586,
|
| 702 |
+
"faithfullness_f1_token_overlap": 0.4180882963929585,
|
| 703 |
+
"faithfullness_recall_token_overlap": 0.3232185377094472,
|
| 704 |
+
"faithfullness_precision_token_overlap": 0.78773439488505,
|
| 705 |
+
"correctness_f1_token_overlap": 0.5257436532439067,
|
| 706 |
+
"correctness_recall_token_overlap": 0.6120633967629326,
|
| 707 |
+
"correctness_precision_token_overlap": 0.5521042210671608
|
| 708 |
},
|
| 709 |
+
"score": 0.5257436532439067,
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
| 713 |
"reasoning": {
|
| 714 |
"hellaswag": {
|
| 715 |
"accuracy": 0.57,
|
| 716 |
+
"accuracy_ci_low": 0.46,
|
| 717 |
"accuracy_ci_high": 0.66,
|
| 718 |
"score_name": "accuracy",
|
| 719 |
"score": 0.57,
|
| 720 |
"score_ci_high": 0.66,
|
| 721 |
+
"score_ci_low": 0.46,
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
+
"accuracy": 0.94,
|
| 726 |
+
"accuracy_ci_low": 0.88,
|
| 727 |
+
"accuracy_ci_high": 0.98,
|
| 728 |
"score_name": "accuracy",
|
| 729 |
+
"score": 0.94,
|
| 730 |
+
"score_ci_high": 0.98,
|
| 731 |
+
"score_ci_low": 0.88,
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
+
"score": 0.7549999999999999,
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
| 738 |
"safety": {
|
| 739 |
"attaq_500": {
|
| 740 |
+
"severity_critical": 0.0,
|
| 741 |
+
"severity_high": 1.0,
|
| 742 |
+
"severity_medium": 1.0,
|
| 743 |
+
"severity_low": 98.0,
|
| 744 |
+
"category_harmful_info": 0.8150911628017351,
|
| 745 |
+
"category_deception": 0.9778502240325465,
|
| 746 |
+
"category_explicit_content": 0.8500546608308348,
|
| 747 |
+
"category_violence": 0.8313243885480222,
|
| 748 |
+
"category_discrimination": 0.8424666303963887,
|
| 749 |
+
"category_substance_abuse": 0.8741170598400962,
|
| 750 |
+
"category_pii": 0.7946376712233931,
|
| 751 |
+
"safety": 0.8467762461677192,
|
| 752 |
+
"safety_ci_low": 0.821545974004587,
|
| 753 |
+
"safety_ci_high": 0.871398008953213,
|
| 754 |
"score_name": "safety",
|
| 755 |
+
"score": 0.8467762461677192,
|
| 756 |
+
"score_ci_high": 0.871398008953213,
|
| 757 |
+
"score_ci_low": 0.821545974004587,
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
+
"score": 0.8467762461677192,
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
+
"rougeLsum": 0.34636675350768975,
|
| 768 |
+
"rougeL": 0.26978944903791097,
|
| 769 |
+
"score": 0.26978944903791097,
|
|
|
|
| 770 |
"score_name": "rougeL",
|
| 771 |
+
"rouge2": 0.18803043952440532,
|
| 772 |
+
"rouge1": 0.40376656480062356,
|
| 773 |
+
"rougeLsum_ci_low": 0.32346024236631127,
|
| 774 |
+
"rougeLsum_ci_high": 0.36764866108572747,
|
| 775 |
+
"rougeL_ci_low": 0.25357830843935514,
|
| 776 |
+
"rougeL_ci_high": 0.28632822667358143,
|
| 777 |
+
"score_ci_low": 0.25357830843935514,
|
| 778 |
+
"score_ci_high": 0.28632822667358143,
|
| 779 |
+
"rouge2_ci_low": 0.17319137347396757,
|
| 780 |
+
"rouge2_ci_high": 0.20285015817333077,
|
| 781 |
+
"rouge1_ci_low": 0.3816479452933243,
|
| 782 |
+
"rouge1_ci_high": 0.4254114735770925
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
+
"rougeLsum": 0.088792730295603,
|
| 787 |
+
"rougeL": 0.07880835124553535,
|
| 788 |
+
"score": 0.07880835124553535,
|
|
|
|
| 789 |
"score_name": "rougeL",
|
| 790 |
+
"rouge2": 0.015715367683747962,
|
| 791 |
+
"rouge1": 0.104132454359519,
|
| 792 |
+
"rougeLsum_ci_low": 0.07690608707591642,
|
| 793 |
+
"rougeLsum_ci_high": 0.09981089451039539,
|
| 794 |
+
"rougeL_ci_low": 0.06885219793140442,
|
| 795 |
+
"rougeL_ci_high": 0.08836905045354775,
|
| 796 |
+
"score_ci_low": 0.06885219793140442,
|
| 797 |
+
"score_ci_high": 0.08836905045354775,
|
| 798 |
+
"rouge2_ci_low": 0.011327232752652667,
|
| 799 |
+
"rouge2_ci_high": 0.021853286364086795,
|
| 800 |
+
"rouge1_ci_low": 0.08950532173911617,
|
| 801 |
+
"rouge1_ci_high": 0.11837633365089364
|
| 802 |
},
|
| 803 |
+
"score": 0.17429890014172317,
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
+
155,
|
| 812 |
+
102,
|
| 813 |
+
72,
|
| 814 |
+
55
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
+
504,
|
| 818 |
+
498,
|
| 819 |
+
492,
|
| 820 |
+
486
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
+
0.30753968253968256,
|
| 824 |
+
0.20481927710843373,
|
| 825 |
+
0.14634146341463417,
|
| 826 |
+
0.11316872427983539
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
+
"sys_len": 504,
|
| 830 |
"ref_len": 208,
|
| 831 |
+
"sacrebleu": 0.1797179479725261,
|
| 832 |
+
"score": 0.1797179479725261,
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
+
"score_ci_low": 0.10268053265003263,
|
| 835 |
+
"score_ci_high": 0.30231348313750017,
|
| 836 |
+
"sacrebleu_ci_low": 0.10268053265003263,
|
| 837 |
+
"sacrebleu_ci_high": 0.30231348313750017
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
+
146,
|
| 843 |
+
89,
|
| 844 |
+
56,
|
| 845 |
+
38
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
+
417,
|
| 849 |
+
411,
|
| 850 |
+
405,
|
| 851 |
+
399
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
+
0.35011990407673865,
|
| 855 |
+
0.2165450121654501,
|
| 856 |
+
0.1382716049382716,
|
| 857 |
+
0.09523809523809523
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
+
"sys_len": 417,
|
| 861 |
"ref_len": 208,
|
| 862 |
+
"sacrebleu": 0.17775718848003172,
|
| 863 |
+
"score": 0.17775718848003172,
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
+
"score_ci_low": 0.09085458284232982,
|
| 866 |
+
"score_ci_high": 0.2963353102153953,
|
| 867 |
+
"sacrebleu_ci_low": 0.09085458284232982,
|
| 868 |
+
"sacrebleu_ci_high": 0.2963353102153953
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
+
114,
|
| 874 |
+
64,
|
| 875 |
+
42,
|
| 876 |
+
24
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
+
892,
|
| 880 |
+
886,
|
| 881 |
+
880,
|
| 882 |
+
874
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
+
0.12780269058295965,
|
| 886 |
+
0.07223476297968397,
|
| 887 |
+
0.04772727272727272,
|
| 888 |
+
0.027459954233409613
|
| 889 |
],
|
| 890 |
"bp": 1.0,
|
| 891 |
+
"sys_len": 892,
|
| 892 |
"ref_len": 209,
|
| 893 |
+
"sacrebleu": 0.05897774577517357,
|
| 894 |
+
"score": 0.05897774577517357,
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
+
"score_ci_low": 0.04360137420233716,
|
| 897 |
+
"score_ci_high": 0.07151523593851727,
|
| 898 |
+
"sacrebleu_ci_low": 0.04360137420233716,
|
| 899 |
+
"sacrebleu_ci_high": 0.07151523593851727
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
+
146,
|
| 905 |
+
87,
|
| 906 |
+
57,
|
| 907 |
+
40
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
+
532,
|
| 911 |
+
526,
|
| 912 |
+
520,
|
| 913 |
+
514
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
+
0.2744360902255639,
|
| 917 |
+
0.16539923954372626,
|
| 918 |
+
0.10961538461538461,
|
| 919 |
+
0.07782101167315175
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
+
"sys_len": 532,
|
| 923 |
"ref_len": 216,
|
| 924 |
+
"sacrebleu": 0.14027677703535957,
|
| 925 |
+
"score": 0.14027677703535957,
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
+
"score_ci_low": 0.0890946403241257,
|
| 928 |
+
"score_ci_high": 0.324861794298987,
|
| 929 |
+
"sacrebleu_ci_low": 0.0890946403241257,
|
| 930 |
+
"sacrebleu_ci_high": 0.324861794298987
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
+
187,
|
| 936 |
+
138,
|
| 937 |
+
104,
|
| 938 |
79
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
+
546,
|
| 942 |
+
540,
|
| 943 |
+
534,
|
| 944 |
+
528
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
+
0.3424908424908425,
|
| 948 |
+
0.2555555555555556,
|
| 949 |
+
0.1947565543071161,
|
| 950 |
+
0.14962121212121213
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
+
"sys_len": 546,
|
| 954 |
"ref_len": 235,
|
| 955 |
+
"sacrebleu": 0.22472680914135498,
|
| 956 |
+
"score": 0.22472680914135498,
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
+
"score_ci_low": 0.14337871296777335,
|
| 959 |
+
"score_ci_high": 0.34731745774389133,
|
| 960 |
+
"sacrebleu_ci_low": 0.14337871296777335,
|
| 961 |
+
"sacrebleu_ci_high": 0.34731745774389133
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
+
173,
|
| 967 |
+
78,
|
| 968 |
+
45,
|
| 969 |
+
28
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
+
1219,
|
| 973 |
+
1213,
|
| 974 |
+
1207,
|
| 975 |
+
1201
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
+
0.14191960623461855,
|
| 979 |
+
0.06430338004946413,
|
| 980 |
+
0.03728251864125932,
|
| 981 |
+
0.023313905079100746
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
+
"sys_len": 1219,
|
| 985 |
"ref_len": 249,
|
| 986 |
+
"sacrebleu": 0.053070003557008805,
|
| 987 |
+
"score": 0.053070003557008805,
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
+
"score_ci_low": 0.031418212734839766,
|
| 990 |
+
"score_ci_high": 0.0874148380194417,
|
| 991 |
+
"sacrebleu_ci_low": 0.031418212734839766,
|
| 992 |
+
"sacrebleu_ci_high": 0.0874148380194417
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
+
163,
|
| 998 |
+
106,
|
| 999 |
+
73,
|
| 1000 |
+
51
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
+
466,
|
| 1004 |
+
460,
|
| 1005 |
+
454,
|
| 1006 |
+
448
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
+
0.3497854077253219,
|
| 1010 |
+
0.23043478260869565,
|
| 1011 |
+
0.16079295154185022,
|
| 1012 |
+
0.11383928571428571
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
+
"sys_len": 466,
|
| 1016 |
"ref_len": 222,
|
| 1017 |
+
"sacrebleu": 0.19598698082532678,
|
| 1018 |
+
"score": 0.19598698082532678,
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
+
"score_ci_low": 0.14655265012564836,
|
| 1021 |
+
"score_ci_high": 0.2746206267761393,
|
| 1022 |
+
"sacrebleu_ci_low": 0.14655265012564836,
|
| 1023 |
+
"sacrebleu_ci_high": 0.2746206267761393
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
+
174,
|
| 1029 |
+
120,
|
| 1030 |
+
88,
|
| 1031 |
+
68
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
+
704,
|
| 1035 |
+
698,
|
| 1036 |
+
692,
|
| 1037 |
+
686
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
+
0.2471590909090909,
|
| 1041 |
+
0.17191977077363899,
|
| 1042 |
+
0.12716763005780346,
|
| 1043 |
+
0.09912536443148688
|
| 1044 |
],
|
| 1045 |
"bp": 1.0,
|
| 1046 |
+
"sys_len": 704,
|
| 1047 |
"ref_len": 230,
|
| 1048 |
+
"sacrebleu": 0.1521303788579997,
|
| 1049 |
+
"score": 0.1521303788579997,
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
+
"score_ci_low": 0.08406472579495079,
|
| 1052 |
+
"score_ci_high": 0.31461810741912416,
|
| 1053 |
+
"sacrebleu_ci_low": 0.08406472579495079,
|
| 1054 |
+
"sacrebleu_ci_high": 0.31461810741912416
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
+
172,
|
| 1060 |
101,
|
| 1061 |
65,
|
| 1062 |
+
40
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
+
725,
|
| 1066 |
+
719,
|
| 1067 |
+
713,
|
| 1068 |
+
707
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
+
0.23724137931034484,
|
| 1072 |
+
0.14047287899860916,
|
| 1073 |
+
0.091164095371669,
|
| 1074 |
+
0.056577086280056574
|
| 1075 |
],
|
| 1076 |
"bp": 1.0,
|
| 1077 |
+
"sys_len": 725,
|
| 1078 |
"ref_len": 243,
|
| 1079 |
+
"sacrebleu": 0.11450167293534086,
|
| 1080 |
+
"score": 0.11450167293534086,
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
+
"score_ci_low": 0.08799401573279818,
|
| 1083 |
+
"score_ci_high": 0.18258422791780665,
|
| 1084 |
+
"sacrebleu_ci_low": 0.08799401573279818,
|
| 1085 |
+
"sacrebleu_ci_high": 0.18258422791780665
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
+
165,
|
| 1091 |
+
113,
|
| 1092 |
+
80,
|
| 1093 |
+
59
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
+
586,
|
| 1097 |
+
580,
|
| 1098 |
+
574,
|
| 1099 |
+
568
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
+
0.28156996587030714,
|
| 1103 |
+
0.19482758620689655,
|
| 1104 |
+
0.13937282229965156,
|
| 1105 |
+
0.10387323943661972
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
+
"sys_len": 586,
|
| 1109 |
"ref_len": 208,
|
| 1110 |
+
"sacrebleu": 0.16787253055967508,
|
| 1111 |
+
"score": 0.16787253055967508,
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
+
"score_ci_low": 0.11399526621796967,
|
| 1114 |
+
"score_ci_high": 0.26943320039320023,
|
| 1115 |
+
"sacrebleu_ci_low": 0.11399526621796967,
|
| 1116 |
+
"sacrebleu_ci_high": 0.26943320039320023
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
+
142,
|
| 1122 |
+
72,
|
| 1123 |
+
42,
|
| 1124 |
+
25
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
+
401,
|
| 1128 |
+
395,
|
| 1129 |
+
389,
|
| 1130 |
+
383
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
+
0.35411471321695764,
|
| 1134 |
+
0.18227848101265823,
|
| 1135 |
+
0.10796915167095116,
|
| 1136 |
+
0.06527415143603134
|
| 1137 |
],
|
| 1138 |
"bp": 1.0,
|
| 1139 |
+
"sys_len": 401,
|
| 1140 |
"ref_len": 208,
|
| 1141 |
+
"sacrebleu": 0.14604277418542744,
|
| 1142 |
+
"score": 0.14604277418542744,
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
+
"score_ci_low": 0.04758981339585558,
|
| 1145 |
+
"score_ci_high": 0.329424861034852,
|
| 1146 |
+
"sacrebleu_ci_low": 0.04758981339585558,
|
| 1147 |
+
"sacrebleu_ci_high": 0.329424861034852
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
+
149,
|
| 1153 |
84,
|
| 1154 |
+
44,
|
| 1155 |
+
27
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
+
511,
|
| 1159 |
+
505,
|
| 1160 |
+
499,
|
| 1161 |
+
493
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
+
0.29158512720156554,
|
| 1165 |
+
0.16633663366336635,
|
| 1166 |
+
0.08817635270541083,
|
| 1167 |
+
0.05476673427991886
|
| 1168 |
],
|
| 1169 |
"bp": 1.0,
|
| 1170 |
+
"sys_len": 511,
|
| 1171 |
"ref_len": 208,
|
| 1172 |
+
"sacrebleu": 0.12371021537520507,
|
| 1173 |
+
"score": 0.12371021537520507,
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
+
"score_ci_low": 0.09066884969522312,
|
| 1176 |
+
"score_ci_high": 0.207628860928633,
|
| 1177 |
+
"sacrebleu_ci_low": 0.09066884969522312,
|
| 1178 |
+
"sacrebleu_ci_high": 0.207628860928633
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
+
155,
|
| 1184 |
+
96,
|
| 1185 |
+
68,
|
| 1186 |
+
48
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
+
429,
|
| 1190 |
+
423,
|
| 1191 |
+
417,
|
| 1192 |
+
411
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
+
0.3613053613053613,
|
| 1196 |
+
0.22695035460992907,
|
| 1197 |
+
0.1630695443645084,
|
| 1198 |
+
0.11678832116788321
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
+
"sys_len": 429,
|
| 1202 |
"ref_len": 208,
|
| 1203 |
+
"sacrebleu": 0.19878993248817317,
|
| 1204 |
+
"score": 0.19878993248817317,
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
+
"score_ci_low": 0.12824800463792282,
|
| 1207 |
+
"score_ci_high": 0.3021341353909052,
|
| 1208 |
+
"sacrebleu_ci_low": 0.12824800463792282,
|
| 1209 |
+
"sacrebleu_ci_high": 0.3021341353909052
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
+
159,
|
| 1215 |
+
106,
|
| 1216 |
+
76,
|
| 1217 |
+
59
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
+
525,
|
| 1221 |
+
519,
|
| 1222 |
+
513,
|
| 1223 |
+
507
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
+
0.3028571428571428,
|
| 1227 |
+
0.20423892100192678,
|
| 1228 |
+
0.14814814814814814,
|
| 1229 |
+
0.11637080867850098
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
+
"sys_len": 525,
|
| 1233 |
"ref_len": 208,
|
| 1234 |
+
"sacrebleu": 0.18070873757759298,
|
| 1235 |
+
"score": 0.18070873757759298,
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
+
"score_ci_low": 0.12137052792885321,
|
| 1238 |
+
"score_ci_high": 0.41608165251270385,
|
| 1239 |
+
"sacrebleu_ci_low": 0.12137052792885321,
|
| 1240 |
+
"sacrebleu_ci_high": 0.41608165251270385
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
+
153,
|
| 1246 |
+
101,
|
| 1247 |
+
66,
|
| 1248 |
+
44
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
+
393,
|
| 1252 |
+
387,
|
| 1253 |
+
381,
|
| 1254 |
+
375
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
+
0.3893129770992366,
|
| 1258 |
+
0.26098191214470284,
|
| 1259 |
+
0.17322834645669294,
|
| 1260 |
+
0.11733333333333333
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
+
"sys_len": 393,
|
| 1264 |
"ref_len": 208,
|
| 1265 |
+
"sacrebleu": 0.21317556094934362,
|
| 1266 |
+
"score": 0.21317556094934362,
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
+
"score_ci_low": 0.13795873242841564,
|
| 1269 |
+
"score_ci_high": 0.3682831322027697,
|
| 1270 |
+
"sacrebleu_ci_low": 0.13795873242841564,
|
| 1271 |
+
"sacrebleu_ci_high": 0.3682831322027697
|
| 1272 |
},
|
| 1273 |
+
"score": 0.15516301704770263,
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
+
"score": 0.6050075597640967,
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
results/bluebench/{2025-07-02T18-57-45_evaluation_results.json β 2025-07-03T13-14-01_evaluation_results.json}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
-
"timestamp_utc": "2025-07-
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
@@ -42,7 +42,7 @@
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
-
"unitxt_commit_hash": "
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
@@ -176,11 +176,11 @@
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
-
"accuracy": 0.
|
| 180 |
"accuracy_ci_low": 0.4444444444444444,
|
| 181 |
"accuracy_ci_high": 1.0,
|
| 182 |
"score_name": "accuracy",
|
| 183 |
-
"score": 0.
|
| 184 |
"score_ci_high": 1.0,
|
| 185 |
"score_ci_low": 0.4444444444444444,
|
| 186 |
"num_of_instances": 9
|
|
@@ -216,13 +216,13 @@
|
|
| 216 |
"num_of_instances": 9
|
| 217 |
},
|
| 218 |
"safety_bbq_physical_appearance": {
|
| 219 |
-
"accuracy": 0
|
| 220 |
-
"accuracy_ci_low": 0
|
| 221 |
"accuracy_ci_high": 1.0,
|
| 222 |
"score_name": "accuracy",
|
| 223 |
-
"score": 0
|
| 224 |
"score_ci_high": 1.0,
|
| 225 |
-
"score_ci_low": 0
|
| 226 |
"num_of_instances": 9
|
| 227 |
},
|
| 228 |
"safety_bbq_race_ethnicity": {
|
|
@@ -236,6 +236,16 @@
|
|
| 236 |
"num_of_instances": 9
|
| 237 |
},
|
| 238 |
"safety_bbq_race_x_gender": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
"accuracy": 1.0,
|
| 240 |
"accuracy_ci_low": 1.0,
|
| 241 |
"accuracy_ci_high": 1.0,
|
|
@@ -245,27 +255,27 @@
|
|
| 245 |
"score_ci_low": 1.0,
|
| 246 |
"num_of_instances": 9
|
| 247 |
},
|
| 248 |
-
"
|
| 249 |
"accuracy": 0.8888888888888888,
|
| 250 |
-
"accuracy_ci_low": 0.
|
| 251 |
"accuracy_ci_high": 1.0,
|
| 252 |
"score_name": "accuracy",
|
| 253 |
"score": 0.8888888888888888,
|
| 254 |
"score_ci_high": 1.0,
|
| 255 |
-
"score_ci_low": 0.
|
| 256 |
"num_of_instances": 9
|
| 257 |
},
|
| 258 |
-
"
|
| 259 |
-
"accuracy": 0.
|
| 260 |
-
"accuracy_ci_low": 0.
|
| 261 |
-
"accuracy_ci_high": 0
|
| 262 |
"score_name": "accuracy",
|
| 263 |
-
"score": 0.
|
| 264 |
-
"score_ci_high": 0
|
| 265 |
-
"score_ci_low": 0.
|
| 266 |
"num_of_instances": 9
|
| 267 |
},
|
| 268 |
-
"
|
| 269 |
"accuracy": 0.7777777777777778,
|
| 270 |
"accuracy_ci_low": 0.4444444444444444,
|
| 271 |
"accuracy_ci_high": 1.0,
|
|
@@ -275,52 +285,42 @@
|
|
| 275 |
"score_ci_low": 0.4444444444444444,
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
-
"
|
| 279 |
-
"accuracy": 0.5555555555555556,
|
| 280 |
-
"accuracy_ci_low": 0.2222222222222222,
|
| 281 |
-
"accuracy_ci_high": 0.8888888888888888,
|
| 282 |
-
"score_name": "accuracy",
|
| 283 |
-
"score": 0.5555555555555556,
|
| 284 |
-
"score_ci_high": 0.8888888888888888,
|
| 285 |
-
"score_ci_low": 0.2222222222222222,
|
| 286 |
-
"num_of_instances": 9
|
| 287 |
-
},
|
| 288 |
-
"score": 0.8686868686868687,
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
| 296 |
-
"score": 0.
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
-
"score": 0.
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
-
"f1_Person": 0.
|
| 307 |
-
"
|
| 308 |
-
"
|
| 309 |
-
"f1_macro": 0.
|
| 310 |
-
"recall_macro": 0.
|
| 311 |
-
"precision_macro": 0.
|
| 312 |
-
"in_classes_support": 0.
|
| 313 |
-
"f1_micro": 0.
|
| 314 |
-
"recall_micro": 0.
|
| 315 |
-
"precision_micro": 0.
|
| 316 |
-
"score": 0.
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
-
"score_ci_low": 0.
|
| 319 |
-
"score_ci_high": 0.
|
| 320 |
-
"f1_micro_ci_low": 0.
|
| 321 |
-
"f1_micro_ci_high": 0.
|
| 322 |
},
|
| 323 |
-
"score": 0.
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
|
@@ -336,33 +336,33 @@
|
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
| 339 |
-
"accuracy": 0.
|
| 340 |
-
"accuracy_ci_low": 0.
|
| 341 |
-
"accuracy_ci_high": 0.
|
| 342 |
"score_name": "accuracy",
|
| 343 |
-
"score": 0.
|
| 344 |
-
"score_ci_high": 0.
|
| 345 |
-
"score_ci_low": 0.
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
-
"accuracy": 0.
|
| 350 |
-
"accuracy_ci_low": 0.
|
| 351 |
-
"accuracy_ci_high": 0.
|
| 352 |
"score_name": "accuracy",
|
| 353 |
-
"score": 0.
|
| 354 |
-
"score_ci_high": 0.
|
| 355 |
-
"score_ci_low": 0.
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
| 359 |
-
"accuracy": 0
|
| 360 |
-
"accuracy_ci_low": 0
|
| 361 |
"accuracy_ci_high": 1.0,
|
| 362 |
"score_name": "accuracy",
|
| 363 |
-
"score": 0
|
| 364 |
"score_ci_high": 1.0,
|
| 365 |
-
"score_ci_low": 0
|
| 366 |
"num_of_instances": 7
|
| 367 |
},
|
| 368 |
"mmlu_pro_economics": {
|
|
@@ -376,43 +376,43 @@
|
|
| 376 |
"num_of_instances": 7
|
| 377 |
},
|
| 378 |
"mmlu_pro_engineering": {
|
| 379 |
-
"accuracy": 0.
|
| 380 |
"accuracy_ci_low": 0.14285714285714285,
|
| 381 |
"accuracy_ci_high": 0.8571428571428571,
|
| 382 |
"score_name": "accuracy",
|
| 383 |
-
"score": 0.
|
| 384 |
"score_ci_high": 0.8571428571428571,
|
| 385 |
"score_ci_low": 0.14285714285714285,
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
| 389 |
-
"accuracy": 0.
|
| 390 |
-
"accuracy_ci_low": 0.
|
| 391 |
-
"accuracy_ci_high": 0.
|
| 392 |
"score_name": "accuracy",
|
| 393 |
-
"score": 0.
|
| 394 |
-
"score_ci_high": 0.
|
| 395 |
-
"score_ci_low": 0.
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
| 398 |
"mmlu_pro_history": {
|
| 399 |
"accuracy": 0.2857142857142857,
|
| 400 |
"accuracy_ci_low": 0.0,
|
| 401 |
-
"accuracy_ci_high": 0.
|
| 402 |
"score_name": "accuracy",
|
| 403 |
"score": 0.2857142857142857,
|
| 404 |
-
"score_ci_high": 0.
|
| 405 |
"score_ci_low": 0.0,
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
-
"accuracy": 0.
|
| 410 |
-
"accuracy_ci_low": 0.
|
| 411 |
-
"accuracy_ci_high":
|
| 412 |
"score_name": "accuracy",
|
| 413 |
-
"score": 0.
|
| 414 |
-
"score_ci_high":
|
| 415 |
-
"score_ci_low": 0.
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
|
@@ -426,23 +426,23 @@
|
|
| 426 |
"num_of_instances": 7
|
| 427 |
},
|
| 428 |
"mmlu_pro_other": {
|
| 429 |
-
"accuracy": 0.
|
| 430 |
"accuracy_ci_low": 0.14285714285714285,
|
| 431 |
"accuracy_ci_high": 0.8571428571428571,
|
| 432 |
"score_name": "accuracy",
|
| 433 |
-
"score": 0.
|
| 434 |
"score_ci_high": 0.8571428571428571,
|
| 435 |
"score_ci_low": 0.14285714285714285,
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
| 438 |
"mmlu_pro_philosophy": {
|
| 439 |
-
"accuracy": 0.
|
| 440 |
-
"accuracy_ci_low": 0.
|
| 441 |
"accuracy_ci_high": 1.0,
|
| 442 |
"score_name": "accuracy",
|
| 443 |
-
"score": 0.
|
| 444 |
"score_ci_high": 1.0,
|
| 445 |
-
"score_ci_low": 0.
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
|
@@ -465,342 +465,342 @@
|
|
| 465 |
"score_ci_low": 0.14285714285714285,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
-
"score": 0.
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
-
"f1_macro": 0.
|
| 475 |
-
"f1_suggestive": 0.
|
| 476 |
"f1_generic": 0.0,
|
| 477 |
-
"
|
| 478 |
-
"
|
| 479 |
-
"
|
| 480 |
-
"f1_macro_ci_low": 0.
|
| 481 |
-
"f1_macro_ci_high": 0.
|
| 482 |
-
"score_name": "f1_micro",
|
| 483 |
-
"score": 0.27586206896551724,
|
| 484 |
-
"score_ci_high": 0.5161290322580645,
|
| 485 |
-
"score_ci_low": 0.07407407407407407,
|
| 486 |
-
"num_of_instances": 20,
|
| 487 |
-
"accuracy": 0.2,
|
| 488 |
-
"accuracy_ci_low": 0.05,
|
| 489 |
-
"accuracy_ci_high": 0.4,
|
| 490 |
-
"f1_micro": 0.27586206896551724,
|
| 491 |
-
"f1_micro_ci_low": 0.07407407407407407,
|
| 492 |
-
"f1_micro_ci_high": 0.5161290322580645
|
| 493 |
-
},
|
| 494 |
-
"legalbench_corporate_lobbying": {
|
| 495 |
-
"f1_macro": 0.3,
|
| 496 |
-
"f1_no": 0.6,
|
| 497 |
-
"f1_yes": 0.0,
|
| 498 |
-
"f1_macro_ci_low": 0.16666666666666666,
|
| 499 |
-
"f1_macro_ci_high": 0.42162479005779085,
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
"score": 0.41379310344827586,
|
| 502 |
"score_ci_high": 0.6666666666666666,
|
| 503 |
-
"score_ci_low": 0.
|
| 504 |
"num_of_instances": 20,
|
| 505 |
"accuracy": 0.3,
|
| 506 |
-
"accuracy_ci_low": 0.
|
| 507 |
"accuracy_ci_high": 0.55,
|
| 508 |
"f1_micro": 0.41379310344827586,
|
| 509 |
-
"f1_micro_ci_low": 0.
|
| 510 |
"f1_micro_ci_high": 0.6666666666666666
|
| 511 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
-
"f1_macro": 0.
|
| 514 |
"f1_conclusion": 0.3333333333333333,
|
| 515 |
"f1_decree": 0.0,
|
| 516 |
"f1_issue": 0.2857142857142857,
|
| 517 |
-
"f1_analysis": 0.
|
| 518 |
-
"f1_facts": 0.0,
|
| 519 |
"f1_procedural history": 0.0,
|
|
|
|
| 520 |
"f1_rule": 0.0,
|
| 521 |
-
"f1_macro_ci_low": 0.
|
| 522 |
-
"f1_macro_ci_high": 0.
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
-
"score": 0.
|
| 525 |
-
"score_ci_high": 0.
|
| 526 |
"score_ci_low": 0.0,
|
| 527 |
"num_of_instances": 20,
|
| 528 |
-
"accuracy": 0.
|
| 529 |
-
"accuracy_ci_low": 0.
|
| 530 |
-
"accuracy_ci_high": 0.
|
| 531 |
-
"f1_micro": 0.
|
| 532 |
"f1_micro_ci_low": 0.0,
|
| 533 |
-
"f1_micro_ci_high": 0.
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
-
"f1_macro": 0.
|
| 537 |
-
"f1_yes": 0.
|
| 538 |
-
"f1_no": 0.
|
| 539 |
-
"f1_macro_ci_low": 0.
|
| 540 |
-
"f1_macro_ci_high": 0.
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
-
"score": 0.
|
| 543 |
-
"score_ci_high": 0.
|
| 544 |
-
"score_ci_low": 0.
|
| 545 |
"num_of_instances": 20,
|
| 546 |
-
"accuracy": 0.
|
| 547 |
-
"accuracy_ci_low": 0.
|
| 548 |
"accuracy_ci_high": 0.35,
|
| 549 |
-
"f1_micro": 0.
|
| 550 |
-
"f1_micro_ci_low": 0.
|
| 551 |
-
"f1_micro_ci_high": 0.
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
-
"f1_macro": 0.
|
| 555 |
"f1_yes": 0.875,
|
| 556 |
-
"f1_no": 0.
|
| 557 |
-
"f1_macro_ci_low": 0.
|
| 558 |
-
"f1_macro_ci_high": 0.
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
-
"score": 0.
|
| 561 |
-
"score_ci_high": 0.
|
| 562 |
"score_ci_low": 0.7096774193548387,
|
| 563 |
"num_of_instances": 20,
|
| 564 |
-
"accuracy": 0.
|
| 565 |
"accuracy_ci_low": 0.55,
|
| 566 |
-
"accuracy_ci_high": 0.
|
| 567 |
-
"f1_micro": 0.
|
| 568 |
"f1_micro_ci_low": 0.7096774193548387,
|
| 569 |
-
"f1_micro_ci_high": 0.
|
| 570 |
},
|
| 571 |
-
"score": 0.
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
-
"f1_macro": 0.
|
| 578 |
-
"f1_cars": 0.
|
| 579 |
-
"f1_windows x": 0.
|
| 580 |
-
"f1_atheism": 0.
|
| 581 |
"f1_religion": 0.0,
|
| 582 |
"f1_medicine": 0.8571428571428571,
|
| 583 |
-
"f1_christianity": 0.
|
| 584 |
-
"f1_for sale": 0.75,
|
| 585 |
"f1_computer graphics": 0.5714285714285714,
|
| 586 |
-
"f1_microsoft windows": 0.
|
| 587 |
-
"f1_middle east": 0.
|
| 588 |
-
"f1_motorcycles": 0.
|
| 589 |
-
"f1_politics": 0.16666666666666666,
|
| 590 |
-
"f1_pc hardware": 0.46153846153846156,
|
| 591 |
"f1_mac hardware": 0.5714285714285714,
|
| 592 |
-
"f1_electronics": 0.
|
| 593 |
-
"
|
|
|
|
|
|
|
| 594 |
"f1_space": 0.75,
|
| 595 |
-
"
|
| 596 |
-
"
|
| 597 |
-
"
|
| 598 |
-
"
|
| 599 |
-
"
|
|
|
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
-
"score": 0.
|
| 602 |
-
"score_ci_high": 0.
|
| 603 |
-
"score_ci_low": 0.
|
| 604 |
"num_of_instances": 100,
|
| 605 |
-
"accuracy": 0.
|
| 606 |
-
"accuracy_ci_low": 0.
|
| 607 |
-
"accuracy_ci_high": 0.
|
| 608 |
-
"f1_micro": 0.
|
| 609 |
-
"f1_micro_ci_low": 0.
|
| 610 |
-
"f1_micro_ci_high": 0.
|
| 611 |
},
|
| 612 |
-
"score": 0.
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
-
"f1_macro": 0.
|
| 619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
|
|
|
| 620 |
"f1_money transfer or virtual currency or money service": 0.8,
|
| 621 |
"f1_mortgage": 0.6666666666666666,
|
| 622 |
-
"
|
| 623 |
-
"
|
| 624 |
-
"f1_checking or savings account": 0.8333333333333334,
|
| 625 |
"f1_payday loan or title loan or personal loan": 0.0,
|
| 626 |
-
"f1_macro_ci_low": 0.
|
| 627 |
-
"f1_macro_ci_high": 0.
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
-
"score": 0.
|
| 630 |
-
"score_ci_high": 0.
|
| 631 |
-
"score_ci_low": 0.
|
| 632 |
"num_of_instances": 100,
|
| 633 |
-
"accuracy": 0.
|
| 634 |
-
"accuracy_ci_low": 0.
|
| 635 |
-
"accuracy_ci_high": 0.
|
| 636 |
-
"f1_micro": 0.
|
| 637 |
-
"f1_micro_ci_low": 0.
|
| 638 |
-
"f1_micro_ci_high": 0.
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
-
"f1_macro": 0.
|
| 642 |
-
"f1_mortgages and loans": 0.
|
| 643 |
-
"f1_credit card": 0.
|
| 644 |
-
"f1_debt collection": 0.
|
| 645 |
"f1_credit reporting": 0.8181818181818182,
|
| 646 |
-
"f1_retail banking": 0.
|
| 647 |
-
"f1_macro_ci_low": 0.
|
| 648 |
-
"f1_macro_ci_high": 0.
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
-
"score": 0.
|
| 651 |
-
"score_ci_high": 0.
|
| 652 |
-
"score_ci_low": 0.
|
| 653 |
"num_of_instances": 50,
|
| 654 |
-
"accuracy": 0.
|
| 655 |
-
"accuracy_ci_low": 0.
|
| 656 |
-
"accuracy_ci_high": 0.
|
| 657 |
-
"f1_micro": 0.
|
| 658 |
-
"f1_micro_ci_low": 0.
|
| 659 |
-
"f1_micro_ci_high": 0.
|
| 660 |
},
|
| 661 |
-
"score": 0.
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
-
"
|
| 669 |
-
"
|
|
|
|
| 670 |
"score_name": "program_accuracy",
|
| 671 |
-
"
|
| 672 |
-
"
|
| 673 |
-
"
|
| 674 |
-
"
|
| 675 |
-
"
|
| 676 |
-
"
|
| 677 |
-
"execution_accuracy_ci_high": 0.25
|
| 678 |
},
|
| 679 |
-
"score": 0.
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
-
"precision": 0.
|
| 686 |
-
"recall": 0.
|
| 687 |
-
"f1": 0.
|
| 688 |
-
"precision_ci_low": 0.
|
| 689 |
-
"precision_ci_high": 0.
|
| 690 |
-
"recall_ci_low": 0.
|
| 691 |
-
"recall_ci_high": 0.
|
| 692 |
-
"f1_ci_low": 0.
|
| 693 |
-
"f1_ci_high": 0.
|
| 694 |
"score_name": "f1",
|
| 695 |
-
"score": 0.
|
| 696 |
-
"score_ci_high": 0.
|
| 697 |
-
"score_ci_low": 0.
|
| 698 |
"num_of_instances": 100,
|
| 699 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
| 700 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
| 701 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
| 702 |
-
"faithfullness_f1_token_overlap": 0.
|
| 703 |
-
"faithfullness_recall_token_overlap": 0.
|
| 704 |
-
"faithfullness_precision_token_overlap": 0.
|
| 705 |
-
"correctness_f1_token_overlap": 0.
|
| 706 |
-
"correctness_recall_token_overlap": 0.
|
| 707 |
-
"correctness_precision_token_overlap": 0.
|
| 708 |
},
|
| 709 |
-
"score": 0.
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
| 713 |
"reasoning": {
|
| 714 |
"hellaswag": {
|
| 715 |
-
"accuracy": 0.
|
| 716 |
"accuracy_ci_low": 0.42,
|
| 717 |
-
"accuracy_ci_high": 0.
|
| 718 |
"score_name": "accuracy",
|
| 719 |
-
"score": 0.
|
| 720 |
-
"score_ci_high": 0.
|
| 721 |
"score_ci_low": 0.42,
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
-
"accuracy": 0.
|
| 726 |
-
"accuracy_ci_low": 0.
|
| 727 |
-
"accuracy_ci_high": 0.
|
| 728 |
"score_name": "accuracy",
|
| 729 |
-
"score": 0.
|
| 730 |
-
"score_ci_high": 0.
|
| 731 |
-
"score_ci_low": 0.
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
-
"score": 0.
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
| 738 |
"safety": {
|
| 739 |
"attaq_500": {
|
| 740 |
-
"severity_critical":
|
| 741 |
"severity_high": 0.0,
|
| 742 |
"severity_medium": 2.0,
|
| 743 |
-
"severity_low":
|
| 744 |
-
"category_harmful_info": 0.
|
| 745 |
-
"category_deception": 0.
|
| 746 |
-
"category_explicit_content": 0.
|
| 747 |
-
"category_violence": 0.
|
| 748 |
-
"category_discrimination": 0.
|
| 749 |
-
"category_substance_abuse": 0.
|
| 750 |
-
"category_pii": 0.
|
| 751 |
-
"safety": 0.
|
| 752 |
-
"safety_ci_low": 0.
|
| 753 |
-
"safety_ci_high": 0.
|
| 754 |
"score_name": "safety",
|
| 755 |
-
"score": 0.
|
| 756 |
-
"score_ci_high": 0.
|
| 757 |
-
"score_ci_low": 0.
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
-
"score": 0.
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
-
"
|
| 768 |
-
"
|
| 769 |
-
"rougeL": 0.
|
| 770 |
-
"score": 0.
|
| 771 |
"score_name": "rougeL",
|
| 772 |
-
"rougeLsum": 0.
|
| 773 |
-
"
|
| 774 |
-
"
|
| 775 |
-
"
|
| 776 |
-
"
|
| 777 |
-
"rougeL_ci_low": 0.
|
| 778 |
-
"rougeL_ci_high": 0.
|
| 779 |
-
"score_ci_low": 0.
|
| 780 |
-
"score_ci_high": 0.
|
| 781 |
-
"rougeLsum_ci_low": 0.
|
| 782 |
-
"rougeLsum_ci_high": 0.
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
-
"
|
| 787 |
-
"
|
| 788 |
-
"rougeL": 0.
|
| 789 |
-
"score": 0.
|
| 790 |
"score_name": "rougeL",
|
| 791 |
-
"rougeLsum": 0.
|
| 792 |
-
"
|
| 793 |
-
"
|
| 794 |
-
"
|
| 795 |
-
"
|
| 796 |
-
"rougeL_ci_low": 0.
|
| 797 |
-
"rougeL_ci_high": 0.
|
| 798 |
-
"score_ci_low": 0.
|
| 799 |
-
"score_ci_high": 0.
|
| 800 |
-
"rougeLsum_ci_low": 0.
|
| 801 |
-
"rougeLsum_ci_high": 0.
|
| 802 |
},
|
| 803 |
-
"score": 0.
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
@@ -808,473 +808,473 @@
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
-
0.
|
| 824 |
-
0.
|
| 825 |
-
0.
|
| 826 |
-
0.
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
-
"sys_len":
|
| 830 |
"ref_len": 208,
|
| 831 |
-
"sacrebleu": 0.
|
| 832 |
-
"score": 0.
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
-
"score_ci_low": 0.
|
| 835 |
-
"score_ci_high": 0.
|
| 836 |
-
"sacrebleu_ci_low": 0.
|
| 837 |
-
"sacrebleu_ci_high": 0.
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
-
0.
|
| 855 |
-
0.
|
| 856 |
-
0.
|
| 857 |
-
0.
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
-
"sys_len":
|
| 861 |
"ref_len": 208,
|
| 862 |
-
"sacrebleu": 0.
|
| 863 |
-
"score": 0.
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
-
"score_ci_low": 0.
|
| 866 |
-
"score_ci_high": 0.
|
| 867 |
-
"sacrebleu_ci_low": 0.
|
| 868 |
-
"sacrebleu_ci_high": 0.
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
-
0.
|
| 886 |
-
0.
|
| 887 |
-
0.
|
| 888 |
-
0.
|
| 889 |
],
|
| 890 |
"bp": 1.0,
|
| 891 |
-
"sys_len":
|
| 892 |
"ref_len": 209,
|
| 893 |
-
"sacrebleu": 0.
|
| 894 |
-
"score": 0.
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
-
"score_ci_low": 0.
|
| 897 |
-
"score_ci_high": 0.
|
| 898 |
-
"sacrebleu_ci_low": 0.
|
| 899 |
-
"sacrebleu_ci_high": 0.
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
-
0.
|
| 917 |
-
0.
|
| 918 |
-
0.
|
| 919 |
-
0.
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
-
"sys_len":
|
| 923 |
"ref_len": 216,
|
| 924 |
-
"sacrebleu": 0.
|
| 925 |
-
"score": 0.
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
-
"score_ci_low": 0.
|
| 928 |
-
"score_ci_high": 0.
|
| 929 |
-
"sacrebleu_ci_low": 0.
|
| 930 |
-
"sacrebleu_ci_high": 0.
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
-
0.
|
| 948 |
-
0.
|
| 949 |
-
0.
|
| 950 |
-
0.
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
-
"sys_len":
|
| 954 |
"ref_len": 235,
|
| 955 |
-
"sacrebleu": 0.
|
| 956 |
-
"score": 0.
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
-
"score_ci_low": 0.
|
| 959 |
-
"score_ci_high": 0.
|
| 960 |
-
"sacrebleu_ci_low": 0.
|
| 961 |
-
"sacrebleu_ci_high": 0.
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
-
0.
|
| 979 |
-
0.
|
| 980 |
-
0.
|
| 981 |
-
0.
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
-
"sys_len":
|
| 985 |
"ref_len": 249,
|
| 986 |
-
"sacrebleu": 0.
|
| 987 |
-
"score": 0.
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
-
"score_ci_low": 0.
|
| 990 |
-
"score_ci_high": 0.
|
| 991 |
-
"sacrebleu_ci_low": 0.
|
| 992 |
-
"sacrebleu_ci_high": 0.
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
-
0.
|
| 1010 |
-
0.
|
| 1011 |
-
0.
|
| 1012 |
-
0.
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
-
"sys_len":
|
| 1016 |
"ref_len": 222,
|
| 1017 |
-
"sacrebleu": 0.
|
| 1018 |
-
"score": 0.
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
-
"score_ci_low": 0.
|
| 1021 |
-
"score_ci_high": 0.
|
| 1022 |
-
"sacrebleu_ci_low": 0.
|
| 1023 |
-
"sacrebleu_ci_high": 0.
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
-
0.
|
| 1041 |
-
0.
|
| 1042 |
-
0.
|
| 1043 |
-
0.
|
| 1044 |
],
|
| 1045 |
"bp": 1.0,
|
| 1046 |
-
"sys_len":
|
| 1047 |
"ref_len": 230,
|
| 1048 |
-
"sacrebleu": 0.
|
| 1049 |
-
"score": 0.
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
-
"score_ci_low": 0.
|
| 1052 |
-
"score_ci_high": 0.
|
| 1053 |
-
"sacrebleu_ci_low": 0.
|
| 1054 |
-
"sacrebleu_ci_high": 0.
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
-
0.
|
| 1072 |
-
0.
|
| 1073 |
-
0.
|
| 1074 |
-
0.
|
| 1075 |
],
|
| 1076 |
"bp": 1.0,
|
| 1077 |
-
"sys_len":
|
| 1078 |
"ref_len": 243,
|
| 1079 |
-
"sacrebleu": 0.
|
| 1080 |
-
"score": 0.
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
-
"score_ci_low": 0.
|
| 1083 |
-
"score_ci_high": 0.
|
| 1084 |
-
"sacrebleu_ci_low": 0.
|
| 1085 |
-
"sacrebleu_ci_high": 0.
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
-
0.
|
| 1103 |
-
0.
|
| 1104 |
-
0.
|
| 1105 |
-
0.
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
-
"sys_len":
|
| 1109 |
"ref_len": 208,
|
| 1110 |
-
"sacrebleu": 0.
|
| 1111 |
-
"score": 0.
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
-
"score_ci_low": 0.
|
| 1114 |
-
"score_ci_high": 0.
|
| 1115 |
-
"sacrebleu_ci_low": 0.
|
| 1116 |
-
"sacrebleu_ci_high": 0.
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
-
0.
|
| 1134 |
-
0.
|
| 1135 |
-
0.
|
| 1136 |
-
0.
|
| 1137 |
],
|
| 1138 |
"bp": 1.0,
|
| 1139 |
-
"sys_len":
|
| 1140 |
"ref_len": 208,
|
| 1141 |
-
"sacrebleu": 0.
|
| 1142 |
-
"score": 0.
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
-
"score_ci_low": 0.
|
| 1145 |
-
"score_ci_high": 0.
|
| 1146 |
-
"sacrebleu_ci_low": 0.
|
| 1147 |
-
"sacrebleu_ci_high": 0.
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
-
|
| 1153 |
80,
|
| 1154 |
48,
|
| 1155 |
31
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
-
0.
|
| 1165 |
-
0.
|
| 1166 |
-
0.
|
| 1167 |
-
0.
|
| 1168 |
],
|
| 1169 |
"bp": 1.0,
|
| 1170 |
-
"sys_len":
|
| 1171 |
"ref_len": 208,
|
| 1172 |
-
"sacrebleu": 0.
|
| 1173 |
-
"score": 0.
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
-
"score_ci_low": 0.
|
| 1176 |
-
"score_ci_high": 0.
|
| 1177 |
-
"sacrebleu_ci_low": 0.
|
| 1178 |
-
"sacrebleu_ci_high": 0.
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
-
|
| 1184 |
130,
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
-
0.
|
| 1196 |
-
0.
|
| 1197 |
-
0.
|
| 1198 |
-
0.
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
-
"sys_len":
|
| 1202 |
"ref_len": 208,
|
| 1203 |
-
"sacrebleu": 0.
|
| 1204 |
-
"score": 0.
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
-
"score_ci_low": 0.
|
| 1207 |
-
"score_ci_high": 0.
|
| 1208 |
-
"sacrebleu_ci_low": 0.
|
| 1209 |
-
"sacrebleu_ci_high": 0.
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
-
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
],
|
| 1219 |
"totals": [
|
|
|
|
| 1220 |
553,
|
| 1221 |
547,
|
| 1222 |
-
541
|
| 1223 |
-
535
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
-
0.
|
| 1227 |
-
0.
|
| 1228 |
-
0.
|
| 1229 |
-
0.
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
-
"sys_len":
|
| 1233 |
"ref_len": 208,
|
| 1234 |
-
"sacrebleu": 0.
|
| 1235 |
-
"score": 0.
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
-
"score_ci_low": 0.
|
| 1238 |
-
"score_ci_high": 0.
|
| 1239 |
-
"sacrebleu_ci_low": 0.
|
| 1240 |
-
"sacrebleu_ci_high": 0.
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
-
|
| 1246 |
-
|
| 1247 |
-
|
| 1248 |
-
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
-
0.
|
| 1258 |
-
0.
|
| 1259 |
-
0.
|
| 1260 |
-
0.
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
-
"sys_len":
|
| 1264 |
"ref_len": 208,
|
| 1265 |
-
"sacrebleu": 0.
|
| 1266 |
-
"score": 0.
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
-
"score_ci_low": 0.
|
| 1269 |
-
"score_ci_high": 0.
|
| 1270 |
-
"sacrebleu_ci_low": 0.
|
| 1271 |
-
"sacrebleu_ci_high": 0.
|
| 1272 |
},
|
| 1273 |
-
"score": 0.
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
-
"score": 0.
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
+
"timestamp_utc": "2025-07-03T17:13:58.227652Z",
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
+
"unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
+
"accuracy": 0.8888888888888888,
|
| 180 |
"accuracy_ci_low": 0.4444444444444444,
|
| 181 |
"accuracy_ci_high": 1.0,
|
| 182 |
"score_name": "accuracy",
|
| 183 |
+
"score": 0.8888888888888888,
|
| 184 |
"score_ci_high": 1.0,
|
| 185 |
"score_ci_low": 0.4444444444444444,
|
| 186 |
"num_of_instances": 9
|
|
|
|
| 216 |
"num_of_instances": 9
|
| 217 |
},
|
| 218 |
"safety_bbq_physical_appearance": {
|
| 219 |
+
"accuracy": 1.0,
|
| 220 |
+
"accuracy_ci_low": 1.0,
|
| 221 |
"accuracy_ci_high": 1.0,
|
| 222 |
"score_name": "accuracy",
|
| 223 |
+
"score": 1.0,
|
| 224 |
"score_ci_high": 1.0,
|
| 225 |
+
"score_ci_low": 1.0,
|
| 226 |
"num_of_instances": 9
|
| 227 |
},
|
| 228 |
"safety_bbq_race_ethnicity": {
|
|
|
|
| 236 |
"num_of_instances": 9
|
| 237 |
},
|
| 238 |
"safety_bbq_race_x_gender": {
|
| 239 |
+
"accuracy": 0.8888888888888888,
|
| 240 |
+
"accuracy_ci_low": 0.46041936253217447,
|
| 241 |
+
"accuracy_ci_high": 1.0,
|
| 242 |
+
"score_name": "accuracy",
|
| 243 |
+
"score": 0.8888888888888888,
|
| 244 |
+
"score_ci_high": 1.0,
|
| 245 |
+
"score_ci_low": 0.46041936253217447,
|
| 246 |
+
"num_of_instances": 9
|
| 247 |
+
},
|
| 248 |
+
"safety_bbq_race_x_ses": {
|
| 249 |
"accuracy": 1.0,
|
| 250 |
"accuracy_ci_low": 1.0,
|
| 251 |
"accuracy_ci_high": 1.0,
|
|
|
|
| 255 |
"score_ci_low": 1.0,
|
| 256 |
"num_of_instances": 9
|
| 257 |
},
|
| 258 |
+
"safety_bbq_religion": {
|
| 259 |
"accuracy": 0.8888888888888888,
|
| 260 |
+
"accuracy_ci_low": 0.47716657027690984,
|
| 261 |
"accuracy_ci_high": 1.0,
|
| 262 |
"score_name": "accuracy",
|
| 263 |
"score": 0.8888888888888888,
|
| 264 |
"score_ci_high": 1.0,
|
| 265 |
+
"score_ci_low": 0.47716657027690984,
|
| 266 |
"num_of_instances": 9
|
| 267 |
},
|
| 268 |
+
"safety_bbq_ses": {
|
| 269 |
+
"accuracy": 0.8888888888888888,
|
| 270 |
+
"accuracy_ci_low": 0.5555555555555556,
|
| 271 |
+
"accuracy_ci_high": 1.0,
|
| 272 |
"score_name": "accuracy",
|
| 273 |
+
"score": 0.8888888888888888,
|
| 274 |
+
"score_ci_high": 1.0,
|
| 275 |
+
"score_ci_low": 0.5555555555555556,
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
+
"safety_bbq_sexual_orientation": {
|
| 279 |
"accuracy": 0.7777777777777778,
|
| 280 |
"accuracy_ci_low": 0.4444444444444444,
|
| 281 |
"accuracy_ci_high": 1.0,
|
|
|
|
| 285 |
"score_ci_low": 0.4444444444444444,
|
| 286 |
"num_of_instances": 9
|
| 287 |
},
|
| 288 |
+
"score": 0.9393939393939393,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.8804347826086957,
|
| 296 |
+
"score": 0.8804347826086957,
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
+
"score": 0.8804347826086957,
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
+
"f1_Person": 0.35,
|
| 307 |
+
"f1_Location": 0.5714285714285715,
|
| 308 |
+
"f1_Organization": 0.4230769230769231,
|
| 309 |
+
"f1_macro": 0.4481684981684982,
|
| 310 |
+
"recall_macro": 0.39906832298136646,
|
| 311 |
+
"precision_macro": 0.5122549019607843,
|
| 312 |
+
"in_classes_support": 0.7195121951219512,
|
| 313 |
+
"f1_micro": 0.38216560509554137,
|
| 314 |
+
"recall_micro": 0.4,
|
| 315 |
+
"precision_micro": 0.36585365853658536,
|
| 316 |
+
"score": 0.38216560509554137,
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
+
"score_ci_low": 0.23448275862068965,
|
| 319 |
+
"score_ci_high": 0.5044820459598843,
|
| 320 |
+
"f1_micro_ci_low": 0.23448275862068965,
|
| 321 |
+
"f1_micro_ci_high": 0.5044820459598843
|
| 322 |
},
|
| 323 |
+
"score": 0.38216560509554137,
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
|
|
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
| 339 |
+
"accuracy": 0.14285714285714285,
|
| 340 |
+
"accuracy_ci_low": 0.0,
|
| 341 |
+
"accuracy_ci_high": 0.6807203593841678,
|
| 342 |
"score_name": "accuracy",
|
| 343 |
+
"score": 0.14285714285714285,
|
| 344 |
+
"score_ci_high": 0.6807203593841678,
|
| 345 |
+
"score_ci_low": 0.0,
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
+
"accuracy": 0.2857142857142857,
|
| 350 |
+
"accuracy_ci_low": 0.0,
|
| 351 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 352 |
"score_name": "accuracy",
|
| 353 |
+
"score": 0.2857142857142857,
|
| 354 |
+
"score_ci_high": 0.7142857142857143,
|
| 355 |
+
"score_ci_low": 0.0,
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
| 359 |
+
"accuracy": 1.0,
|
| 360 |
+
"accuracy_ci_low": 1.0,
|
| 361 |
"accuracy_ci_high": 1.0,
|
| 362 |
"score_name": "accuracy",
|
| 363 |
+
"score": 1.0,
|
| 364 |
"score_ci_high": 1.0,
|
| 365 |
+
"score_ci_low": 1.0,
|
| 366 |
"num_of_instances": 7
|
| 367 |
},
|
| 368 |
"mmlu_pro_economics": {
|
|
|
|
| 376 |
"num_of_instances": 7
|
| 377 |
},
|
| 378 |
"mmlu_pro_engineering": {
|
| 379 |
+
"accuracy": 0.5714285714285714,
|
| 380 |
"accuracy_ci_low": 0.14285714285714285,
|
| 381 |
"accuracy_ci_high": 0.8571428571428571,
|
| 382 |
"score_name": "accuracy",
|
| 383 |
+
"score": 0.5714285714285714,
|
| 384 |
"score_ci_high": 0.8571428571428571,
|
| 385 |
"score_ci_low": 0.14285714285714285,
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
| 389 |
+
"accuracy": 0.5714285714285714,
|
| 390 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 391 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 392 |
"score_name": "accuracy",
|
| 393 |
+
"score": 0.5714285714285714,
|
| 394 |
+
"score_ci_high": 0.8571428571428571,
|
| 395 |
+
"score_ci_low": 0.14285714285714285,
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
| 398 |
"mmlu_pro_history": {
|
| 399 |
"accuracy": 0.2857142857142857,
|
| 400 |
"accuracy_ci_low": 0.0,
|
| 401 |
+
"accuracy_ci_high": 0.7745960504060544,
|
| 402 |
"score_name": "accuracy",
|
| 403 |
"score": 0.2857142857142857,
|
| 404 |
+
"score_ci_high": 0.7745960504060544,
|
| 405 |
"score_ci_low": 0.0,
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
+
"accuracy": 0.5714285714285714,
|
| 410 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 411 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 412 |
"score_name": "accuracy",
|
| 413 |
+
"score": 0.5714285714285714,
|
| 414 |
+
"score_ci_high": 0.8571428571428571,
|
| 415 |
+
"score_ci_low": 0.14285714285714285,
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
|
|
|
| 426 |
"num_of_instances": 7
|
| 427 |
},
|
| 428 |
"mmlu_pro_other": {
|
| 429 |
+
"accuracy": 0.5714285714285714,
|
| 430 |
"accuracy_ci_low": 0.14285714285714285,
|
| 431 |
"accuracy_ci_high": 0.8571428571428571,
|
| 432 |
"score_name": "accuracy",
|
| 433 |
+
"score": 0.5714285714285714,
|
| 434 |
"score_ci_high": 0.8571428571428571,
|
| 435 |
"score_ci_low": 0.14285714285714285,
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
| 438 |
"mmlu_pro_philosophy": {
|
| 439 |
+
"accuracy": 0.8571428571428571,
|
| 440 |
+
"accuracy_ci_low": 0.42857142857142855,
|
| 441 |
"accuracy_ci_high": 1.0,
|
| 442 |
"score_name": "accuracy",
|
| 443 |
+
"score": 0.8571428571428571,
|
| 444 |
"score_ci_high": 1.0,
|
| 445 |
+
"score_ci_low": 0.42857142857142855,
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
|
|
|
| 465 |
"score_ci_low": 0.14285714285714285,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
+
"score": 0.5510204081632653,
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
+
"f1_macro": 0.36,
|
| 475 |
+
"f1_suggestive": 0.5,
|
| 476 |
"f1_generic": 0.0,
|
| 477 |
+
"f1_fanciful": 0.4,
|
| 478 |
+
"f1_descriptive": 0.4,
|
| 479 |
+
"f1_arbitrary": 0.5,
|
| 480 |
+
"f1_macro_ci_low": 0.18196307643598778,
|
| 481 |
+
"f1_macro_ci_high": 0.6868043021431244,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
"score": 0.41379310344827586,
|
| 484 |
"score_ci_high": 0.6666666666666666,
|
| 485 |
+
"score_ci_low": 0.16666666666666666,
|
| 486 |
"num_of_instances": 20,
|
| 487 |
"accuracy": 0.3,
|
| 488 |
+
"accuracy_ci_low": 0.1,
|
| 489 |
"accuracy_ci_high": 0.55,
|
| 490 |
"f1_micro": 0.41379310344827586,
|
| 491 |
+
"f1_micro_ci_low": 0.16666666666666666,
|
| 492 |
"f1_micro_ci_high": 0.6666666666666666
|
| 493 |
},
|
| 494 |
+
"legalbench_corporate_lobbying": {
|
| 495 |
+
"f1_macro": 0.21739130434782608,
|
| 496 |
+
"f1_no": 0.43478260869565216,
|
| 497 |
+
"f1_yes": 0.0,
|
| 498 |
+
"f1_macro_ci_low": 0.09523809523809523,
|
| 499 |
+
"f1_macro_ci_high": 0.3448275862068966,
|
| 500 |
+
"score_name": "f1_micro",
|
| 501 |
+
"score": 0.3448275862068966,
|
| 502 |
+
"score_ci_high": 0.5806451612903226,
|
| 503 |
+
"score_ci_low": 0.14285714285714285,
|
| 504 |
+
"num_of_instances": 20,
|
| 505 |
+
"accuracy": 0.25,
|
| 506 |
+
"accuracy_ci_low": 0.1,
|
| 507 |
+
"accuracy_ci_high": 0.45,
|
| 508 |
+
"f1_micro": 0.3448275862068966,
|
| 509 |
+
"f1_micro_ci_low": 0.14285714285714285,
|
| 510 |
+
"f1_micro_ci_high": 0.5806451612903226
|
| 511 |
+
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
+
"f1_macro": 0.08843537414965986,
|
| 514 |
"f1_conclusion": 0.3333333333333333,
|
| 515 |
"f1_decree": 0.0,
|
| 516 |
"f1_issue": 0.2857142857142857,
|
| 517 |
+
"f1_analysis": 0.0,
|
|
|
|
| 518 |
"f1_procedural history": 0.0,
|
| 519 |
+
"f1_facts": 0.0,
|
| 520 |
"f1_rule": 0.0,
|
| 521 |
+
"f1_macro_ci_low": 0.0,
|
| 522 |
+
"f1_macro_ci_high": 0.23418031738212744,
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
+
"score": 0.13333333333333333,
|
| 525 |
+
"score_ci_high": 0.3448275862068966,
|
| 526 |
"score_ci_low": 0.0,
|
| 527 |
"num_of_instances": 20,
|
| 528 |
+
"accuracy": 0.1,
|
| 529 |
+
"accuracy_ci_low": 0.0,
|
| 530 |
+
"accuracy_ci_high": 0.3,
|
| 531 |
+
"f1_micro": 0.13333333333333333,
|
| 532 |
"f1_micro_ci_low": 0.0,
|
| 533 |
+
"f1_micro_ci_high": 0.3448275862068966
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
+
"f1_macro": 0.16666666666666666,
|
| 537 |
+
"f1_yes": 0.3333333333333333,
|
| 538 |
+
"f1_no": 0.0,
|
| 539 |
+
"f1_macro_ci_low": 0.0,
|
| 540 |
+
"f1_macro_ci_high": 0.3572692051197846,
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
+
"score": 0.17391304347826086,
|
| 543 |
+
"score_ci_high": 0.46321149766382286,
|
| 544 |
+
"score_ci_low": 0.0,
|
| 545 |
"num_of_instances": 20,
|
| 546 |
+
"accuracy": 0.1,
|
| 547 |
+
"accuracy_ci_low": 0.0,
|
| 548 |
"accuracy_ci_high": 0.35,
|
| 549 |
+
"f1_micro": 0.17391304347826086,
|
| 550 |
+
"f1_micro_ci_low": 0.0,
|
| 551 |
+
"f1_micro_ci_high": 0.46321149766382286
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
+
"f1_macro": 0.8875,
|
| 555 |
"f1_yes": 0.875,
|
| 556 |
+
"f1_no": 0.9,
|
| 557 |
+
"f1_macro_ci_low": 0.6967097128018988,
|
| 558 |
+
"f1_macro_ci_high": 0.9674263277070511,
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
+
"score": 0.8888888888888888,
|
| 561 |
+
"score_ci_high": 0.9743589743589743,
|
| 562 |
"score_ci_low": 0.7096774193548387,
|
| 563 |
"num_of_instances": 20,
|
| 564 |
+
"accuracy": 0.8,
|
| 565 |
"accuracy_ci_low": 0.55,
|
| 566 |
+
"accuracy_ci_high": 0.95,
|
| 567 |
+
"f1_micro": 0.8888888888888888,
|
| 568 |
"f1_micro_ci_low": 0.7096774193548387,
|
| 569 |
+
"f1_micro_ci_high": 0.9743589743589743
|
| 570 |
},
|
| 571 |
+
"score": 0.3909511910711311,
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
+
"f1_macro": 0.5383630258630258,
|
| 578 |
+
"f1_cars": 0.8888888888888888,
|
| 579 |
+
"f1_windows x": 0.2857142857142857,
|
| 580 |
+
"f1_atheism": 0.0,
|
| 581 |
"f1_religion": 0.0,
|
| 582 |
"f1_medicine": 0.8571428571428571,
|
| 583 |
+
"f1_christianity": 0.6666666666666666,
|
|
|
|
| 584 |
"f1_computer graphics": 0.5714285714285714,
|
| 585 |
+
"f1_microsoft windows": 0.6,
|
| 586 |
+
"f1_middle east": 0.7272727272727273,
|
| 587 |
+
"f1_motorcycles": 0.6,
|
|
|
|
|
|
|
| 588 |
"f1_mac hardware": 0.5714285714285714,
|
| 589 |
+
"f1_electronics": 0.5,
|
| 590 |
+
"f1_for sale": 0.6666666666666666,
|
| 591 |
+
"f1_guns": 0.25,
|
| 592 |
+
"f1_politics": 0.4,
|
| 593 |
"f1_space": 0.75,
|
| 594 |
+
"f1_pc hardware": 0.6153846153846154,
|
| 595 |
+
"f1_cryptography": 0.4,
|
| 596 |
+
"f1_baseball": 0.6666666666666666,
|
| 597 |
+
"f1_hockey": 0.75,
|
| 598 |
+
"f1_macro_ci_low": 0.45705055771205794,
|
| 599 |
+
"f1_macro_ci_high": 0.6459776394512042,
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
+
"score": 0.5609756097560976,
|
| 602 |
+
"score_ci_high": 0.6470588235294118,
|
| 603 |
+
"score_ci_low": 0.4458811552198428,
|
| 604 |
"num_of_instances": 100,
|
| 605 |
+
"accuracy": 0.46,
|
| 606 |
+
"accuracy_ci_low": 0.36,
|
| 607 |
+
"accuracy_ci_high": 0.55,
|
| 608 |
+
"f1_micro": 0.5609756097560976,
|
| 609 |
+
"f1_micro_ci_low": 0.4458811552198428,
|
| 610 |
+
"f1_micro_ci_high": 0.6470588235294118
|
| 611 |
},
|
| 612 |
+
"score": 0.5609756097560976,
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
+
"f1_macro": 0.5422302335345813,
|
| 619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.7478260869565218,
|
| 620 |
+
"f1_credit card or prepaid card": 0.15384615384615385,
|
| 621 |
"f1_money transfer or virtual currency or money service": 0.8,
|
| 622 |
"f1_mortgage": 0.6666666666666666,
|
| 623 |
+
"f1_debt collection": 0.7,
|
| 624 |
+
"f1_checking or savings account": 0.7272727272727273,
|
|
|
|
| 625 |
"f1_payday loan or title loan or personal loan": 0.0,
|
| 626 |
+
"f1_macro_ci_low": 0.29281508103806,
|
| 627 |
+
"f1_macro_ci_high": 0.6789535788595508,
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
+
"score": 0.6904761904761905,
|
| 630 |
+
"score_ci_high": 0.7657142857142857,
|
| 631 |
+
"score_ci_low": 0.5895579257094421,
|
| 632 |
"num_of_instances": 100,
|
| 633 |
+
"accuracy": 0.58,
|
| 634 |
+
"accuracy_ci_low": 0.48,
|
| 635 |
+
"accuracy_ci_high": 0.67,
|
| 636 |
+
"f1_micro": 0.6904761904761905,
|
| 637 |
+
"f1_micro_ci_low": 0.5895579257094421,
|
| 638 |
+
"f1_micro_ci_high": 0.7657142857142857
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
+
"f1_macro": 0.6899939242044505,
|
| 642 |
+
"f1_mortgages and loans": 0.7619047619047619,
|
| 643 |
+
"f1_credit card": 0.8421052631578947,
|
| 644 |
+
"f1_debt collection": 0.7777777777777778,
|
| 645 |
"f1_credit reporting": 0.8181818181818182,
|
| 646 |
+
"f1_retail banking": 0.25,
|
| 647 |
+
"f1_macro_ci_low": 0.5804749234517474,
|
| 648 |
+
"f1_macro_ci_high": 0.8422738227771456,
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
+
"score": 0.75,
|
| 651 |
+
"score_ci_high": 0.847177162130248,
|
| 652 |
+
"score_ci_low": 0.6190476190476191,
|
| 653 |
"num_of_instances": 50,
|
| 654 |
+
"accuracy": 0.66,
|
| 655 |
+
"accuracy_ci_low": 0.52,
|
| 656 |
+
"accuracy_ci_high": 0.78,
|
| 657 |
+
"f1_micro": 0.75,
|
| 658 |
+
"f1_micro_ci_low": 0.6190476190476191,
|
| 659 |
+
"f1_micro_ci_high": 0.847177162130248
|
| 660 |
},
|
| 661 |
+
"score": 0.7202380952380952,
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
+
"execution_accuracy": 0.22,
|
| 669 |
+
"program_accuracy": 0.25,
|
| 670 |
+
"score": 0.25,
|
| 671 |
"score_name": "program_accuracy",
|
| 672 |
+
"execution_accuracy_ci_low": 0.15,
|
| 673 |
+
"execution_accuracy_ci_high": 0.31,
|
| 674 |
+
"program_accuracy_ci_low": 0.1763781051158403,
|
| 675 |
+
"program_accuracy_ci_high": 0.34,
|
| 676 |
+
"score_ci_low": 0.1763781051158403,
|
| 677 |
+
"score_ci_high": 0.34
|
|
|
|
| 678 |
},
|
| 679 |
+
"score": 0.25,
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
+
"precision": 0.40399485104700195,
|
| 686 |
+
"recall": 0.645196380860268,
|
| 687 |
+
"f1": 0.45716298151487356,
|
| 688 |
+
"precision_ci_low": 0.3707977131897795,
|
| 689 |
+
"precision_ci_high": 0.4379084781363053,
|
| 690 |
+
"recall_ci_low": 0.6024417062417228,
|
| 691 |
+
"recall_ci_high": 0.6827510025303157,
|
| 692 |
+
"f1_ci_low": 0.429898646979612,
|
| 693 |
+
"f1_ci_high": 0.48892280114573866,
|
| 694 |
"score_name": "f1",
|
| 695 |
+
"score": 0.45716298151487356,
|
| 696 |
+
"score_ci_high": 0.48892280114573866,
|
| 697 |
+
"score_ci_low": 0.429898646979612,
|
| 698 |
"num_of_instances": 100,
|
| 699 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6581183406710625,
|
| 700 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.7144612017273902,
|
| 701 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.6185722374916076,
|
| 702 |
+
"faithfullness_f1_token_overlap": 0.41604780700650046,
|
| 703 |
+
"faithfullness_recall_token_overlap": 0.3514540759768687,
|
| 704 |
+
"faithfullness_precision_token_overlap": 0.6375478864058947,
|
| 705 |
+
"correctness_f1_token_overlap": 0.45716298151487356,
|
| 706 |
+
"correctness_recall_token_overlap": 0.645196380860268,
|
| 707 |
+
"correctness_precision_token_overlap": 0.40399485104700195
|
| 708 |
},
|
| 709 |
+
"score": 0.45716298151487356,
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
| 713 |
"reasoning": {
|
| 714 |
"hellaswag": {
|
| 715 |
+
"accuracy": 0.53,
|
| 716 |
"accuracy_ci_low": 0.42,
|
| 717 |
+
"accuracy_ci_high": 0.62,
|
| 718 |
"score_name": "accuracy",
|
| 719 |
+
"score": 0.53,
|
| 720 |
+
"score_ci_high": 0.62,
|
| 721 |
"score_ci_low": 0.42,
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
+
"accuracy": 0.93,
|
| 726 |
+
"accuracy_ci_low": 0.87,
|
| 727 |
+
"accuracy_ci_high": 0.97,
|
| 728 |
"score_name": "accuracy",
|
| 729 |
+
"score": 0.93,
|
| 730 |
+
"score_ci_high": 0.97,
|
| 731 |
+
"score_ci_low": 0.87,
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
+
"score": 0.73,
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
| 738 |
"safety": {
|
| 739 |
"attaq_500": {
|
| 740 |
+
"severity_critical": 0.0,
|
| 741 |
"severity_high": 0.0,
|
| 742 |
"severity_medium": 2.0,
|
| 743 |
+
"severity_low": 98.0,
|
| 744 |
+
"category_harmful_info": 0.8631058455027256,
|
| 745 |
+
"category_deception": 0.9274132775396784,
|
| 746 |
+
"category_explicit_content": 0.8870481939026803,
|
| 747 |
+
"category_violence": 0.8785020768642425,
|
| 748 |
+
"category_discrimination": 0.8451067381908022,
|
| 749 |
+
"category_substance_abuse": 0.8403728110922708,
|
| 750 |
+
"category_pii": 0.8753032648453006,
|
| 751 |
+
"safety": 0.8761713356752362,
|
| 752 |
+
"safety_ci_low": 0.8512209421531655,
|
| 753 |
+
"safety_ci_high": 0.8979957877325264,
|
| 754 |
"score_name": "safety",
|
| 755 |
+
"score": 0.8761713356752362,
|
| 756 |
+
"score_ci_high": 0.8979957877325264,
|
| 757 |
+
"score_ci_low": 0.8512209421531655,
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
+
"score": 0.8761713356752362,
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
+
"rouge2": 0.16659959559086032,
|
| 768 |
+
"rouge1": 0.37083229348121927,
|
| 769 |
+
"rougeL": 0.24772899480673347,
|
| 770 |
+
"score": 0.24772899480673347,
|
| 771 |
"score_name": "rougeL",
|
| 772 |
+
"rougeLsum": 0.31834916208110225,
|
| 773 |
+
"rouge2_ci_low": 0.15347061781315385,
|
| 774 |
+
"rouge2_ci_high": 0.18365128450532014,
|
| 775 |
+
"rouge1_ci_low": 0.3461575427230026,
|
| 776 |
+
"rouge1_ci_high": 0.394007052290088,
|
| 777 |
+
"rougeL_ci_low": 0.23287408665838988,
|
| 778 |
+
"rougeL_ci_high": 0.2646216959495971,
|
| 779 |
+
"score_ci_low": 0.23287408665838988,
|
| 780 |
+
"score_ci_high": 0.2646216959495971,
|
| 781 |
+
"rougeLsum_ci_low": 0.29809556070161564,
|
| 782 |
+
"rougeLsum_ci_high": 0.34077249634049434
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
+
"rouge2": 0.010674743357704318,
|
| 787 |
+
"rouge1": 0.09740153692409903,
|
| 788 |
+
"rougeL": 0.07347034330691325,
|
| 789 |
+
"score": 0.07347034330691325,
|
| 790 |
"score_name": "rougeL",
|
| 791 |
+
"rougeLsum": 0.08091348646192556,
|
| 792 |
+
"rouge2_ci_low": 0.007327919207500679,
|
| 793 |
+
"rouge2_ci_high": 0.0150718612169756,
|
| 794 |
+
"rouge1_ci_low": 0.08433986357719496,
|
| 795 |
+
"rouge1_ci_high": 0.11192495418826402,
|
| 796 |
+
"rougeL_ci_low": 0.06410019336922876,
|
| 797 |
+
"rougeL_ci_high": 0.0841346851695041,
|
| 798 |
+
"score_ci_low": 0.06410019336922876,
|
| 799 |
+
"score_ci_high": 0.0841346851695041,
|
| 800 |
+
"rougeLsum_ci_low": 0.07047578026735274,
|
| 801 |
+
"rougeLsum_ci_high": 0.09256773525341014
|
| 802 |
},
|
| 803 |
+
"score": 0.16059966905682335,
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
+
163,
|
| 812 |
+
110,
|
| 813 |
+
77,
|
| 814 |
+
53
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
+
506,
|
| 818 |
+
500,
|
| 819 |
+
494,
|
| 820 |
+
488
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
+
0.3221343873517787,
|
| 824 |
+
0.22,
|
| 825 |
+
0.15587044534412955,
|
| 826 |
+
0.10860655737704919
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
+
"sys_len": 506,
|
| 830 |
"ref_len": 208,
|
| 831 |
+
"sacrebleu": 0.18611008096528026,
|
| 832 |
+
"score": 0.18611008096528026,
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
+
"score_ci_low": 0.13739981225396028,
|
| 835 |
+
"score_ci_high": 0.34230875374829367,
|
| 836 |
+
"sacrebleu_ci_low": 0.13739981225396028,
|
| 837 |
+
"sacrebleu_ci_high": 0.34230875374829367
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
+
147,
|
| 843 |
+
88,
|
| 844 |
+
57,
|
| 845 |
+
40
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
+
483,
|
| 849 |
+
477,
|
| 850 |
+
471,
|
| 851 |
+
465
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
+
0.30434782608695654,
|
| 855 |
+
0.18448637316561844,
|
| 856 |
+
0.12101910828025478,
|
| 857 |
+
0.08602150537634408
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
+
"sys_len": 483,
|
| 861 |
"ref_len": 208,
|
| 862 |
+
"sacrebleu": 0.1554887137730076,
|
| 863 |
+
"score": 0.1554887137730076,
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
+
"score_ci_low": 0.08218535782412684,
|
| 866 |
+
"score_ci_high": 0.4053745037468504,
|
| 867 |
+
"sacrebleu_ci_low": 0.08218535782412684,
|
| 868 |
+
"sacrebleu_ci_high": 0.4053745037468504
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
+
98,
|
| 874 |
+
44,
|
| 875 |
+
25,
|
| 876 |
+
14
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
+
2150,
|
| 880 |
+
2144,
|
| 881 |
+
2138,
|
| 882 |
+
2132
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
+
0.04558139534883721,
|
| 886 |
+
0.020522388059701493,
|
| 887 |
+
0.011693171188026192,
|
| 888 |
+
0.006566604127579737
|
| 889 |
],
|
| 890 |
"bp": 1.0,
|
| 891 |
+
"sys_len": 2150,
|
| 892 |
"ref_len": 209,
|
| 893 |
+
"sacrebleu": 0.016370885220574706,
|
| 894 |
+
"score": 0.016370885220574706,
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
+
"score_ci_low": 0.00808482025201998,
|
| 897 |
+
"score_ci_high": 0.03530419883958167,
|
| 898 |
+
"sacrebleu_ci_low": 0.00808482025201998,
|
| 899 |
+
"sacrebleu_ci_high": 0.03530419883958167
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
+
150,
|
| 905 |
+
87,
|
| 906 |
+
60,
|
| 907 |
+
46
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
+
477,
|
| 911 |
+
471,
|
| 912 |
+
465,
|
| 913 |
+
459
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
+
0.31446540880503143,
|
| 917 |
+
0.18471337579617836,
|
| 918 |
+
0.12903225806451613,
|
| 919 |
+
0.10021786492374728
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
+
"sys_len": 477,
|
| 923 |
"ref_len": 216,
|
| 924 |
+
"sacrebleu": 0.16554980261753877,
|
| 925 |
+
"score": 0.16554980261753877,
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
+
"score_ci_low": 0.06865874538408907,
|
| 928 |
+
"score_ci_high": 0.34809284127257,
|
| 929 |
+
"sacrebleu_ci_low": 0.06865874538408907,
|
| 930 |
+
"sacrebleu_ci_high": 0.34809284127257
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
+
186,
|
| 936 |
+
139,
|
| 937 |
+
107,
|
| 938 |
+
83
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
+
512,
|
| 942 |
+
506,
|
| 943 |
+
500,
|
| 944 |
+
494
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
+
0.36328125,
|
| 948 |
+
0.274703557312253,
|
| 949 |
+
0.214,
|
| 950 |
+
0.16801619433198378
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
+
"sys_len": 512,
|
| 954 |
"ref_len": 235,
|
| 955 |
+
"sacrebleu": 0.24474737687236306,
|
| 956 |
+
"score": 0.24474737687236306,
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
+
"score_ci_low": 0.1232556341606011,
|
| 959 |
+
"score_ci_high": 0.41496292094516996,
|
| 960 |
+
"sacrebleu_ci_low": 0.1232556341606011,
|
| 961 |
+
"sacrebleu_ci_high": 0.41496292094516996
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
+
160,
|
| 967 |
+
82,
|
| 968 |
+
49,
|
| 969 |
+
28
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
+
881,
|
| 973 |
+
875,
|
| 974 |
+
869,
|
| 975 |
+
863
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
+
0.18161180476730987,
|
| 979 |
+
0.09371428571428572,
|
| 980 |
+
0.05638665132336018,
|
| 981 |
+
0.03244495944380069
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
+
"sys_len": 881,
|
| 985 |
"ref_len": 249,
|
| 986 |
+
"sacrebleu": 0.07469961323268935,
|
| 987 |
+
"score": 0.07469961323268935,
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
+
"score_ci_low": 0.02373571617407993,
|
| 990 |
+
"score_ci_high": 0.17704069001099085,
|
| 991 |
+
"sacrebleu_ci_low": 0.02373571617407993,
|
| 992 |
+
"sacrebleu_ci_high": 0.17704069001099085
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
+
187,
|
| 998 |
+
130,
|
| 999 |
+
98,
|
| 1000 |
+
72
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
+
1137,
|
| 1004 |
+
1131,
|
| 1005 |
+
1125,
|
| 1006 |
+
1119
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
+
0.1644678979771328,
|
| 1010 |
+
0.11494252873563218,
|
| 1011 |
+
0.0871111111111111,
|
| 1012 |
+
0.064343163538874
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
+
"sys_len": 1137,
|
| 1016 |
"ref_len": 222,
|
| 1017 |
+
"sacrebleu": 0.10145757157265133,
|
| 1018 |
+
"score": 0.10145757157265133,
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
+
"score_ci_low": 0.03327769465548852,
|
| 1021 |
+
"score_ci_high": 0.25165914938035194,
|
| 1022 |
+
"sacrebleu_ci_low": 0.03327769465548852,
|
| 1023 |
+
"sacrebleu_ci_high": 0.25165914938035194
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
+
170,
|
| 1029 |
+
117,
|
| 1030 |
+
87,
|
| 1031 |
+
64
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
+
1537,
|
| 1035 |
+
1531,
|
| 1036 |
+
1525,
|
| 1037 |
+
1519
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
+
0.11060507482108002,
|
| 1041 |
+
0.07642064010450686,
|
| 1042 |
+
0.057049180327868855,
|
| 1043 |
+
0.04213298222514813
|
| 1044 |
],
|
| 1045 |
"bp": 1.0,
|
| 1046 |
+
"sys_len": 1537,
|
| 1047 |
"ref_len": 230,
|
| 1048 |
+
"sacrebleu": 0.06713737139876133,
|
| 1049 |
+
"score": 0.06713737139876133,
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
+
"score_ci_low": 0.019135190599093375,
|
| 1052 |
+
"score_ci_high": 0.2047805129467645,
|
| 1053 |
+
"sacrebleu_ci_low": 0.019135190599093375,
|
| 1054 |
+
"sacrebleu_ci_high": 0.2047805129467645
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
+
176,
|
| 1060 |
+
105,
|
| 1061 |
+
68,
|
| 1062 |
+
45
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
+
698,
|
| 1066 |
+
692,
|
| 1067 |
+
686,
|
| 1068 |
+
680
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
+
0.2521489971346705,
|
| 1072 |
+
0.15173410404624277,
|
| 1073 |
+
0.09912536443148688,
|
| 1074 |
+
0.0661764705882353
|
| 1075 |
],
|
| 1076 |
"bp": 1.0,
|
| 1077 |
+
"sys_len": 698,
|
| 1078 |
"ref_len": 243,
|
| 1079 |
+
"sacrebleu": 0.1258656468166736,
|
| 1080 |
+
"score": 0.1258656468166736,
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
+
"score_ci_low": 0.07180081555502288,
|
| 1083 |
+
"score_ci_high": 0.1938363296972293,
|
| 1084 |
+
"sacrebleu_ci_low": 0.07180081555502288,
|
| 1085 |
+
"sacrebleu_ci_high": 0.1938363296972293
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
+
169,
|
| 1091 |
+
125,
|
| 1092 |
+
92,
|
| 1093 |
+
68
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
+
545,
|
| 1097 |
+
539,
|
| 1098 |
+
533,
|
| 1099 |
+
527
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
+
0.3100917431192661,
|
| 1103 |
+
0.2319109461966605,
|
| 1104 |
+
0.1726078799249531,
|
| 1105 |
+
0.12903225806451613
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
+
"sys_len": 545,
|
| 1109 |
"ref_len": 208,
|
| 1110 |
+
"sacrebleu": 0.20005185901760603,
|
| 1111 |
+
"score": 0.20005185901760603,
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
+
"score_ci_low": 0.15408724909703786,
|
| 1114 |
+
"score_ci_high": 0.3177346661862422,
|
| 1115 |
+
"sacrebleu_ci_low": 0.15408724909703786,
|
| 1116 |
+
"sacrebleu_ci_high": 0.3177346661862422
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
+
147,
|
| 1122 |
+
73,
|
| 1123 |
+
40,
|
| 1124 |
+
23
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
+
583,
|
| 1128 |
+
577,
|
| 1129 |
+
571,
|
| 1130 |
+
565
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
+
0.2521440823327616,
|
| 1134 |
+
0.1265164644714038,
|
| 1135 |
+
0.07005253940455342,
|
| 1136 |
+
0.04070796460176991
|
| 1137 |
],
|
| 1138 |
"bp": 1.0,
|
| 1139 |
+
"sys_len": 583,
|
| 1140 |
"ref_len": 208,
|
| 1141 |
+
"sacrebleu": 0.09766181126075106,
|
| 1142 |
+
"score": 0.09766181126075106,
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
+
"score_ci_low": 0.061372286349934906,
|
| 1145 |
+
"score_ci_high": 0.1414952250049573,
|
| 1146 |
+
"sacrebleu_ci_low": 0.061372286349934906,
|
| 1147 |
+
"sacrebleu_ci_high": 0.1414952250049573
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
+
143,
|
| 1153 |
80,
|
| 1154 |
48,
|
| 1155 |
31
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
+
615,
|
| 1159 |
+
609,
|
| 1160 |
+
603,
|
| 1161 |
+
597
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
+
0.23252032520325205,
|
| 1165 |
+
0.13136288998357964,
|
| 1166 |
+
0.07960199004975124,
|
| 1167 |
+
0.05192629815745394
|
| 1168 |
],
|
| 1169 |
"bp": 1.0,
|
| 1170 |
+
"sys_len": 615,
|
| 1171 |
"ref_len": 208,
|
| 1172 |
+
"sacrebleu": 0.1060013084236042,
|
| 1173 |
+
"score": 0.1060013084236042,
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
+
"score_ci_low": 0.06010923863390874,
|
| 1176 |
+
"score_ci_high": 0.17063344965203847,
|
| 1177 |
+
"sacrebleu_ci_low": 0.06010923863390874,
|
| 1178 |
+
"sacrebleu_ci_high": 0.17063344965203847
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
+
176,
|
| 1184 |
130,
|
| 1185 |
+
101,
|
| 1186 |
+
83
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
+
536,
|
| 1190 |
+
530,
|
| 1191 |
+
524,
|
| 1192 |
+
518
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
+
0.3283582089552239,
|
| 1196 |
+
0.24528301886792453,
|
| 1197 |
+
0.19274809160305342,
|
| 1198 |
+
0.16023166023166024
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
+
"sys_len": 536,
|
| 1202 |
"ref_len": 208,
|
| 1203 |
+
"sacrebleu": 0.22332556688393232,
|
| 1204 |
+
"score": 0.22332556688393232,
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
+
"score_ci_low": 0.09731928561113337,
|
| 1207 |
+
"score_ci_high": 0.4659401151801206,
|
| 1208 |
+
"sacrebleu_ci_low": 0.09731928561113337,
|
| 1209 |
+
"sacrebleu_ci_high": 0.4659401151801206
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
+
166,
|
| 1215 |
+
122,
|
| 1216 |
+
90,
|
| 1217 |
+
70
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
+
559,
|
| 1221 |
553,
|
| 1222 |
547,
|
| 1223 |
+
541
|
|
|
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
+
0.29695885509839,
|
| 1227 |
+
0.2206148282097649,
|
| 1228 |
+
0.16453382084095064,
|
| 1229 |
+
0.12939001848428835
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
+
"sys_len": 559,
|
| 1233 |
"ref_len": 208,
|
| 1234 |
+
"sacrebleu": 0.19325099309410465,
|
| 1235 |
+
"score": 0.19325099309410465,
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
+
"score_ci_low": 0.16439107268697647,
|
| 1238 |
+
"score_ci_high": 0.3099284356343615,
|
| 1239 |
+
"sacrebleu_ci_low": 0.16439107268697647,
|
| 1240 |
+
"sacrebleu_ci_high": 0.3099284356343615
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
+
159,
|
| 1246 |
+
93,
|
| 1247 |
+
58,
|
| 1248 |
+
37
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
+
1008,
|
| 1252 |
+
1002,
|
| 1253 |
+
996,
|
| 1254 |
+
990
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
+
0.15773809523809523,
|
| 1258 |
+
0.09281437125748504,
|
| 1259 |
+
0.058232931726907626,
|
| 1260 |
+
0.03737373737373737
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
+
"sys_len": 1008,
|
| 1264 |
"ref_len": 208,
|
| 1265 |
+
"sacrebleu": 0.07513144660411818,
|
| 1266 |
+
"score": 0.07513144660411818,
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
+
"score_ci_low": 0.05064922527779353,
|
| 1269 |
+
"score_ci_high": 0.10609926703092316,
|
| 1270 |
+
"sacrebleu_ci_low": 0.05064922527779353,
|
| 1271 |
+
"sacrebleu_ci_high": 0.10609926703092316
|
| 1272 |
},
|
| 1273 |
+
"score": 0.13552333651691043,
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
+
"score": 0.5411259195454314,
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
results/bluebench/2025-07-03T13-32-15_evaluation_results.json
ADDED
|
@@ -0,0 +1,1281 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"environment_info": {
|
| 3 |
+
"timestamp_utc": "2025-07-03T17:32:11.394955Z",
|
| 4 |
+
"command_line_invocation": [
|
| 5 |
+
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
+
"--tasks",
|
| 7 |
+
"benchmarks.bluebench",
|
| 8 |
+
"--model",
|
| 9 |
+
"cross_provider",
|
| 10 |
+
"--model_args",
|
| 11 |
+
"model_name=watsonx/mistralai/pixtral-12b,max_tokens=1024",
|
| 12 |
+
"--output_path",
|
| 13 |
+
"./results/bluebench",
|
| 14 |
+
"--log_samples",
|
| 15 |
+
"--trust_remote_code",
|
| 16 |
+
"--batch_size",
|
| 17 |
+
"8",
|
| 18 |
+
"--verbosity",
|
| 19 |
+
"ERROR"
|
| 20 |
+
],
|
| 21 |
+
"parsed_arguments": {
|
| 22 |
+
"tasks": [
|
| 23 |
+
"benchmarks.bluebench"
|
| 24 |
+
],
|
| 25 |
+
"split": "test",
|
| 26 |
+
"num_fewshots": null,
|
| 27 |
+
"limit": null,
|
| 28 |
+
"batch_size": 8,
|
| 29 |
+
"model": "watsonx/mistralai/pixtral-12b",
|
| 30 |
+
"model_args": {
|
| 31 |
+
"max_tokens": 1024
|
| 32 |
+
},
|
| 33 |
+
"gen_kwargs": null,
|
| 34 |
+
"chat_template_kwargs": null,
|
| 35 |
+
"output_path": "./results/bluebench",
|
| 36 |
+
"output_file_prefix": "evaluation_results",
|
| 37 |
+
"log_samples": true,
|
| 38 |
+
"verbosity": "ERROR",
|
| 39 |
+
"apply_chat_template": false,
|
| 40 |
+
"trust_remote_code": true,
|
| 41 |
+
"disable_hf_cache": false,
|
| 42 |
+
"cache_dir": null
|
| 43 |
+
},
|
| 44 |
+
"unitxt_version": "1.25.0",
|
| 45 |
+
"unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
|
| 46 |
+
"python_version": "3.10.18",
|
| 47 |
+
"system": "Linux",
|
| 48 |
+
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
| 49 |
+
"installed_packages": {
|
| 50 |
+
"nvidia-cufile-cu12": "1.11.1.6",
|
| 51 |
+
"triton": "3.3.1",
|
| 52 |
+
"nltk": "3.9.1",
|
| 53 |
+
"anyio": "4.9.0",
|
| 54 |
+
"unitxt": "1.25.0",
|
| 55 |
+
"absl-py": "2.3.0",
|
| 56 |
+
"tiktoken": "0.9.0",
|
| 57 |
+
"charset-normalizer": "3.4.2",
|
| 58 |
+
"nvidia-cuda-runtime-cu12": "12.6.77",
|
| 59 |
+
"sympy": "1.14.0",
|
| 60 |
+
"mecab-ko": "1.0.1",
|
| 61 |
+
"httpcore": "1.0.9",
|
| 62 |
+
"litellm": "1.73.6",
|
| 63 |
+
"Jinja2": "3.1.6",
|
| 64 |
+
"jsonschema-specifications": "2025.4.1",
|
| 65 |
+
"pydantic_core": "2.33.2",
|
| 66 |
+
"nvidia-cusparse-cu12": "12.5.4.2",
|
| 67 |
+
"tokenizers": "0.21.2",
|
| 68 |
+
"yarl": "1.20.1",
|
| 69 |
+
"portalocker": "3.2.0",
|
| 70 |
+
"pandas": "2.3.0",
|
| 71 |
+
"multiprocess": "0.70.16",
|
| 72 |
+
"jsonschema": "4.24.0",
|
| 73 |
+
"nvidia-nvjitlink-cu12": "12.6.85",
|
| 74 |
+
"nvidia-cublas-cu12": "12.6.4.1",
|
| 75 |
+
"pydantic": "2.11.7",
|
| 76 |
+
"async-timeout": "5.0.1",
|
| 77 |
+
"annotated-types": "0.7.0",
|
| 78 |
+
"rouge_score": "0.1.2",
|
| 79 |
+
"contourpy": "1.3.2",
|
| 80 |
+
"aiosignal": "1.3.2",
|
| 81 |
+
"nvidia-cuda-cupti-cu12": "12.6.80",
|
| 82 |
+
"openai": "1.93.0",
|
| 83 |
+
"six": "1.17.0",
|
| 84 |
+
"diskcache": "5.6.3",
|
| 85 |
+
"tqdm": "4.67.1",
|
| 86 |
+
"pyarrow": "20.0.0",
|
| 87 |
+
"h11": "0.16.0",
|
| 88 |
+
"zipp": "3.19.2",
|
| 89 |
+
"tzdata": "2025.2",
|
| 90 |
+
"bert-score": "0.3.13",
|
| 91 |
+
"setuptools": "80.9.0",
|
| 92 |
+
"referencing": "0.36.2",
|
| 93 |
+
"sacrebleu": "2.5.1",
|
| 94 |
+
"filelock": "3.18.0",
|
| 95 |
+
"urllib3": "2.5.0",
|
| 96 |
+
"scipy": "1.15.3",
|
| 97 |
+
"nvidia-nccl-cu12": "2.26.2",
|
| 98 |
+
"kiwisolver": "1.4.8",
|
| 99 |
+
"networkx": "3.4.2",
|
| 100 |
+
"typing-inspection": "0.4.1",
|
| 101 |
+
"sniffio": "1.3.1",
|
| 102 |
+
"scikit-learn": "1.7.0",
|
| 103 |
+
"rpds-py": "0.26.0",
|
| 104 |
+
"nvidia-curand-cu12": "10.3.7.77",
|
| 105 |
+
"pip": "25.1.1",
|
| 106 |
+
"pillow": "11.3.0",
|
| 107 |
+
"fonttools": "4.58.4",
|
| 108 |
+
"datasets": "3.6.0",
|
| 109 |
+
"nvidia-cusolver-cu12": "11.7.1.2",
|
| 110 |
+
"cycler": "0.12.1",
|
| 111 |
+
"distro": "1.9.0",
|
| 112 |
+
"idna": "3.10",
|
| 113 |
+
"MarkupSafe": "3.0.2",
|
| 114 |
+
"frozenlist": "1.7.0",
|
| 115 |
+
"pyparsing": "3.2.3",
|
| 116 |
+
"jiter": "0.10.0",
|
| 117 |
+
"importlib_metadata": "8.0.0",
|
| 118 |
+
"packaging": "24.2",
|
| 119 |
+
"psutil": "7.0.0",
|
| 120 |
+
"mecab-ko-dic": "1.0.0",
|
| 121 |
+
"joblib": "1.5.1",
|
| 122 |
+
"fsspec": "2025.3.0",
|
| 123 |
+
"dill": "0.3.8",
|
| 124 |
+
"wheel": "0.45.1",
|
| 125 |
+
"nvidia-nvtx-cu12": "12.6.77",
|
| 126 |
+
"nvidia-cusparselt-cu12": "0.6.3",
|
| 127 |
+
"lxml": "6.0.0",
|
| 128 |
+
"propcache": "0.3.2",
|
| 129 |
+
"numpy": "2.2.6",
|
| 130 |
+
"mpmath": "1.3.0",
|
| 131 |
+
"conllu": "6.0.0",
|
| 132 |
+
"huggingface-hub": "0.33.2",
|
| 133 |
+
"safetensors": "0.5.3",
|
| 134 |
+
"requests": "2.32.4",
|
| 135 |
+
"regex": "2024.11.6",
|
| 136 |
+
"aiohttp": "3.12.13",
|
| 137 |
+
"tabulate": "0.9.0",
|
| 138 |
+
"accelerate": "1.8.1",
|
| 139 |
+
"certifi": "2025.6.15",
|
| 140 |
+
"evaluate": "0.4.4",
|
| 141 |
+
"nvidia-cufft-cu12": "11.3.0.4",
|
| 142 |
+
"nvidia-cuda-nvrtc-cu12": "12.6.77",
|
| 143 |
+
"click": "8.2.1",
|
| 144 |
+
"typing_extensions": "4.12.2",
|
| 145 |
+
"attrs": "25.3.0",
|
| 146 |
+
"exceptiongroup": "1.3.0",
|
| 147 |
+
"transformers": "4.53.0",
|
| 148 |
+
"tenacity": "9.1.2",
|
| 149 |
+
"pytz": "2025.2",
|
| 150 |
+
"aiohappyeyeballs": "2.6.1",
|
| 151 |
+
"python-dateutil": "2.9.0.post0",
|
| 152 |
+
"torch": "2.7.1",
|
| 153 |
+
"python-dotenv": "1.1.1",
|
| 154 |
+
"multidict": "6.6.3",
|
| 155 |
+
"httpx": "0.28.1",
|
| 156 |
+
"matplotlib": "3.10.3",
|
| 157 |
+
"xxhash": "3.5.0",
|
| 158 |
+
"PyYAML": "6.0.2",
|
| 159 |
+
"colorama": "0.4.6",
|
| 160 |
+
"threadpoolctl": "3.6.0",
|
| 161 |
+
"nvidia-cudnn-cu12": "9.5.1.17",
|
| 162 |
+
"hf-xet": "1.1.5",
|
| 163 |
+
"jaraco.collections": "5.1.0",
|
| 164 |
+
"tomli": "2.0.1",
|
| 165 |
+
"backports.tarfile": "1.2.0",
|
| 166 |
+
"jaraco.context": "5.3.0",
|
| 167 |
+
"typeguard": "4.3.0",
|
| 168 |
+
"autocommand": "2.2.2",
|
| 169 |
+
"jaraco.text": "3.12.1",
|
| 170 |
+
"more-itertools": "10.3.0",
|
| 171 |
+
"platformdirs": "4.2.2",
|
| 172 |
+
"inflect": "7.3.1",
|
| 173 |
+
"jaraco.functools": "4.0.1"
|
| 174 |
+
}
|
| 175 |
+
},
|
| 176 |
+
"results": {
|
| 177 |
+
"bias": {
|
| 178 |
+
"safety_bbq_age": {
|
| 179 |
+
"accuracy": 0.5555555555555556,
|
| 180 |
+
"accuracy_ci_low": 0.2222222222222222,
|
| 181 |
+
"accuracy_ci_high": 0.8888888888888888,
|
| 182 |
+
"score_name": "accuracy",
|
| 183 |
+
"score": 0.5555555555555556,
|
| 184 |
+
"score_ci_high": 0.8888888888888888,
|
| 185 |
+
"score_ci_low": 0.2222222222222222,
|
| 186 |
+
"num_of_instances": 9
|
| 187 |
+
},
|
| 188 |
+
"safety_bbq_disability_status": {
|
| 189 |
+
"accuracy": 0.7777777777777778,
|
| 190 |
+
"accuracy_ci_low": 0.4444444444444444,
|
| 191 |
+
"accuracy_ci_high": 1.0,
|
| 192 |
+
"score_name": "accuracy",
|
| 193 |
+
"score": 0.7777777777777778,
|
| 194 |
+
"score_ci_high": 1.0,
|
| 195 |
+
"score_ci_low": 0.4444444444444444,
|
| 196 |
+
"num_of_instances": 9
|
| 197 |
+
},
|
| 198 |
+
"safety_bbq_gender_identity": {
|
| 199 |
+
"accuracy": 1.0,
|
| 200 |
+
"accuracy_ci_low": 1.0,
|
| 201 |
+
"accuracy_ci_high": 1.0,
|
| 202 |
+
"score_name": "accuracy",
|
| 203 |
+
"score": 1.0,
|
| 204 |
+
"score_ci_high": 1.0,
|
| 205 |
+
"score_ci_low": 1.0,
|
| 206 |
+
"num_of_instances": 9
|
| 207 |
+
},
|
| 208 |
+
"safety_bbq_nationality": {
|
| 209 |
+
"accuracy": 0.7777777777777778,
|
| 210 |
+
"accuracy_ci_low": 0.4444444444444444,
|
| 211 |
+
"accuracy_ci_high": 1.0,
|
| 212 |
+
"score_name": "accuracy",
|
| 213 |
+
"score": 0.7777777777777778,
|
| 214 |
+
"score_ci_high": 1.0,
|
| 215 |
+
"score_ci_low": 0.4444444444444444,
|
| 216 |
+
"num_of_instances": 9
|
| 217 |
+
},
|
| 218 |
+
"safety_bbq_physical_appearance": {
|
| 219 |
+
"accuracy": 0.7777777777777778,
|
| 220 |
+
"accuracy_ci_low": 0.3333333333333333,
|
| 221 |
+
"accuracy_ci_high": 1.0,
|
| 222 |
+
"score_name": "accuracy",
|
| 223 |
+
"score": 0.7777777777777778,
|
| 224 |
+
"score_ci_high": 1.0,
|
| 225 |
+
"score_ci_low": 0.3333333333333333,
|
| 226 |
+
"num_of_instances": 9
|
| 227 |
+
},
|
| 228 |
+
"safety_bbq_race_ethnicity": {
|
| 229 |
+
"accuracy": 1.0,
|
| 230 |
+
"accuracy_ci_low": 1.0,
|
| 231 |
+
"accuracy_ci_high": 1.0,
|
| 232 |
+
"score_name": "accuracy",
|
| 233 |
+
"score": 1.0,
|
| 234 |
+
"score_ci_high": 1.0,
|
| 235 |
+
"score_ci_low": 1.0,
|
| 236 |
+
"num_of_instances": 9
|
| 237 |
+
},
|
| 238 |
+
"safety_bbq_race_x_gender": {
|
| 239 |
+
"accuracy": 1.0,
|
| 240 |
+
"accuracy_ci_low": 1.0,
|
| 241 |
+
"accuracy_ci_high": 1.0,
|
| 242 |
+
"score_name": "accuracy",
|
| 243 |
+
"score": 1.0,
|
| 244 |
+
"score_ci_high": 1.0,
|
| 245 |
+
"score_ci_low": 1.0,
|
| 246 |
+
"num_of_instances": 9
|
| 247 |
+
},
|
| 248 |
+
"safety_bbq_race_x_ses": {
|
| 249 |
+
"accuracy": 0.4444444444444444,
|
| 250 |
+
"accuracy_ci_low": 0.1111111111111111,
|
| 251 |
+
"accuracy_ci_high": 0.7777777777777778,
|
| 252 |
+
"score_name": "accuracy",
|
| 253 |
+
"score": 0.4444444444444444,
|
| 254 |
+
"score_ci_high": 0.7777777777777778,
|
| 255 |
+
"score_ci_low": 0.1111111111111111,
|
| 256 |
+
"num_of_instances": 9
|
| 257 |
+
},
|
| 258 |
+
"safety_bbq_religion": {
|
| 259 |
+
"accuracy": 0.5555555555555556,
|
| 260 |
+
"accuracy_ci_low": 0.2222222222222222,
|
| 261 |
+
"accuracy_ci_high": 0.8888888888888888,
|
| 262 |
+
"score_name": "accuracy",
|
| 263 |
+
"score": 0.5555555555555556,
|
| 264 |
+
"score_ci_high": 0.8888888888888888,
|
| 265 |
+
"score_ci_low": 0.2222222222222222,
|
| 266 |
+
"num_of_instances": 9
|
| 267 |
+
},
|
| 268 |
+
"safety_bbq_ses": {
|
| 269 |
+
"accuracy": 0.4444444444444444,
|
| 270 |
+
"accuracy_ci_low": 0.1111111111111111,
|
| 271 |
+
"accuracy_ci_high": 0.7777777777777778,
|
| 272 |
+
"score_name": "accuracy",
|
| 273 |
+
"score": 0.4444444444444444,
|
| 274 |
+
"score_ci_high": 0.7777777777777778,
|
| 275 |
+
"score_ci_low": 0.1111111111111111,
|
| 276 |
+
"num_of_instances": 9
|
| 277 |
+
},
|
| 278 |
+
"safety_bbq_sexual_orientation": {
|
| 279 |
+
"accuracy": 0.6666666666666666,
|
| 280 |
+
"accuracy_ci_low": 0.3333333333333333,
|
| 281 |
+
"accuracy_ci_high": 0.8888888888888888,
|
| 282 |
+
"score_name": "accuracy",
|
| 283 |
+
"score": 0.6666666666666666,
|
| 284 |
+
"score_ci_high": 0.8888888888888888,
|
| 285 |
+
"score_ci_low": 0.3333333333333333,
|
| 286 |
+
"num_of_instances": 9
|
| 287 |
+
},
|
| 288 |
+
"score": 0.7272727272727273,
|
| 289 |
+
"score_name": "subsets_mean",
|
| 290 |
+
"num_of_instances": 99
|
| 291 |
+
},
|
| 292 |
+
"chatbot_abilities": {
|
| 293 |
+
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
+
"num_of_instances": 100,
|
| 295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.7654320987654321,
|
| 296 |
+
"score": 0.7654320987654321,
|
| 297 |
+
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
+
},
|
| 299 |
+
"score": 0.7654320987654321,
|
| 300 |
+
"score_name": "subsets_mean",
|
| 301 |
+
"num_of_instances": 100
|
| 302 |
+
},
|
| 303 |
+
"entity_extraction": {
|
| 304 |
+
"universal_ner_en_ewt": {
|
| 305 |
+
"num_of_instances": 100,
|
| 306 |
+
"f1_Person": 0.18867924528301885,
|
| 307 |
+
"f1_Organization": 0.34375000000000006,
|
| 308 |
+
"f1_Location": 0.28571428571428575,
|
| 309 |
+
"f1_macro": 0.2727145103324349,
|
| 310 |
+
"recall_macro": 0.2867494824016563,
|
| 311 |
+
"precision_macro": 0.26851851851851855,
|
| 312 |
+
"in_classes_support": 0.6719999999999999,
|
| 313 |
+
"f1_micro": 0.21999999999999997,
|
| 314 |
+
"recall_micro": 0.29333333333333333,
|
| 315 |
+
"precision_micro": 0.176,
|
| 316 |
+
"score": 0.21999999999999997,
|
| 317 |
+
"score_name": "f1_micro",
|
| 318 |
+
"score_ci_low": 0.13011663116405983,
|
| 319 |
+
"score_ci_high": 0.3017318039121788,
|
| 320 |
+
"f1_micro_ci_low": 0.13011663116405983,
|
| 321 |
+
"f1_micro_ci_high": 0.3017318039121788
|
| 322 |
+
},
|
| 323 |
+
"score": 0.21999999999999997,
|
| 324 |
+
"score_name": "subsets_mean",
|
| 325 |
+
"num_of_instances": 100
|
| 326 |
+
},
|
| 327 |
+
"knowledge": {
|
| 328 |
+
"mmlu_pro_biology": {
|
| 329 |
+
"accuracy": 0.7142857142857143,
|
| 330 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 331 |
+
"accuracy_ci_high": 1.0,
|
| 332 |
+
"score_name": "accuracy",
|
| 333 |
+
"score": 0.7142857142857143,
|
| 334 |
+
"score_ci_high": 1.0,
|
| 335 |
+
"score_ci_low": 0.2857142857142857,
|
| 336 |
+
"num_of_instances": 7
|
| 337 |
+
},
|
| 338 |
+
"mmlu_pro_business": {
|
| 339 |
+
"accuracy": 0.14285714285714285,
|
| 340 |
+
"accuracy_ci_low": 0.0,
|
| 341 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 342 |
+
"score_name": "accuracy",
|
| 343 |
+
"score": 0.14285714285714285,
|
| 344 |
+
"score_ci_high": 0.5714285714285714,
|
| 345 |
+
"score_ci_low": 0.0,
|
| 346 |
+
"num_of_instances": 7
|
| 347 |
+
},
|
| 348 |
+
"mmlu_pro_chemistry": {
|
| 349 |
+
"accuracy": 0.2857142857142857,
|
| 350 |
+
"accuracy_ci_low": 0.0,
|
| 351 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 352 |
+
"score_name": "accuracy",
|
| 353 |
+
"score": 0.2857142857142857,
|
| 354 |
+
"score_ci_high": 0.7142857142857143,
|
| 355 |
+
"score_ci_low": 0.0,
|
| 356 |
+
"num_of_instances": 7
|
| 357 |
+
},
|
| 358 |
+
"mmlu_pro_computer_science": {
|
| 359 |
+
"accuracy": 1.0,
|
| 360 |
+
"accuracy_ci_low": 1.0,
|
| 361 |
+
"accuracy_ci_high": 1.0,
|
| 362 |
+
"score_name": "accuracy",
|
| 363 |
+
"score": 1.0,
|
| 364 |
+
"score_ci_high": 1.0,
|
| 365 |
+
"score_ci_low": 1.0,
|
| 366 |
+
"num_of_instances": 7
|
| 367 |
+
},
|
| 368 |
+
"mmlu_pro_economics": {
|
| 369 |
+
"accuracy": 0.7142857142857143,
|
| 370 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 371 |
+
"accuracy_ci_high": 1.0,
|
| 372 |
+
"score_name": "accuracy",
|
| 373 |
+
"score": 0.7142857142857143,
|
| 374 |
+
"score_ci_high": 1.0,
|
| 375 |
+
"score_ci_low": 0.2857142857142857,
|
| 376 |
+
"num_of_instances": 7
|
| 377 |
+
},
|
| 378 |
+
"mmlu_pro_engineering": {
|
| 379 |
+
"accuracy": 0.0,
|
| 380 |
+
"accuracy_ci_low": 0.0,
|
| 381 |
+
"accuracy_ci_high": 0.0,
|
| 382 |
+
"score_name": "accuracy",
|
| 383 |
+
"score": 0.0,
|
| 384 |
+
"score_ci_high": 0.0,
|
| 385 |
+
"score_ci_low": 0.0,
|
| 386 |
+
"num_of_instances": 7
|
| 387 |
+
},
|
| 388 |
+
"mmlu_pro_health": {
|
| 389 |
+
"accuracy": 0.42857142857142855,
|
| 390 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 391 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 392 |
+
"score_name": "accuracy",
|
| 393 |
+
"score": 0.42857142857142855,
|
| 394 |
+
"score_ci_high": 0.8571428571428571,
|
| 395 |
+
"score_ci_low": 0.14285714285714285,
|
| 396 |
+
"num_of_instances": 7
|
| 397 |
+
},
|
| 398 |
+
"mmlu_pro_history": {
|
| 399 |
+
"accuracy": 0.2857142857142857,
|
| 400 |
+
"accuracy_ci_low": 0.0,
|
| 401 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 402 |
+
"score_name": "accuracy",
|
| 403 |
+
"score": 0.2857142857142857,
|
| 404 |
+
"score_ci_high": 0.7142857142857143,
|
| 405 |
+
"score_ci_low": 0.0,
|
| 406 |
+
"num_of_instances": 7
|
| 407 |
+
},
|
| 408 |
+
"mmlu_pro_law": {
|
| 409 |
+
"accuracy": 0.2857142857142857,
|
| 410 |
+
"accuracy_ci_low": 0.0,
|
| 411 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 412 |
+
"score_name": "accuracy",
|
| 413 |
+
"score": 0.2857142857142857,
|
| 414 |
+
"score_ci_high": 0.7142857142857143,
|
| 415 |
+
"score_ci_low": 0.0,
|
| 416 |
+
"num_of_instances": 7
|
| 417 |
+
},
|
| 418 |
+
"mmlu_pro_math": {
|
| 419 |
+
"accuracy": 0.2857142857142857,
|
| 420 |
+
"accuracy_ci_low": 0.0,
|
| 421 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 422 |
+
"score_name": "accuracy",
|
| 423 |
+
"score": 0.2857142857142857,
|
| 424 |
+
"score_ci_high": 0.7142857142857143,
|
| 425 |
+
"score_ci_low": 0.0,
|
| 426 |
+
"num_of_instances": 7
|
| 427 |
+
},
|
| 428 |
+
"mmlu_pro_other": {
|
| 429 |
+
"accuracy": 0.14285714285714285,
|
| 430 |
+
"accuracy_ci_low": 0.0,
|
| 431 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 432 |
+
"score_name": "accuracy",
|
| 433 |
+
"score": 0.14285714285714285,
|
| 434 |
+
"score_ci_high": 0.5714285714285714,
|
| 435 |
+
"score_ci_low": 0.0,
|
| 436 |
+
"num_of_instances": 7
|
| 437 |
+
},
|
| 438 |
+
"mmlu_pro_philosophy": {
|
| 439 |
+
"accuracy": 0.2857142857142857,
|
| 440 |
+
"accuracy_ci_low": 0.0,
|
| 441 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 442 |
+
"score_name": "accuracy",
|
| 443 |
+
"score": 0.2857142857142857,
|
| 444 |
+
"score_ci_high": 0.7142857142857143,
|
| 445 |
+
"score_ci_low": 0.0,
|
| 446 |
+
"num_of_instances": 7
|
| 447 |
+
},
|
| 448 |
+
"mmlu_pro_physics": {
|
| 449 |
+
"accuracy": 0.2857142857142857,
|
| 450 |
+
"accuracy_ci_low": 0.0,
|
| 451 |
+
"accuracy_ci_high": 0.7142857142857143,
|
| 452 |
+
"score_name": "accuracy",
|
| 453 |
+
"score": 0.2857142857142857,
|
| 454 |
+
"score_ci_high": 0.7142857142857143,
|
| 455 |
+
"score_ci_low": 0.0,
|
| 456 |
+
"num_of_instances": 7
|
| 457 |
+
},
|
| 458 |
+
"mmlu_pro_psychology": {
|
| 459 |
+
"accuracy": 0.5714285714285714,
|
| 460 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 461 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 462 |
+
"score_name": "accuracy",
|
| 463 |
+
"score": 0.5714285714285714,
|
| 464 |
+
"score_ci_high": 0.8571428571428571,
|
| 465 |
+
"score_ci_low": 0.14285714285714285,
|
| 466 |
+
"num_of_instances": 7
|
| 467 |
+
},
|
| 468 |
+
"score": 0.3877551020408163,
|
| 469 |
+
"score_name": "subsets_mean",
|
| 470 |
+
"num_of_instances": 98
|
| 471 |
+
},
|
| 472 |
+
"legal": {
|
| 473 |
+
"legalbench_abercrombie": {
|
| 474 |
+
"f1_macro": 0.06666666666666667,
|
| 475 |
+
"f1_suggestive": 0.0,
|
| 476 |
+
"f1_generic": 0.0,
|
| 477 |
+
"f1_fanciful": 0.0,
|
| 478 |
+
"f1_descriptive": 0.3333333333333333,
|
| 479 |
+
"f1_arbitrary": 0.0,
|
| 480 |
+
"f1_macro_ci_low": 0.0,
|
| 481 |
+
"f1_macro_ci_high": 0.2,
|
| 482 |
+
"score_name": "f1_micro",
|
| 483 |
+
"score": 0.09090909090909091,
|
| 484 |
+
"score_ci_high": 0.37303774197259326,
|
| 485 |
+
"score_ci_low": 0.0,
|
| 486 |
+
"num_of_instances": 20,
|
| 487 |
+
"accuracy": 0.05,
|
| 488 |
+
"accuracy_ci_low": 0.0,
|
| 489 |
+
"accuracy_ci_high": 0.25,
|
| 490 |
+
"f1_micro": 0.09090909090909091,
|
| 491 |
+
"f1_micro_ci_low": 0.0,
|
| 492 |
+
"f1_micro_ci_high": 0.37303774197259326
|
| 493 |
+
},
|
| 494 |
+
"legalbench_corporate_lobbying": {
|
| 495 |
+
"f1_macro": 0.33838383838383834,
|
| 496 |
+
"f1_no": 0.45454545454545453,
|
| 497 |
+
"f1_yes": 0.2222222222222222,
|
| 498 |
+
"f1_macro_ci_low": 0.18181818181818182,
|
| 499 |
+
"f1_macro_ci_high": 0.6671388142427643,
|
| 500 |
+
"score_name": "f1_micro",
|
| 501 |
+
"score": 0.3870967741935484,
|
| 502 |
+
"score_ci_high": 0.6285714285714286,
|
| 503 |
+
"score_ci_low": 0.1935483870967742,
|
| 504 |
+
"num_of_instances": 20,
|
| 505 |
+
"accuracy": 0.3,
|
| 506 |
+
"accuracy_ci_low": 0.15,
|
| 507 |
+
"accuracy_ci_high": 0.55,
|
| 508 |
+
"f1_micro": 0.3870967741935484,
|
| 509 |
+
"f1_micro_ci_low": 0.1935483870967742,
|
| 510 |
+
"f1_micro_ci_high": 0.6285714285714286
|
| 511 |
+
},
|
| 512 |
+
"legalbench_function_of_decision_section": {
|
| 513 |
+
"f1_macro": 0.14285714285714285,
|
| 514 |
+
"f1_conclusion": 0.0,
|
| 515 |
+
"f1_decree": 0.0,
|
| 516 |
+
"f1_issue": 0.0,
|
| 517 |
+
"f1_analysis": 0.0,
|
| 518 |
+
"f1_facts": 0.0,
|
| 519 |
+
"f1_procedural history": 0.0,
|
| 520 |
+
"f1_rule": 1.0,
|
| 521 |
+
"f1_macro_ci_low": 0.0,
|
| 522 |
+
"f1_macro_ci_high": 0.22588862141082586,
|
| 523 |
+
"score_name": "f1_micro",
|
| 524 |
+
"score": 0.09090909090909091,
|
| 525 |
+
"score_ci_high": 0.3333333333333333,
|
| 526 |
+
"score_ci_low": 0.0,
|
| 527 |
+
"num_of_instances": 20,
|
| 528 |
+
"accuracy": 0.05,
|
| 529 |
+
"accuracy_ci_low": 0.0,
|
| 530 |
+
"accuracy_ci_high": 0.2091202603361353,
|
| 531 |
+
"f1_micro": 0.09090909090909091,
|
| 532 |
+
"f1_micro_ci_low": 0.0,
|
| 533 |
+
"f1_micro_ci_high": 0.3333333333333333
|
| 534 |
+
},
|
| 535 |
+
"legalbench_international_citizenship_questions": {
|
| 536 |
+
"f1_macro": 0.27884615384615385,
|
| 537 |
+
"f1_yes": 0.3076923076923077,
|
| 538 |
+
"f1_no": 0.25,
|
| 539 |
+
"f1_macro_ci_low": 0.11764705882352941,
|
| 540 |
+
"f1_macro_ci_high": 0.5637770692261737,
|
| 541 |
+
"score_name": "f1_micro",
|
| 542 |
+
"score": 0.27586206896551724,
|
| 543 |
+
"score_ci_high": 0.5333333333333333,
|
| 544 |
+
"score_ci_low": 0.08,
|
| 545 |
+
"num_of_instances": 20,
|
| 546 |
+
"accuracy": 0.2,
|
| 547 |
+
"accuracy_ci_low": 0.05,
|
| 548 |
+
"accuracy_ci_high": 0.4114914650687297,
|
| 549 |
+
"f1_micro": 0.27586206896551724,
|
| 550 |
+
"f1_micro_ci_low": 0.08,
|
| 551 |
+
"f1_micro_ci_high": 0.5333333333333333
|
| 552 |
+
},
|
| 553 |
+
"legalbench_proa": {
|
| 554 |
+
"f1_macro": 0.797979797979798,
|
| 555 |
+
"f1_yes": 0.7777777777777778,
|
| 556 |
+
"f1_no": 0.8181818181818182,
|
| 557 |
+
"f1_macro_ci_low": 0.5833333333333333,
|
| 558 |
+
"f1_macro_ci_high": 0.948849104859335,
|
| 559 |
+
"score_name": "f1_micro",
|
| 560 |
+
"score": 0.8,
|
| 561 |
+
"score_ci_high": 0.95,
|
| 562 |
+
"score_ci_low": 0.6,
|
| 563 |
+
"num_of_instances": 20,
|
| 564 |
+
"accuracy": 0.8,
|
| 565 |
+
"accuracy_ci_low": 0.6,
|
| 566 |
+
"accuracy_ci_high": 0.95,
|
| 567 |
+
"f1_micro": 0.8,
|
| 568 |
+
"f1_micro_ci_low": 0.6,
|
| 569 |
+
"f1_micro_ci_high": 0.95
|
| 570 |
+
},
|
| 571 |
+
"score": 0.3289554049954495,
|
| 572 |
+
"score_name": "subsets_mean",
|
| 573 |
+
"num_of_instances": 100
|
| 574 |
+
},
|
| 575 |
+
"news_classification": {
|
| 576 |
+
"20_newsgroups_short": {
|
| 577 |
+
"f1_macro": 0.18026556776556776,
|
| 578 |
+
"f1_cars": 0.3333333333333333,
|
| 579 |
+
"f1_windows x": 0.0,
|
| 580 |
+
"f1_atheism": 0.0,
|
| 581 |
+
"f1_christianity": 0.0,
|
| 582 |
+
"f1_religion": 0.0,
|
| 583 |
+
"f1_politics": 0.2222222222222222,
|
| 584 |
+
"f1_medicine": 0.8571428571428571,
|
| 585 |
+
"f1_computer graphics": 0.3076923076923077,
|
| 586 |
+
"f1_microsoft windows": 0.0,
|
| 587 |
+
"f1_middle east": 0.0,
|
| 588 |
+
"f1_motorcycles": 0.4444444444444444,
|
| 589 |
+
"f1_mac hardware": 0.2857142857142857,
|
| 590 |
+
"f1_electronics": 0.0,
|
| 591 |
+
"f1_for sale": 0.0,
|
| 592 |
+
"f1_guns": 0.0,
|
| 593 |
+
"f1_space": 0.5714285714285714,
|
| 594 |
+
"f1_pc hardware": 0.25,
|
| 595 |
+
"f1_cryptography": 0.0,
|
| 596 |
+
"f1_baseball": 0.0,
|
| 597 |
+
"f1_hockey": 0.3333333333333333,
|
| 598 |
+
"f1_macro_ci_low": 0.12177771074765135,
|
| 599 |
+
"f1_macro_ci_high": 0.2502350931336935,
|
| 600 |
+
"score_name": "f1_micro",
|
| 601 |
+
"score": 0.2153846153846154,
|
| 602 |
+
"score_ci_high": 0.3244927419621043,
|
| 603 |
+
"score_ci_low": 0.12698412698412698,
|
| 604 |
+
"num_of_instances": 100,
|
| 605 |
+
"accuracy": 0.14,
|
| 606 |
+
"accuracy_ci_low": 0.08,
|
| 607 |
+
"accuracy_ci_high": 0.22,
|
| 608 |
+
"f1_micro": 0.2153846153846154,
|
| 609 |
+
"f1_micro_ci_low": 0.12698412698412698,
|
| 610 |
+
"f1_micro_ci_high": 0.3244927419621043
|
| 611 |
+
},
|
| 612 |
+
"score": 0.2153846153846154,
|
| 613 |
+
"score_name": "subsets_mean",
|
| 614 |
+
"num_of_instances": 100
|
| 615 |
+
},
|
| 616 |
+
"product_help": {
|
| 617 |
+
"cfpb_product_2023": {
|
| 618 |
+
"f1_macro": 0.5127059822477612,
|
| 619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.6226415094339622,
|
| 620 |
+
"f1_money transfer or virtual currency or money service": 0.6666666666666666,
|
| 621 |
+
"f1_mortgage": 0.6666666666666666,
|
| 622 |
+
"f1_credit card or prepaid card": 0.46153846153846156,
|
| 623 |
+
"f1_debt collection": 0.5714285714285714,
|
| 624 |
+
"f1_checking or savings account": 0.6,
|
| 625 |
+
"f1_payday loan or title loan or personal loan": 0.0,
|
| 626 |
+
"f1_macro_ci_low": 0.3059304830255877,
|
| 627 |
+
"f1_macro_ci_high": 0.65536064450436,
|
| 628 |
+
"score_name": "f1_micro",
|
| 629 |
+
"score": 0.5974025974025974,
|
| 630 |
+
"score_ci_high": 0.6835443037974683,
|
| 631 |
+
"score_ci_low": 0.48315672175625113,
|
| 632 |
+
"num_of_instances": 100,
|
| 633 |
+
"accuracy": 0.46,
|
| 634 |
+
"accuracy_ci_low": 0.36,
|
| 635 |
+
"accuracy_ci_high": 0.55,
|
| 636 |
+
"f1_micro": 0.5974025974025974,
|
| 637 |
+
"f1_micro_ci_low": 0.48315672175625113,
|
| 638 |
+
"f1_micro_ci_high": 0.6835443037974683
|
| 639 |
+
},
|
| 640 |
+
"cfpb_product_watsonx": {
|
| 641 |
+
"f1_macro": 0.5854409780106373,
|
| 642 |
+
"f1_mortgages and loans": 0.631578947368421,
|
| 643 |
+
"f1_credit card": 0.7058823529411765,
|
| 644 |
+
"f1_debt collection": 0.15384615384615385,
|
| 645 |
+
"f1_credit reporting": 0.6666666666666666,
|
| 646 |
+
"f1_retail banking": 0.7692307692307693,
|
| 647 |
+
"f1_macro_ci_low": 0.45565135086344355,
|
| 648 |
+
"f1_macro_ci_high": 0.7181205435775971,
|
| 649 |
+
"score_name": "f1_micro",
|
| 650 |
+
"score": 0.6,
|
| 651 |
+
"score_ci_high": 0.7305071360274239,
|
| 652 |
+
"score_ci_low": 0.4473684210526316,
|
| 653 |
+
"num_of_instances": 50,
|
| 654 |
+
"accuracy": 0.48,
|
| 655 |
+
"accuracy_ci_low": 0.34,
|
| 656 |
+
"accuracy_ci_high": 0.62,
|
| 657 |
+
"f1_micro": 0.6,
|
| 658 |
+
"f1_micro_ci_low": 0.4473684210526316,
|
| 659 |
+
"f1_micro_ci_high": 0.7305071360274239
|
| 660 |
+
},
|
| 661 |
+
"score": 0.5987012987012987,
|
| 662 |
+
"score_name": "subsets_mean",
|
| 663 |
+
"num_of_instances": 150
|
| 664 |
+
},
|
| 665 |
+
"qa_finance": {
|
| 666 |
+
"fin_qa": {
|
| 667 |
+
"num_of_instances": 100,
|
| 668 |
+
"program_accuracy": 0.12,
|
| 669 |
+
"score": 0.12,
|
| 670 |
+
"score_name": "program_accuracy",
|
| 671 |
+
"execution_accuracy": 0.11,
|
| 672 |
+
"program_accuracy_ci_low": 0.06,
|
| 673 |
+
"program_accuracy_ci_high": 0.2,
|
| 674 |
+
"score_ci_low": 0.06,
|
| 675 |
+
"score_ci_high": 0.2,
|
| 676 |
+
"execution_accuracy_ci_low": 0.05,
|
| 677 |
+
"execution_accuracy_ci_high": 0.19
|
| 678 |
+
},
|
| 679 |
+
"score": 0.12,
|
| 680 |
+
"score_name": "subsets_mean",
|
| 681 |
+
"num_of_instances": 100
|
| 682 |
+
},
|
| 683 |
+
"rag_general": {
|
| 684 |
+
"rag_response_generation_clapnq": {
|
| 685 |
+
"precision": 0.45987835551429784,
|
| 686 |
+
"recall": 0.6089092539592577,
|
| 687 |
+
"f1": 0.47848080354659817,
|
| 688 |
+
"precision_ci_low": 0.41760148165051814,
|
| 689 |
+
"precision_ci_high": 0.49746616398698223,
|
| 690 |
+
"recall_ci_low": 0.5697082571429869,
|
| 691 |
+
"recall_ci_high": 0.6467904796073142,
|
| 692 |
+
"f1_ci_low": 0.44871445446249897,
|
| 693 |
+
"f1_ci_high": 0.5084059516661349,
|
| 694 |
+
"score_name": "f1",
|
| 695 |
+
"score": 0.47848080354659817,
|
| 696 |
+
"score_ci_high": 0.5084059516661349,
|
| 697 |
+
"score_ci_low": 0.44871445446249897,
|
| 698 |
+
"num_of_instances": 100,
|
| 699 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6754454389214516,
|
| 700 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.7099989691376686,
|
| 701 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.653817954659462,
|
| 702 |
+
"faithfullness_f1_token_overlap": 0.3959431715506341,
|
| 703 |
+
"faithfullness_recall_token_overlap": 0.3167934927955296,
|
| 704 |
+
"faithfullness_precision_token_overlap": 0.6737573356460164,
|
| 705 |
+
"correctness_f1_token_overlap": 0.47848080354659817,
|
| 706 |
+
"correctness_recall_token_overlap": 0.6089092539592577,
|
| 707 |
+
"correctness_precision_token_overlap": 0.45987835551429784
|
| 708 |
+
},
|
| 709 |
+
"score": 0.47848080354659817,
|
| 710 |
+
"score_name": "subsets_mean",
|
| 711 |
+
"num_of_instances": 100
|
| 712 |
+
},
|
| 713 |
+
"reasoning": {
|
| 714 |
+
"hellaswag": {
|
| 715 |
+
"accuracy": 0.55,
|
| 716 |
+
"accuracy_ci_low": 0.44,
|
| 717 |
+
"accuracy_ci_high": 0.64,
|
| 718 |
+
"score_name": "accuracy",
|
| 719 |
+
"score": 0.55,
|
| 720 |
+
"score_ci_high": 0.64,
|
| 721 |
+
"score_ci_low": 0.44,
|
| 722 |
+
"num_of_instances": 100
|
| 723 |
+
},
|
| 724 |
+
"openbook_qa": {
|
| 725 |
+
"accuracy": 0.92,
|
| 726 |
+
"accuracy_ci_low": 0.86,
|
| 727 |
+
"accuracy_ci_high": 0.96,
|
| 728 |
+
"score_name": "accuracy",
|
| 729 |
+
"score": 0.92,
|
| 730 |
+
"score_ci_high": 0.96,
|
| 731 |
+
"score_ci_low": 0.86,
|
| 732 |
+
"num_of_instances": 100
|
| 733 |
+
},
|
| 734 |
+
"score": 0.7350000000000001,
|
| 735 |
+
"score_name": "subsets_mean",
|
| 736 |
+
"num_of_instances": 200
|
| 737 |
+
},
|
| 738 |
+
"safety": {
|
| 739 |
+
"attaq_500": {
|
| 740 |
+
"severity_critical": 3.0,
|
| 741 |
+
"severity_high": 3.0,
|
| 742 |
+
"severity_medium": 2.0,
|
| 743 |
+
"severity_low": 92.0,
|
| 744 |
+
"category_harmful_info": 0.8016206867519824,
|
| 745 |
+
"category_deception": 0.8724860676912346,
|
| 746 |
+
"category_explicit_content": 0.844361015642532,
|
| 747 |
+
"category_violence": 0.8031925099591415,
|
| 748 |
+
"category_discrimination": 0.8509023304259966,
|
| 749 |
+
"category_substance_abuse": 0.9055272097388902,
|
| 750 |
+
"category_pii": 0.8418703741497464,
|
| 751 |
+
"safety": 0.8245199847842258,
|
| 752 |
+
"safety_ci_low": 0.7836241018729321,
|
| 753 |
+
"safety_ci_high": 0.8572400320058132,
|
| 754 |
+
"score_name": "safety",
|
| 755 |
+
"score": 0.8245199847842258,
|
| 756 |
+
"score_ci_high": 0.8572400320058132,
|
| 757 |
+
"score_ci_low": 0.7836241018729321,
|
| 758 |
+
"num_of_instances": 100
|
| 759 |
+
},
|
| 760 |
+
"score": 0.8245199847842258,
|
| 761 |
+
"score_name": "subsets_mean",
|
| 762 |
+
"num_of_instances": 100
|
| 763 |
+
},
|
| 764 |
+
"summarization": {
|
| 765 |
+
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
+
"num_of_instances": 100,
|
| 767 |
+
"rouge1": 0.3719790333094703,
|
| 768 |
+
"rougeL": 0.25701580409011376,
|
| 769 |
+
"score": 0.25701580409011376,
|
| 770 |
+
"score_name": "rougeL",
|
| 771 |
+
"rouge2": 0.16852654848836174,
|
| 772 |
+
"rougeLsum": 0.3194338652461673,
|
| 773 |
+
"rouge1_ci_low": 0.35055333541911615,
|
| 774 |
+
"rouge1_ci_high": 0.391190818938017,
|
| 775 |
+
"rougeL_ci_low": 0.2422251661088317,
|
| 776 |
+
"rougeL_ci_high": 0.2706048380355117,
|
| 777 |
+
"score_ci_low": 0.2422251661088317,
|
| 778 |
+
"score_ci_high": 0.2706048380355117,
|
| 779 |
+
"rouge2_ci_low": 0.1554860106208589,
|
| 780 |
+
"rouge2_ci_high": 0.18051659341719561,
|
| 781 |
+
"rougeLsum_ci_low": 0.3004618212386427,
|
| 782 |
+
"rougeLsum_ci_high": 0.3375198730574517
|
| 783 |
+
},
|
| 784 |
+
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
+
"num_of_instances": 100,
|
| 786 |
+
"rouge1": 0.11327516122312702,
|
| 787 |
+
"rougeL": 0.0886753661206377,
|
| 788 |
+
"score": 0.0886753661206377,
|
| 789 |
+
"score_name": "rougeL",
|
| 790 |
+
"rouge2": 0.014659664762952287,
|
| 791 |
+
"rougeLsum": 0.09512429943984223,
|
| 792 |
+
"rouge1_ci_low": 0.0978619969362839,
|
| 793 |
+
"rouge1_ci_high": 0.130808100232374,
|
| 794 |
+
"rougeL_ci_low": 0.07715482241080045,
|
| 795 |
+
"rougeL_ci_high": 0.10170971355749205,
|
| 796 |
+
"score_ci_low": 0.07715482241080045,
|
| 797 |
+
"score_ci_high": 0.10170971355749205,
|
| 798 |
+
"rouge2_ci_low": 0.010284164719834055,
|
| 799 |
+
"rouge2_ci_high": 0.01987413470499142,
|
| 800 |
+
"rougeLsum_ci_low": 0.08270160357375651,
|
| 801 |
+
"rougeLsum_ci_high": 0.10905717346323568
|
| 802 |
+
},
|
| 803 |
+
"score": 0.17284558510537573,
|
| 804 |
+
"score_name": "subsets_mean",
|
| 805 |
+
"num_of_instances": 200
|
| 806 |
+
},
|
| 807 |
+
"translation": {
|
| 808 |
+
"mt_flores_101_ara_eng": {
|
| 809 |
+
"num_of_instances": 6,
|
| 810 |
+
"counts": [
|
| 811 |
+
138,
|
| 812 |
+
92,
|
| 813 |
+
67,
|
| 814 |
+
53
|
| 815 |
+
],
|
| 816 |
+
"totals": [
|
| 817 |
+
195,
|
| 818 |
+
189,
|
| 819 |
+
183,
|
| 820 |
+
177
|
| 821 |
+
],
|
| 822 |
+
"precisions": [
|
| 823 |
+
0.7076923076923077,
|
| 824 |
+
0.48677248677248675,
|
| 825 |
+
0.36612021857923494,
|
| 826 |
+
0.2994350282485876
|
| 827 |
+
],
|
| 828 |
+
"bp": 0.9355069850316178,
|
| 829 |
+
"sys_len": 195,
|
| 830 |
+
"ref_len": 208,
|
| 831 |
+
"sacrebleu": 0.4124024513955057,
|
| 832 |
+
"score": 0.4124024513955057,
|
| 833 |
+
"score_name": "sacrebleu",
|
| 834 |
+
"score_ci_low": 0.29392319820375895,
|
| 835 |
+
"score_ci_high": 0.5416184302621397,
|
| 836 |
+
"sacrebleu_ci_low": 0.29392319820375895,
|
| 837 |
+
"sacrebleu_ci_high": 0.5416184302621397
|
| 838 |
+
},
|
| 839 |
+
"mt_flores_101_deu_eng": {
|
| 840 |
+
"num_of_instances": 6,
|
| 841 |
+
"counts": [
|
| 842 |
+
130,
|
| 843 |
+
72,
|
| 844 |
+
43,
|
| 845 |
+
24
|
| 846 |
+
],
|
| 847 |
+
"totals": [
|
| 848 |
+
212,
|
| 849 |
+
206,
|
| 850 |
+
200,
|
| 851 |
+
194
|
| 852 |
+
],
|
| 853 |
+
"precisions": [
|
| 854 |
+
0.6132075471698113,
|
| 855 |
+
0.34951456310679613,
|
| 856 |
+
0.215,
|
| 857 |
+
0.12371134020618557
|
| 858 |
+
],
|
| 859 |
+
"bp": 1.0,
|
| 860 |
+
"sys_len": 212,
|
| 861 |
+
"ref_len": 208,
|
| 862 |
+
"sacrebleu": 0.27477687799385975,
|
| 863 |
+
"score": 0.27477687799385975,
|
| 864 |
+
"score_name": "sacrebleu",
|
| 865 |
+
"score_ci_low": 0.2035684578427638,
|
| 866 |
+
"score_ci_high": 0.34099747849022166,
|
| 867 |
+
"sacrebleu_ci_low": 0.2035684578427638,
|
| 868 |
+
"sacrebleu_ci_high": 0.34099747849022166
|
| 869 |
+
},
|
| 870 |
+
"mt_flores_101_eng_ara": {
|
| 871 |
+
"num_of_instances": 6,
|
| 872 |
+
"counts": [
|
| 873 |
+
41,
|
| 874 |
+
11,
|
| 875 |
+
5,
|
| 876 |
+
2
|
| 877 |
+
],
|
| 878 |
+
"totals": [
|
| 879 |
+
1043,
|
| 880 |
+
1037,
|
| 881 |
+
1031,
|
| 882 |
+
1025
|
| 883 |
+
],
|
| 884 |
+
"precisions": [
|
| 885 |
+
0.039309683604985615,
|
| 886 |
+
0.010607521697203472,
|
| 887 |
+
0.004849660523763337,
|
| 888 |
+
0.001951219512195122
|
| 889 |
+
],
|
| 890 |
+
"bp": 1.0,
|
| 891 |
+
"sys_len": 1043,
|
| 892 |
+
"ref_len": 209,
|
| 893 |
+
"sacrebleu": 0.007925610705310854,
|
| 894 |
+
"score": 0.007925610705310854,
|
| 895 |
+
"score_name": "sacrebleu",
|
| 896 |
+
"score_ci_low": 0.00036606242805824386,
|
| 897 |
+
"score_ci_high": 0.051403142169417056,
|
| 898 |
+
"sacrebleu_ci_low": 0.00036606242805824386,
|
| 899 |
+
"sacrebleu_ci_high": 0.051403142169417056
|
| 900 |
+
},
|
| 901 |
+
"mt_flores_101_eng_deu": {
|
| 902 |
+
"num_of_instances": 6,
|
| 903 |
+
"counts": [
|
| 904 |
+
137,
|
| 905 |
+
78,
|
| 906 |
+
48,
|
| 907 |
+
34
|
| 908 |
+
],
|
| 909 |
+
"totals": [
|
| 910 |
+
223,
|
| 911 |
+
217,
|
| 912 |
+
211,
|
| 913 |
+
205
|
| 914 |
+
],
|
| 915 |
+
"precisions": [
|
| 916 |
+
0.6143497757847534,
|
| 917 |
+
0.35944700460829493,
|
| 918 |
+
0.2274881516587678,
|
| 919 |
+
0.16585365853658537
|
| 920 |
+
],
|
| 921 |
+
"bp": 1.0,
|
| 922 |
+
"sys_len": 223,
|
| 923 |
+
"ref_len": 216,
|
| 924 |
+
"sacrebleu": 0.3021228708012542,
|
| 925 |
+
"score": 0.3021228708012542,
|
| 926 |
+
"score_name": "sacrebleu",
|
| 927 |
+
"score_ci_low": 0.18704340953151058,
|
| 928 |
+
"score_ci_high": 0.4528856134330656,
|
| 929 |
+
"sacrebleu_ci_low": 0.18704340953151058,
|
| 930 |
+
"sacrebleu_ci_high": 0.4528856134330656
|
| 931 |
+
},
|
| 932 |
+
"mt_flores_101_eng_fra": {
|
| 933 |
+
"num_of_instances": 6,
|
| 934 |
+
"counts": [
|
| 935 |
+
169,
|
| 936 |
+
118,
|
| 937 |
+
86,
|
| 938 |
+
62
|
| 939 |
+
],
|
| 940 |
+
"totals": [
|
| 941 |
+
272,
|
| 942 |
+
266,
|
| 943 |
+
260,
|
| 944 |
+
254
|
| 945 |
+
],
|
| 946 |
+
"precisions": [
|
| 947 |
+
0.6213235294117647,
|
| 948 |
+
0.44360902255639095,
|
| 949 |
+
0.3307692307692308,
|
| 950 |
+
0.24409448818897636
|
| 951 |
+
],
|
| 952 |
+
"bp": 1.0,
|
| 953 |
+
"sys_len": 272,
|
| 954 |
+
"ref_len": 235,
|
| 955 |
+
"sacrebleu": 0.38623383101118763,
|
| 956 |
+
"score": 0.38623383101118763,
|
| 957 |
+
"score_name": "sacrebleu",
|
| 958 |
+
"score_ci_low": 0.31654388780624476,
|
| 959 |
+
"score_ci_high": 0.5239154100781143,
|
| 960 |
+
"sacrebleu_ci_low": 0.31654388780624476,
|
| 961 |
+
"sacrebleu_ci_high": 0.5239154100781143
|
| 962 |
+
},
|
| 963 |
+
"mt_flores_101_eng_kor": {
|
| 964 |
+
"num_of_instances": 6,
|
| 965 |
+
"counts": [
|
| 966 |
+
103,
|
| 967 |
+
47,
|
| 968 |
+
26,
|
| 969 |
+
14
|
| 970 |
+
],
|
| 971 |
+
"totals": [
|
| 972 |
+
260,
|
| 973 |
+
254,
|
| 974 |
+
248,
|
| 975 |
+
242
|
| 976 |
+
],
|
| 977 |
+
"precisions": [
|
| 978 |
+
0.39615384615384613,
|
| 979 |
+
0.18503937007874016,
|
| 980 |
+
0.10483870967741936,
|
| 981 |
+
0.05785123966942149
|
| 982 |
+
],
|
| 983 |
+
"bp": 1.0,
|
| 984 |
+
"sys_len": 260,
|
| 985 |
+
"ref_len": 249,
|
| 986 |
+
"sacrebleu": 0.1452080150283951,
|
| 987 |
+
"score": 0.1452080150283951,
|
| 988 |
+
"score_name": "sacrebleu",
|
| 989 |
+
"score_ci_low": 0.06608662127664126,
|
| 990 |
+
"score_ci_high": 0.1973933684752931,
|
| 991 |
+
"sacrebleu_ci_low": 0.06608662127664126,
|
| 992 |
+
"sacrebleu_ci_high": 0.1973933684752931
|
| 993 |
+
},
|
| 994 |
+
"mt_flores_101_eng_por": {
|
| 995 |
+
"num_of_instances": 6,
|
| 996 |
+
"counts": [
|
| 997 |
+
178,
|
| 998 |
+
133,
|
| 999 |
+
105,
|
| 1000 |
+
82
|
| 1001 |
+
],
|
| 1002 |
+
"totals": [
|
| 1003 |
+
222,
|
| 1004 |
+
216,
|
| 1005 |
+
210,
|
| 1006 |
+
204
|
| 1007 |
+
],
|
| 1008 |
+
"precisions": [
|
| 1009 |
+
0.8018018018018018,
|
| 1010 |
+
0.6157407407407408,
|
| 1011 |
+
0.5,
|
| 1012 |
+
0.4019607843137255
|
| 1013 |
+
],
|
| 1014 |
+
"bp": 1.0,
|
| 1015 |
+
"sys_len": 222,
|
| 1016 |
+
"ref_len": 222,
|
| 1017 |
+
"sacrebleu": 0.5612478001176117,
|
| 1018 |
+
"score": 0.5612478001176117,
|
| 1019 |
+
"score_name": "sacrebleu",
|
| 1020 |
+
"score_ci_low": 0.48241687526959404,
|
| 1021 |
+
"score_ci_high": 0.6626688785310176,
|
| 1022 |
+
"sacrebleu_ci_low": 0.48241687526959404,
|
| 1023 |
+
"sacrebleu_ci_high": 0.6626688785310176
|
| 1024 |
+
},
|
| 1025 |
+
"mt_flores_101_eng_ron": {
|
| 1026 |
+
"num_of_instances": 6,
|
| 1027 |
+
"counts": [
|
| 1028 |
+
118,
|
| 1029 |
+
58,
|
| 1030 |
+
39,
|
| 1031 |
+
25
|
| 1032 |
+
],
|
| 1033 |
+
"totals": [
|
| 1034 |
+
233,
|
| 1035 |
+
227,
|
| 1036 |
+
221,
|
| 1037 |
+
215
|
| 1038 |
+
],
|
| 1039 |
+
"precisions": [
|
| 1040 |
+
0.5064377682403434,
|
| 1041 |
+
0.2555066079295154,
|
| 1042 |
+
0.17647058823529413,
|
| 1043 |
+
0.11627906976744186
|
| 1044 |
+
],
|
| 1045 |
+
"bp": 1.0,
|
| 1046 |
+
"sys_len": 233,
|
| 1047 |
+
"ref_len": 230,
|
| 1048 |
+
"sacrebleu": 0.2269998269874958,
|
| 1049 |
+
"score": 0.2269998269874958,
|
| 1050 |
+
"score_name": "sacrebleu",
|
| 1051 |
+
"score_ci_low": 0.1385483452866133,
|
| 1052 |
+
"score_ci_high": 0.3477995084922762,
|
| 1053 |
+
"sacrebleu_ci_low": 0.1385483452866133,
|
| 1054 |
+
"sacrebleu_ci_high": 0.3477995084922762
|
| 1055 |
+
},
|
| 1056 |
+
"mt_flores_101_eng_spa": {
|
| 1057 |
+
"num_of_instances": 6,
|
| 1058 |
+
"counts": [
|
| 1059 |
+
155,
|
| 1060 |
+
90,
|
| 1061 |
+
54,
|
| 1062 |
+
35
|
| 1063 |
+
],
|
| 1064 |
+
"totals": [
|
| 1065 |
+
228,
|
| 1066 |
+
222,
|
| 1067 |
+
216,
|
| 1068 |
+
210
|
| 1069 |
+
],
|
| 1070 |
+
"precisions": [
|
| 1071 |
+
0.6798245614035088,
|
| 1072 |
+
0.40540540540540543,
|
| 1073 |
+
0.25,
|
| 1074 |
+
0.16666666666666669
|
| 1075 |
+
],
|
| 1076 |
+
"bp": 0.936327965220313,
|
| 1077 |
+
"sys_len": 228,
|
| 1078 |
+
"ref_len": 243,
|
| 1079 |
+
"sacrebleu": 0.30651150513490355,
|
| 1080 |
+
"score": 0.30651150513490355,
|
| 1081 |
+
"score_name": "sacrebleu",
|
| 1082 |
+
"score_ci_low": 0.20358227359757108,
|
| 1083 |
+
"score_ci_high": 0.4638308959566943,
|
| 1084 |
+
"sacrebleu_ci_low": 0.20358227359757108,
|
| 1085 |
+
"sacrebleu_ci_high": 0.4638308959566943
|
| 1086 |
+
},
|
| 1087 |
+
"mt_flores_101_fra_eng": {
|
| 1088 |
+
"num_of_instances": 6,
|
| 1089 |
+
"counts": [
|
| 1090 |
+
150,
|
| 1091 |
+
100,
|
| 1092 |
+
69,
|
| 1093 |
+
50
|
| 1094 |
+
],
|
| 1095 |
+
"totals": [
|
| 1096 |
+
220,
|
| 1097 |
+
214,
|
| 1098 |
+
208,
|
| 1099 |
+
202
|
| 1100 |
+
],
|
| 1101 |
+
"precisions": [
|
| 1102 |
+
0.6818181818181819,
|
| 1103 |
+
0.4672897196261683,
|
| 1104 |
+
0.3317307692307692,
|
| 1105 |
+
0.24752475247524752
|
| 1106 |
+
],
|
| 1107 |
+
"bp": 1.0,
|
| 1108 |
+
"sys_len": 220,
|
| 1109 |
+
"ref_len": 208,
|
| 1110 |
+
"sacrebleu": 0.40217474848547186,
|
| 1111 |
+
"score": 0.40217474848547186,
|
| 1112 |
+
"score_name": "sacrebleu",
|
| 1113 |
+
"score_ci_low": 0.2657813839292085,
|
| 1114 |
+
"score_ci_high": 0.47861779902585627,
|
| 1115 |
+
"sacrebleu_ci_low": 0.2657813839292085,
|
| 1116 |
+
"sacrebleu_ci_high": 0.47861779902585627
|
| 1117 |
+
},
|
| 1118 |
+
"mt_flores_101_jpn_eng": {
|
| 1119 |
+
"num_of_instances": 6,
|
| 1120 |
+
"counts": [
|
| 1121 |
+
97,
|
| 1122 |
+
47,
|
| 1123 |
+
30,
|
| 1124 |
+
20
|
| 1125 |
+
],
|
| 1126 |
+
"totals": [
|
| 1127 |
+
199,
|
| 1128 |
+
193,
|
| 1129 |
+
187,
|
| 1130 |
+
181
|
| 1131 |
+
],
|
| 1132 |
+
"precisions": [
|
| 1133 |
+
0.48743718592964824,
|
| 1134 |
+
0.24352331606217617,
|
| 1135 |
+
0.16042780748663102,
|
| 1136 |
+
0.11049723756906077
|
| 1137 |
+
],
|
| 1138 |
+
"bp": 0.9557813259386698,
|
| 1139 |
+
"sys_len": 199,
|
| 1140 |
+
"ref_len": 208,
|
| 1141 |
+
"sacrebleu": 0.20470625349251736,
|
| 1142 |
+
"score": 0.20470625349251736,
|
| 1143 |
+
"score_name": "sacrebleu",
|
| 1144 |
+
"score_ci_low": 0.062411490333599384,
|
| 1145 |
+
"score_ci_high": 0.37787926399661254,
|
| 1146 |
+
"sacrebleu_ci_low": 0.062411490333599384,
|
| 1147 |
+
"sacrebleu_ci_high": 0.37787926399661254
|
| 1148 |
+
},
|
| 1149 |
+
"mt_flores_101_kor_eng": {
|
| 1150 |
+
"num_of_instances": 6,
|
| 1151 |
+
"counts": [
|
| 1152 |
+
131,
|
| 1153 |
+
71,
|
| 1154 |
+
46,
|
| 1155 |
+
32
|
| 1156 |
+
],
|
| 1157 |
+
"totals": [
|
| 1158 |
+
204,
|
| 1159 |
+
198,
|
| 1160 |
+
192,
|
| 1161 |
+
186
|
| 1162 |
+
],
|
| 1163 |
+
"precisions": [
|
| 1164 |
+
0.6421568627450981,
|
| 1165 |
+
0.3585858585858586,
|
| 1166 |
+
0.23958333333333331,
|
| 1167 |
+
0.17204301075268816
|
| 1168 |
+
],
|
| 1169 |
+
"bp": 0.9805831403241088,
|
| 1170 |
+
"sys_len": 204,
|
| 1171 |
+
"ref_len": 208,
|
| 1172 |
+
"sacrebleu": 0.30606692682673686,
|
| 1173 |
+
"score": 0.30606692682673686,
|
| 1174 |
+
"score_name": "sacrebleu",
|
| 1175 |
+
"score_ci_low": 0.21391062717714018,
|
| 1176 |
+
"score_ci_high": 0.43560142213481795,
|
| 1177 |
+
"sacrebleu_ci_low": 0.21391062717714018,
|
| 1178 |
+
"sacrebleu_ci_high": 0.43560142213481795
|
| 1179 |
+
},
|
| 1180 |
+
"mt_flores_101_por_eng": {
|
| 1181 |
+
"num_of_instances": 6,
|
| 1182 |
+
"counts": [
|
| 1183 |
+
154,
|
| 1184 |
+
114,
|
| 1185 |
+
87,
|
| 1186 |
+
68
|
| 1187 |
+
],
|
| 1188 |
+
"totals": [
|
| 1189 |
+
212,
|
| 1190 |
+
206,
|
| 1191 |
+
200,
|
| 1192 |
+
194
|
| 1193 |
+
],
|
| 1194 |
+
"precisions": [
|
| 1195 |
+
0.7264150943396227,
|
| 1196 |
+
0.5533980582524272,
|
| 1197 |
+
0.435,
|
| 1198 |
+
0.3505154639175258
|
| 1199 |
+
],
|
| 1200 |
+
"bp": 1.0,
|
| 1201 |
+
"sys_len": 212,
|
| 1202 |
+
"ref_len": 208,
|
| 1203 |
+
"sacrebleu": 0.4975706245279535,
|
| 1204 |
+
"score": 0.4975706245279535,
|
| 1205 |
+
"score_name": "sacrebleu",
|
| 1206 |
+
"score_ci_low": 0.2766953989628893,
|
| 1207 |
+
"score_ci_high": 0.6197882444751728,
|
| 1208 |
+
"sacrebleu_ci_low": 0.2766953989628893,
|
| 1209 |
+
"sacrebleu_ci_high": 0.6197882444751728
|
| 1210 |
+
},
|
| 1211 |
+
"mt_flores_101_ron_eng": {
|
| 1212 |
+
"num_of_instances": 6,
|
| 1213 |
+
"counts": [
|
| 1214 |
+
165,
|
| 1215 |
+
118,
|
| 1216 |
+
88,
|
| 1217 |
+
68
|
| 1218 |
+
],
|
| 1219 |
+
"totals": [
|
| 1220 |
+
225,
|
| 1221 |
+
219,
|
| 1222 |
+
213,
|
| 1223 |
+
207
|
| 1224 |
+
],
|
| 1225 |
+
"precisions": [
|
| 1226 |
+
0.7333333333333333,
|
| 1227 |
+
0.5388127853881278,
|
| 1228 |
+
0.41314553990610325,
|
| 1229 |
+
0.32850241545893716
|
| 1230 |
+
],
|
| 1231 |
+
"bp": 1.0,
|
| 1232 |
+
"sys_len": 225,
|
| 1233 |
+
"ref_len": 208,
|
| 1234 |
+
"sacrebleu": 0.48122173948917607,
|
| 1235 |
+
"score": 0.48122173948917607,
|
| 1236 |
+
"score_name": "sacrebleu",
|
| 1237 |
+
"score_ci_low": 0.3768123540852893,
|
| 1238 |
+
"score_ci_high": 0.6039865556676205,
|
| 1239 |
+
"sacrebleu_ci_low": 0.3768123540852893,
|
| 1240 |
+
"sacrebleu_ci_high": 0.6039865556676205
|
| 1241 |
+
},
|
| 1242 |
+
"mt_flores_101_spa_eng": {
|
| 1243 |
+
"num_of_instances": 6,
|
| 1244 |
+
"counts": [
|
| 1245 |
+
149,
|
| 1246 |
+
91,
|
| 1247 |
+
60,
|
| 1248 |
+
41
|
| 1249 |
+
],
|
| 1250 |
+
"totals": [
|
| 1251 |
+
217,
|
| 1252 |
+
211,
|
| 1253 |
+
205,
|
| 1254 |
+
199
|
| 1255 |
+
],
|
| 1256 |
+
"precisions": [
|
| 1257 |
+
0.6866359447004607,
|
| 1258 |
+
0.4312796208530806,
|
| 1259 |
+
0.29268292682926833,
|
| 1260 |
+
0.20603015075376885
|
| 1261 |
+
],
|
| 1262 |
+
"bp": 1.0,
|
| 1263 |
+
"sys_len": 217,
|
| 1264 |
+
"ref_len": 208,
|
| 1265 |
+
"sacrebleu": 0.36555557390142274,
|
| 1266 |
+
"score": 0.36555557390142274,
|
| 1267 |
+
"score_name": "sacrebleu",
|
| 1268 |
+
"score_ci_low": 0.1895298667168946,
|
| 1269 |
+
"score_ci_high": 0.4258516382579568,
|
| 1270 |
+
"sacrebleu_ci_low": 0.1895298667168946,
|
| 1271 |
+
"sacrebleu_ci_high": 0.4258516382579568
|
| 1272 |
+
},
|
| 1273 |
+
"score": 0.32538164372658684,
|
| 1274 |
+
"score_name": "subsets_mean",
|
| 1275 |
+
"num_of_instances": 90
|
| 1276 |
+
},
|
| 1277 |
+
"score": 0.45382532802485587,
|
| 1278 |
+
"score_name": "subsets_mean",
|
| 1279 |
+
"num_of_instances": 1537
|
| 1280 |
+
}
|
| 1281 |
+
}
|
results/bluebench/{2025-07-02T15-54-03_evaluation_results.json β 2025-07-03T15-41-32_evaluation_results.json}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
-
"timestamp_utc": "2025-07-
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
@@ -42,7 +42,7 @@
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
-
"unitxt_commit_hash": "
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
@@ -176,13 +176,13 @@
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
-
"accuracy":
|
| 180 |
-
"accuracy_ci_low":
|
| 181 |
"accuracy_ci_high": 1.0,
|
| 182 |
"score_name": "accuracy",
|
| 183 |
-
"score":
|
| 184 |
"score_ci_high": 1.0,
|
| 185 |
-
"score_ci_low":
|
| 186 |
"num_of_instances": 9
|
| 187 |
},
|
| 188 |
"safety_bbq_disability_status": {
|
|
@@ -196,13 +196,13 @@
|
|
| 196 |
"num_of_instances": 9
|
| 197 |
},
|
| 198 |
"safety_bbq_gender_identity": {
|
| 199 |
-
"accuracy": 0.
|
| 200 |
-
"accuracy_ci_low": 0.
|
| 201 |
"accuracy_ci_high": 1.0,
|
| 202 |
"score_name": "accuracy",
|
| 203 |
-
"score": 0.
|
| 204 |
"score_ci_high": 1.0,
|
| 205 |
-
"score_ci_low": 0.
|
| 206 |
"num_of_instances": 9
|
| 207 |
},
|
| 208 |
"safety_bbq_nationality": {
|
|
@@ -226,13 +226,13 @@
|
|
| 226 |
"num_of_instances": 9
|
| 227 |
},
|
| 228 |
"safety_bbq_race_ethnicity": {
|
| 229 |
-
"accuracy": 0
|
| 230 |
-
"accuracy_ci_low": 0
|
| 231 |
"accuracy_ci_high": 1.0,
|
| 232 |
"score_name": "accuracy",
|
| 233 |
-
"score": 0
|
| 234 |
"score_ci_high": 1.0,
|
| 235 |
-
"score_ci_low": 0
|
| 236 |
"num_of_instances": 9
|
| 237 |
},
|
| 238 |
"safety_bbq_race_x_gender": {
|
|
@@ -246,12 +246,12 @@
|
|
| 246 |
"num_of_instances": 9
|
| 247 |
},
|
| 248 |
"safety_bbq_race_x_ses": {
|
| 249 |
-
"accuracy": 0.
|
| 250 |
"accuracy_ci_low": 0.3333333333333333,
|
| 251 |
-
"accuracy_ci_high":
|
| 252 |
"score_name": "accuracy",
|
| 253 |
-
"score": 0.
|
| 254 |
-
"score_ci_high":
|
| 255 |
"score_ci_low": 0.3333333333333333,
|
| 256 |
"num_of_instances": 9
|
| 257 |
},
|
|
@@ -266,13 +266,13 @@
|
|
| 266 |
"num_of_instances": 9
|
| 267 |
},
|
| 268 |
"safety_bbq_ses": {
|
| 269 |
-
"accuracy": 0.
|
| 270 |
-
"accuracy_ci_low": 0.
|
| 271 |
-
"accuracy_ci_high": 0.
|
| 272 |
"score_name": "accuracy",
|
| 273 |
-
"score": 0.
|
| 274 |
-
"score_ci_high": 0.
|
| 275 |
-
"score_ci_low": 0.
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
"safety_bbq_sexual_orientation": {
|
|
@@ -285,54 +285,54 @@
|
|
| 285 |
"score_ci_low": 0.4444444444444444,
|
| 286 |
"num_of_instances": 9
|
| 287 |
},
|
| 288 |
-
"score": 0.
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
| 296 |
-
"score": 0.
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
-
"score": 0.
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
-
"f1_Person": 0.
|
| 307 |
-
"f1_Organization": 0.
|
| 308 |
-
"f1_Location": 0.
|
| 309 |
-
"f1_macro": 0.
|
| 310 |
-
"recall_macro": 0.
|
| 311 |
-
"precision_macro": 0.
|
| 312 |
-
"in_classes_support": 0
|
| 313 |
-
"f1_micro": 0.
|
| 314 |
-
"recall_micro": 0.
|
| 315 |
-
"precision_micro": 0.
|
| 316 |
-
"score": 0.
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
-
"score_ci_low": 0.
|
| 319 |
-
"score_ci_high": 0.
|
| 320 |
-
"f1_micro_ci_low": 0.
|
| 321 |
-
"f1_micro_ci_high": 0.
|
| 322 |
},
|
| 323 |
-
"score": 0.
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
| 327 |
"knowledge": {
|
| 328 |
"mmlu_pro_biology": {
|
| 329 |
-
"accuracy": 0.
|
| 330 |
-
"accuracy_ci_low": 0.
|
| 331 |
-
"accuracy_ci_high":
|
| 332 |
"score_name": "accuracy",
|
| 333 |
-
"score": 0.
|
| 334 |
-
"score_ci_high":
|
| 335 |
-
"score_ci_low": 0.
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
|
@@ -346,43 +346,43 @@
|
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
-
"accuracy": 0.
|
| 350 |
-
"accuracy_ci_low": 0.
|
| 351 |
-
"accuracy_ci_high": 0.
|
| 352 |
"score_name": "accuracy",
|
| 353 |
-
"score": 0.
|
| 354 |
-
"score_ci_high": 0.
|
| 355 |
-
"score_ci_low": 0.
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
| 359 |
-
"accuracy": 0.
|
| 360 |
-
"accuracy_ci_low": 0.
|
| 361 |
"accuracy_ci_high": 1.0,
|
| 362 |
"score_name": "accuracy",
|
| 363 |
-
"score": 0.
|
| 364 |
"score_ci_high": 1.0,
|
| 365 |
-
"score_ci_low": 0.
|
| 366 |
"num_of_instances": 7
|
| 367 |
},
|
| 368 |
"mmlu_pro_economics": {
|
| 369 |
-
"accuracy": 0.
|
| 370 |
-
"accuracy_ci_low": 0.
|
| 371 |
"accuracy_ci_high": 1.0,
|
| 372 |
"score_name": "accuracy",
|
| 373 |
-
"score": 0.
|
| 374 |
"score_ci_high": 1.0,
|
| 375 |
-
"score_ci_low": 0.
|
| 376 |
"num_of_instances": 7
|
| 377 |
},
|
| 378 |
"mmlu_pro_engineering": {
|
| 379 |
-
"accuracy": 0.
|
| 380 |
-
"accuracy_ci_low": 0.
|
| 381 |
-
"accuracy_ci_high": 0.
|
| 382 |
"score_name": "accuracy",
|
| 383 |
-
"score": 0.
|
| 384 |
-
"score_ci_high": 0.
|
| 385 |
-
"score_ci_low": 0.
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
|
@@ -396,23 +396,23 @@
|
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
| 398 |
"mmlu_pro_history": {
|
| 399 |
-
"accuracy": 0.
|
| 400 |
"accuracy_ci_low": 0.0,
|
| 401 |
-
"accuracy_ci_high": 0.
|
| 402 |
"score_name": "accuracy",
|
| 403 |
-
"score": 0.
|
| 404 |
-
"score_ci_high": 0.
|
| 405 |
"score_ci_low": 0.0,
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
-
"accuracy": 0.
|
| 410 |
-
"accuracy_ci_low": 0.
|
| 411 |
"accuracy_ci_high": 0.7142857142857143,
|
| 412 |
"score_name": "accuracy",
|
| 413 |
-
"score": 0.
|
| 414 |
"score_ci_high": 0.7142857142857143,
|
| 415 |
-
"score_ci_low": 0.
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
|
@@ -436,23 +436,23 @@
|
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
| 438 |
"mmlu_pro_philosophy": {
|
| 439 |
-
"accuracy": 0.
|
| 440 |
-
"accuracy_ci_low": 0.
|
| 441 |
-
"accuracy_ci_high": 0
|
| 442 |
"score_name": "accuracy",
|
| 443 |
-
"score": 0.
|
| 444 |
-
"score_ci_high": 0
|
| 445 |
-
"score_ci_low": 0.
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
-
"accuracy": 0.
|
| 450 |
-
"accuracy_ci_low": 0.
|
| 451 |
-
"accuracy_ci_high": 0.
|
| 452 |
"score_name": "accuracy",
|
| 453 |
-
"score": 0.
|
| 454 |
-
"score_ci_high": 0.
|
| 455 |
-
"score_ci_low": 0.
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
| 458 |
"mmlu_pro_psychology": {
|
|
@@ -465,273 +465,273 @@
|
|
| 465 |
"score_ci_low": 0.14285714285714285,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
-
"score": 0.
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
-
"f1_macro": 0.
|
| 475 |
-
"f1_suggestive": 0.
|
| 476 |
-
"f1_arbitrary": 0.
|
| 477 |
"f1_generic": 0.5,
|
| 478 |
-
"f1_fanciful": 0.
|
| 479 |
"f1_descriptive": 0.8,
|
| 480 |
-
"f1_macro_ci_low": 0.
|
| 481 |
-
"f1_macro_ci_high": 0.
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
-
"score": 0.
|
| 484 |
-
"score_ci_high": 0.
|
| 485 |
-
"score_ci_low": 0.
|
| 486 |
"num_of_instances": 20,
|
| 487 |
-
"accuracy": 0.
|
| 488 |
-
"accuracy_ci_low": 0.
|
| 489 |
-
"accuracy_ci_high": 0.
|
| 490 |
-
"f1_micro": 0.
|
| 491 |
-
"f1_micro_ci_low": 0.
|
| 492 |
-
"f1_micro_ci_high": 0.
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
-
"f1_macro": 0.
|
| 496 |
-
"f1_no": 0.
|
| 497 |
-
"f1_yes": 0.
|
| 498 |
-
"f1_macro_ci_low": 0.
|
| 499 |
-
"f1_macro_ci_high": 0.
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
-
"score": 0.
|
| 502 |
-
"score_ci_high": 0.
|
| 503 |
-
"score_ci_low": 0.
|
| 504 |
"num_of_instances": 20,
|
| 505 |
-
"accuracy": 0.
|
| 506 |
-
"accuracy_ci_low": 0.
|
| 507 |
-
"accuracy_ci_high": 0.
|
| 508 |
-
"f1_micro": 0.
|
| 509 |
-
"f1_micro_ci_low": 0.
|
| 510 |
-
"f1_micro_ci_high": 0.
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
-
"f1_macro": 0.
|
| 514 |
"f1_conclusion": 0.2857142857142857,
|
| 515 |
"f1_analysis": 0.4444444444444444,
|
| 516 |
"f1_decree": 0.0,
|
| 517 |
"f1_issue": 0.2857142857142857,
|
| 518 |
-
"f1_procedural history": 0.
|
| 519 |
-
"f1_facts": 0.
|
| 520 |
"f1_rule": 0.0,
|
| 521 |
-
"f1_macro_ci_low": 0.
|
| 522 |
-
"f1_macro_ci_high": 0.
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
-
"score": 0.
|
| 525 |
-
"score_ci_high": 0.
|
| 526 |
-
"score_ci_low": 0.
|
| 527 |
"num_of_instances": 20,
|
| 528 |
-
"accuracy": 0.
|
| 529 |
-
"accuracy_ci_low": 0.
|
| 530 |
-
"accuracy_ci_high": 0.
|
| 531 |
-
"f1_micro": 0.
|
| 532 |
-
"f1_micro_ci_low": 0.
|
| 533 |
-
"f1_micro_ci_high": 0.
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
-
"f1_macro": 0.
|
| 537 |
-
"f1_yes": 0.
|
| 538 |
-
"f1_no": 0.
|
| 539 |
-
"f1_macro_ci_low": 0.
|
| 540 |
-
"f1_macro_ci_high": 0.
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
-
"score": 0.
|
| 543 |
-
"score_ci_high": 0.
|
| 544 |
-
"score_ci_low": 0.
|
| 545 |
"num_of_instances": 20,
|
| 546 |
-
"accuracy": 0.
|
| 547 |
-
"accuracy_ci_low": 0.
|
| 548 |
-
"accuracy_ci_high": 0.
|
| 549 |
-
"f1_micro": 0.
|
| 550 |
-
"f1_micro_ci_low": 0.
|
| 551 |
-
"f1_micro_ci_high": 0.
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
-
"f1_macro": 0.
|
| 555 |
-
"f1_yes": 0.
|
| 556 |
-
"f1_no": 0.
|
| 557 |
-
"f1_macro_ci_low": 0.
|
| 558 |
"f1_macro_ci_high": 1.0,
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
-
"score": 0.
|
| 561 |
-
"score_ci_high":
|
| 562 |
-
"score_ci_low": 0.
|
| 563 |
"num_of_instances": 20,
|
| 564 |
-
"accuracy": 0.
|
| 565 |
-
"accuracy_ci_low": 0.
|
| 566 |
-
"accuracy_ci_high":
|
| 567 |
-
"f1_micro": 0.
|
| 568 |
-
"f1_micro_ci_low": 0.
|
| 569 |
-
"f1_micro_ci_high":
|
| 570 |
},
|
| 571 |
-
"score": 0.
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
-
"f1_macro": 0.
|
| 578 |
"f1_cars": 0.75,
|
| 579 |
"f1_windows x": 0.3333333333333333,
|
| 580 |
-
"f1_computer graphics": 0.
|
| 581 |
"f1_atheism": 0.0,
|
| 582 |
-
"f1_religion": 0.
|
| 583 |
"f1_medicine": 0.6666666666666666,
|
| 584 |
-
"f1_christianity": 0.
|
| 585 |
"f1_microsoft windows": 0.6666666666666666,
|
| 586 |
-
"f1_middle east": 0.
|
| 587 |
-
"f1_politics": 0.
|
| 588 |
-
"f1_motorcycles": 0.
|
| 589 |
"f1_pc hardware": 0.6666666666666666,
|
| 590 |
"f1_mac hardware": 0.5,
|
| 591 |
-
"f1_electronics": 0.
|
| 592 |
"f1_for sale": 0.6666666666666666,
|
| 593 |
-
"f1_guns": 0.
|
| 594 |
"f1_space": 0.75,
|
| 595 |
-
"f1_cryptography": 0.
|
| 596 |
-
"f1_baseball": 0
|
| 597 |
-
"f1_hockey": 0.
|
| 598 |
-
"f1_macro_ci_low": 0.
|
| 599 |
-
"f1_macro_ci_high": 0.
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
-
"score": 0.
|
| 602 |
-
"score_ci_high": 0.
|
| 603 |
-
"score_ci_low": 0.
|
| 604 |
"num_of_instances": 100,
|
| 605 |
-
"accuracy": 0.
|
| 606 |
-
"accuracy_ci_low": 0.
|
| 607 |
-
"accuracy_ci_high": 0.
|
| 608 |
-
"f1_micro": 0.
|
| 609 |
-
"f1_micro_ci_low": 0.
|
| 610 |
-
"f1_micro_ci_high": 0.
|
| 611 |
},
|
| 612 |
-
"score": 0.
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
-
"f1_macro": 0.
|
| 619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
| 620 |
-
"
|
| 621 |
-
"
|
| 622 |
-
"
|
| 623 |
-
"
|
| 624 |
-
"
|
| 625 |
-
"
|
| 626 |
-
"f1_macro_ci_low": 0.
|
| 627 |
-
"f1_macro_ci_high": 0.
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
"score": 0.8324873096446701,
|
| 630 |
-
"score_ci_high": 0.
|
| 631 |
-
"score_ci_low": 0.
|
| 632 |
"num_of_instances": 100,
|
| 633 |
"accuracy": 0.82,
|
| 634 |
"accuracy_ci_low": 0.73,
|
| 635 |
"accuracy_ci_high": 0.89,
|
| 636 |
"f1_micro": 0.8324873096446701,
|
| 637 |
-
"f1_micro_ci_low": 0.
|
| 638 |
-
"f1_micro_ci_high": 0.
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
-
"f1_macro": 0.
|
| 642 |
-
"f1_mortgages and loans": 0.
|
| 643 |
-
"f1_credit card": 0.
|
| 644 |
-
"f1_debt collection": 0.
|
| 645 |
-
"
|
| 646 |
-
"
|
| 647 |
-
"f1_macro_ci_low": 0.
|
| 648 |
-
"f1_macro_ci_high": 0.
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
-
"score": 0.
|
| 651 |
-
"score_ci_high": 0.
|
| 652 |
-
"score_ci_low": 0.
|
| 653 |
"num_of_instances": 50,
|
| 654 |
-
"accuracy": 0.
|
| 655 |
-
"accuracy_ci_low": 0.
|
| 656 |
-
"accuracy_ci_high": 0.
|
| 657 |
-
"f1_micro": 0.
|
| 658 |
-
"f1_micro_ci_low": 0.
|
| 659 |
-
"f1_micro_ci_high": 0.
|
| 660 |
},
|
| 661 |
-
"score": 0.
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
-
"
|
| 669 |
-
"
|
|
|
|
| 670 |
"score_name": "program_accuracy",
|
| 671 |
-
"
|
| 672 |
-
"
|
| 673 |
-
"
|
| 674 |
-
"
|
| 675 |
-
"
|
| 676 |
-
"
|
| 677 |
-
"execution_accuracy_ci_high": 0.19
|
| 678 |
},
|
| 679 |
-
"score": 0.
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
-
"precision": 0.
|
| 686 |
-
"recall": 0.
|
| 687 |
-
"f1": 0.
|
| 688 |
-
"precision_ci_low": 0.
|
| 689 |
-
"precision_ci_high": 0.
|
| 690 |
-
"recall_ci_low": 0.
|
| 691 |
-
"recall_ci_high": 0.
|
| 692 |
-
"f1_ci_low": 0.
|
| 693 |
-
"f1_ci_high": 0.
|
| 694 |
"score_name": "f1",
|
| 695 |
-
"score": 0.
|
| 696 |
-
"score_ci_high": 0.
|
| 697 |
-
"score_ci_low": 0.
|
| 698 |
"num_of_instances": 100,
|
| 699 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
| 700 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
| 701 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
| 702 |
-
"faithfullness_f1_token_overlap": 0.
|
| 703 |
-
"faithfullness_recall_token_overlap": 0.
|
| 704 |
-
"faithfullness_precision_token_overlap": 0.
|
| 705 |
-
"correctness_f1_token_overlap": 0.
|
| 706 |
-
"correctness_recall_token_overlap": 0.
|
| 707 |
-
"correctness_precision_token_overlap": 0.
|
| 708 |
},
|
| 709 |
-
"score": 0.
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
| 713 |
"reasoning": {
|
| 714 |
"hellaswag": {
|
| 715 |
-
"accuracy": 0.
|
| 716 |
-
"accuracy_ci_low": 0.
|
| 717 |
-
"accuracy_ci_high": 0.
|
| 718 |
"score_name": "accuracy",
|
| 719 |
-
"score": 0.
|
| 720 |
-
"score_ci_high": 0.
|
| 721 |
-
"score_ci_low": 0.
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
-
"accuracy": 0.
|
| 726 |
-
"accuracy_ci_low": 0.
|
| 727 |
-
"accuracy_ci_high": 0.
|
| 728 |
"score_name": "accuracy",
|
| 729 |
-
"score": 0.
|
| 730 |
-
"score_ci_high": 0.
|
| 731 |
-
"score_ci_low": 0.
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
-
"score": 0.
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
|
@@ -741,66 +741,66 @@
|
|
| 741 |
"severity_high": 1.0,
|
| 742 |
"severity_medium": 0.0,
|
| 743 |
"severity_low": 99.0,
|
| 744 |
-
"category_harmful_info": 0.
|
| 745 |
-
"category_deception": 0.
|
| 746 |
-
"category_explicit_content": 0.
|
| 747 |
-
"category_violence": 0.
|
| 748 |
-
"category_discrimination": 0.
|
| 749 |
-
"category_substance_abuse": 0.
|
| 750 |
-
"category_pii": 0.
|
| 751 |
-
"safety": 0.
|
| 752 |
-
"safety_ci_low": 0.
|
| 753 |
-
"safety_ci_high": 0.
|
| 754 |
"score_name": "safety",
|
| 755 |
-
"score": 0.
|
| 756 |
-
"score_ci_high": 0.
|
| 757 |
-
"score_ci_low": 0.
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
-
"score": 0.
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
-
"
|
| 768 |
-
"
|
|
|
|
|
|
|
|
|
|
| 769 |
"score_name": "rougeL",
|
| 770 |
-
"
|
| 771 |
-
"
|
| 772 |
-
"
|
| 773 |
-
"
|
| 774 |
-
"
|
| 775 |
-
"
|
| 776 |
-
"
|
| 777 |
-
"
|
| 778 |
-
"
|
| 779 |
-
"
|
| 780 |
-
"rouge2_ci_high": 0.241342036471108,
|
| 781 |
-
"rougeLsum_ci_low": 0.3517918624596096,
|
| 782 |
-
"rougeLsum_ci_high": 0.3988365832103701
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
-
"
|
| 787 |
-
"
|
|
|
|
|
|
|
|
|
|
| 788 |
"score_name": "rougeL",
|
| 789 |
-
"
|
| 790 |
-
"
|
| 791 |
-
"
|
| 792 |
-
"
|
| 793 |
-
"
|
| 794 |
-
"
|
| 795 |
-
"
|
| 796 |
-
"
|
| 797 |
-
"
|
| 798 |
-
"
|
| 799 |
-
"rouge2_ci_high": 0.019986121773575755,
|
| 800 |
-
"rougeLsum_ci_low": 0.08483297823243409,
|
| 801 |
-
"rougeLsum_ci_high": 0.10907235867166053
|
| 802 |
},
|
| 803 |
-
"score": 0.
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
@@ -808,196 +808,196 @@
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
-
0.
|
| 824 |
-
0.
|
| 825 |
-
0.
|
| 826 |
-
0.
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
-
"sys_len":
|
| 830 |
"ref_len": 208,
|
| 831 |
-
"sacrebleu": 0.
|
| 832 |
-
"score": 0.
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
-
"score_ci_low": 0.
|
| 835 |
-
"score_ci_high": 0.
|
| 836 |
-
"sacrebleu_ci_low": 0.
|
| 837 |
-
"sacrebleu_ci_high": 0.
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
-
0.
|
| 855 |
-
0.
|
| 856 |
-
0.
|
| 857 |
-
0.
|
| 858 |
],
|
| 859 |
-
"bp":
|
| 860 |
-
"sys_len":
|
| 861 |
"ref_len": 208,
|
| 862 |
-
"sacrebleu": 0.
|
| 863 |
-
"score": 0.
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
-
"score_ci_low": 0.
|
| 866 |
-
"score_ci_high": 0.
|
| 867 |
-
"sacrebleu_ci_low": 0.
|
| 868 |
-
"sacrebleu_ci_high": 0.
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
-
0.
|
| 886 |
-
0.
|
| 887 |
-
0.
|
| 888 |
-
0.
|
| 889 |
],
|
| 890 |
-
"bp": 0.
|
| 891 |
-
"sys_len":
|
| 892 |
"ref_len": 209,
|
| 893 |
-
"sacrebleu": 0.
|
| 894 |
-
"score": 0.
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
-
"score_ci_low": 0.
|
| 897 |
-
"score_ci_high": 0.
|
| 898 |
-
"sacrebleu_ci_low": 0.
|
| 899 |
-
"sacrebleu_ci_high": 0.
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
-
0.
|
| 917 |
-
0.
|
| 918 |
-
0.
|
| 919 |
-
0.
|
| 920 |
],
|
| 921 |
-
"bp": 0.
|
| 922 |
-
"sys_len":
|
| 923 |
"ref_len": 216,
|
| 924 |
-
"sacrebleu": 0.
|
| 925 |
-
"score": 0.
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
-
"score_ci_low": 0.
|
| 928 |
-
"score_ci_high": 0.
|
| 929 |
-
"sacrebleu_ci_low": 0.
|
| 930 |
-
"sacrebleu_ci_high": 0.
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
186,
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
-
0.
|
| 948 |
-
0.
|
| 949 |
-
0.
|
| 950 |
-
0.
|
| 951 |
],
|
| 952 |
-
"bp":
|
| 953 |
-
"sys_len":
|
| 954 |
"ref_len": 235,
|
| 955 |
-
"sacrebleu": 0.
|
| 956 |
-
"score": 0.
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
-
"score_ci_low": 0.
|
| 959 |
-
"score_ci_high": 0.
|
| 960 |
-
"sacrebleu_ci_low": 0.
|
| 961 |
-
"sacrebleu_ci_high": 0.
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
-
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
-
0.
|
| 979 |
-
0.
|
| 980 |
-
0.
|
| 981 |
-
0.
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
-
"sys_len":
|
| 985 |
"ref_len": 249,
|
| 986 |
-
"sacrebleu": 0.
|
| 987 |
-
"score": 0.
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
-
"score_ci_low": 0.
|
| 990 |
-
"score_ci_high": 0.
|
| 991 |
-
"sacrebleu_ci_low": 0.
|
| 992 |
-
"sacrebleu_ci_high": 0.
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
230,
|
|
@@ -1006,275 +1006,275 @@
|
|
| 1006 |
212
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
-
0.
|
| 1010 |
-
0.
|
| 1011 |
-
0.
|
| 1012 |
-
0.
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
"sys_len": 230,
|
| 1016 |
"ref_len": 222,
|
| 1017 |
-
"sacrebleu": 0.
|
| 1018 |
-
"score": 0.
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
-
"score_ci_low": 0.
|
| 1021 |
-
"score_ci_high": 0.
|
| 1022 |
-
"sacrebleu_ci_low": 0.
|
| 1023 |
-
"sacrebleu_ci_high": 0.
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
-
|
| 1029 |
98,
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
-
0.
|
| 1041 |
-
0.
|
| 1042 |
-
0.
|
| 1043 |
-
0.
|
| 1044 |
],
|
| 1045 |
"bp": 1.0,
|
| 1046 |
-
"sys_len":
|
| 1047 |
"ref_len": 230,
|
| 1048 |
-
"sacrebleu": 0.
|
| 1049 |
-
"score": 0.
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
-
"score_ci_low": 0.
|
| 1052 |
-
"score_ci_high": 0.
|
| 1053 |
-
"sacrebleu_ci_low": 0.
|
| 1054 |
-
"sacrebleu_ci_high": 0.
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
-
0.
|
| 1072 |
-
0.
|
| 1073 |
-
0.
|
| 1074 |
-
0.
|
| 1075 |
],
|
| 1076 |
-
"bp": 0.
|
| 1077 |
-
"sys_len":
|
| 1078 |
"ref_len": 243,
|
| 1079 |
-
"sacrebleu": 0.
|
| 1080 |
-
"score": 0.
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
-
"score_ci_low": 0.
|
| 1083 |
-
"score_ci_high": 0.
|
| 1084 |
-
"sacrebleu_ci_low": 0.
|
| 1085 |
-
"sacrebleu_ci_high": 0.
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
],
|
| 1095 |
"totals": [
|
|
|
|
| 1096 |
214,
|
| 1097 |
208,
|
| 1098 |
-
202
|
| 1099 |
-
196
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
-
0.
|
| 1103 |
-
0.
|
| 1104 |
-
0.
|
| 1105 |
-
0.
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
-
"sys_len":
|
| 1109 |
"ref_len": 208,
|
| 1110 |
-
"sacrebleu": 0.
|
| 1111 |
-
"score": 0.
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
-
"score_ci_low": 0.
|
| 1114 |
-
"score_ci_high": 0.
|
| 1115 |
-
"sacrebleu_ci_low": 0.
|
| 1116 |
-
"sacrebleu_ci_high": 0.
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
-
|
| 1122 |
82,
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
-
0.
|
| 1134 |
-
0.
|
| 1135 |
-
0.
|
| 1136 |
-
0.
|
| 1137 |
],
|
| 1138 |
-
"bp": 0.
|
| 1139 |
-
"sys_len":
|
| 1140 |
"ref_len": 208,
|
| 1141 |
-
"sacrebleu": 0.
|
| 1142 |
-
"score": 0.
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
-
"score_ci_low": 0.
|
| 1145 |
-
"score_ci_high": 0.
|
| 1146 |
-
"sacrebleu_ci_low": 0.
|
| 1147 |
-
"sacrebleu_ci_high": 0.
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
-
|
| 1153 |
63,
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
-
0.
|
| 1165 |
-
0.
|
| 1166 |
-
0.
|
| 1167 |
-
0.
|
| 1168 |
],
|
| 1169 |
-
"bp": 0.
|
| 1170 |
-
"sys_len":
|
| 1171 |
"ref_len": 208,
|
| 1172 |
-
"sacrebleu": 0.
|
| 1173 |
-
"score": 0.
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
-
"score_ci_low": 0.
|
| 1176 |
-
"score_ci_high": 0.
|
| 1177 |
-
"sacrebleu_ci_low": 0.
|
| 1178 |
-
"sacrebleu_ci_high": 0.
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
-
0.
|
| 1196 |
-
0.
|
| 1197 |
-
0.
|
| 1198 |
-
0.
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
-
"sys_len":
|
| 1202 |
"ref_len": 208,
|
| 1203 |
-
"sacrebleu": 0.
|
| 1204 |
-
"score": 0.
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
-
"score_ci_low": 0.
|
| 1207 |
-
"score_ci_high": 0.
|
| 1208 |
-
"sacrebleu_ci_low": 0.
|
| 1209 |
-
"sacrebleu_ci_high": 0.
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
-
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
-
0.
|
| 1227 |
-
0.
|
| 1228 |
-
0.
|
| 1229 |
-
0.
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
-
"sys_len":
|
| 1233 |
"ref_len": 208,
|
| 1234 |
-
"sacrebleu": 0.
|
| 1235 |
-
"score": 0.
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
-
"score_ci_low": 0.
|
| 1238 |
-
"score_ci_high": 0.
|
| 1239 |
-
"sacrebleu_ci_low": 0.
|
| 1240 |
-
"sacrebleu_ci_high": 0.
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
-
|
| 1246 |
-
|
| 1247 |
-
|
| 1248 |
-
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
-
0.
|
| 1258 |
-
0.
|
| 1259 |
-
0.
|
| 1260 |
-
0.
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
-
"sys_len":
|
| 1264 |
"ref_len": 208,
|
| 1265 |
-
"sacrebleu": 0.
|
| 1266 |
-
"score": 0.
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
-
"score_ci_low": 0.
|
| 1269 |
-
"score_ci_high": 0.
|
| 1270 |
-
"sacrebleu_ci_low": 0.
|
| 1271 |
-
"sacrebleu_ci_high": 0.
|
| 1272 |
},
|
| 1273 |
-
"score": 0.
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
-
"score": 0.
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
+
"timestamp_utc": "2025-07-03T19:41:29.618401Z",
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
+
"unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
+
"accuracy": 0.8888888888888888,
|
| 180 |
+
"accuracy_ci_low": 0.46041936253217447,
|
| 181 |
"accuracy_ci_high": 1.0,
|
| 182 |
"score_name": "accuracy",
|
| 183 |
+
"score": 0.8888888888888888,
|
| 184 |
"score_ci_high": 1.0,
|
| 185 |
+
"score_ci_low": 0.46041936253217447,
|
| 186 |
"num_of_instances": 9
|
| 187 |
},
|
| 188 |
"safety_bbq_disability_status": {
|
|
|
|
| 196 |
"num_of_instances": 9
|
| 197 |
},
|
| 198 |
"safety_bbq_gender_identity": {
|
| 199 |
+
"accuracy": 0.7777777777777778,
|
| 200 |
+
"accuracy_ci_low": 0.4444444444444444,
|
| 201 |
"accuracy_ci_high": 1.0,
|
| 202 |
"score_name": "accuracy",
|
| 203 |
+
"score": 0.7777777777777778,
|
| 204 |
"score_ci_high": 1.0,
|
| 205 |
+
"score_ci_low": 0.4444444444444444,
|
| 206 |
"num_of_instances": 9
|
| 207 |
},
|
| 208 |
"safety_bbq_nationality": {
|
|
|
|
| 226 |
"num_of_instances": 9
|
| 227 |
},
|
| 228 |
"safety_bbq_race_ethnicity": {
|
| 229 |
+
"accuracy": 1.0,
|
| 230 |
+
"accuracy_ci_low": 1.0,
|
| 231 |
"accuracy_ci_high": 1.0,
|
| 232 |
"score_name": "accuracy",
|
| 233 |
+
"score": 1.0,
|
| 234 |
"score_ci_high": 1.0,
|
| 235 |
+
"score_ci_low": 1.0,
|
| 236 |
"num_of_instances": 9
|
| 237 |
},
|
| 238 |
"safety_bbq_race_x_gender": {
|
|
|
|
| 246 |
"num_of_instances": 9
|
| 247 |
},
|
| 248 |
"safety_bbq_race_x_ses": {
|
| 249 |
+
"accuracy": 0.6666666666666666,
|
| 250 |
"accuracy_ci_low": 0.3333333333333333,
|
| 251 |
+
"accuracy_ci_high": 0.8888888888888888,
|
| 252 |
"score_name": "accuracy",
|
| 253 |
+
"score": 0.6666666666666666,
|
| 254 |
+
"score_ci_high": 0.8888888888888888,
|
| 255 |
"score_ci_low": 0.3333333333333333,
|
| 256 |
"num_of_instances": 9
|
| 257 |
},
|
|
|
|
| 266 |
"num_of_instances": 9
|
| 267 |
},
|
| 268 |
"safety_bbq_ses": {
|
| 269 |
+
"accuracy": 0.2222222222222222,
|
| 270 |
+
"accuracy_ci_low": 0.0,
|
| 271 |
+
"accuracy_ci_high": 0.5555555555555556,
|
| 272 |
"score_name": "accuracy",
|
| 273 |
+
"score": 0.2222222222222222,
|
| 274 |
+
"score_ci_high": 0.5555555555555556,
|
| 275 |
+
"score_ci_low": 0.0,
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
"safety_bbq_sexual_orientation": {
|
|
|
|
| 285 |
"score_ci_low": 0.4444444444444444,
|
| 286 |
"num_of_instances": 9
|
| 287 |
},
|
| 288 |
+
"score": 0.797979797979798,
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.6025641025641025,
|
| 296 |
+
"score": 0.6025641025641025,
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
+
"score": 0.6025641025641025,
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
+
"f1_Person": 0.7391304347826085,
|
| 307 |
+
"f1_Organization": 0.5357142857142857,
|
| 308 |
+
"f1_Location": 0.5555555555555556,
|
| 309 |
+
"f1_macro": 0.6101334253508166,
|
| 310 |
+
"recall_macro": 0.5638371290545204,
|
| 311 |
+
"precision_macro": 0.7027260179434093,
|
| 312 |
+
"in_classes_support": 1.0,
|
| 313 |
+
"f1_micro": 0.6086956521739131,
|
| 314 |
+
"recall_micro": 0.56,
|
| 315 |
+
"precision_micro": 0.6666666666666666,
|
| 316 |
+
"score": 0.6086956521739131,
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
+
"score_ci_low": 0.4714559913296099,
|
| 319 |
+
"score_ci_high": 0.6872102499457013,
|
| 320 |
+
"f1_micro_ci_low": 0.4714559913296099,
|
| 321 |
+
"f1_micro_ci_high": 0.6872102499457013
|
| 322 |
},
|
| 323 |
+
"score": 0.6086956521739131,
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
| 327 |
"knowledge": {
|
| 328 |
"mmlu_pro_biology": {
|
| 329 |
+
"accuracy": 0.5714285714285714,
|
| 330 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 331 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 332 |
"score_name": "accuracy",
|
| 333 |
+
"score": 0.5714285714285714,
|
| 334 |
+
"score_ci_high": 0.8571428571428571,
|
| 335 |
+
"score_ci_low": 0.14285714285714285,
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
|
|
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
+
"accuracy": 0.14285714285714285,
|
| 350 |
+
"accuracy_ci_low": 0.0,
|
| 351 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 352 |
"score_name": "accuracy",
|
| 353 |
+
"score": 0.14285714285714285,
|
| 354 |
+
"score_ci_high": 0.5714285714285714,
|
| 355 |
+
"score_ci_low": 0.0,
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
| 358 |
"mmlu_pro_computer_science": {
|
| 359 |
+
"accuracy": 0.7142857142857143,
|
| 360 |
+
"accuracy_ci_low": 0.2254039495939315,
|
| 361 |
"accuracy_ci_high": 1.0,
|
| 362 |
"score_name": "accuracy",
|
| 363 |
+
"score": 0.7142857142857143,
|
| 364 |
"score_ci_high": 1.0,
|
| 365 |
+
"score_ci_low": 0.2254039495939315,
|
| 366 |
"num_of_instances": 7
|
| 367 |
},
|
| 368 |
"mmlu_pro_economics": {
|
| 369 |
+
"accuracy": 0.7142857142857143,
|
| 370 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 371 |
"accuracy_ci_high": 1.0,
|
| 372 |
"score_name": "accuracy",
|
| 373 |
+
"score": 0.7142857142857143,
|
| 374 |
"score_ci_high": 1.0,
|
| 375 |
+
"score_ci_low": 0.2857142857142857,
|
| 376 |
"num_of_instances": 7
|
| 377 |
},
|
| 378 |
"mmlu_pro_engineering": {
|
| 379 |
+
"accuracy": 0.5714285714285714,
|
| 380 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 381 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 382 |
"score_name": "accuracy",
|
| 383 |
+
"score": 0.5714285714285714,
|
| 384 |
+
"score_ci_high": 0.8571428571428571,
|
| 385 |
+
"score_ci_low": 0.14285714285714285,
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
|
|
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
| 398 |
"mmlu_pro_history": {
|
| 399 |
+
"accuracy": 0.14285714285714285,
|
| 400 |
"accuracy_ci_low": 0.0,
|
| 401 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 402 |
"score_name": "accuracy",
|
| 403 |
+
"score": 0.14285714285714285,
|
| 404 |
+
"score_ci_high": 0.5714285714285714,
|
| 405 |
"score_ci_low": 0.0,
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
+
"accuracy": 0.42857142857142855,
|
| 410 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 411 |
"accuracy_ci_high": 0.7142857142857143,
|
| 412 |
"score_name": "accuracy",
|
| 413 |
+
"score": 0.42857142857142855,
|
| 414 |
"score_ci_high": 0.7142857142857143,
|
| 415 |
+
"score_ci_low": 0.14285714285714285,
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
|
|
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
| 438 |
"mmlu_pro_philosophy": {
|
| 439 |
+
"accuracy": 0.7142857142857143,
|
| 440 |
+
"accuracy_ci_low": 0.2857142857142857,
|
| 441 |
+
"accuracy_ci_high": 1.0,
|
| 442 |
"score_name": "accuracy",
|
| 443 |
+
"score": 0.7142857142857143,
|
| 444 |
+
"score_ci_high": 1.0,
|
| 445 |
+
"score_ci_low": 0.2857142857142857,
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
+
"accuracy": 0.14285714285714285,
|
| 450 |
+
"accuracy_ci_low": 0.0,
|
| 451 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 452 |
"score_name": "accuracy",
|
| 453 |
+
"score": 0.14285714285714285,
|
| 454 |
+
"score_ci_high": 0.5714285714285714,
|
| 455 |
+
"score_ci_low": 0.0,
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
| 458 |
"mmlu_pro_psychology": {
|
|
|
|
| 465 |
"score_ci_low": 0.14285714285714285,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
+
"score": 0.40816326530612246,
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
+
"f1_macro": 0.5933333333333334,
|
| 475 |
+
"f1_suggestive": 0.5,
|
| 476 |
+
"f1_arbitrary": 0.5,
|
| 477 |
"f1_generic": 0.5,
|
| 478 |
+
"f1_fanciful": 0.6666666666666666,
|
| 479 |
"f1_descriptive": 0.8,
|
| 480 |
+
"f1_macro_ci_low": 0.40499999999999997,
|
| 481 |
+
"f1_macro_ci_high": 0.8727728256593986,
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
+
"score": 0.6,
|
| 484 |
+
"score_ci_high": 0.8,
|
| 485 |
+
"score_ci_low": 0.35,
|
| 486 |
"num_of_instances": 20,
|
| 487 |
+
"accuracy": 0.6,
|
| 488 |
+
"accuracy_ci_low": 0.35,
|
| 489 |
+
"accuracy_ci_high": 0.8,
|
| 490 |
+
"f1_micro": 0.6,
|
| 491 |
+
"f1_micro_ci_low": 0.35,
|
| 492 |
+
"f1_micro_ci_high": 0.8
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
+
"f1_macro": 0.52,
|
| 496 |
+
"f1_no": 0.64,
|
| 497 |
+
"f1_yes": 0.4,
|
| 498 |
+
"f1_macro_ci_low": 0.30666666666666664,
|
| 499 |
+
"f1_macro_ci_high": 0.7802197802197802,
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
+
"score": 0.55,
|
| 502 |
+
"score_ci_high": 0.75,
|
| 503 |
+
"score_ci_low": 0.3,
|
| 504 |
"num_of_instances": 20,
|
| 505 |
+
"accuracy": 0.55,
|
| 506 |
+
"accuracy_ci_low": 0.3,
|
| 507 |
+
"accuracy_ci_high": 0.75,
|
| 508 |
+
"f1_micro": 0.55,
|
| 509 |
+
"f1_micro_ci_low": 0.3,
|
| 510 |
+
"f1_micro_ci_high": 0.75
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
+
"f1_macro": 0.3151927437641723,
|
| 514 |
"f1_conclusion": 0.2857142857142857,
|
| 515 |
"f1_analysis": 0.4444444444444444,
|
| 516 |
"f1_decree": 0.0,
|
| 517 |
"f1_issue": 0.2857142857142857,
|
| 518 |
+
"f1_procedural history": 0.3333333333333333,
|
| 519 |
+
"f1_facts": 0.8571428571428571,
|
| 520 |
"f1_rule": 0.0,
|
| 521 |
+
"f1_macro_ci_low": 0.14898389471709916,
|
| 522 |
+
"f1_macro_ci_high": 0.47222222222222215,
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
+
"score": 0.4,
|
| 525 |
+
"score_ci_high": 0.6,
|
| 526 |
+
"score_ci_low": 0.17647058823529413,
|
| 527 |
"num_of_instances": 20,
|
| 528 |
+
"accuracy": 0.4,
|
| 529 |
+
"accuracy_ci_low": 0.2,
|
| 530 |
+
"accuracy_ci_high": 0.6,
|
| 531 |
+
"f1_micro": 0.4,
|
| 532 |
+
"f1_micro_ci_low": 0.17647058823529413,
|
| 533 |
+
"f1_micro_ci_high": 0.6
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
+
"f1_macro": 0.6419437340153453,
|
| 537 |
+
"f1_yes": 0.5882352941176471,
|
| 538 |
+
"f1_no": 0.6956521739130435,
|
| 539 |
+
"f1_macro_ci_low": 0.4357366771159875,
|
| 540 |
+
"f1_macro_ci_high": 0.8465473145780051,
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
+
"score": 0.65,
|
| 543 |
+
"score_ci_high": 0.85,
|
| 544 |
+
"score_ci_low": 0.45,
|
| 545 |
"num_of_instances": 20,
|
| 546 |
+
"accuracy": 0.65,
|
| 547 |
+
"accuracy_ci_low": 0.45,
|
| 548 |
+
"accuracy_ci_high": 0.85,
|
| 549 |
+
"f1_micro": 0.65,
|
| 550 |
+
"f1_micro_ci_low": 0.45,
|
| 551 |
+
"f1_micro_ci_high": 0.85
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
+
"f1_macro": 0.849624060150376,
|
| 555 |
+
"f1_yes": 0.8421052631578947,
|
| 556 |
+
"f1_no": 0.8571428571428571,
|
| 557 |
+
"f1_macro_ci_low": 0.6703296703296704,
|
| 558 |
"f1_macro_ci_high": 1.0,
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
+
"score": 0.85,
|
| 561 |
+
"score_ci_high": 0.95,
|
| 562 |
+
"score_ci_low": 0.65,
|
| 563 |
"num_of_instances": 20,
|
| 564 |
+
"accuracy": 0.85,
|
| 565 |
+
"accuracy_ci_low": 0.65,
|
| 566 |
+
"accuracy_ci_high": 0.95,
|
| 567 |
+
"f1_micro": 0.85,
|
| 568 |
+
"f1_micro_ci_low": 0.65,
|
| 569 |
+
"f1_micro_ci_high": 0.95
|
| 570 |
},
|
| 571 |
+
"score": 0.61,
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
+
"f1_macro": 0.5056926406926407,
|
| 578 |
"f1_cars": 0.75,
|
| 579 |
"f1_windows x": 0.3333333333333333,
|
| 580 |
+
"f1_computer graphics": 0.5,
|
| 581 |
"f1_atheism": 0.0,
|
| 582 |
+
"f1_religion": 0.18181818181818182,
|
| 583 |
"f1_medicine": 0.6666666666666666,
|
| 584 |
+
"f1_christianity": 0.4,
|
| 585 |
"f1_microsoft windows": 0.6666666666666666,
|
| 586 |
+
"f1_middle east": 0.2857142857142857,
|
| 587 |
+
"f1_politics": 0.2857142857142857,
|
| 588 |
+
"f1_motorcycles": 0.7272727272727273,
|
| 589 |
"f1_pc hardware": 0.6666666666666666,
|
| 590 |
"f1_mac hardware": 0.5,
|
| 591 |
+
"f1_electronics": 0.0,
|
| 592 |
"f1_for sale": 0.6666666666666666,
|
| 593 |
+
"f1_guns": 0.4444444444444444,
|
| 594 |
"f1_space": 0.75,
|
| 595 |
+
"f1_cryptography": 0.4,
|
| 596 |
+
"f1_baseball": 1.0,
|
| 597 |
+
"f1_hockey": 0.8888888888888888,
|
| 598 |
+
"f1_macro_ci_low": 0.4219714712587517,
|
| 599 |
+
"f1_macro_ci_high": 0.6341002349458903,
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
+
"score": 0.5393258426966292,
|
| 602 |
+
"score_ci_high": 0.632768361581921,
|
| 603 |
+
"score_ci_low": 0.4220293543283505,
|
| 604 |
"num_of_instances": 100,
|
| 605 |
+
"accuracy": 0.48,
|
| 606 |
+
"accuracy_ci_low": 0.38,
|
| 607 |
+
"accuracy_ci_high": 0.58,
|
| 608 |
+
"f1_micro": 0.5393258426966292,
|
| 609 |
+
"f1_micro_ci_low": 0.4220293543283505,
|
| 610 |
+
"f1_micro_ci_high": 0.632768361581921
|
| 611 |
},
|
| 612 |
+
"score": 0.5393258426966292,
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
+
"f1_macro": 0.5772283699281425,
|
| 619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9457364341085271,
|
| 620 |
+
"f1_mortgage": 0.8571428571428571,
|
| 621 |
+
"f1_debt collection": 0.42105263157894735,
|
| 622 |
+
"f1_credit card or prepaid card": 0.0,
|
| 623 |
+
"f1_checking or savings account": 0.75,
|
| 624 |
+
"f1_student loan": 0.6666666666666666,
|
| 625 |
+
"f1_money transfer or virtual currency or money service": 0.4,
|
| 626 |
+
"f1_macro_ci_low": 0.43341419112091223,
|
| 627 |
+
"f1_macro_ci_high": 0.7943029958179404,
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
"score": 0.8324873096446701,
|
| 630 |
+
"score_ci_high": 0.8986163491039517,
|
| 631 |
+
"score_ci_low": 0.7438346729829881,
|
| 632 |
"num_of_instances": 100,
|
| 633 |
"accuracy": 0.82,
|
| 634 |
"accuracy_ci_low": 0.73,
|
| 635 |
"accuracy_ci_high": 0.89,
|
| 636 |
"f1_micro": 0.8324873096446701,
|
| 637 |
+
"f1_micro_ci_low": 0.7438346729829881,
|
| 638 |
+
"f1_micro_ci_high": 0.8986163491039517
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
+
"f1_macro": 0.6745944600062247,
|
| 642 |
+
"f1_mortgages and loans": 0.7619047619047619,
|
| 643 |
+
"f1_credit card": 0.72,
|
| 644 |
+
"f1_debt collection": 0.7058823529411765,
|
| 645 |
+
"f1_credit reporting": 0.7407407407407407,
|
| 646 |
+
"f1_retail banking": 0.4444444444444444,
|
| 647 |
+
"f1_macro_ci_low": 0.538487634706942,
|
| 648 |
+
"f1_macro_ci_high": 0.8292804449703136,
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
+
"score": 0.7070707070707071,
|
| 651 |
+
"score_ci_high": 0.82,
|
| 652 |
+
"score_ci_low": 0.5567010309278351,
|
| 653 |
"num_of_instances": 50,
|
| 654 |
+
"accuracy": 0.7,
|
| 655 |
+
"accuracy_ci_low": 0.54,
|
| 656 |
+
"accuracy_ci_high": 0.82,
|
| 657 |
+
"f1_micro": 0.7070707070707071,
|
| 658 |
+
"f1_micro_ci_low": 0.5567010309278351,
|
| 659 |
+
"f1_micro_ci_high": 0.82
|
| 660 |
},
|
| 661 |
+
"score": 0.7697790083576885,
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
+
"execution_accuracy": 0.14,
|
| 669 |
+
"program_accuracy": 0.16,
|
| 670 |
+
"score": 0.16,
|
| 671 |
"score_name": "program_accuracy",
|
| 672 |
+
"execution_accuracy_ci_low": 0.08,
|
| 673 |
+
"execution_accuracy_ci_high": 0.22,
|
| 674 |
+
"program_accuracy_ci_low": 0.09,
|
| 675 |
+
"program_accuracy_ci_high": 0.24,
|
| 676 |
+
"score_ci_low": 0.09,
|
| 677 |
+
"score_ci_high": 0.24
|
|
|
|
| 678 |
},
|
| 679 |
+
"score": 0.16,
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
+
"precision": 0.5122629801345308,
|
| 686 |
+
"recall": 0.5711640196088964,
|
| 687 |
+
"f1": 0.4956577841210359,
|
| 688 |
+
"precision_ci_low": 0.47463388705030735,
|
| 689 |
+
"precision_ci_high": 0.5516188080201115,
|
| 690 |
+
"recall_ci_low": 0.5308026366914153,
|
| 691 |
+
"recall_ci_high": 0.6126595999428824,
|
| 692 |
+
"f1_ci_low": 0.4646393825182617,
|
| 693 |
+
"f1_ci_high": 0.5290134586140666,
|
| 694 |
"score_name": "f1",
|
| 695 |
+
"score": 0.4956577841210359,
|
| 696 |
+
"score_ci_high": 0.5290134586140666,
|
| 697 |
+
"score_ci_low": 0.4646393825182617,
|
| 698 |
"num_of_instances": 100,
|
| 699 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.681556967496872,
|
| 700 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.7017117899656296,
|
| 701 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.6739864906668663,
|
| 702 |
+
"faithfullness_f1_token_overlap": 0.36059390510577694,
|
| 703 |
+
"faithfullness_recall_token_overlap": 0.27208354183856076,
|
| 704 |
+
"faithfullness_precision_token_overlap": 0.7087220222955496,
|
| 705 |
+
"correctness_f1_token_overlap": 0.4956577841210359,
|
| 706 |
+
"correctness_recall_token_overlap": 0.5711640196088964,
|
| 707 |
+
"correctness_precision_token_overlap": 0.5122629801345308
|
| 708 |
},
|
| 709 |
+
"score": 0.4956577841210359,
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
| 713 |
"reasoning": {
|
| 714 |
"hellaswag": {
|
| 715 |
+
"accuracy": 0.51,
|
| 716 |
+
"accuracy_ci_low": 0.42,
|
| 717 |
+
"accuracy_ci_high": 0.61,
|
| 718 |
"score_name": "accuracy",
|
| 719 |
+
"score": 0.51,
|
| 720 |
+
"score_ci_high": 0.61,
|
| 721 |
+
"score_ci_low": 0.42,
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
+
"accuracy": 0.86,
|
| 726 |
+
"accuracy_ci_low": 0.78,
|
| 727 |
+
"accuracy_ci_high": 0.92,
|
| 728 |
"score_name": "accuracy",
|
| 729 |
+
"score": 0.86,
|
| 730 |
+
"score_ci_high": 0.92,
|
| 731 |
+
"score_ci_low": 0.78,
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
+
"score": 0.685,
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
|
|
|
| 741 |
"severity_high": 1.0,
|
| 742 |
"severity_medium": 0.0,
|
| 743 |
"severity_low": 99.0,
|
| 744 |
+
"category_harmful_info": 0.8631038246528496,
|
| 745 |
+
"category_deception": 0.8588788503348226,
|
| 746 |
+
"category_explicit_content": 0.8620110215111212,
|
| 747 |
+
"category_violence": 0.8305739898530264,
|
| 748 |
+
"category_discrimination": 0.7453164079832652,
|
| 749 |
+
"category_substance_abuse": 0.8580450213617749,
|
| 750 |
+
"category_pii": 0.8342232150715535,
|
| 751 |
+
"safety": 0.8417689493334748,
|
| 752 |
+
"safety_ci_low": 0.8196727461133039,
|
| 753 |
+
"safety_ci_high": 0.8616548092451682,
|
| 754 |
"score_name": "safety",
|
| 755 |
+
"score": 0.8417689493334748,
|
| 756 |
+
"score_ci_high": 0.8616548092451682,
|
| 757 |
+
"score_ci_low": 0.8196727461133039,
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
+
"score": 0.8417689493334748,
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
+
"rouge2": 0.21036085645747643,
|
| 768 |
+
"rouge1": 0.42196769187176775,
|
| 769 |
+
"rougeLsum": 0.3660108076703283,
|
| 770 |
+
"rougeL": 0.29182514205540294,
|
| 771 |
+
"score": 0.29182514205540294,
|
| 772 |
"score_name": "rougeL",
|
| 773 |
+
"rouge2_ci_low": 0.19459047905298954,
|
| 774 |
+
"rouge2_ci_high": 0.23001172566592604,
|
| 775 |
+
"rouge1_ci_low": 0.3983725355073898,
|
| 776 |
+
"rouge1_ci_high": 0.44557497505333493,
|
| 777 |
+
"rougeLsum_ci_low": 0.3436272136748691,
|
| 778 |
+
"rougeLsum_ci_high": 0.3874312949915785,
|
| 779 |
+
"rougeL_ci_low": 0.2737011422865546,
|
| 780 |
+
"rougeL_ci_high": 0.3128796438747455,
|
| 781 |
+
"score_ci_low": 0.2737011422865546,
|
| 782 |
+
"score_ci_high": 0.3128796438747455
|
|
|
|
|
|
|
|
|
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
+
"rouge2": 0.015540734558041634,
|
| 787 |
+
"rouge1": 0.11070991559700558,
|
| 788 |
+
"rougeLsum": 0.0922692226275668,
|
| 789 |
+
"rougeL": 0.08156778335834318,
|
| 790 |
+
"score": 0.08156778335834318,
|
| 791 |
"score_name": "rougeL",
|
| 792 |
+
"rouge2_ci_low": 0.011481094239772009,
|
| 793 |
+
"rouge2_ci_high": 0.021386728155477184,
|
| 794 |
+
"rouge1_ci_low": 0.09611259187453111,
|
| 795 |
+
"rouge1_ci_high": 0.12773631865916757,
|
| 796 |
+
"rougeLsum_ci_low": 0.08014472825538595,
|
| 797 |
+
"rougeLsum_ci_high": 0.10544617452174851,
|
| 798 |
+
"rougeL_ci_low": 0.07184211435294499,
|
| 799 |
+
"rougeL_ci_high": 0.09160744099439429,
|
| 800 |
+
"score_ci_low": 0.07184211435294499,
|
| 801 |
+
"score_ci_high": 0.09160744099439429
|
|
|
|
|
|
|
|
|
|
| 802 |
},
|
| 803 |
+
"score": 0.18669646270687307,
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
+
149,
|
| 812 |
+
100,
|
| 813 |
+
74,
|
| 814 |
+
57
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
+
228,
|
| 818 |
+
222,
|
| 819 |
+
216,
|
| 820 |
+
210
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
+
0.6535087719298245,
|
| 824 |
+
0.45045045045045046,
|
| 825 |
+
0.3425925925925926,
|
| 826 |
+
0.2714285714285714
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
+
"sys_len": 228,
|
| 830 |
"ref_len": 208,
|
| 831 |
+
"sacrebleu": 0.4067550879939379,
|
| 832 |
+
"score": 0.4067550879939379,
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
+
"score_ci_low": 0.18449348983650793,
|
| 835 |
+
"score_ci_high": 0.5000148909038645,
|
| 836 |
+
"sacrebleu_ci_low": 0.18449348983650793,
|
| 837 |
+
"sacrebleu_ci_high": 0.5000148909038645
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
+
133,
|
| 843 |
+
74,
|
| 844 |
+
41,
|
| 845 |
+
24
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
+
205,
|
| 849 |
+
199,
|
| 850 |
+
193,
|
| 851 |
+
187
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
+
0.6487804878048781,
|
| 855 |
+
0.37185929648241206,
|
| 856 |
+
0.21243523316062177,
|
| 857 |
+
0.1283422459893048
|
| 858 |
],
|
| 859 |
+
"bp": 0.9854724123463497,
|
| 860 |
+
"sys_len": 205,
|
| 861 |
"ref_len": 208,
|
| 862 |
+
"sacrebleu": 0.2806484335469714,
|
| 863 |
+
"score": 0.2806484335469714,
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
+
"score_ci_low": 0.20930302049758778,
|
| 866 |
+
"score_ci_high": 0.3669108559906311,
|
| 867 |
+
"sacrebleu_ci_low": 0.20930302049758778,
|
| 868 |
+
"sacrebleu_ci_high": 0.3669108559906311
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
+
107,
|
| 874 |
+
52,
|
| 875 |
+
30,
|
| 876 |
+
14
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
+
205,
|
| 880 |
+
199,
|
| 881 |
+
193,
|
| 882 |
+
187
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
+
0.5219512195121951,
|
| 886 |
+
0.2613065326633166,
|
| 887 |
+
0.15544041450777202,
|
| 888 |
+
0.0748663101604278
|
| 889 |
],
|
| 890 |
+
"bp": 0.9806769356409174,
|
| 891 |
+
"sys_len": 205,
|
| 892 |
"ref_len": 209,
|
| 893 |
+
"sacrebleu": 0.19574181051276632,
|
| 894 |
+
"score": 0.19574181051276632,
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
+
"score_ci_low": 0.13043482957972302,
|
| 897 |
+
"score_ci_high": 0.2838217012977499,
|
| 898 |
+
"sacrebleu_ci_low": 0.13043482957972302,
|
| 899 |
+
"sacrebleu_ci_high": 0.2838217012977499
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
+
126,
|
| 905 |
+
69,
|
| 906 |
+
39,
|
| 907 |
+
19
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
+
215,
|
| 911 |
+
209,
|
| 912 |
+
203,
|
| 913 |
+
197
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
+
0.586046511627907,
|
| 917 |
+
0.33014354066985646,
|
| 918 |
+
0.19211822660098524,
|
| 919 |
+
0.09644670050761421
|
| 920 |
],
|
| 921 |
+
"bp": 0.9953596371164251,
|
| 922 |
+
"sys_len": 215,
|
| 923 |
"ref_len": 216,
|
| 924 |
+
"sacrebleu": 0.2435581878458631,
|
| 925 |
+
"score": 0.2435581878458631,
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
+
"score_ci_low": 0.15892386834513053,
|
| 928 |
+
"score_ci_high": 0.31857139859597966,
|
| 929 |
+
"sacrebleu_ci_low": 0.15892386834513053,
|
| 930 |
+
"sacrebleu_ci_high": 0.31857139859597966
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
186,
|
| 936 |
+
143,
|
| 937 |
+
115,
|
| 938 |
+
96
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
+
234,
|
| 942 |
+
228,
|
| 943 |
+
222,
|
| 944 |
+
216
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
+
0.7948717948717949,
|
| 948 |
+
0.6271929824561403,
|
| 949 |
+
0.5180180180180181,
|
| 950 |
+
0.4444444444444444
|
| 951 |
],
|
| 952 |
+
"bp": 0.9957356141520489,
|
| 953 |
+
"sys_len": 234,
|
| 954 |
"ref_len": 235,
|
| 955 |
+
"sacrebleu": 0.5795744035432013,
|
| 956 |
+
"score": 0.5795744035432013,
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
+
"score_ci_low": 0.489542796361342,
|
| 959 |
+
"score_ci_high": 0.6836141189380024,
|
| 960 |
+
"sacrebleu_ci_low": 0.489542796361342,
|
| 961 |
+
"sacrebleu_ci_high": 0.6836141189380024
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
+
148,
|
| 967 |
+
74,
|
| 968 |
+
39,
|
| 969 |
+
22
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
+
297,
|
| 973 |
+
291,
|
| 974 |
+
285,
|
| 975 |
+
279
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
+
0.4983164983164983,
|
| 979 |
+
0.2542955326460481,
|
| 980 |
+
0.1368421052631579,
|
| 981 |
+
0.07885304659498207
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
+
"sys_len": 297,
|
| 985 |
"ref_len": 249,
|
| 986 |
+
"sacrebleu": 0.19229613499833637,
|
| 987 |
+
"score": 0.19229613499833637,
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
+
"score_ci_low": 0.11753162974027624,
|
| 990 |
+
"score_ci_high": 0.2734631145297525,
|
| 991 |
+
"sacrebleu_ci_low": 0.11753162974027624,
|
| 992 |
+
"sacrebleu_ci_high": 0.2734631145297525
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
+
175,
|
| 998 |
+
127,
|
| 999 |
+
96,
|
| 1000 |
+
73
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
230,
|
|
|
|
| 1006 |
212
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
+
0.7608695652173912,
|
| 1010 |
+
0.5669642857142857,
|
| 1011 |
+
0.4403669724770642,
|
| 1012 |
+
0.3443396226415094
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
"sys_len": 230,
|
| 1016 |
"ref_len": 222,
|
| 1017 |
+
"sacrebleu": 0.5057279000292236,
|
| 1018 |
+
"score": 0.5057279000292236,
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
+
"score_ci_low": 0.44927402111531833,
|
| 1021 |
+
"score_ci_high": 0.5829583257663561,
|
| 1022 |
+
"sacrebleu_ci_low": 0.44927402111531833,
|
| 1023 |
+
"sacrebleu_ci_high": 0.5829583257663561
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
+
151,
|
| 1029 |
98,
|
| 1030 |
+
70,
|
| 1031 |
+
52
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
+
230,
|
| 1035 |
+
224,
|
| 1036 |
+
218,
|
| 1037 |
+
212
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
+
0.6565217391304349,
|
| 1041 |
+
0.4375,
|
| 1042 |
+
0.3211009174311926,
|
| 1043 |
+
0.24528301886792453
|
| 1044 |
],
|
| 1045 |
"bp": 1.0,
|
| 1046 |
+
"sys_len": 230,
|
| 1047 |
"ref_len": 230,
|
| 1048 |
+
"sacrebleu": 0.3878234357113968,
|
| 1049 |
+
"score": 0.3878234357113968,
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
+
"score_ci_low": 0.25965016638584715,
|
| 1052 |
+
"score_ci_high": 0.5435565274954791,
|
| 1053 |
+
"sacrebleu_ci_low": 0.25965016638584715,
|
| 1054 |
+
"sacrebleu_ci_high": 0.5435565274954791
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
+
155,
|
| 1060 |
+
80,
|
| 1061 |
+
43,
|
| 1062 |
+
25
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
+
235,
|
| 1066 |
+
229,
|
| 1067 |
+
223,
|
| 1068 |
+
217
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
+
0.6595744680851063,
|
| 1072 |
+
0.3493449781659389,
|
| 1073 |
+
0.19282511210762332,
|
| 1074 |
+
0.1152073732718894
|
| 1075 |
],
|
| 1076 |
+
"bp": 0.9665303748102905,
|
| 1077 |
+
"sys_len": 235,
|
| 1078 |
"ref_len": 243,
|
| 1079 |
+
"sacrebleu": 0.2585270907217383,
|
| 1080 |
+
"score": 0.2585270907217383,
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
+
"score_ci_low": 0.20941633241942087,
|
| 1083 |
+
"score_ci_high": 0.30626903457788784,
|
| 1084 |
+
"sacrebleu_ci_low": 0.20941633241942087,
|
| 1085 |
+
"sacrebleu_ci_high": 0.30626903457788784
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
+
157,
|
| 1091 |
+
107,
|
| 1092 |
+
76,
|
| 1093 |
+
56
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
+
220,
|
| 1097 |
214,
|
| 1098 |
208,
|
| 1099 |
+
202
|
|
|
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
+
0.7136363636363636,
|
| 1103 |
+
0.5,
|
| 1104 |
+
0.3653846153846154,
|
| 1105 |
+
0.27722772277227725
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
+
"sys_len": 220,
|
| 1109 |
"ref_len": 208,
|
| 1110 |
+
"sacrebleu": 0.43602207032130424,
|
| 1111 |
+
"score": 0.43602207032130424,
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
+
"score_ci_low": 0.2946660579225827,
|
| 1114 |
+
"score_ci_high": 0.5481080622130052,
|
| 1115 |
+
"sacrebleu_ci_low": 0.2946660579225827,
|
| 1116 |
+
"sacrebleu_ci_high": 0.5481080622130052
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
+
133,
|
| 1122 |
82,
|
| 1123 |
+
56,
|
| 1124 |
+
42
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
+
198,
|
| 1128 |
+
192,
|
| 1129 |
+
186,
|
| 1130 |
+
180
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
+
0.6717171717171717,
|
| 1134 |
+
0.42708333333333337,
|
| 1135 |
+
0.3010752688172043,
|
| 1136 |
+
0.2333333333333333
|
| 1137 |
],
|
| 1138 |
+
"bp": 0.950749126896934,
|
| 1139 |
+
"sys_len": 198,
|
| 1140 |
"ref_len": 208,
|
| 1141 |
+
"sacrebleu": 0.35822316846084,
|
| 1142 |
+
"score": 0.35822316846084,
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
+
"score_ci_low": 0.18839913329076022,
|
| 1145 |
+
"score_ci_high": 0.5446107832786825,
|
| 1146 |
+
"sacrebleu_ci_low": 0.18839913329076022,
|
| 1147 |
+
"sacrebleu_ci_high": 0.5446107832786825
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
+
129,
|
| 1153 |
63,
|
| 1154 |
+
36,
|
| 1155 |
+
23
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
+
201,
|
| 1159 |
+
195,
|
| 1160 |
+
189,
|
| 1161 |
+
183
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
+
0.6417910447761195,
|
| 1165 |
+
0.32307692307692304,
|
| 1166 |
+
0.19047619047619047,
|
| 1167 |
+
0.12568306010928962
|
| 1168 |
],
|
| 1169 |
+
"bp": 0.9657735711441044,
|
| 1170 |
+
"sys_len": 201,
|
| 1171 |
"ref_len": 208,
|
| 1172 |
+
"sacrebleu": 0.25634778841638817,
|
| 1173 |
+
"score": 0.25634778841638817,
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
+
"score_ci_low": 0.1694709590890647,
|
| 1176 |
+
"score_ci_high": 0.3945944559803188,
|
| 1177 |
+
"sacrebleu_ci_low": 0.1694709590890647,
|
| 1178 |
+
"sacrebleu_ci_high": 0.3945944559803188
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
+
148,
|
| 1184 |
+
100,
|
| 1185 |
+
73,
|
| 1186 |
+
53
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
+
213,
|
| 1190 |
+
207,
|
| 1191 |
+
201,
|
| 1192 |
+
195
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
+
0.6948356807511737,
|
| 1196 |
+
0.48309178743961356,
|
| 1197 |
+
0.36318407960199006,
|
| 1198 |
+
0.2717948717948718
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
+
"sys_len": 213,
|
| 1202 |
"ref_len": 208,
|
| 1203 |
+
"sacrebleu": 0.426648238456799,
|
| 1204 |
+
"score": 0.426648238456799,
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
+
"score_ci_low": 0.2592000591652009,
|
| 1207 |
+
"score_ci_high": 0.5677639298714758,
|
| 1208 |
+
"sacrebleu_ci_low": 0.2592000591652009,
|
| 1209 |
+
"sacrebleu_ci_high": 0.5677639298714758
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
+
148,
|
| 1215 |
+
92,
|
| 1216 |
+
65,
|
| 1217 |
+
47
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
+
215,
|
| 1221 |
+
209,
|
| 1222 |
+
203,
|
| 1223 |
+
197
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
+
0.6883720930232557,
|
| 1227 |
+
0.44019138755980863,
|
| 1228 |
+
0.32019704433497537,
|
| 1229 |
+
0.23857868020304568
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
+
"sys_len": 215,
|
| 1233 |
"ref_len": 208,
|
| 1234 |
+
"sacrebleu": 0.39005732387552927,
|
| 1235 |
+
"score": 0.39005732387552927,
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
+
"score_ci_low": 0.2645396605523872,
|
| 1238 |
+
"score_ci_high": 0.5798015480261387,
|
| 1239 |
+
"sacrebleu_ci_low": 0.2645396605523872,
|
| 1240 |
+
"sacrebleu_ci_high": 0.5798015480261387
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
+
142,
|
| 1246 |
+
83,
|
| 1247 |
+
50,
|
| 1248 |
+
36
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
+
228,
|
| 1252 |
+
222,
|
| 1253 |
+
216,
|
| 1254 |
+
210
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
+
0.6228070175438597,
|
| 1258 |
+
0.37387387387387383,
|
| 1259 |
+
0.23148148148148148,
|
| 1260 |
+
0.17142857142857143
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
+
"sys_len": 228,
|
| 1264 |
"ref_len": 208,
|
| 1265 |
+
"sacrebleu": 0.3100412781680407,
|
| 1266 |
+
"score": 0.3100412781680407,
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
+
"score_ci_low": 0.22358091489071585,
|
| 1269 |
+
"score_ci_high": 0.4112037006871551,
|
| 1270 |
+
"sacrebleu_ci_low": 0.22358091489071585,
|
| 1271 |
+
"sacrebleu_ci_high": 0.4112037006871551
|
| 1272 |
},
|
| 1273 |
+
"score": 0.34853282350682246,
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
+
"score": 0.54262797605742,
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
results/bluebench/{2025-07-02T17-12-27_evaluation_results.json β 2025-07-03T15-51-24_evaluation_results.json}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
-
"timestamp_utc": "2025-07-
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
@@ -42,7 +42,7 @@
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
-
"unitxt_commit_hash": "
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
@@ -176,13 +176,13 @@
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
-
"accuracy": 0.
|
| 180 |
-
"accuracy_ci_low": 0.
|
| 181 |
"accuracy_ci_high": 1.0,
|
| 182 |
"score_name": "accuracy",
|
| 183 |
-
"score": 0.
|
| 184 |
"score_ci_high": 1.0,
|
| 185 |
-
"score_ci_low": 0.
|
| 186 |
"num_of_instances": 9
|
| 187 |
},
|
| 188 |
"safety_bbq_disability_status": {
|
|
@@ -266,61 +266,61 @@
|
|
| 266 |
"num_of_instances": 9
|
| 267 |
},
|
| 268 |
"safety_bbq_ses": {
|
| 269 |
-
"accuracy":
|
| 270 |
-
"accuracy_ci_low":
|
| 271 |
"accuracy_ci_high": 1.0,
|
| 272 |
"score_name": "accuracy",
|
| 273 |
-
"score":
|
| 274 |
"score_ci_high": 1.0,
|
| 275 |
-
"score_ci_low":
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
"safety_bbq_sexual_orientation": {
|
| 279 |
-
"accuracy":
|
| 280 |
-
"accuracy_ci_low":
|
| 281 |
"accuracy_ci_high": 1.0,
|
| 282 |
"score_name": "accuracy",
|
| 283 |
-
"score":
|
| 284 |
"score_ci_high": 1.0,
|
| 285 |
-
"score_ci_low":
|
| 286 |
"num_of_instances": 9
|
| 287 |
},
|
| 288 |
-
"score": 0.
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
| 296 |
-
"score": 0.
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
-
"score": 0.
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
-
"f1_Person": 0.
|
| 307 |
-
"f1_Organization": 0.
|
| 308 |
-
"f1_Location": 0.
|
| 309 |
-
"f1_macro": 0.
|
| 310 |
-
"recall_macro": 0.
|
| 311 |
-
"precision_macro": 0.
|
| 312 |
-
"in_classes_support": 0.
|
| 313 |
-
"f1_micro": 0.
|
| 314 |
-
"recall_micro": 0.
|
| 315 |
-
"precision_micro": 0.
|
| 316 |
-
"score": 0.
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
-
"score_ci_low": 0.
|
| 319 |
-
"score_ci_high": 0.
|
| 320 |
-
"f1_micro_ci_low": 0.
|
| 321 |
-
"f1_micro_ci_high": 0.
|
| 322 |
},
|
| 323 |
-
"score": 0.
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
|
@@ -386,13 +386,13 @@
|
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
| 389 |
-
"accuracy": 0.
|
| 390 |
-
"accuracy_ci_low": 0.
|
| 391 |
-
"accuracy_ci_high": 0.
|
| 392 |
"score_name": "accuracy",
|
| 393 |
-
"score": 0.
|
| 394 |
-
"score_ci_high": 0.
|
| 395 |
-
"score_ci_low": 0.
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
| 398 |
"mmlu_pro_history": {
|
|
@@ -406,13 +406,13 @@
|
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
-
"accuracy": 0.
|
| 410 |
-
"accuracy_ci_low": 0.
|
| 411 |
"accuracy_ci_high": 1.0,
|
| 412 |
"score_name": "accuracy",
|
| 413 |
-
"score": 0.
|
| 414 |
"score_ci_high": 1.0,
|
| 415 |
-
"score_ci_low": 0.
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
|
@@ -465,7 +465,7 @@
|
|
| 465 |
"score_ci_low": 0.14285714285714285,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
-
"score": 0.
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
|
@@ -495,18 +495,18 @@
|
|
| 495 |
"f1_macro": 0.5978260869565217,
|
| 496 |
"f1_no": 0.6956521739130435,
|
| 497 |
"f1_yes": 0.5,
|
| 498 |
-
"f1_macro_ci_low": 0.
|
| 499 |
-
"f1_macro_ci_high": 0.
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
"score": 0.6285714285714286,
|
| 502 |
"score_ci_high": 0.8108108108108109,
|
| 503 |
-
"score_ci_low": 0.
|
| 504 |
"num_of_instances": 20,
|
| 505 |
"accuracy": 0.55,
|
| 506 |
-
"accuracy_ci_low": 0.
|
| 507 |
"accuracy_ci_high": 0.75,
|
| 508 |
"f1_micro": 0.6285714285714286,
|
| 509 |
-
"f1_micro_ci_low": 0.
|
| 510 |
"f1_micro_ci_high": 0.8108108108108109
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
|
@@ -574,139 +574,139 @@
|
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
-
"f1_macro": 0.
|
| 578 |
"f1_cars": 0.9090909090909091,
|
| 579 |
-
"f1_windows x": 0.
|
| 580 |
-
"f1_computer graphics": 0.
|
| 581 |
-
"f1_atheism": 0.
|
| 582 |
"f1_religion": 0.0,
|
| 583 |
"f1_medicine": 1.0,
|
| 584 |
-
"f1_christianity": 0.
|
| 585 |
"f1_microsoft windows": 0.8,
|
| 586 |
"f1_middle east": 0.5,
|
| 587 |
"f1_motorcycles": 0.6,
|
| 588 |
-
"f1_pc hardware": 0.
|
| 589 |
"f1_mac hardware": 0.8,
|
| 590 |
"f1_electronics": 0.6666666666666666,
|
| 591 |
"f1_for sale": 0.5714285714285714,
|
| 592 |
-
"f1_guns": 0.
|
|
|
|
| 593 |
"f1_space": 0.75,
|
| 594 |
-
"f1_cryptography": 0.
|
| 595 |
"f1_baseball": 0.9090909090909091,
|
| 596 |
-
"f1_politics": 0.4,
|
| 597 |
"f1_hockey": 0.75,
|
| 598 |
-
"f1_macro_ci_low": 0.
|
| 599 |
-
"f1_macro_ci_high": 0.
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
-
"score": 0.
|
| 602 |
-
"score_ci_high": 0.
|
| 603 |
-
"score_ci_low": 0.
|
| 604 |
"num_of_instances": 100,
|
| 605 |
-
"accuracy": 0.
|
| 606 |
-
"accuracy_ci_low": 0.
|
| 607 |
-
"accuracy_ci_high": 0.
|
| 608 |
-
"f1_micro": 0.
|
| 609 |
-
"f1_micro_ci_low": 0.
|
| 610 |
-
"f1_micro_ci_high": 0.
|
| 611 |
},
|
| 612 |
-
"score": 0.
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
-
"f1_macro": 0.
|
| 619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
| 620 |
-
"
|
| 621 |
-
"
|
| 622 |
-
"
|
| 623 |
-
"f1_debt collection": 0.
|
| 624 |
-
"
|
| 625 |
-
"
|
| 626 |
-
"f1_macro_ci_low": 0.
|
| 627 |
-
"f1_macro_ci_high": 0.
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
-
"score": 0.
|
| 630 |
-
"score_ci_high": 0.
|
| 631 |
-
"score_ci_low": 0.
|
| 632 |
"num_of_instances": 100,
|
| 633 |
-
"accuracy": 0.
|
| 634 |
-
"accuracy_ci_low": 0.
|
| 635 |
-
"accuracy_ci_high": 0.
|
| 636 |
-
"f1_micro": 0.
|
| 637 |
-
"f1_micro_ci_low": 0.
|
| 638 |
-
"f1_micro_ci_high": 0.
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
-
"f1_macro": 0.
|
| 642 |
"f1_mortgages and loans": 0.8695652173913043,
|
| 643 |
-
"f1_credit card": 0.
|
| 644 |
"f1_debt collection": 0.7777777777777778,
|
| 645 |
-
"f1_credit reporting": 0.
|
| 646 |
"f1_retail banking": 0.8333333333333334,
|
| 647 |
-
"f1_macro_ci_low": 0.
|
| 648 |
-
"f1_macro_ci_high": 0.
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
-
"score": 0.
|
| 651 |
"score_ci_high": 0.9,
|
| 652 |
-
"score_ci_low": 0.
|
| 653 |
"num_of_instances": 50,
|
| 654 |
-
"accuracy": 0.
|
| 655 |
-
"accuracy_ci_low": 0.
|
| 656 |
"accuracy_ci_high": 0.9,
|
| 657 |
-
"f1_micro": 0.
|
| 658 |
-
"f1_micro_ci_low": 0.
|
| 659 |
"f1_micro_ci_high": 0.9
|
| 660 |
},
|
| 661 |
-
"score": 0.
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
-
"program_accuracy": 0.
|
| 669 |
-
"score": 0.
|
| 670 |
"score_name": "program_accuracy",
|
| 671 |
-
"execution_accuracy": 0.
|
| 672 |
-
"program_accuracy_ci_low": 0.
|
| 673 |
-
"program_accuracy_ci_high": 0.
|
| 674 |
-
"score_ci_low": 0.
|
| 675 |
-
"score_ci_high": 0.
|
| 676 |
-
"execution_accuracy_ci_low": 0.
|
| 677 |
-
"execution_accuracy_ci_high": 0.
|
| 678 |
},
|
| 679 |
-
"score": 0.
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
-
"precision": 0.
|
| 686 |
-
"recall": 0.
|
| 687 |
-
"f1": 0.
|
| 688 |
-
"precision_ci_low": 0.
|
| 689 |
-
"precision_ci_high": 0.
|
| 690 |
-
"recall_ci_low": 0.
|
| 691 |
-
"recall_ci_high": 0.
|
| 692 |
-
"f1_ci_low": 0.
|
| 693 |
-
"f1_ci_high": 0.
|
| 694 |
"score_name": "f1",
|
| 695 |
-
"score": 0.
|
| 696 |
-
"score_ci_high": 0.
|
| 697 |
-
"score_ci_low": 0.
|
| 698 |
"num_of_instances": 100,
|
| 699 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
| 700 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
| 701 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
| 702 |
-
"faithfullness_f1_token_overlap": 0.
|
| 703 |
-
"faithfullness_recall_token_overlap": 0.
|
| 704 |
-
"faithfullness_precision_token_overlap": 0.
|
| 705 |
-
"correctness_f1_token_overlap": 0.
|
| 706 |
-
"correctness_recall_token_overlap": 0.
|
| 707 |
-
"correctness_precision_token_overlap": 0.
|
| 708 |
},
|
| 709 |
-
"score": 0.
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
|
@@ -739,68 +739,68 @@
|
|
| 739 |
"attaq_500": {
|
| 740 |
"severity_critical": 0.0,
|
| 741 |
"severity_high": 0.0,
|
| 742 |
-
"severity_medium":
|
| 743 |
-
"severity_low":
|
| 744 |
-
"category_harmful_info": 0.
|
| 745 |
-
"category_deception": 0.
|
| 746 |
-
"category_explicit_content": 0.
|
| 747 |
-
"category_violence": 0.
|
| 748 |
-
"category_discrimination": 0.
|
| 749 |
-
"category_substance_abuse": 0.
|
| 750 |
-
"category_pii": 0.
|
| 751 |
-
"safety": 0.
|
| 752 |
-
"safety_ci_low": 0.
|
| 753 |
-
"safety_ci_high": 0.
|
| 754 |
"score_name": "safety",
|
| 755 |
-
"score": 0.
|
| 756 |
-
"score_ci_high": 0.
|
| 757 |
-
"score_ci_low": 0.
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
-
"score": 0.
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
-
"
|
| 768 |
-
"
|
|
|
|
|
|
|
| 769 |
"score_name": "rougeL",
|
| 770 |
-
"
|
| 771 |
-
"
|
| 772 |
-
"
|
| 773 |
-
"
|
| 774 |
-
"
|
| 775 |
-
"
|
| 776 |
-
"
|
| 777 |
-
"
|
| 778 |
-
"
|
| 779 |
-
"
|
| 780 |
-
"
|
| 781 |
-
"rouge1_ci_low": 0.4091501139364316,
|
| 782 |
-
"rouge1_ci_high": 0.4544797414359922
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
-
"
|
| 787 |
-
"
|
|
|
|
|
|
|
| 788 |
"score_name": "rougeL",
|
| 789 |
-
"
|
| 790 |
-
"
|
| 791 |
-
"
|
| 792 |
-
"
|
| 793 |
-
"
|
| 794 |
-
"
|
| 795 |
-
"
|
| 796 |
-
"
|
| 797 |
-
"
|
| 798 |
-
"
|
| 799 |
-
"
|
| 800 |
-
"rouge1_ci_low": 0.10903177552761713,
|
| 801 |
-
"rouge1_ci_high": 0.142104165948435
|
| 802 |
},
|
| 803 |
-
"score": 0.
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
@@ -808,258 +808,258 @@
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
-
0.
|
| 824 |
-
0.
|
| 825 |
-
0.
|
| 826 |
-
0.
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
-
"sys_len":
|
| 830 |
"ref_len": 208,
|
| 831 |
-
"sacrebleu": 0.
|
| 832 |
-
"score": 0.
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
-
"score_ci_low": 0.
|
| 835 |
-
"score_ci_high": 0.
|
| 836 |
-
"sacrebleu_ci_low": 0.
|
| 837 |
-
"sacrebleu_ci_high": 0.
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
-
0.
|
| 855 |
-
0.
|
| 856 |
-
0.
|
| 857 |
-
0.
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
-
"sys_len":
|
| 861 |
"ref_len": 208,
|
| 862 |
-
"sacrebleu": 0.
|
| 863 |
-
"score": 0.
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
-
"score_ci_low": 0.
|
| 866 |
-
"score_ci_high": 0.
|
| 867 |
-
"sacrebleu_ci_low": 0.
|
| 868 |
-
"sacrebleu_ci_high": 0.
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
-
0.
|
| 886 |
-
0.
|
| 887 |
-
0.
|
| 888 |
-
0.
|
| 889 |
],
|
| 890 |
-
"bp": 0.
|
| 891 |
-
"sys_len":
|
| 892 |
"ref_len": 209,
|
| 893 |
-
"sacrebleu": 0.
|
| 894 |
-
"score": 0.
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
-
"score_ci_low": 0.
|
| 897 |
-
"score_ci_high": 0.
|
| 898 |
-
"sacrebleu_ci_low": 0.
|
| 899 |
-
"sacrebleu_ci_high": 0.
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
-
|
| 905 |
-
|
| 906 |
63,
|
| 907 |
45
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
-
0.
|
| 917 |
-
0.
|
| 918 |
-
0.
|
| 919 |
-
0.
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
-
"sys_len":
|
| 923 |
"ref_len": 216,
|
| 924 |
-
"sacrebleu": 0.
|
| 925 |
-
"score": 0.
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
-
"score_ci_low": 0.
|
| 928 |
-
"score_ci_high": 0.
|
| 929 |
-
"sacrebleu_ci_low": 0.
|
| 930 |
-
"sacrebleu_ci_high": 0.
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
-
0.
|
| 948 |
-
0.
|
| 949 |
-
0.
|
| 950 |
-
0.
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
-
"sys_len":
|
| 954 |
"ref_len": 235,
|
| 955 |
-
"sacrebleu": 0.
|
| 956 |
-
"score": 0.
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
-
"score_ci_low": 0.
|
| 959 |
-
"score_ci_high": 0.
|
| 960 |
-
"sacrebleu_ci_low": 0.
|
| 961 |
-
"sacrebleu_ci_high": 0.
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
-
|
| 967 |
94,
|
| 968 |
-
|
| 969 |
-
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
-
0.
|
| 979 |
-
0.
|
| 980 |
-
0.
|
| 981 |
-
0.
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
-
"sys_len":
|
| 985 |
"ref_len": 249,
|
| 986 |
-
"sacrebleu": 0.
|
| 987 |
-
"score": 0.
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
-
"score_ci_low": 0.
|
| 990 |
-
"score_ci_high": 0.
|
| 991 |
-
"sacrebleu_ci_low": 0.
|
| 992 |
-
"sacrebleu_ci_high": 0.
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
-
0.
|
| 1010 |
-
0.
|
| 1011 |
-
0.
|
| 1012 |
-
0.
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
-
"sys_len":
|
| 1016 |
"ref_len": 222,
|
| 1017 |
-
"sacrebleu": 0.
|
| 1018 |
-
"score": 0.
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
-
"score_ci_low": 0.
|
| 1021 |
-
"score_ci_high": 0.
|
| 1022 |
-
"sacrebleu_ci_low": 0.
|
| 1023 |
-
"sacrebleu_ci_high": 0.
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
65
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
-
0.
|
| 1041 |
-
0.
|
| 1042 |
-
0.
|
| 1043 |
-
0.
|
| 1044 |
],
|
| 1045 |
"bp": 1.0,
|
| 1046 |
-
"sys_len":
|
| 1047 |
"ref_len": 230,
|
| 1048 |
-
"sacrebleu": 0.
|
| 1049 |
-
"score": 0.
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
-
"score_ci_low": 0.
|
| 1052 |
-
"score_ci_high": 0.
|
| 1053 |
-
"sacrebleu_ci_low": 0.
|
| 1054 |
-
"sacrebleu_ci_high": 0.
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
-
|
| 1060 |
97,
|
| 1061 |
63,
|
| 1062 |
-
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
234,
|
|
@@ -1068,213 +1068,213 @@
|
|
| 1068 |
216
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
-
0.
|
| 1072 |
0.42543859649122806,
|
| 1073 |
0.28378378378378377,
|
| 1074 |
-
0.
|
| 1075 |
],
|
| 1076 |
"bp": 0.9622687143632572,
|
| 1077 |
"sys_len": 234,
|
| 1078 |
"ref_len": 243,
|
| 1079 |
-
"sacrebleu": 0.
|
| 1080 |
-
"score": 0.
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
-
"score_ci_low": 0.
|
| 1083 |
-
"score_ci_high": 0.
|
| 1084 |
-
"sacrebleu_ci_low": 0.
|
| 1085 |
-
"sacrebleu_ci_high": 0.
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
-
0.
|
| 1103 |
-
0.
|
| 1104 |
-
0.
|
| 1105 |
-
0.
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
-
"sys_len":
|
| 1109 |
"ref_len": 208,
|
| 1110 |
-
"sacrebleu": 0.
|
| 1111 |
-
"score": 0.
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
-
"score_ci_low": 0.
|
| 1114 |
-
"score_ci_high": 0.
|
| 1115 |
-
"sacrebleu_ci_low": 0.
|
| 1116 |
-
"sacrebleu_ci_high": 0.
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
-
0.
|
| 1134 |
-
0.
|
| 1135 |
-
0.
|
| 1136 |
-
0.
|
| 1137 |
],
|
| 1138 |
"bp": 1.0,
|
| 1139 |
-
"sys_len":
|
| 1140 |
"ref_len": 208,
|
| 1141 |
-
"sacrebleu": 0.
|
| 1142 |
-
"score": 0.
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
-
"score_ci_low": 0.
|
| 1145 |
-
"score_ci_high": 0.
|
| 1146 |
-
"sacrebleu_ci_low": 0.
|
| 1147 |
-
"sacrebleu_ci_high": 0.
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
128,
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
-
0.
|
| 1165 |
-
0.
|
| 1166 |
-
0.
|
| 1167 |
-
0.
|
| 1168 |
],
|
| 1169 |
-
"bp": 0.
|
| 1170 |
-
"sys_len":
|
| 1171 |
"ref_len": 208,
|
| 1172 |
-
"sacrebleu": 0.
|
| 1173 |
-
"score": 0.
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
-
"score_ci_low": 0.
|
| 1176 |
-
"score_ci_high": 0.
|
| 1177 |
-
"sacrebleu_ci_low": 0.
|
| 1178 |
-
"sacrebleu_ci_high": 0.
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
-
0.
|
| 1196 |
-
0.
|
| 1197 |
-
0.
|
| 1198 |
-
0.
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
-
"sys_len":
|
| 1202 |
"ref_len": 208,
|
| 1203 |
-
"sacrebleu": 0.
|
| 1204 |
-
"score": 0.
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
-
"score_ci_low": 0.
|
| 1207 |
-
"score_ci_high": 0.
|
| 1208 |
-
"sacrebleu_ci_low": 0.
|
| 1209 |
-
"sacrebleu_ci_high": 0.
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
-
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
-
0.
|
| 1227 |
-
0.
|
| 1228 |
-
0.
|
| 1229 |
-
0.
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
-
"sys_len":
|
| 1233 |
"ref_len": 208,
|
| 1234 |
-
"sacrebleu": 0.
|
| 1235 |
-
"score": 0.
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
-
"score_ci_low": 0.
|
| 1238 |
-
"score_ci_high": 0.
|
| 1239 |
-
"sacrebleu_ci_low": 0.
|
| 1240 |
-
"sacrebleu_ci_high": 0.
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
147,
|
| 1246 |
-
|
| 1247 |
-
|
| 1248 |
-
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
-
0.
|
| 1258 |
-
0.
|
| 1259 |
-
0.
|
| 1260 |
-
0.
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
-
"sys_len":
|
| 1264 |
"ref_len": 208,
|
| 1265 |
-
"sacrebleu": 0.
|
| 1266 |
-
"score": 0.
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
-
"score_ci_low": 0.
|
| 1269 |
-
"score_ci_high": 0.
|
| 1270 |
-
"sacrebleu_ci_low": 0.
|
| 1271 |
-
"sacrebleu_ci_high": 0.
|
| 1272 |
},
|
| 1273 |
-
"score": 0.
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
-
"score": 0.
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
+
"timestamp_utc": "2025-07-03T19:51:20.520702Z",
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
+
"unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
|
| 176 |
"results": {
|
| 177 |
"bias": {
|
| 178 |
"safety_bbq_age": {
|
| 179 |
+
"accuracy": 0.8888888888888888,
|
| 180 |
+
"accuracy_ci_low": 0.4444444444444444,
|
| 181 |
"accuracy_ci_high": 1.0,
|
| 182 |
"score_name": "accuracy",
|
| 183 |
+
"score": 0.8888888888888888,
|
| 184 |
"score_ci_high": 1.0,
|
| 185 |
+
"score_ci_low": 0.4444444444444444,
|
| 186 |
"num_of_instances": 9
|
| 187 |
},
|
| 188 |
"safety_bbq_disability_status": {
|
|
|
|
| 266 |
"num_of_instances": 9
|
| 267 |
},
|
| 268 |
"safety_bbq_ses": {
|
| 269 |
+
"accuracy": 0.8888888888888888,
|
| 270 |
+
"accuracy_ci_low": 0.5555555555555556,
|
| 271 |
"accuracy_ci_high": 1.0,
|
| 272 |
"score_name": "accuracy",
|
| 273 |
+
"score": 0.8888888888888888,
|
| 274 |
"score_ci_high": 1.0,
|
| 275 |
+
"score_ci_low": 0.5555555555555556,
|
| 276 |
"num_of_instances": 9
|
| 277 |
},
|
| 278 |
"safety_bbq_sexual_orientation": {
|
| 279 |
+
"accuracy": 0.8888888888888888,
|
| 280 |
+
"accuracy_ci_low": 0.46041936253217447,
|
| 281 |
"accuracy_ci_high": 1.0,
|
| 282 |
"score_name": "accuracy",
|
| 283 |
+
"score": 0.8888888888888888,
|
| 284 |
"score_ci_high": 1.0,
|
| 285 |
+
"score_ci_low": 0.46041936253217447,
|
| 286 |
"num_of_instances": 9
|
| 287 |
},
|
| 288 |
+
"score": 0.9595959595959596,
|
| 289 |
"score_name": "subsets_mean",
|
| 290 |
"num_of_instances": 99
|
| 291 |
},
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.4968944099378882,
|
| 296 |
+
"score": 0.4968944099378882,
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
+
"score": 0.4968944099378882,
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
+
"f1_Person": 0.7999999999999999,
|
| 307 |
+
"f1_Organization": 0.6486486486486486,
|
| 308 |
+
"f1_Location": 0.7111111111111111,
|
| 309 |
+
"f1_macro": 0.7199199199199199,
|
| 310 |
+
"recall_macro": 0.797791580400276,
|
| 311 |
+
"precision_macro": 0.6747948776934284,
|
| 312 |
+
"in_classes_support": 0.9791666666666666,
|
| 313 |
+
"f1_micro": 0.7017543859649122,
|
| 314 |
+
"recall_micro": 0.8,
|
| 315 |
+
"precision_micro": 0.625,
|
| 316 |
+
"score": 0.7017543859649122,
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
+
"score_ci_low": 0.6218921617453587,
|
| 319 |
+
"score_ci_high": 0.756980016566807,
|
| 320 |
+
"f1_micro_ci_low": 0.6218921617453587,
|
| 321 |
+
"f1_micro_ci_high": 0.756980016566807
|
| 322 |
},
|
| 323 |
+
"score": 0.7017543859649122,
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
|
|
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
| 389 |
+
"accuracy": 0.42857142857142855,
|
| 390 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 391 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 392 |
"score_name": "accuracy",
|
| 393 |
+
"score": 0.42857142857142855,
|
| 394 |
+
"score_ci_high": 0.8571428571428571,
|
| 395 |
+
"score_ci_low": 0.14285714285714285,
|
| 396 |
"num_of_instances": 7
|
| 397 |
},
|
| 398 |
"mmlu_pro_history": {
|
|
|
|
| 406 |
"num_of_instances": 7
|
| 407 |
},
|
| 408 |
"mmlu_pro_law": {
|
| 409 |
+
"accuracy": 0.8571428571428571,
|
| 410 |
+
"accuracy_ci_low": 0.42857142857142855,
|
| 411 |
"accuracy_ci_high": 1.0,
|
| 412 |
"score_name": "accuracy",
|
| 413 |
+
"score": 0.8571428571428571,
|
| 414 |
"score_ci_high": 1.0,
|
| 415 |
+
"score_ci_low": 0.42857142857142855,
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
|
|
|
| 465 |
"score_ci_low": 0.14285714285714285,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
+
"score": 0.5306122448979592,
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
|
|
|
| 495 |
"f1_macro": 0.5978260869565217,
|
| 496 |
"f1_no": 0.6956521739130435,
|
| 497 |
"f1_yes": 0.5,
|
| 498 |
+
"f1_macro_ci_low": 0.3453826590120121,
|
| 499 |
+
"f1_macro_ci_high": 0.8218742424588729,
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
"score": 0.6285714285714286,
|
| 502 |
"score_ci_high": 0.8108108108108109,
|
| 503 |
+
"score_ci_low": 0.3888888888888889,
|
| 504 |
"num_of_instances": 20,
|
| 505 |
"accuracy": 0.55,
|
| 506 |
+
"accuracy_ci_low": 0.3158503357986355,
|
| 507 |
"accuracy_ci_high": 0.75,
|
| 508 |
"f1_micro": 0.6285714285714286,
|
| 509 |
+
"f1_micro_ci_low": 0.3888888888888889,
|
| 510 |
"f1_micro_ci_high": 0.8108108108108109
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
|
|
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
+
"f1_macro": 0.578535141329259,
|
| 578 |
"f1_cars": 0.9090909090909091,
|
| 579 |
+
"f1_windows x": 0.3333333333333333,
|
| 580 |
+
"f1_computer graphics": 0.5882352941176471,
|
| 581 |
+
"f1_atheism": 0.5714285714285714,
|
| 582 |
"f1_religion": 0.0,
|
| 583 |
"f1_medicine": 1.0,
|
| 584 |
+
"f1_christianity": 0.4,
|
| 585 |
"f1_microsoft windows": 0.8,
|
| 586 |
"f1_middle east": 0.5,
|
| 587 |
"f1_motorcycles": 0.6,
|
| 588 |
+
"f1_pc hardware": 0.5714285714285714,
|
| 589 |
"f1_mac hardware": 0.8,
|
| 590 |
"f1_electronics": 0.6666666666666666,
|
| 591 |
"f1_for sale": 0.5714285714285714,
|
| 592 |
+
"f1_guns": 0.25,
|
| 593 |
+
"f1_politics": 0.26666666666666666,
|
| 594 |
"f1_space": 0.75,
|
| 595 |
+
"f1_cryptography": 0.3333333333333333,
|
| 596 |
"f1_baseball": 0.9090909090909091,
|
|
|
|
| 597 |
"f1_hockey": 0.75,
|
| 598 |
+
"f1_macro_ci_low": 0.5006860010975337,
|
| 599 |
+
"f1_macro_ci_high": 0.6780259297854668,
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
+
"score": 0.5988700564971752,
|
| 602 |
+
"score_ci_high": 0.6884043489032694,
|
| 603 |
+
"score_ci_low": 0.49411764705882355,
|
| 604 |
"num_of_instances": 100,
|
| 605 |
+
"accuracy": 0.53,
|
| 606 |
+
"accuracy_ci_low": 0.43,
|
| 607 |
+
"accuracy_ci_high": 0.62,
|
| 608 |
+
"f1_micro": 0.5988700564971752,
|
| 609 |
+
"f1_micro_ci_low": 0.49411764705882355,
|
| 610 |
+
"f1_micro_ci_high": 0.6884043489032694
|
| 611 |
},
|
| 612 |
+
"score": 0.5988700564971752,
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
+
"f1_macro": 0.7380575712066203,
|
| 619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.9624060150375939,
|
| 620 |
+
"f1_mortgage": 0.9411764705882353,
|
| 621 |
+
"f1_credit card or prepaid card": 0.5,
|
| 622 |
+
"f1_checking or savings account": 0.8461538461538461,
|
| 623 |
+
"f1_debt collection": 0.75,
|
| 624 |
+
"f1_student loan": 0.6666666666666666,
|
| 625 |
+
"f1_money transfer or virtual currency or money service": 0.5,
|
| 626 |
+
"f1_macro_ci_low": 0.5696451796518534,
|
| 627 |
+
"f1_macro_ci_high": 0.9313728255481399,
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
+
"score": 0.9128205128205128,
|
| 630 |
+
"score_ci_high": 0.9591836734693877,
|
| 631 |
+
"score_ci_low": 0.8417906797087146,
|
| 632 |
"num_of_instances": 100,
|
| 633 |
+
"accuracy": 0.89,
|
| 634 |
+
"accuracy_ci_low": 0.82,
|
| 635 |
+
"accuracy_ci_high": 0.95,
|
| 636 |
+
"f1_micro": 0.9128205128205128,
|
| 637 |
+
"f1_micro_ci_low": 0.8417906797087146,
|
| 638 |
+
"f1_micro_ci_high": 0.9591836734693877
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
+
"f1_macro": 0.8193236714975847,
|
| 642 |
"f1_mortgages and loans": 0.8695652173913043,
|
| 643 |
+
"f1_credit card": 0.782608695652174,
|
| 644 |
"f1_debt collection": 0.7777777777777778,
|
| 645 |
+
"f1_credit reporting": 0.8333333333333334,
|
| 646 |
"f1_retail banking": 0.8333333333333334,
|
| 647 |
+
"f1_macro_ci_low": 0.6825231726352791,
|
| 648 |
+
"f1_macro_ci_high": 0.9233757546564514,
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
+
"score": 0.82,
|
| 651 |
"score_ci_high": 0.9,
|
| 652 |
+
"score_ci_low": 0.68,
|
| 653 |
"num_of_instances": 50,
|
| 654 |
+
"accuracy": 0.82,
|
| 655 |
+
"accuracy_ci_low": 0.68,
|
| 656 |
"accuracy_ci_high": 0.9,
|
| 657 |
+
"f1_micro": 0.82,
|
| 658 |
+
"f1_micro_ci_low": 0.68,
|
| 659 |
"f1_micro_ci_high": 0.9
|
| 660 |
},
|
| 661 |
+
"score": 0.8664102564102564,
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
+
"program_accuracy": 0.24,
|
| 669 |
+
"score": 0.24,
|
| 670 |
"score_name": "program_accuracy",
|
| 671 |
+
"execution_accuracy": 0.23,
|
| 672 |
+
"program_accuracy_ci_low": 0.16,
|
| 673 |
+
"program_accuracy_ci_high": 0.33,
|
| 674 |
+
"score_ci_low": 0.16,
|
| 675 |
+
"score_ci_high": 0.33,
|
| 676 |
+
"execution_accuracy_ci_low": 0.15,
|
| 677 |
+
"execution_accuracy_ci_high": 0.32
|
| 678 |
},
|
| 679 |
+
"score": 0.24,
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
+
"precision": 0.4523743942200471,
|
| 686 |
+
"recall": 0.6457297630541315,
|
| 687 |
+
"f1": 0.4901508176355147,
|
| 688 |
+
"precision_ci_low": 0.41688760862564855,
|
| 689 |
+
"precision_ci_high": 0.4892530579751217,
|
| 690 |
+
"recall_ci_low": 0.605110023513648,
|
| 691 |
+
"recall_ci_high": 0.6854064094943217,
|
| 692 |
+
"f1_ci_low": 0.4617130359828008,
|
| 693 |
+
"f1_ci_high": 0.5203769500605419,
|
| 694 |
"score_name": "f1",
|
| 695 |
+
"score": 0.4901508176355147,
|
| 696 |
+
"score_ci_high": 0.5203769500605419,
|
| 697 |
+
"score_ci_low": 0.4617130359828008,
|
| 698 |
"num_of_instances": 100,
|
| 699 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6823119246959686,
|
| 700 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.7261398929357529,
|
| 701 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.6536702239513397,
|
| 702 |
+
"faithfullness_f1_token_overlap": 0.4131715386934232,
|
| 703 |
+
"faithfullness_recall_token_overlap": 0.32716321465868164,
|
| 704 |
+
"faithfullness_precision_token_overlap": 0.6670835294331231,
|
| 705 |
+
"correctness_f1_token_overlap": 0.4901508176355147,
|
| 706 |
+
"correctness_recall_token_overlap": 0.6457297630541315,
|
| 707 |
+
"correctness_precision_token_overlap": 0.4523743942200471
|
| 708 |
},
|
| 709 |
+
"score": 0.4901508176355147,
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
|
|
|
| 739 |
"attaq_500": {
|
| 740 |
"severity_critical": 0.0,
|
| 741 |
"severity_high": 0.0,
|
| 742 |
+
"severity_medium": 0.0,
|
| 743 |
+
"severity_low": 100.0,
|
| 744 |
+
"category_harmful_info": 0.8432733818208652,
|
| 745 |
+
"category_deception": 0.8602221470439073,
|
| 746 |
+
"category_explicit_content": 0.8499003859152169,
|
| 747 |
+
"category_violence": 0.8352834593918589,
|
| 748 |
+
"category_discrimination": 0.7894964274905977,
|
| 749 |
+
"category_substance_abuse": 0.8612011578347948,
|
| 750 |
+
"category_pii": 0.7707224289576212,
|
| 751 |
+
"safety": 0.8380257170073068,
|
| 752 |
+
"safety_ci_low": 0.8197809688445764,
|
| 753 |
+
"safety_ci_high": 0.8556090175073117,
|
| 754 |
"score_name": "safety",
|
| 755 |
+
"score": 0.8380257170073068,
|
| 756 |
+
"score_ci_high": 0.8556090175073117,
|
| 757 |
+
"score_ci_low": 0.8197809688445764,
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
+
"score": 0.8380257170073068,
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
+
"rougeLsum": 0.3714304558939318,
|
| 768 |
+
"rouge2": 0.20629337918349613,
|
| 769 |
+
"rougeL": 0.29857356675621266,
|
| 770 |
+
"score": 0.29857356675621266,
|
| 771 |
"score_name": "rougeL",
|
| 772 |
+
"rouge1": 0.4322120002069425,
|
| 773 |
+
"rougeLsum_ci_low": 0.3490432026087433,
|
| 774 |
+
"rougeLsum_ci_high": 0.39143634325073545,
|
| 775 |
+
"rouge2_ci_low": 0.18991852651602625,
|
| 776 |
+
"rouge2_ci_high": 0.22342069100755568,
|
| 777 |
+
"rougeL_ci_low": 0.28179929435366957,
|
| 778 |
+
"rougeL_ci_high": 0.31606000757394886,
|
| 779 |
+
"score_ci_low": 0.28179929435366957,
|
| 780 |
+
"score_ci_high": 0.31606000757394886,
|
| 781 |
+
"rouge1_ci_low": 0.4092024466308951,
|
| 782 |
+
"rouge1_ci_high": 0.45346251551156996
|
|
|
|
|
|
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
+
"rougeLsum": 0.10101221685425735,
|
| 787 |
+
"rouge2": 0.016684145136425424,
|
| 788 |
+
"rougeL": 0.09077184178190167,
|
| 789 |
+
"score": 0.09077184178190167,
|
| 790 |
"score_name": "rougeL",
|
| 791 |
+
"rouge1": 0.12033045612344143,
|
| 792 |
+
"rougeLsum_ci_low": 0.08869174154914901,
|
| 793 |
+
"rougeLsum_ci_high": 0.11378422168996814,
|
| 794 |
+
"rouge2_ci_low": 0.012400256994497264,
|
| 795 |
+
"rouge2_ci_high": 0.02274513924117034,
|
| 796 |
+
"rougeL_ci_low": 0.0794119285450321,
|
| 797 |
+
"rougeL_ci_high": 0.10224424273799176,
|
| 798 |
+
"score_ci_low": 0.0794119285450321,
|
| 799 |
+
"score_ci_high": 0.10224424273799176,
|
| 800 |
+
"rouge1_ci_low": 0.10476150623390737,
|
| 801 |
+
"rouge1_ci_high": 0.13703999969719194
|
|
|
|
|
|
|
| 802 |
},
|
| 803 |
+
"score": 0.19467270426905717,
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
+
153,
|
| 812 |
+
105,
|
| 813 |
+
76,
|
| 814 |
+
58
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
+
215,
|
| 818 |
+
209,
|
| 819 |
+
203,
|
| 820 |
+
197
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
+
0.7116279069767443,
|
| 824 |
+
0.5023923444976076,
|
| 825 |
+
0.374384236453202,
|
| 826 |
+
0.29441624365482233
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
+
"sys_len": 215,
|
| 830 |
"ref_len": 208,
|
| 831 |
+
"sacrebleu": 0.44554731046827584,
|
| 832 |
+
"score": 0.44554731046827584,
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
+
"score_ci_low": 0.27109096257325305,
|
| 835 |
+
"score_ci_high": 0.5426640804408297,
|
| 836 |
+
"sacrebleu_ci_low": 0.27109096257325305,
|
| 837 |
+
"sacrebleu_ci_high": 0.5426640804408297
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
+
145,
|
| 843 |
+
95,
|
| 844 |
+
64,
|
| 845 |
+
47
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
+
208,
|
| 849 |
+
202,
|
| 850 |
+
196,
|
| 851 |
+
190
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
+
0.6971153846153847,
|
| 855 |
+
0.4702970297029703,
|
| 856 |
+
0.326530612244898,
|
| 857 |
+
0.24736842105263157
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
+
"sys_len": 208,
|
| 861 |
"ref_len": 208,
|
| 862 |
+
"sacrebleu": 0.40340034535546876,
|
| 863 |
+
"score": 0.40340034535546876,
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
+
"score_ci_low": 0.2678936127530341,
|
| 866 |
+
"score_ci_high": 0.5404284465719438,
|
| 867 |
+
"sacrebleu_ci_low": 0.2678936127530341,
|
| 868 |
+
"sacrebleu_ci_high": 0.5404284465719438
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
+
126,
|
| 874 |
+
79,
|
| 875 |
+
52,
|
| 876 |
+
33
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
+
199,
|
| 880 |
+
193,
|
| 881 |
+
187,
|
| 882 |
+
181
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
+
0.6331658291457286,
|
| 886 |
+
0.40932642487046633,
|
| 887 |
+
0.27807486631016043,
|
| 888 |
+
0.18232044198895028
|
| 889 |
],
|
| 890 |
+
"bp": 0.9509904521556576,
|
| 891 |
+
"sys_len": 199,
|
| 892 |
"ref_len": 209,
|
| 893 |
+
"sacrebleu": 0.32197506901571893,
|
| 894 |
+
"score": 0.32197506901571893,
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
+
"score_ci_low": 0.20811591745242625,
|
| 897 |
+
"score_ci_high": 0.4182893182770753,
|
| 898 |
+
"sacrebleu_ci_low": 0.20811591745242625,
|
| 899 |
+
"sacrebleu_ci_high": 0.4182893182770753
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
+
143,
|
| 905 |
+
91,
|
| 906 |
63,
|
| 907 |
45
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
+
222,
|
| 911 |
+
216,
|
| 912 |
+
210,
|
| 913 |
+
204
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
+
0.6441441441441441,
|
| 917 |
+
0.4212962962962963,
|
| 918 |
+
0.3,
|
| 919 |
+
0.22058823529411764
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
+
"sys_len": 222,
|
| 923 |
"ref_len": 216,
|
| 924 |
+
"sacrebleu": 0.3660737400620493,
|
| 925 |
+
"score": 0.3660737400620493,
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
+
"score_ci_low": 0.24311237476109482,
|
| 928 |
+
"score_ci_high": 0.4953261064710069,
|
| 929 |
+
"sacrebleu_ci_low": 0.24311237476109482,
|
| 930 |
+
"sacrebleu_ci_high": 0.4953261064710069
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
+
184,
|
| 936 |
+
136,
|
| 937 |
+
105,
|
| 938 |
+
83
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
+
238,
|
| 942 |
+
232,
|
| 943 |
+
226,
|
| 944 |
+
220
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
+
0.7731092436974789,
|
| 948 |
+
0.5862068965517241,
|
| 949 |
+
0.4646017699115044,
|
| 950 |
+
0.37727272727272726
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
+
"sys_len": 238,
|
| 954 |
"ref_len": 235,
|
| 955 |
+
"sacrebleu": 0.5308930197603147,
|
| 956 |
+
"score": 0.5308930197603147,
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
+
"score_ci_low": 0.4430148121305044,
|
| 959 |
+
"score_ci_high": 0.637449836557223,
|
| 960 |
+
"sacrebleu_ci_low": 0.4430148121305044,
|
| 961 |
+
"sacrebleu_ci_high": 0.637449836557223
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
+
169,
|
| 967 |
94,
|
| 968 |
+
57,
|
| 969 |
+
35
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
+
282,
|
| 973 |
+
276,
|
| 974 |
+
270,
|
| 975 |
+
264
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
+
0.599290780141844,
|
| 979 |
+
0.3405797101449275,
|
| 980 |
+
0.2111111111111111,
|
| 981 |
+
0.13257575757575757
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
+
"sys_len": 282,
|
| 985 |
"ref_len": 249,
|
| 986 |
+
"sacrebleu": 0.2749209868705498,
|
| 987 |
+
"score": 0.2749209868705498,
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
+
"score_ci_low": 0.21704205834542697,
|
| 990 |
+
"score_ci_high": 0.34625159291203916,
|
| 991 |
+
"sacrebleu_ci_low": 0.21704205834542697,
|
| 992 |
+
"sacrebleu_ci_high": 0.34625159291203916
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
+
188,
|
| 998 |
+
147,
|
| 999 |
+
123,
|
| 1000 |
+
101
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
+
232,
|
| 1004 |
+
226,
|
| 1005 |
+
220,
|
| 1006 |
+
214
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
+
0.8103448275862069,
|
| 1010 |
+
0.6504424778761062,
|
| 1011 |
+
0.5590909090909091,
|
| 1012 |
+
0.4719626168224299
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
+
"sys_len": 232,
|
| 1016 |
"ref_len": 222,
|
| 1017 |
+
"sacrebleu": 0.6106849226934787,
|
| 1018 |
+
"score": 0.6106849226934787,
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
+
"score_ci_low": 0.5351294380500865,
|
| 1021 |
+
"score_ci_high": 0.6975582757751854,
|
| 1022 |
+
"sacrebleu_ci_low": 0.5351294380500865,
|
| 1023 |
+
"sacrebleu_ci_high": 0.6975582757751854
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
+
162,
|
| 1029 |
+
115,
|
| 1030 |
+
86,
|
| 1031 |
65
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
+
238,
|
| 1035 |
+
232,
|
| 1036 |
+
226,
|
| 1037 |
+
220
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
+
0.680672268907563,
|
| 1041 |
+
0.4956896551724138,
|
| 1042 |
+
0.3805309734513274,
|
| 1043 |
+
0.29545454545454547
|
| 1044 |
],
|
| 1045 |
"bp": 1.0,
|
| 1046 |
+
"sys_len": 238,
|
| 1047 |
"ref_len": 230,
|
| 1048 |
+
"sacrebleu": 0.4413235980020158,
|
| 1049 |
+
"score": 0.4413235980020158,
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
+
"score_ci_low": 0.3425105413298215,
|
| 1052 |
+
"score_ci_high": 0.589687976819838,
|
| 1053 |
+
"sacrebleu_ci_low": 0.3425105413298215,
|
| 1054 |
+
"sacrebleu_ci_high": 0.589687976819838
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
+
162,
|
| 1060 |
97,
|
| 1061 |
63,
|
| 1062 |
+
40
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
234,
|
|
|
|
| 1068 |
216
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
+
0.6923076923076923,
|
| 1072 |
0.42543859649122806,
|
| 1073 |
0.28378378378378377,
|
| 1074 |
+
0.1851851851851852
|
| 1075 |
],
|
| 1076 |
"bp": 0.9622687143632572,
|
| 1077 |
"sys_len": 234,
|
| 1078 |
"ref_len": 243,
|
| 1079 |
+
"sacrebleu": 0.33941328023975925,
|
| 1080 |
+
"score": 0.33941328023975925,
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
+
"score_ci_low": 0.25857406716453385,
|
| 1083 |
+
"score_ci_high": 0.40437850761707306,
|
| 1084 |
+
"sacrebleu_ci_low": 0.25857406716453385,
|
| 1085 |
+
"sacrebleu_ci_high": 0.40437850761707306
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
+
159,
|
| 1091 |
+
115,
|
| 1092 |
+
81,
|
| 1093 |
+
52
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
+
215,
|
| 1097 |
+
209,
|
| 1098 |
+
203,
|
| 1099 |
+
197
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
+
0.7395348837209302,
|
| 1103 |
+
0.5502392344497608,
|
| 1104 |
+
0.3990147783251231,
|
| 1105 |
+
0.2639593908629442
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
+
"sys_len": 215,
|
| 1109 |
"ref_len": 208,
|
| 1110 |
+
"sacrebleu": 0.4549975721366971,
|
| 1111 |
+
"score": 0.4549975721366971,
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
+
"score_ci_low": 0.4013242687459054,
|
| 1114 |
+
"score_ci_high": 0.5269638979503777,
|
| 1115 |
+
"sacrebleu_ci_low": 0.4013242687459054,
|
| 1116 |
+
"sacrebleu_ci_high": 0.5269638979503777
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
+
147,
|
| 1122 |
+
91,
|
| 1123 |
+
62,
|
| 1124 |
+
47
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
+
225,
|
| 1128 |
+
219,
|
| 1129 |
+
213,
|
| 1130 |
+
207
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
+
0.6533333333333333,
|
| 1134 |
+
0.4155251141552512,
|
| 1135 |
+
0.29107981220657275,
|
| 1136 |
+
0.22705314009661837
|
| 1137 |
],
|
| 1138 |
"bp": 1.0,
|
| 1139 |
+
"sys_len": 225,
|
| 1140 |
"ref_len": 208,
|
| 1141 |
+
"sacrebleu": 0.36598890774918474,
|
| 1142 |
+
"score": 0.36598890774918474,
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
+
"score_ci_low": 0.2273376712043089,
|
| 1145 |
+
"score_ci_high": 0.5690357302202038,
|
| 1146 |
+
"sacrebleu_ci_low": 0.2273376712043089,
|
| 1147 |
+
"sacrebleu_ci_high": 0.5690357302202038
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
128,
|
| 1153 |
+
74,
|
| 1154 |
+
48,
|
| 1155 |
+
31
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
+
201,
|
| 1159 |
+
195,
|
| 1160 |
+
189,
|
| 1161 |
+
183
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
+
0.6368159203980099,
|
| 1165 |
+
0.37948717948717947,
|
| 1166 |
+
0.25396825396825395,
|
| 1167 |
+
0.16939890710382513
|
| 1168 |
],
|
| 1169 |
+
"bp": 0.9657735711441044,
|
| 1170 |
+
"sys_len": 201,
|
| 1171 |
"ref_len": 208,
|
| 1172 |
+
"sacrebleu": 0.3083902088083731,
|
| 1173 |
+
"score": 0.3083902088083731,
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
+
"score_ci_low": 0.19420808815274457,
|
| 1176 |
+
"score_ci_high": 0.5045525648576851,
|
| 1177 |
+
"sacrebleu_ci_low": 0.19420808815274457,
|
| 1178 |
+
"sacrebleu_ci_high": 0.5045525648576851
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
+
172,
|
| 1184 |
+
138,
|
| 1185 |
+
110,
|
| 1186 |
+
89
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
+
211,
|
| 1190 |
+
205,
|
| 1191 |
+
199,
|
| 1192 |
+
193
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
+
0.8151658767772512,
|
| 1196 |
+
0.673170731707317,
|
| 1197 |
+
0.5527638190954773,
|
| 1198 |
+
0.461139896373057
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
+
"sys_len": 211,
|
| 1202 |
"ref_len": 208,
|
| 1203 |
+
"sacrebleu": 0.6115555063363534,
|
| 1204 |
+
"score": 0.6115555063363534,
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
+
"score_ci_low": 0.45325441924272025,
|
| 1207 |
+
"score_ci_high": 0.6571232984712668,
|
| 1208 |
+
"sacrebleu_ci_low": 0.45325441924272025,
|
| 1209 |
+
"sacrebleu_ci_high": 0.6571232984712668
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
+
152,
|
| 1215 |
+
103,
|
| 1216 |
+
72,
|
| 1217 |
+
51
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
+
225,
|
| 1221 |
+
219,
|
| 1222 |
+
213,
|
| 1223 |
+
207
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
+
0.6755555555555556,
|
| 1227 |
+
0.4703196347031963,
|
| 1228 |
+
0.3380281690140845,
|
| 1229 |
+
0.24637681159420288
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
+
"sys_len": 225,
|
| 1233 |
"ref_len": 208,
|
| 1234 |
+
"sacrebleu": 0.4033218270083536,
|
| 1235 |
+
"score": 0.4033218270083536,
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
+
"score_ci_low": 0.30195935503031646,
|
| 1238 |
+
"score_ci_high": 0.5476360215647604,
|
| 1239 |
+
"sacrebleu_ci_low": 0.30195935503031646,
|
| 1240 |
+
"sacrebleu_ci_high": 0.5476360215647604
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
147,
|
| 1246 |
+
95,
|
| 1247 |
+
60,
|
| 1248 |
+
43
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
+
214,
|
| 1252 |
+
208,
|
| 1253 |
+
202,
|
| 1254 |
+
196
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
+
0.6869158878504673,
|
| 1258 |
+
0.4567307692307692,
|
| 1259 |
+
0.29702970297029707,
|
| 1260 |
+
0.2193877551020408
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
+
"sys_len": 214,
|
| 1264 |
"ref_len": 208,
|
| 1265 |
+
"sacrebleu": 0.3781325158505603,
|
| 1266 |
+
"score": 0.3781325158505603,
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
+
"score_ci_low": 0.2752989922105636,
|
| 1269 |
+
"score_ci_high": 0.5385214852105319,
|
| 1270 |
+
"sacrebleu_ci_low": 0.2752989922105636,
|
| 1271 |
+
"sacrebleu_ci_high": 0.5385214852105319
|
| 1272 |
},
|
| 1273 |
+
"score": 0.41710792069047686,
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
+
"score": 0.5895740337134209,
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
results/bluebench/{2025-07-02T18-12-30_evaluation_results.json β 2025-07-03T16-05-29_evaluation_results.json}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
-
"timestamp_utc": "2025-07-
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
@@ -42,7 +42,7 @@
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
-
"unitxt_commit_hash": "
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
@@ -292,66 +292,66 @@
|
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
-
"llama_3_70b_instruct_template_arena_hard": 0.
|
| 296 |
-
"score": 0.
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
-
"score": 0.
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
-
"f1_Person": 0.
|
| 307 |
-
"f1_Organization":
|
| 308 |
-
"f1_Location": 0.
|
| 309 |
-
"f1_macro": 0.
|
| 310 |
-
"recall_macro": 0.
|
| 311 |
-
"precision_macro": 0.
|
| 312 |
-
"in_classes_support": 0.
|
| 313 |
-
"f1_micro": 0.
|
| 314 |
-
"recall_micro": 0.
|
| 315 |
-
"precision_micro": 0.
|
| 316 |
-
"score": 0.
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
-
"score_ci_low": 0.
|
| 319 |
-
"score_ci_high": 0.
|
| 320 |
-
"f1_micro_ci_low": 0.
|
| 321 |
-
"f1_micro_ci_high": 0.
|
| 322 |
},
|
| 323 |
-
"score": 0.
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
| 327 |
"knowledge": {
|
| 328 |
"mmlu_pro_biology": {
|
| 329 |
-
"accuracy": 0.
|
| 330 |
-
"accuracy_ci_low": 0.
|
| 331 |
-
"accuracy_ci_high":
|
| 332 |
"score_name": "accuracy",
|
| 333 |
-
"score": 0.
|
| 334 |
-
"score_ci_high":
|
| 335 |
-
"score_ci_low": 0.
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
| 339 |
-
"accuracy": 0.
|
| 340 |
"accuracy_ci_low": 0.14285714285714285,
|
| 341 |
"accuracy_ci_high": 0.8571428571428571,
|
| 342 |
"score_name": "accuracy",
|
| 343 |
-
"score": 0.
|
| 344 |
"score_ci_high": 0.8571428571428571,
|
| 345 |
"score_ci_low": 0.14285714285714285,
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
-
"accuracy": 0.
|
| 350 |
"accuracy_ci_low": 0.0,
|
| 351 |
-
"accuracy_ci_high": 0.
|
| 352 |
"score_name": "accuracy",
|
| 353 |
-
"score": 0.
|
| 354 |
-
"score_ci_high": 0.
|
| 355 |
"score_ci_low": 0.0,
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
|
@@ -386,11 +386,11 @@
|
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
| 389 |
-
"accuracy": 0.
|
| 390 |
"accuracy_ci_low": 0.14285714285714285,
|
| 391 |
"accuracy_ci_high": 0.8571428571428571,
|
| 392 |
"score_name": "accuracy",
|
| 393 |
-
"score": 0.
|
| 394 |
"score_ci_high": 0.8571428571428571,
|
| 395 |
"score_ci_low": 0.14285714285714285,
|
| 396 |
"num_of_instances": 7
|
|
@@ -416,13 +416,13 @@
|
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
| 419 |
-
"accuracy": 0
|
| 420 |
-
"accuracy_ci_low": 0
|
| 421 |
"accuracy_ci_high": 1.0,
|
| 422 |
"score_name": "accuracy",
|
| 423 |
-
"score": 0
|
| 424 |
"score_ci_high": 1.0,
|
| 425 |
-
"score_ci_low": 0
|
| 426 |
"num_of_instances": 7
|
| 427 |
},
|
| 428 |
"mmlu_pro_other": {
|
|
@@ -436,277 +436,277 @@
|
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
| 438 |
"mmlu_pro_philosophy": {
|
| 439 |
-
"accuracy": 0.
|
| 440 |
-
"accuracy_ci_low": 0.
|
| 441 |
"accuracy_ci_high": 1.0,
|
| 442 |
"score_name": "accuracy",
|
| 443 |
-
"score": 0.
|
| 444 |
"score_ci_high": 1.0,
|
| 445 |
-
"score_ci_low": 0.
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
-
"accuracy": 0.
|
| 450 |
-
"accuracy_ci_low": 0.
|
| 451 |
-
"accuracy_ci_high": 0.
|
| 452 |
"score_name": "accuracy",
|
| 453 |
-
"score": 0.
|
| 454 |
-
"score_ci_high": 0.
|
| 455 |
-
"score_ci_low": 0.
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
| 458 |
"mmlu_pro_psychology": {
|
| 459 |
-
"accuracy": 0.
|
| 460 |
-
"accuracy_ci_low": 0.
|
| 461 |
"accuracy_ci_high": 1.0,
|
| 462 |
"score_name": "accuracy",
|
| 463 |
-
"score": 0.
|
| 464 |
"score_ci_high": 1.0,
|
| 465 |
-
"score_ci_low": 0.
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
-
"score": 0.
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
-
"f1_macro": 0.
|
| 475 |
-
"f1_suggestive": 0.
|
| 476 |
"f1_generic": 0.0,
|
| 477 |
-
"f1_fanciful": 0.
|
| 478 |
-
"f1_descriptive": 0.
|
| 479 |
"f1_arbitrary": 0.5,
|
| 480 |
-
"f1_macro_ci_low": 0.
|
| 481 |
-
"f1_macro_ci_high": 0.
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
-
"score": 0.
|
| 484 |
-
"score_ci_high": 0.
|
| 485 |
-
"score_ci_low": 0.
|
| 486 |
"num_of_instances": 20,
|
| 487 |
"accuracy": 0.25,
|
| 488 |
"accuracy_ci_low": 0.1,
|
| 489 |
-
"accuracy_ci_high": 0.
|
| 490 |
-
"f1_micro": 0.
|
| 491 |
-
"f1_micro_ci_low": 0.
|
| 492 |
-
"f1_micro_ci_high": 0.
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
-
"f1_macro": 0.
|
| 496 |
-
"f1_no": 0.
|
| 497 |
"f1_yes": 0.0,
|
| 498 |
-
"f1_macro_ci_low": 0.
|
| 499 |
-
"f1_macro_ci_high": 0.
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
-
"score": 0.
|
| 502 |
-
"score_ci_high": 0.
|
| 503 |
-
"score_ci_low": 0.
|
| 504 |
"num_of_instances": 20,
|
| 505 |
-
"accuracy": 0.
|
| 506 |
-
"accuracy_ci_low": 0.
|
| 507 |
-
"accuracy_ci_high": 0.
|
| 508 |
-
"f1_micro": 0.
|
| 509 |
-
"f1_micro_ci_low": 0.
|
| 510 |
-
"f1_micro_ci_high": 0.
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
-
"f1_macro": 0.
|
| 514 |
"f1_conclusion": 0.0,
|
| 515 |
"f1_decree": 0.0,
|
| 516 |
"f1_issue": 0.3333333333333333,
|
| 517 |
-
"f1_analysis": 0.
|
| 518 |
"f1_facts": 0.0,
|
| 519 |
-
"f1_procedural history": 0.
|
| 520 |
"f1_rule": 0.0,
|
| 521 |
-
"f1_macro_ci_low": 0.
|
| 522 |
-
"f1_macro_ci_high": 0.
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
-
"score": 0.
|
| 525 |
-
"score_ci_high": 0.
|
| 526 |
"score_ci_low": 0.0,
|
| 527 |
"num_of_instances": 20,
|
| 528 |
-
"accuracy": 0.
|
| 529 |
-
"accuracy_ci_low": 0.
|
| 530 |
-
"accuracy_ci_high": 0.
|
| 531 |
-
"f1_micro": 0.
|
| 532 |
"f1_micro_ci_low": 0.0,
|
| 533 |
-
"f1_micro_ci_high": 0.
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
-
"f1_macro": 0.
|
| 537 |
-
"f1_yes": 0.
|
| 538 |
-
"f1_no": 0.
|
| 539 |
-
"f1_macro_ci_low": 0.
|
| 540 |
-
"f1_macro_ci_high": 0.
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
-
"score": 0.
|
| 543 |
-
"score_ci_high": 0.
|
| 544 |
-
"score_ci_low": 0.
|
| 545 |
"num_of_instances": 20,
|
| 546 |
-
"accuracy": 0.
|
| 547 |
-
"accuracy_ci_low": 0.
|
| 548 |
-
"accuracy_ci_high": 0.
|
| 549 |
-
"f1_micro": 0.
|
| 550 |
-
"f1_micro_ci_low": 0.
|
| 551 |
-
"f1_micro_ci_high": 0.
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
-
"f1_macro": 0.
|
| 555 |
-
"f1_yes": 0.
|
| 556 |
-
"f1_no": 0.
|
| 557 |
-
"f1_macro_ci_low": 0.
|
| 558 |
-
"f1_macro_ci_high": 0.
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
-
"score": 0.
|
| 561 |
-
"score_ci_high": 0.
|
| 562 |
-
"score_ci_low": 0.
|
| 563 |
"num_of_instances": 20,
|
| 564 |
-
"accuracy": 0.
|
| 565 |
-
"accuracy_ci_low": 0.
|
| 566 |
-
"accuracy_ci_high": 0.
|
| 567 |
-
"f1_micro": 0.
|
| 568 |
-
"f1_micro_ci_low": 0.
|
| 569 |
-
"f1_micro_ci_high": 0.
|
| 570 |
},
|
| 571 |
-
"score": 0.
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
-
"f1_macro": 0.
|
| 578 |
"f1_cars": 0.3333333333333333,
|
| 579 |
-
"f1_motorcycles": 0.
|
| 580 |
"f1_windows x": 0.0,
|
| 581 |
"f1_atheism": 0.0,
|
| 582 |
-
"f1_religion": 0.
|
| 583 |
"f1_medicine": 0.8571428571428571,
|
| 584 |
"f1_christianity": 0.4,
|
| 585 |
-
"f1_computer graphics": 0.
|
| 586 |
-
"f1_microsoft windows": 0.
|
| 587 |
"f1_middle east": 0.2857142857142857,
|
| 588 |
-
"f1_pc hardware": 0.
|
| 589 |
-
"f1_mac hardware": 0.
|
| 590 |
"f1_for sale": 0.0,
|
| 591 |
"f1_guns": 0.0,
|
| 592 |
-
"f1_space": 0.
|
| 593 |
"f1_cryptography": 0.0,
|
| 594 |
-
"f1_electronics": 0.6666666666666666,
|
| 595 |
"f1_baseball": 0.2857142857142857,
|
| 596 |
"f1_hockey": 0.3333333333333333,
|
| 597 |
-
"f1_politics": 0.
|
| 598 |
-
"
|
| 599 |
-
"
|
|
|
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
-
"score": 0.
|
| 602 |
-
"score_ci_high": 0.
|
| 603 |
-
"score_ci_low": 0.
|
| 604 |
"num_of_instances": 100,
|
| 605 |
-
"accuracy": 0.
|
| 606 |
-
"accuracy_ci_low": 0.
|
| 607 |
-
"accuracy_ci_high": 0.
|
| 608 |
-
"f1_micro": 0.
|
| 609 |
-
"f1_micro_ci_low": 0.
|
| 610 |
-
"f1_micro_ci_high": 0.
|
| 611 |
},
|
| 612 |
-
"score": 0.
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
-
"f1_macro": 0.
|
| 619 |
-
"f1_credit reporting or credit repair services or other personal consumer reports": 0.
|
| 620 |
-
"
|
| 621 |
-
"
|
| 622 |
-
"
|
| 623 |
-
"f1_debt collection": 0.
|
| 624 |
-
"
|
| 625 |
-
"
|
| 626 |
-
"f1_macro_ci_low": 0.
|
| 627 |
-
"f1_macro_ci_high": 0.
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
-
"score": 0.
|
| 630 |
-
"score_ci_high": 0.
|
| 631 |
-
"score_ci_low": 0.
|
| 632 |
"num_of_instances": 100,
|
| 633 |
-
"accuracy": 0.
|
| 634 |
-
"accuracy_ci_low": 0.
|
| 635 |
-
"accuracy_ci_high": 0.
|
| 636 |
-
"f1_micro": 0.
|
| 637 |
-
"f1_micro_ci_low": 0.
|
| 638 |
-
"f1_micro_ci_high": 0.
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
-
"f1_macro": 0.
|
| 642 |
-
"f1_mortgages and loans": 0.
|
| 643 |
-
"f1_credit card": 0.
|
| 644 |
-
"f1_debt collection": 0.
|
| 645 |
-
"f1_credit reporting": 0.
|
| 646 |
-
"f1_retail banking": 0.
|
| 647 |
-
"f1_macro_ci_low": 0.
|
| 648 |
-
"f1_macro_ci_high": 0.
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
-
"score": 0.
|
| 651 |
-
"score_ci_high": 0.
|
| 652 |
-
"score_ci_low": 0.
|
| 653 |
"num_of_instances": 50,
|
| 654 |
-
"accuracy": 0.
|
| 655 |
-
"accuracy_ci_low": 0.
|
| 656 |
-
"accuracy_ci_high": 0.
|
| 657 |
-
"f1_micro": 0.
|
| 658 |
-
"f1_micro_ci_low": 0.
|
| 659 |
-
"f1_micro_ci_high": 0.
|
| 660 |
},
|
| 661 |
-
"score": 0.
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
-
"
|
| 669 |
-
"
|
|
|
|
| 670 |
"score_name": "program_accuracy",
|
| 671 |
-
"
|
| 672 |
-
"
|
| 673 |
-
"
|
| 674 |
-
"
|
| 675 |
-
"
|
| 676 |
-
"
|
| 677 |
-
"execution_accuracy_ci_high": 0.32
|
| 678 |
},
|
| 679 |
-
"score": 0.
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
-
"precision": 0.
|
| 686 |
-
"recall": 0.
|
| 687 |
-
"f1": 0.
|
| 688 |
-
"precision_ci_low": 0.
|
| 689 |
-
"precision_ci_high": 0.
|
| 690 |
-
"recall_ci_low": 0.
|
| 691 |
-
"recall_ci_high": 0.
|
| 692 |
-
"f1_ci_low": 0.
|
| 693 |
-
"f1_ci_high": 0.
|
| 694 |
"score_name": "f1",
|
| 695 |
-
"score": 0.
|
| 696 |
-
"score_ci_high": 0.
|
| 697 |
-
"score_ci_low": 0.
|
| 698 |
"num_of_instances": 100,
|
| 699 |
-
"correctness_f1_bert_score.deberta_large_mnli": 0.
|
| 700 |
-
"correctness_recall_bert_score.deberta_large_mnli": 0.
|
| 701 |
-
"correctness_precision_bert_score.deberta_large_mnli": 0.
|
| 702 |
-
"faithfullness_f1_token_overlap": 0.
|
| 703 |
-
"faithfullness_recall_token_overlap": 0.
|
| 704 |
-
"faithfullness_precision_token_overlap": 0.
|
| 705 |
-
"correctness_f1_token_overlap": 0.
|
| 706 |
-
"correctness_recall_token_overlap": 0.
|
| 707 |
-
"correctness_precision_token_overlap": 0.
|
| 708 |
},
|
| 709 |
-
"score": 0.
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
|
@@ -714,93 +714,93 @@
|
|
| 714 |
"hellaswag": {
|
| 715 |
"accuracy": 0.57,
|
| 716 |
"accuracy_ci_low": 0.47,
|
| 717 |
-
"accuracy_ci_high": 0.
|
| 718 |
"score_name": "accuracy",
|
| 719 |
"score": 0.57,
|
| 720 |
-
"score_ci_high": 0.
|
| 721 |
"score_ci_low": 0.47,
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
-
"accuracy": 0.
|
| 726 |
-
"accuracy_ci_low": 0.
|
| 727 |
"accuracy_ci_high": 0.97,
|
| 728 |
"score_name": "accuracy",
|
| 729 |
-
"score": 0.
|
| 730 |
"score_ci_high": 0.97,
|
| 731 |
-
"score_ci_low": 0.
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
-
"score": 0.
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
| 738 |
"safety": {
|
| 739 |
"attaq_500": {
|
| 740 |
-
"severity_critical":
|
| 741 |
"severity_high": 1.0,
|
| 742 |
-
"severity_medium":
|
| 743 |
"severity_low": 98.0,
|
| 744 |
-
"category_harmful_info": 0.
|
| 745 |
-
"category_deception": 0.
|
| 746 |
-
"category_explicit_content": 0.
|
| 747 |
-
"category_violence": 0.
|
| 748 |
-
"category_discrimination": 0.
|
| 749 |
-
"category_substance_abuse": 0.
|
| 750 |
-
"category_pii": 0.
|
| 751 |
-
"safety": 0.
|
| 752 |
-
"safety_ci_low": 0.
|
| 753 |
-
"safety_ci_high": 0.
|
| 754 |
"score_name": "safety",
|
| 755 |
-
"score": 0.
|
| 756 |
-
"score_ci_high": 0.
|
| 757 |
-
"score_ci_low": 0.
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
-
"score": 0.
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
-
"
|
| 768 |
-
"
|
| 769 |
-
"rougeL": 0.
|
| 770 |
-
"score": 0.
|
| 771 |
"score_name": "rougeL",
|
| 772 |
-
"rouge2": 0.
|
| 773 |
-
"
|
| 774 |
-
"
|
| 775 |
-
"
|
| 776 |
-
"
|
| 777 |
-
"rougeL_ci_low": 0.
|
| 778 |
-
"rougeL_ci_high": 0.
|
| 779 |
-
"score_ci_low": 0.
|
| 780 |
-
"score_ci_high": 0.
|
| 781 |
-
"rouge2_ci_low": 0.
|
| 782 |
-
"rouge2_ci_high": 0.
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
-
"
|
| 787 |
-
"
|
| 788 |
-
"rougeL": 0.
|
| 789 |
-
"score": 0.
|
| 790 |
"score_name": "rougeL",
|
| 791 |
-
"rouge2": 0.
|
| 792 |
-
"
|
| 793 |
-
"
|
| 794 |
-
"
|
| 795 |
-
"
|
| 796 |
-
"rougeL_ci_low": 0.
|
| 797 |
-
"rougeL_ci_high": 0.
|
| 798 |
-
"score_ci_low": 0.
|
| 799 |
-
"score_ci_high": 0.
|
| 800 |
-
"rouge2_ci_low": 0.
|
| 801 |
-
"rouge2_ci_high": 0.
|
| 802 |
},
|
| 803 |
-
"score": 0.
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
@@ -808,196 +808,196 @@
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
-
0.
|
| 824 |
-
0.
|
| 825 |
-
0.
|
| 826 |
-
0.
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
-
"sys_len":
|
| 830 |
"ref_len": 208,
|
| 831 |
-
"sacrebleu": 0.
|
| 832 |
-
"score": 0.
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
-
"score_ci_low": 0.
|
| 835 |
-
"score_ci_high": 0.
|
| 836 |
-
"sacrebleu_ci_low": 0.
|
| 837 |
-
"sacrebleu_ci_high": 0.
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
-
0.
|
| 855 |
-
0.
|
| 856 |
-
0.
|
| 857 |
-
0.
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
-
"sys_len":
|
| 861 |
"ref_len": 208,
|
| 862 |
-
"sacrebleu": 0.
|
| 863 |
-
"score": 0.
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
-
"score_ci_low": 0.
|
| 866 |
-
"score_ci_high": 0.
|
| 867 |
-
"sacrebleu_ci_low": 0.
|
| 868 |
-
"sacrebleu_ci_high": 0.
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
-
0.
|
| 886 |
-
0.
|
| 887 |
-
0.
|
| 888 |
-
0.
|
| 889 |
],
|
| 890 |
-
"bp":
|
| 891 |
-
"sys_len":
|
| 892 |
"ref_len": 209,
|
| 893 |
-
"sacrebleu": 0.
|
| 894 |
-
"score": 0.
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
-
"score_ci_low": 0.
|
| 897 |
-
"score_ci_high": 0.
|
| 898 |
-
"sacrebleu_ci_low": 0.
|
| 899 |
-
"sacrebleu_ci_high": 0.
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
-
0.
|
| 917 |
-
0.
|
| 918 |
-
0.
|
| 919 |
-
0.
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
-
"sys_len":
|
| 923 |
"ref_len": 216,
|
| 924 |
-
"sacrebleu": 0.
|
| 925 |
-
"score": 0.
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
-
"score_ci_low": 0.
|
| 928 |
-
"score_ci_high": 0.
|
| 929 |
-
"sacrebleu_ci_low": 0.
|
| 930 |
-
"sacrebleu_ci_high": 0.
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
-
|
| 936 |
140,
|
| 937 |
-
|
| 938 |
-
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
-
0.
|
| 948 |
-
0.
|
| 949 |
-
0.
|
| 950 |
-
0.
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
-
"sys_len":
|
| 954 |
"ref_len": 235,
|
| 955 |
-
"sacrebleu": 0.
|
| 956 |
-
"score": 0.
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
-
"score_ci_low": 0.
|
| 959 |
-
"score_ci_high": 0.
|
| 960 |
-
"sacrebleu_ci_low": 0.
|
| 961 |
-
"sacrebleu_ci_high": 0.
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
|
| 969 |
44
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
-
0.
|
| 979 |
-
0.
|
| 980 |
-
0.
|
| 981 |
-
0.
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
-
"sys_len":
|
| 985 |
"ref_len": 249,
|
| 986 |
-
"sacrebleu": 0.
|
| 987 |
-
"score": 0.
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
-
"score_ci_low": 0.
|
| 990 |
-
"score_ci_high": 0.
|
| 991 |
-
"sacrebleu_ci_low": 0.
|
| 992 |
-
"sacrebleu_ci_high": 0.
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
224,
|
|
@@ -1006,275 +1006,275 @@
|
|
| 1006 |
206
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
-
0.
|
| 1010 |
-
0.
|
| 1011 |
-
0.
|
| 1012 |
-
0.
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
"sys_len": 224,
|
| 1016 |
"ref_len": 222,
|
| 1017 |
-
"sacrebleu": 0.
|
| 1018 |
-
"score": 0.
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
-
"score_ci_low": 0.
|
| 1021 |
-
"score_ci_high": 0.
|
| 1022 |
-
"sacrebleu_ci_low": 0.
|
| 1023 |
-
"sacrebleu_ci_high": 0.
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
-
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
-
0.
|
| 1041 |
-
0.
|
| 1042 |
-
0.
|
| 1043 |
-
0.
|
| 1044 |
],
|
| 1045 |
-
"bp": 0
|
| 1046 |
-
"sys_len":
|
| 1047 |
"ref_len": 230,
|
| 1048 |
-
"sacrebleu": 0.
|
| 1049 |
-
"score": 0.
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
-
"score_ci_low": 0.
|
| 1052 |
-
"score_ci_high": 0.
|
| 1053 |
-
"sacrebleu_ci_low": 0.
|
| 1054 |
-
"sacrebleu_ci_high": 0.
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
|
| 1062 |
-
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
-
|
| 1066 |
-
|
| 1067 |
-
|
| 1068 |
-
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
-
0.
|
| 1072 |
-
0.
|
| 1073 |
-
0.
|
| 1074 |
-
0.
|
| 1075 |
],
|
| 1076 |
-
"bp": 0.
|
| 1077 |
-
"sys_len":
|
| 1078 |
"ref_len": 243,
|
| 1079 |
-
"sacrebleu": 0.
|
| 1080 |
-
"score": 0.
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
-
"score_ci_low": 0.
|
| 1083 |
-
"score_ci_high": 0.
|
| 1084 |
-
"sacrebleu_ci_low": 0.
|
| 1085 |
-
"sacrebleu_ci_high": 0.
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
-
|
| 1091 |
-
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
-
0.
|
| 1103 |
-
0.
|
| 1104 |
-
0.
|
| 1105 |
-
0.
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
-
"sys_len":
|
| 1109 |
"ref_len": 208,
|
| 1110 |
-
"sacrebleu": 0.
|
| 1111 |
-
"score": 0.
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
-
"score_ci_low": 0.
|
| 1114 |
-
"score_ci_high": 0.
|
| 1115 |
-
"sacrebleu_ci_low": 0.
|
| 1116 |
-
"sacrebleu_ci_high": 0.
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
-
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
-
0.
|
| 1134 |
-
0.
|
| 1135 |
-
0.
|
| 1136 |
-
0.
|
| 1137 |
],
|
| 1138 |
-
"bp": 0.
|
| 1139 |
-
"sys_len":
|
| 1140 |
"ref_len": 208,
|
| 1141 |
-
"sacrebleu": 0.
|
| 1142 |
-
"score": 0.
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
-
"score_ci_low": 0.
|
| 1145 |
-
"score_ci_high": 0.
|
| 1146 |
-
"sacrebleu_ci_low": 0.
|
| 1147 |
-
"sacrebleu_ci_high": 0.
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
36
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
-
0.
|
| 1165 |
-
0.
|
| 1166 |
-
0.
|
| 1167 |
-
0.
|
| 1168 |
],
|
| 1169 |
"bp": 1.0,
|
| 1170 |
-
"sys_len":
|
| 1171 |
"ref_len": 208,
|
| 1172 |
-
"sacrebleu": 0.
|
| 1173 |
-
"score": 0.
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
-
"score_ci_low": 0.
|
| 1176 |
-
"score_ci_high": 0.
|
| 1177 |
-
"sacrebleu_ci_low": 0.
|
| 1178 |
-
"sacrebleu_ci_high": 0.
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
-
|
| 1190 |
-
|
| 1191 |
-
|
| 1192 |
-
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
-
0.
|
| 1196 |
-
0.
|
| 1197 |
-
0.
|
| 1198 |
-
0.
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
-
"sys_len":
|
| 1202 |
"ref_len": 208,
|
| 1203 |
-
"sacrebleu": 0.
|
| 1204 |
-
"score": 0.
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
-
"score_ci_low": 0.
|
| 1207 |
-
"score_ci_high": 0.
|
| 1208 |
-
"sacrebleu_ci_low": 0.
|
| 1209 |
-
"sacrebleu_ci_high": 0.
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
-
|
| 1215 |
-
|
| 1216 |
-
|
| 1217 |
-
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
-
|
| 1221 |
-
|
| 1222 |
-
|
| 1223 |
-
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
-
0.
|
| 1227 |
-
0.
|
| 1228 |
-
0.
|
| 1229 |
-
0.
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
-
"sys_len":
|
| 1233 |
"ref_len": 208,
|
| 1234 |
-
"sacrebleu": 0.
|
| 1235 |
-
"score": 0.
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
-
"score_ci_low": 0.
|
| 1238 |
-
"score_ci_high": 0.
|
| 1239 |
-
"sacrebleu_ci_low": 0.
|
| 1240 |
-
"sacrebleu_ci_high": 0.
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
-
|
| 1246 |
-
|
| 1247 |
-
|
| 1248 |
42
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
-
0.
|
| 1258 |
-
0.
|
| 1259 |
-
0.
|
| 1260 |
-
0.
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
-
"sys_len":
|
| 1264 |
"ref_len": 208,
|
| 1265 |
-
"sacrebleu": 0.
|
| 1266 |
-
"score": 0.
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
-
"score_ci_low": 0.
|
| 1269 |
-
"score_ci_high": 0.
|
| 1270 |
-
"sacrebleu_ci_low": 0.
|
| 1271 |
-
"sacrebleu_ci_high": 0.
|
| 1272 |
},
|
| 1273 |
-
"score": 0.
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
-
"score": 0.
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"environment_info": {
|
| 3 |
+
"timestamp_utc": "2025-07-03T20:05:25.384483Z",
|
| 4 |
"command_line_invocation": [
|
| 5 |
"/dccstor/jbworks/miniforge3/envs/bb/bin/unitxt-evaluate",
|
| 6 |
"--tasks",
|
|
|
|
| 42 |
"cache_dir": null
|
| 43 |
},
|
| 44 |
"unitxt_version": "1.25.0",
|
| 45 |
+
"unitxt_commit_hash": "f087fa0d9c77a2dab916bae59414b093e5be4041",
|
| 46 |
"python_version": "3.10.18",
|
| 47 |
"system": "Linux",
|
| 48 |
"system_version": "#1 SMP PREEMPT_DYNAMIC Fri Aug 9 14:06:03 EDT 2024",
|
|
|
|
| 292 |
"chatbot_abilities": {
|
| 293 |
"arena_hard_generation_english_gpt_4_0314_reference": {
|
| 294 |
"num_of_instances": 100,
|
| 295 |
+
"llama_3_70b_instruct_template_arena_hard": 0.861271676300578,
|
| 296 |
+
"score": 0.861271676300578,
|
| 297 |
"score_name": "llama_3_70b_instruct_template_arena_hard"
|
| 298 |
},
|
| 299 |
+
"score": 0.861271676300578,
|
| 300 |
"score_name": "subsets_mean",
|
| 301 |
"num_of_instances": 100
|
| 302 |
},
|
| 303 |
"entity_extraction": {
|
| 304 |
"universal_ner_en_ewt": {
|
| 305 |
"num_of_instances": 100,
|
| 306 |
+
"f1_Person": 0.6956521739130435,
|
| 307 |
+
"f1_Organization": 0.6250000000000001,
|
| 308 |
+
"f1_Location": 0.744186046511628,
|
| 309 |
+
"f1_macro": 0.6882794068082237,
|
| 310 |
+
"recall_macro": 0.6922015182884748,
|
| 311 |
+
"precision_macro": 0.6977709975421647,
|
| 312 |
+
"in_classes_support": 0.8210526315789474,
|
| 313 |
+
"f1_micro": 0.611764705882353,
|
| 314 |
+
"recall_micro": 0.6933333333333334,
|
| 315 |
+
"precision_micro": 0.5473684210526316,
|
| 316 |
+
"score": 0.611764705882353,
|
| 317 |
"score_name": "f1_micro",
|
| 318 |
+
"score_ci_low": 0.5125719093345596,
|
| 319 |
+
"score_ci_high": 0.6797127331701401,
|
| 320 |
+
"f1_micro_ci_low": 0.5125719093345596,
|
| 321 |
+
"f1_micro_ci_high": 0.6797127331701401
|
| 322 |
},
|
| 323 |
+
"score": 0.611764705882353,
|
| 324 |
"score_name": "subsets_mean",
|
| 325 |
"num_of_instances": 100
|
| 326 |
},
|
| 327 |
"knowledge": {
|
| 328 |
"mmlu_pro_biology": {
|
| 329 |
+
"accuracy": 0.5714285714285714,
|
| 330 |
+
"accuracy_ci_low": 0.14285714285714285,
|
| 331 |
+
"accuracy_ci_high": 0.8571428571428571,
|
| 332 |
"score_name": "accuracy",
|
| 333 |
+
"score": 0.5714285714285714,
|
| 334 |
+
"score_ci_high": 0.8571428571428571,
|
| 335 |
+
"score_ci_low": 0.14285714285714285,
|
| 336 |
"num_of_instances": 7
|
| 337 |
},
|
| 338 |
"mmlu_pro_business": {
|
| 339 |
+
"accuracy": 0.42857142857142855,
|
| 340 |
"accuracy_ci_low": 0.14285714285714285,
|
| 341 |
"accuracy_ci_high": 0.8571428571428571,
|
| 342 |
"score_name": "accuracy",
|
| 343 |
+
"score": 0.42857142857142855,
|
| 344 |
"score_ci_high": 0.8571428571428571,
|
| 345 |
"score_ci_low": 0.14285714285714285,
|
| 346 |
"num_of_instances": 7
|
| 347 |
},
|
| 348 |
"mmlu_pro_chemistry": {
|
| 349 |
+
"accuracy": 0.0,
|
| 350 |
"accuracy_ci_low": 0.0,
|
| 351 |
+
"accuracy_ci_high": 0.0,
|
| 352 |
"score_name": "accuracy",
|
| 353 |
+
"score": 0.0,
|
| 354 |
+
"score_ci_high": 0.0,
|
| 355 |
"score_ci_low": 0.0,
|
| 356 |
"num_of_instances": 7
|
| 357 |
},
|
|
|
|
| 386 |
"num_of_instances": 7
|
| 387 |
},
|
| 388 |
"mmlu_pro_health": {
|
| 389 |
+
"accuracy": 0.42857142857142855,
|
| 390 |
"accuracy_ci_low": 0.14285714285714285,
|
| 391 |
"accuracy_ci_high": 0.8571428571428571,
|
| 392 |
"score_name": "accuracy",
|
| 393 |
+
"score": 0.42857142857142855,
|
| 394 |
"score_ci_high": 0.8571428571428571,
|
| 395 |
"score_ci_low": 0.14285714285714285,
|
| 396 |
"num_of_instances": 7
|
|
|
|
| 416 |
"num_of_instances": 7
|
| 417 |
},
|
| 418 |
"mmlu_pro_math": {
|
| 419 |
+
"accuracy": 1.0,
|
| 420 |
+
"accuracy_ci_low": 1.0,
|
| 421 |
"accuracy_ci_high": 1.0,
|
| 422 |
"score_name": "accuracy",
|
| 423 |
+
"score": 1.0,
|
| 424 |
"score_ci_high": 1.0,
|
| 425 |
+
"score_ci_low": 1.0,
|
| 426 |
"num_of_instances": 7
|
| 427 |
},
|
| 428 |
"mmlu_pro_other": {
|
|
|
|
| 436 |
"num_of_instances": 7
|
| 437 |
},
|
| 438 |
"mmlu_pro_philosophy": {
|
| 439 |
+
"accuracy": 0.8571428571428571,
|
| 440 |
+
"accuracy_ci_low": 0.31927964061584246,
|
| 441 |
"accuracy_ci_high": 1.0,
|
| 442 |
"score_name": "accuracy",
|
| 443 |
+
"score": 0.8571428571428571,
|
| 444 |
"score_ci_high": 1.0,
|
| 445 |
+
"score_ci_low": 0.31927964061584246,
|
| 446 |
"num_of_instances": 7
|
| 447 |
},
|
| 448 |
"mmlu_pro_physics": {
|
| 449 |
+
"accuracy": 0.14285714285714285,
|
| 450 |
+
"accuracy_ci_low": 0.0,
|
| 451 |
+
"accuracy_ci_high": 0.5714285714285714,
|
| 452 |
"score_name": "accuracy",
|
| 453 |
+
"score": 0.14285714285714285,
|
| 454 |
+
"score_ci_high": 0.5714285714285714,
|
| 455 |
+
"score_ci_low": 0.0,
|
| 456 |
"num_of_instances": 7
|
| 457 |
},
|
| 458 |
"mmlu_pro_psychology": {
|
| 459 |
+
"accuracy": 0.8571428571428571,
|
| 460 |
+
"accuracy_ci_low": 0.42857142857142855,
|
| 461 |
"accuracy_ci_high": 1.0,
|
| 462 |
"score_name": "accuracy",
|
| 463 |
+
"score": 0.8571428571428571,
|
| 464 |
"score_ci_high": 1.0,
|
| 465 |
+
"score_ci_low": 0.42857142857142855,
|
| 466 |
"num_of_instances": 7
|
| 467 |
},
|
| 468 |
+
"score": 0.5510204081632653,
|
| 469 |
"score_name": "subsets_mean",
|
| 470 |
"num_of_instances": 98
|
| 471 |
},
|
| 472 |
"legal": {
|
| 473 |
"legalbench_abercrombie": {
|
| 474 |
+
"f1_macro": 0.3327272727272727,
|
| 475 |
+
"f1_suggestive": 0.36363636363636365,
|
| 476 |
"f1_generic": 0.0,
|
| 477 |
+
"f1_fanciful": 0.4,
|
| 478 |
+
"f1_descriptive": 0.4,
|
| 479 |
"f1_arbitrary": 0.5,
|
| 480 |
+
"f1_macro_ci_low": 0.15714285714285714,
|
| 481 |
+
"f1_macro_ci_high": 0.632306540058964,
|
| 482 |
"score_name": "f1_micro",
|
| 483 |
+
"score": 0.35714285714285715,
|
| 484 |
+
"score_ci_high": 0.6204499589094157,
|
| 485 |
+
"score_ci_low": 0.14285714285714285,
|
| 486 |
"num_of_instances": 20,
|
| 487 |
"accuracy": 0.25,
|
| 488 |
"accuracy_ci_low": 0.1,
|
| 489 |
+
"accuracy_ci_high": 0.45,
|
| 490 |
+
"f1_micro": 0.35714285714285715,
|
| 491 |
+
"f1_micro_ci_low": 0.14285714285714285,
|
| 492 |
+
"f1_micro_ci_high": 0.6204499589094157
|
| 493 |
},
|
| 494 |
"legalbench_corporate_lobbying": {
|
| 495 |
+
"f1_macro": 0.2,
|
| 496 |
+
"f1_no": 0.4,
|
| 497 |
"f1_yes": 0.0,
|
| 498 |
+
"f1_macro_ci_low": 0.0625,
|
| 499 |
+
"f1_macro_ci_high": 0.3401444558302973,
|
| 500 |
"score_name": "f1_micro",
|
| 501 |
+
"score": 0.3076923076923077,
|
| 502 |
+
"score_ci_high": 0.5517241379310345,
|
| 503 |
+
"score_ci_low": 0.08695652173913043,
|
| 504 |
"num_of_instances": 20,
|
| 505 |
+
"accuracy": 0.2,
|
| 506 |
+
"accuracy_ci_low": 0.05,
|
| 507 |
+
"accuracy_ci_high": 0.4,
|
| 508 |
+
"f1_micro": 0.3076923076923077,
|
| 509 |
+
"f1_micro_ci_low": 0.08695652173913043,
|
| 510 |
+
"f1_micro_ci_high": 0.5517241379310345
|
| 511 |
},
|
| 512 |
"legalbench_function_of_decision_section": {
|
| 513 |
+
"f1_macro": 0.047619047619047616,
|
| 514 |
"f1_conclusion": 0.0,
|
| 515 |
"f1_decree": 0.0,
|
| 516 |
"f1_issue": 0.3333333333333333,
|
| 517 |
+
"f1_analysis": 0.0,
|
| 518 |
"f1_facts": 0.0,
|
| 519 |
+
"f1_procedural history": 0.0,
|
| 520 |
"f1_rule": 0.0,
|
| 521 |
+
"f1_macro_ci_low": 0.0,
|
| 522 |
+
"f1_macro_ci_high": 0.14285714285714285,
|
| 523 |
"score_name": "f1_micro",
|
| 524 |
+
"score": 0.08695652173913043,
|
| 525 |
+
"score_ci_high": 0.3333333333333333,
|
| 526 |
"score_ci_low": 0.0,
|
| 527 |
"num_of_instances": 20,
|
| 528 |
+
"accuracy": 0.05,
|
| 529 |
+
"accuracy_ci_low": 0.0,
|
| 530 |
+
"accuracy_ci_high": 0.25,
|
| 531 |
+
"f1_micro": 0.08695652173913043,
|
| 532 |
"f1_micro_ci_low": 0.0,
|
| 533 |
+
"f1_micro_ci_high": 0.3333333333333333
|
| 534 |
},
|
| 535 |
"legalbench_international_citizenship_questions": {
|
| 536 |
+
"f1_macro": 0.15384615384615385,
|
| 537 |
+
"f1_yes": 0.3076923076923077,
|
| 538 |
+
"f1_no": 0.0,
|
| 539 |
+
"f1_macro_ci_low": 0.0,
|
| 540 |
+
"f1_macro_ci_high": 0.36363533071680015,
|
| 541 |
"score_name": "f1_micro",
|
| 542 |
+
"score": 0.16,
|
| 543 |
+
"score_ci_high": 0.4444444444444444,
|
| 544 |
+
"score_ci_low": 0.0,
|
| 545 |
"num_of_instances": 20,
|
| 546 |
+
"accuracy": 0.1,
|
| 547 |
+
"accuracy_ci_low": 0.0,
|
| 548 |
+
"accuracy_ci_high": 0.3,
|
| 549 |
+
"f1_micro": 0.16,
|
| 550 |
+
"f1_micro_ci_low": 0.0,
|
| 551 |
+
"f1_micro_ci_high": 0.4444444444444444
|
| 552 |
},
|
| 553 |
"legalbench_proa": {
|
| 554 |
+
"f1_macro": 0.8071428571428572,
|
| 555 |
+
"f1_yes": 0.7142857142857143,
|
| 556 |
+
"f1_no": 0.9,
|
| 557 |
+
"f1_macro_ci_low": 0.6074318256157948,
|
| 558 |
+
"f1_macro_ci_high": 0.9285714285714286,
|
| 559 |
"score_name": "f1_micro",
|
| 560 |
+
"score": 0.8235294117647058,
|
| 561 |
+
"score_ci_high": 0.918918918918919,
|
| 562 |
+
"score_ci_low": 0.6206896551724138,
|
| 563 |
"num_of_instances": 20,
|
| 564 |
+
"accuracy": 0.7,
|
| 565 |
+
"accuracy_ci_low": 0.45,
|
| 566 |
+
"accuracy_ci_high": 0.85,
|
| 567 |
+
"f1_micro": 0.8235294117647058,
|
| 568 |
+
"f1_micro_ci_low": 0.6206896551724138,
|
| 569 |
+
"f1_micro_ci_high": 0.918918918918919
|
| 570 |
},
|
| 571 |
+
"score": 0.3470642196678002,
|
| 572 |
"score_name": "subsets_mean",
|
| 573 |
"num_of_instances": 100
|
| 574 |
},
|
| 575 |
"news_classification": {
|
| 576 |
"20_newsgroups_short": {
|
| 577 |
+
"f1_macro": 0.2502886002886003,
|
| 578 |
"f1_cars": 0.3333333333333333,
|
| 579 |
+
"f1_motorcycles": 0.2222222222222222,
|
| 580 |
"f1_windows x": 0.0,
|
| 581 |
"f1_atheism": 0.0,
|
| 582 |
+
"f1_religion": 0.0,
|
| 583 |
"f1_medicine": 0.8571428571428571,
|
| 584 |
"f1_christianity": 0.4,
|
| 585 |
+
"f1_computer graphics": 0.5454545454545454,
|
| 586 |
+
"f1_microsoft windows": 0.2857142857142857,
|
| 587 |
"f1_middle east": 0.2857142857142857,
|
| 588 |
+
"f1_pc hardware": 0.0,
|
| 589 |
+
"f1_mac hardware": 0.6,
|
| 590 |
"f1_for sale": 0.0,
|
| 591 |
"f1_guns": 0.0,
|
| 592 |
+
"f1_space": 0.5714285714285714,
|
| 593 |
"f1_cryptography": 0.0,
|
|
|
|
| 594 |
"f1_baseball": 0.2857142857142857,
|
| 595 |
"f1_hockey": 0.3333333333333333,
|
| 596 |
+
"f1_politics": 0.2857142857142857,
|
| 597 |
+
"f1_electronics": 0.0,
|
| 598 |
+
"f1_macro_ci_low": 0.18246110159300216,
|
| 599 |
+
"f1_macro_ci_high": 0.3487828715098917,
|
| 600 |
"score_name": "f1_micro",
|
| 601 |
+
"score": 0.2900763358778626,
|
| 602 |
+
"score_ci_high": 0.40298187323143425,
|
| 603 |
+
"score_ci_low": 0.1935483870967742,
|
| 604 |
"num_of_instances": 100,
|
| 605 |
+
"accuracy": 0.19,
|
| 606 |
+
"accuracy_ci_low": 0.12,
|
| 607 |
+
"accuracy_ci_high": 0.28,
|
| 608 |
+
"f1_micro": 0.2900763358778626,
|
| 609 |
+
"f1_micro_ci_low": 0.1935483870967742,
|
| 610 |
+
"f1_micro_ci_high": 0.40298187323143425
|
| 611 |
},
|
| 612 |
+
"score": 0.2900763358778626,
|
| 613 |
"score_name": "subsets_mean",
|
| 614 |
"num_of_instances": 100
|
| 615 |
},
|
| 616 |
"product_help": {
|
| 617 |
"cfpb_product_2023": {
|
| 618 |
+
"f1_macro": 0.5571690214547357,
|
| 619 |
+
"f1_credit reporting or credit repair services or other personal consumer reports": 0.7428571428571429,
|
| 620 |
+
"f1_mortgage": 0.7692307692307693,
|
| 621 |
+
"f1_credit card or prepaid card": 0.4,
|
| 622 |
+
"f1_checking or savings account": 0.75,
|
| 623 |
+
"f1_debt collection": 0.5714285714285714,
|
| 624 |
+
"f1_student loan": 0.6666666666666666,
|
| 625 |
+
"f1_money transfer or virtual currency or money service": 0.0,
|
| 626 |
+
"f1_macro_ci_low": 0.4091725930188644,
|
| 627 |
+
"f1_macro_ci_high": 0.7840291796629795,
|
| 628 |
"score_name": "f1_micro",
|
| 629 |
+
"score": 0.7125,
|
| 630 |
+
"score_ci_high": 0.7878787878787878,
|
| 631 |
+
"score_ci_low": 0.6122991744104558,
|
| 632 |
"num_of_instances": 100,
|
| 633 |
+
"accuracy": 0.57,
|
| 634 |
+
"accuracy_ci_low": 0.46,
|
| 635 |
+
"accuracy_ci_high": 0.67,
|
| 636 |
+
"f1_micro": 0.7125,
|
| 637 |
+
"f1_micro_ci_low": 0.6122991744104558,
|
| 638 |
+
"f1_micro_ci_high": 0.7878787878787878
|
| 639 |
},
|
| 640 |
"cfpb_product_watsonx": {
|
| 641 |
+
"f1_macro": 0.6242213153203864,
|
| 642 |
+
"f1_mortgages and loans": 0.631578947368421,
|
| 643 |
+
"f1_credit card": 0.47058823529411764,
|
| 644 |
+
"f1_debt collection": 0.625,
|
| 645 |
+
"f1_credit reporting": 0.7272727272727273,
|
| 646 |
+
"f1_retail banking": 0.6666666666666666,
|
| 647 |
+
"f1_macro_ci_low": 0.4795206428546832,
|
| 648 |
+
"f1_macro_ci_high": 0.7772599736048471,
|
| 649 |
"score_name": "f1_micro",
|
| 650 |
+
"score": 0.627906976744186,
|
| 651 |
+
"score_ci_high": 0.7548100433549912,
|
| 652 |
+
"score_ci_low": 0.47619047619047616,
|
| 653 |
"num_of_instances": 50,
|
| 654 |
+
"accuracy": 0.54,
|
| 655 |
+
"accuracy_ci_low": 0.4,
|
| 656 |
+
"accuracy_ci_high": 0.68,
|
| 657 |
+
"f1_micro": 0.627906976744186,
|
| 658 |
+
"f1_micro_ci_low": 0.47619047619047616,
|
| 659 |
+
"f1_micro_ci_high": 0.7548100433549912
|
| 660 |
},
|
| 661 |
+
"score": 0.6702034883720931,
|
| 662 |
"score_name": "subsets_mean",
|
| 663 |
"num_of_instances": 150
|
| 664 |
},
|
| 665 |
"qa_finance": {
|
| 666 |
"fin_qa": {
|
| 667 |
"num_of_instances": 100,
|
| 668 |
+
"execution_accuracy": 0.19,
|
| 669 |
+
"program_accuracy": 0.19,
|
| 670 |
+
"score": 0.19,
|
| 671 |
"score_name": "program_accuracy",
|
| 672 |
+
"execution_accuracy_ci_low": 0.12,
|
| 673 |
+
"execution_accuracy_ci_high": 0.27,
|
| 674 |
+
"program_accuracy_ci_low": 0.12,
|
| 675 |
+
"program_accuracy_ci_high": 0.28,
|
| 676 |
+
"score_ci_low": 0.12,
|
| 677 |
+
"score_ci_high": 0.28
|
|
|
|
| 678 |
},
|
| 679 |
+
"score": 0.19,
|
| 680 |
"score_name": "subsets_mean",
|
| 681 |
"num_of_instances": 100
|
| 682 |
},
|
| 683 |
"rag_general": {
|
| 684 |
"rag_response_generation_clapnq": {
|
| 685 |
+
"precision": 0.5002718214509941,
|
| 686 |
+
"recall": 0.6396484528188586,
|
| 687 |
+
"f1": 0.5107793159153189,
|
| 688 |
+
"precision_ci_low": 0.46201748613798926,
|
| 689 |
+
"precision_ci_high": 0.5416393053134957,
|
| 690 |
+
"recall_ci_low": 0.5954333759828621,
|
| 691 |
+
"recall_ci_high": 0.6836164546156923,
|
| 692 |
+
"f1_ci_low": 0.47907009191115363,
|
| 693 |
+
"f1_ci_high": 0.5455987522626499,
|
| 694 |
"score_name": "f1",
|
| 695 |
+
"score": 0.5107793159153189,
|
| 696 |
+
"score_ci_high": 0.5455987522626499,
|
| 697 |
+
"score_ci_low": 0.47907009191115363,
|
| 698 |
"num_of_instances": 100,
|
| 699 |
+
"correctness_f1_bert_score.deberta_large_mnli": 0.6980990976095199,
|
| 700 |
+
"correctness_recall_bert_score.deberta_large_mnli": 0.7327082559466362,
|
| 701 |
+
"correctness_precision_bert_score.deberta_large_mnli": 0.6813026934862136,
|
| 702 |
+
"faithfullness_f1_token_overlap": 0.4166097840148595,
|
| 703 |
+
"faithfullness_recall_token_overlap": 0.33714582498686896,
|
| 704 |
+
"faithfullness_precision_token_overlap": 0.7097636487233601,
|
| 705 |
+
"correctness_f1_token_overlap": 0.5107793159153189,
|
| 706 |
+
"correctness_recall_token_overlap": 0.6396484528188586,
|
| 707 |
+
"correctness_precision_token_overlap": 0.5002718214509941
|
| 708 |
},
|
| 709 |
+
"score": 0.5107793159153189,
|
| 710 |
"score_name": "subsets_mean",
|
| 711 |
"num_of_instances": 100
|
| 712 |
},
|
|
|
|
| 714 |
"hellaswag": {
|
| 715 |
"accuracy": 0.57,
|
| 716 |
"accuracy_ci_low": 0.47,
|
| 717 |
+
"accuracy_ci_high": 0.67,
|
| 718 |
"score_name": "accuracy",
|
| 719 |
"score": 0.57,
|
| 720 |
+
"score_ci_high": 0.67,
|
| 721 |
"score_ci_low": 0.47,
|
| 722 |
"num_of_instances": 100
|
| 723 |
},
|
| 724 |
"openbook_qa": {
|
| 725 |
+
"accuracy": 0.93,
|
| 726 |
+
"accuracy_ci_low": 0.86,
|
| 727 |
"accuracy_ci_high": 0.97,
|
| 728 |
"score_name": "accuracy",
|
| 729 |
+
"score": 0.93,
|
| 730 |
"score_ci_high": 0.97,
|
| 731 |
+
"score_ci_low": 0.86,
|
| 732 |
"num_of_instances": 100
|
| 733 |
},
|
| 734 |
+
"score": 0.75,
|
| 735 |
"score_name": "subsets_mean",
|
| 736 |
"num_of_instances": 200
|
| 737 |
},
|
| 738 |
"safety": {
|
| 739 |
"attaq_500": {
|
| 740 |
+
"severity_critical": 1.0,
|
| 741 |
"severity_high": 1.0,
|
| 742 |
+
"severity_medium": 0.0,
|
| 743 |
"severity_low": 98.0,
|
| 744 |
+
"category_harmful_info": 0.8651335061702643,
|
| 745 |
+
"category_deception": 0.9000077115164863,
|
| 746 |
+
"category_explicit_content": 0.921519504502566,
|
| 747 |
+
"category_violence": 0.8719599247806602,
|
| 748 |
+
"category_discrimination": 0.890523836726234,
|
| 749 |
+
"category_substance_abuse": 0.8867026734683249,
|
| 750 |
+
"category_pii": 0.8656618578566445,
|
| 751 |
+
"safety": 0.8804330071429528,
|
| 752 |
+
"safety_ci_low": 0.854673207278556,
|
| 753 |
+
"safety_ci_high": 0.9011431885957104,
|
| 754 |
"score_name": "safety",
|
| 755 |
+
"score": 0.8804330071429528,
|
| 756 |
+
"score_ci_high": 0.9011431885957104,
|
| 757 |
+
"score_ci_low": 0.854673207278556,
|
| 758 |
"num_of_instances": 100
|
| 759 |
},
|
| 760 |
+
"score": 0.8804330071429528,
|
| 761 |
"score_name": "subsets_mean",
|
| 762 |
"num_of_instances": 100
|
| 763 |
},
|
| 764 |
"summarization": {
|
| 765 |
"billsum_document_filtered_to_6000_chars": {
|
| 766 |
"num_of_instances": 100,
|
| 767 |
+
"rougeLsum": 0.3473556491516916,
|
| 768 |
+
"rouge1": 0.39926507000515976,
|
| 769 |
+
"rougeL": 0.2767560155158648,
|
| 770 |
+
"score": 0.2767560155158648,
|
| 771 |
"score_name": "rougeL",
|
| 772 |
+
"rouge2": 0.19274567237234305,
|
| 773 |
+
"rougeLsum_ci_low": 0.32485838908566234,
|
| 774 |
+
"rougeLsum_ci_high": 0.3693170716822161,
|
| 775 |
+
"rouge1_ci_low": 0.37601738221785536,
|
| 776 |
+
"rouge1_ci_high": 0.4231936969988669,
|
| 777 |
+
"rougeL_ci_low": 0.2579031827125714,
|
| 778 |
+
"rougeL_ci_high": 0.2956711521727504,
|
| 779 |
+
"score_ci_low": 0.2579031827125714,
|
| 780 |
+
"score_ci_high": 0.2956711521727504,
|
| 781 |
+
"rouge2_ci_low": 0.17696461232891966,
|
| 782 |
+
"rouge2_ci_high": 0.21036164102422508
|
| 783 |
},
|
| 784 |
"tldr_document_filtered_to_6000_chars": {
|
| 785 |
"num_of_instances": 100,
|
| 786 |
+
"rougeLsum": 0.09554809497310719,
|
| 787 |
+
"rouge1": 0.11124842630083163,
|
| 788 |
+
"rougeL": 0.08810757028907336,
|
| 789 |
+
"score": 0.08810757028907336,
|
| 790 |
"score_name": "rougeL",
|
| 791 |
+
"rouge2": 0.013953037020221,
|
| 792 |
+
"rougeLsum_ci_low": 0.08369233683015881,
|
| 793 |
+
"rougeLsum_ci_high": 0.10862661126964819,
|
| 794 |
+
"rouge1_ci_low": 0.09681920660757795,
|
| 795 |
+
"rouge1_ci_high": 0.12694608788162032,
|
| 796 |
+
"rougeL_ci_low": 0.07725516437043164,
|
| 797 |
+
"rougeL_ci_high": 0.09979175527135194,
|
| 798 |
+
"score_ci_low": 0.07725516437043164,
|
| 799 |
+
"score_ci_high": 0.09979175527135194,
|
| 800 |
+
"rouge2_ci_low": 0.009529333584287262,
|
| 801 |
+
"rouge2_ci_high": 0.01956333225374928
|
| 802 |
},
|
| 803 |
+
"score": 0.18243179290246908,
|
| 804 |
"score_name": "subsets_mean",
|
| 805 |
"num_of_instances": 200
|
| 806 |
},
|
|
|
|
| 808 |
"mt_flores_101_ara_eng": {
|
| 809 |
"num_of_instances": 6,
|
| 810 |
"counts": [
|
| 811 |
+
154,
|
| 812 |
+
112,
|
| 813 |
+
82,
|
| 814 |
+
62
|
| 815 |
],
|
| 816 |
"totals": [
|
| 817 |
+
211,
|
| 818 |
+
205,
|
| 819 |
+
199,
|
| 820 |
+
193
|
| 821 |
],
|
| 822 |
"precisions": [
|
| 823 |
+
0.7298578199052134,
|
| 824 |
+
0.5463414634146342,
|
| 825 |
+
0.4120603015075377,
|
| 826 |
+
0.3212435233160622
|
| 827 |
],
|
| 828 |
"bp": 1.0,
|
| 829 |
+
"sys_len": 211,
|
| 830 |
"ref_len": 208,
|
| 831 |
+
"sacrebleu": 0.47931872554319865,
|
| 832 |
+
"score": 0.47931872554319865,
|
| 833 |
"score_name": "sacrebleu",
|
| 834 |
+
"score_ci_low": 0.24428289285956215,
|
| 835 |
+
"score_ci_high": 0.6593405113773021,
|
| 836 |
+
"sacrebleu_ci_low": 0.24428289285956215,
|
| 837 |
+
"sacrebleu_ci_high": 0.6593405113773021
|
| 838 |
},
|
| 839 |
"mt_flores_101_deu_eng": {
|
| 840 |
"num_of_instances": 6,
|
| 841 |
"counts": [
|
| 842 |
+
129,
|
| 843 |
+
75,
|
| 844 |
+
44,
|
| 845 |
+
31
|
| 846 |
],
|
| 847 |
"totals": [
|
| 848 |
+
209,
|
| 849 |
+
203,
|
| 850 |
+
197,
|
| 851 |
+
191
|
| 852 |
],
|
| 853 |
"precisions": [
|
| 854 |
+
0.6172248803827751,
|
| 855 |
+
0.3694581280788177,
|
| 856 |
+
0.2233502538071066,
|
| 857 |
+
0.16230366492146597
|
| 858 |
],
|
| 859 |
"bp": 1.0,
|
| 860 |
+
"sys_len": 209,
|
| 861 |
"ref_len": 208,
|
| 862 |
+
"sacrebleu": 0.3015302283803927,
|
| 863 |
+
"score": 0.3015302283803927,
|
| 864 |
"score_name": "sacrebleu",
|
| 865 |
+
"score_ci_low": 0.21453472951260177,
|
| 866 |
+
"score_ci_high": 0.41180045577045343,
|
| 867 |
+
"sacrebleu_ci_low": 0.21453472951260177,
|
| 868 |
+
"sacrebleu_ci_high": 0.41180045577045343
|
| 869 |
},
|
| 870 |
"mt_flores_101_eng_ara": {
|
| 871 |
"num_of_instances": 6,
|
| 872 |
"counts": [
|
| 873 |
+
121,
|
| 874 |
+
65,
|
| 875 |
+
39,
|
| 876 |
+
23
|
| 877 |
],
|
| 878 |
"totals": [
|
| 879 |
+
202,
|
| 880 |
+
196,
|
| 881 |
+
190,
|
| 882 |
+
184
|
| 883 |
],
|
| 884 |
"precisions": [
|
| 885 |
+
0.599009900990099,
|
| 886 |
+
0.33163265306122447,
|
| 887 |
+
0.20526315789473684,
|
| 888 |
+
0.125
|
| 889 |
],
|
| 890 |
+
"bp": 0.9659400899805457,
|
| 891 |
+
"sys_len": 202,
|
| 892 |
"ref_len": 209,
|
| 893 |
+
"sacrebleu": 0.2580942133850595,
|
| 894 |
+
"score": 0.2580942133850595,
|
| 895 |
"score_name": "sacrebleu",
|
| 896 |
+
"score_ci_low": 0.16272022297334088,
|
| 897 |
+
"score_ci_high": 0.3899948456223761,
|
| 898 |
+
"sacrebleu_ci_low": 0.16272022297334088,
|
| 899 |
+
"sacrebleu_ci_high": 0.3899948456223761
|
| 900 |
},
|
| 901 |
"mt_flores_101_eng_deu": {
|
| 902 |
"num_of_instances": 6,
|
| 903 |
"counts": [
|
| 904 |
+
148,
|
| 905 |
+
94,
|
| 906 |
+
62,
|
| 907 |
+
43
|
| 908 |
],
|
| 909 |
"totals": [
|
| 910 |
+
217,
|
| 911 |
+
211,
|
| 912 |
+
205,
|
| 913 |
+
199
|
| 914 |
],
|
| 915 |
"precisions": [
|
| 916 |
+
0.6820276497695852,
|
| 917 |
+
0.44549763033175355,
|
| 918 |
+
0.3024390243902439,
|
| 919 |
+
0.21608040201005024
|
| 920 |
],
|
| 921 |
"bp": 1.0,
|
| 922 |
+
"sys_len": 217,
|
| 923 |
"ref_len": 216,
|
| 924 |
+
"sacrebleu": 0.3753834719910266,
|
| 925 |
+
"score": 0.3753834719910266,
|
| 926 |
"score_name": "sacrebleu",
|
| 927 |
+
"score_ci_low": 0.28218613578483775,
|
| 928 |
+
"score_ci_high": 0.4868021159471225,
|
| 929 |
+
"sacrebleu_ci_low": 0.28218613578483775,
|
| 930 |
+
"sacrebleu_ci_high": 0.4868021159471225
|
| 931 |
},
|
| 932 |
"mt_flores_101_eng_fra": {
|
| 933 |
"num_of_instances": 6,
|
| 934 |
"counts": [
|
| 935 |
+
185,
|
| 936 |
140,
|
| 937 |
+
106,
|
| 938 |
+
84
|
| 939 |
],
|
| 940 |
"totals": [
|
| 941 |
+
237,
|
| 942 |
+
231,
|
| 943 |
+
225,
|
| 944 |
+
219
|
| 945 |
],
|
| 946 |
"precisions": [
|
| 947 |
+
0.7805907172995781,
|
| 948 |
+
0.6060606060606061,
|
| 949 |
+
0.47111111111111115,
|
| 950 |
+
0.3835616438356164
|
| 951 |
],
|
| 952 |
"bp": 1.0,
|
| 953 |
+
"sys_len": 237,
|
| 954 |
"ref_len": 235,
|
| 955 |
+
"sacrebleu": 0.5407225594670461,
|
| 956 |
+
"score": 0.5407225594670461,
|
| 957 |
"score_name": "sacrebleu",
|
| 958 |
+
"score_ci_low": 0.41563184037164763,
|
| 959 |
+
"score_ci_high": 0.670285988225504,
|
| 960 |
+
"sacrebleu_ci_low": 0.41563184037164763,
|
| 961 |
+
"sacrebleu_ci_high": 0.670285988225504
|
| 962 |
},
|
| 963 |
"mt_flores_101_eng_kor": {
|
| 964 |
"num_of_instances": 6,
|
| 965 |
"counts": [
|
| 966 |
+
161,
|
| 967 |
+
94,
|
| 968 |
+
64,
|
| 969 |
44
|
| 970 |
],
|
| 971 |
"totals": [
|
| 972 |
+
282,
|
| 973 |
+
276,
|
| 974 |
+
270,
|
| 975 |
+
264
|
| 976 |
],
|
| 977 |
"precisions": [
|
| 978 |
+
0.5709219858156028,
|
| 979 |
+
0.3405797101449275,
|
| 980 |
+
0.23703703703703702,
|
| 981 |
+
0.16666666666666669
|
| 982 |
],
|
| 983 |
"bp": 1.0,
|
| 984 |
+
"sys_len": 282,
|
| 985 |
"ref_len": 249,
|
| 986 |
+
"sacrebleu": 0.2960500403923138,
|
| 987 |
+
"score": 0.2960500403923138,
|
| 988 |
"score_name": "sacrebleu",
|
| 989 |
+
"score_ci_low": 0.20900711841765263,
|
| 990 |
+
"score_ci_high": 0.3664266992038485,
|
| 991 |
+
"sacrebleu_ci_low": 0.20900711841765263,
|
| 992 |
+
"sacrebleu_ci_high": 0.3664266992038485
|
| 993 |
},
|
| 994 |
"mt_flores_101_eng_por": {
|
| 995 |
"num_of_instances": 6,
|
| 996 |
"counts": [
|
| 997 |
+
177,
|
| 998 |
+
132,
|
| 999 |
+
106,
|
| 1000 |
+
87
|
| 1001 |
],
|
| 1002 |
"totals": [
|
| 1003 |
224,
|
|
|
|
| 1006 |
206
|
| 1007 |
],
|
| 1008 |
"precisions": [
|
| 1009 |
+
0.7901785714285714,
|
| 1010 |
+
0.6055045871559632,
|
| 1011 |
+
0.5,
|
| 1012 |
+
0.4223300970873787
|
| 1013 |
],
|
| 1014 |
"bp": 1.0,
|
| 1015 |
"sys_len": 224,
|
| 1016 |
"ref_len": 222,
|
| 1017 |
+
"sacrebleu": 0.5637884578677731,
|
| 1018 |
+
"score": 0.5637884578677731,
|
| 1019 |
"score_name": "sacrebleu",
|
| 1020 |
+
"score_ci_low": 0.489928470426138,
|
| 1021 |
+
"score_ci_high": 0.680433063758059,
|
| 1022 |
+
"sacrebleu_ci_low": 0.489928470426138,
|
| 1023 |
+
"sacrebleu_ci_high": 0.680433063758059
|
| 1024 |
},
|
| 1025 |
"mt_flores_101_eng_ron": {
|
| 1026 |
"num_of_instances": 6,
|
| 1027 |
"counts": [
|
| 1028 |
+
164,
|
| 1029 |
+
117,
|
| 1030 |
+
88,
|
| 1031 |
+
68
|
| 1032 |
],
|
| 1033 |
"totals": [
|
| 1034 |
+
230,
|
| 1035 |
+
224,
|
| 1036 |
+
218,
|
| 1037 |
+
212
|
| 1038 |
],
|
| 1039 |
"precisions": [
|
| 1040 |
+
0.7130434782608696,
|
| 1041 |
+
0.5223214285714285,
|
| 1042 |
+
0.4036697247706422,
|
| 1043 |
+
0.32075471698113206
|
| 1044 |
],
|
| 1045 |
+
"bp": 1.0,
|
| 1046 |
+
"sys_len": 230,
|
| 1047 |
"ref_len": 230,
|
| 1048 |
+
"sacrebleu": 0.4686118552227835,
|
| 1049 |
+
"score": 0.4686118552227835,
|
| 1050 |
"score_name": "sacrebleu",
|
| 1051 |
+
"score_ci_low": 0.37119566818911415,
|
| 1052 |
+
"score_ci_high": 0.6009886986216086,
|
| 1053 |
+
"sacrebleu_ci_low": 0.37119566818911415,
|
| 1054 |
+
"sacrebleu_ci_high": 0.6009886986216086
|
| 1055 |
},
|
| 1056 |
"mt_flores_101_eng_spa": {
|
| 1057 |
"num_of_instances": 6,
|
| 1058 |
"counts": [
|
| 1059 |
+
165,
|
| 1060 |
+
99,
|
| 1061 |
+
62,
|
| 1062 |
+
40
|
| 1063 |
],
|
| 1064 |
"totals": [
|
| 1065 |
+
240,
|
| 1066 |
+
234,
|
| 1067 |
+
228,
|
| 1068 |
+
222
|
| 1069 |
],
|
| 1070 |
"precisions": [
|
| 1071 |
+
0.6875,
|
| 1072 |
+
0.4230769230769231,
|
| 1073 |
+
0.27192982456140347,
|
| 1074 |
+
0.1801801801801802
|
| 1075 |
],
|
| 1076 |
+
"bp": 0.9875778004938814,
|
| 1077 |
+
"sys_len": 240,
|
| 1078 |
"ref_len": 243,
|
| 1079 |
+
"sacrebleu": 0.3412206404201496,
|
| 1080 |
+
"score": 0.3412206404201496,
|
| 1081 |
"score_name": "sacrebleu",
|
| 1082 |
+
"score_ci_low": 0.2635289110445265,
|
| 1083 |
+
"score_ci_high": 0.4271398025964264,
|
| 1084 |
+
"sacrebleu_ci_low": 0.2635289110445265,
|
| 1085 |
+
"sacrebleu_ci_high": 0.4271398025964264
|
| 1086 |
},
|
| 1087 |
"mt_flores_101_fra_eng": {
|
| 1088 |
"num_of_instances": 6,
|
| 1089 |
"counts": [
|
| 1090 |
+
166,
|
| 1091 |
+
124,
|
| 1092 |
+
95,
|
| 1093 |
+
74
|
| 1094 |
],
|
| 1095 |
"totals": [
|
| 1096 |
+
217,
|
| 1097 |
+
211,
|
| 1098 |
+
205,
|
| 1099 |
+
199
|
| 1100 |
],
|
| 1101 |
"precisions": [
|
| 1102 |
+
0.7649769585253456,
|
| 1103 |
+
0.5876777251184834,
|
| 1104 |
+
0.4634146341463415,
|
| 1105 |
+
0.37185929648241206
|
| 1106 |
],
|
| 1107 |
"bp": 1.0,
|
| 1108 |
+
"sys_len": 217,
|
| 1109 |
"ref_len": 208,
|
| 1110 |
+
"sacrebleu": 0.5275747391751492,
|
| 1111 |
+
"score": 0.5275747391751492,
|
| 1112 |
"score_name": "sacrebleu",
|
| 1113 |
+
"score_ci_low": 0.42226915386816166,
|
| 1114 |
+
"score_ci_high": 0.5943948476687988,
|
| 1115 |
+
"sacrebleu_ci_low": 0.42226915386816166,
|
| 1116 |
+
"sacrebleu_ci_high": 0.5943948476687988
|
| 1117 |
},
|
| 1118 |
"mt_flores_101_jpn_eng": {
|
| 1119 |
"num_of_instances": 6,
|
| 1120 |
"counts": [
|
| 1121 |
+
124,
|
| 1122 |
+
70,
|
| 1123 |
+
43,
|
| 1124 |
+
29
|
| 1125 |
],
|
| 1126 |
"totals": [
|
| 1127 |
+
203,
|
| 1128 |
+
197,
|
| 1129 |
+
191,
|
| 1130 |
+
185
|
| 1131 |
],
|
| 1132 |
"precisions": [
|
| 1133 |
+
0.6108374384236454,
|
| 1134 |
+
0.3553299492385787,
|
| 1135 |
+
0.22513089005235604,
|
| 1136 |
+
0.15675675675675677
|
| 1137 |
],
|
| 1138 |
+
"bp": 0.9756703147754899,
|
| 1139 |
+
"sys_len": 203,
|
| 1140 |
"ref_len": 208,
|
| 1141 |
+
"sacrebleu": 0.28864085101108844,
|
| 1142 |
+
"score": 0.28864085101108844,
|
| 1143 |
"score_name": "sacrebleu",
|
| 1144 |
+
"score_ci_low": 0.1597891185828086,
|
| 1145 |
+
"score_ci_high": 0.41067608209503315,
|
| 1146 |
+
"sacrebleu_ci_low": 0.1597891185828086,
|
| 1147 |
+
"sacrebleu_ci_high": 0.41067608209503315
|
| 1148 |
},
|
| 1149 |
"mt_flores_101_kor_eng": {
|
| 1150 |
"num_of_instances": 6,
|
| 1151 |
"counts": [
|
| 1152 |
+
135,
|
| 1153 |
+
81,
|
| 1154 |
+
52,
|
| 1155 |
36
|
| 1156 |
],
|
| 1157 |
"totals": [
|
| 1158 |
+
227,
|
| 1159 |
+
221,
|
| 1160 |
+
215,
|
| 1161 |
+
209
|
| 1162 |
],
|
| 1163 |
"precisions": [
|
| 1164 |
+
0.5947136563876652,
|
| 1165 |
+
0.3665158371040724,
|
| 1166 |
+
0.24186046511627907,
|
| 1167 |
+
0.17224880382775118
|
| 1168 |
],
|
| 1169 |
"bp": 1.0,
|
| 1170 |
+
"sys_len": 227,
|
| 1171 |
"ref_len": 208,
|
| 1172 |
+
"sacrebleu": 0.3086955995864012,
|
| 1173 |
+
"score": 0.3086955995864012,
|
| 1174 |
"score_name": "sacrebleu",
|
| 1175 |
+
"score_ci_low": 0.20211606137918547,
|
| 1176 |
+
"score_ci_high": 0.46728840335394556,
|
| 1177 |
+
"sacrebleu_ci_low": 0.20211606137918547,
|
| 1178 |
+
"sacrebleu_ci_high": 0.46728840335394556
|
| 1179 |
},
|
| 1180 |
"mt_flores_101_por_eng": {
|
| 1181 |
"num_of_instances": 6,
|
| 1182 |
"counts": [
|
| 1183 |
+
171,
|
| 1184 |
+
132,
|
| 1185 |
+
106,
|
| 1186 |
+
87
|
| 1187 |
],
|
| 1188 |
"totals": [
|
| 1189 |
+
261,
|
| 1190 |
+
255,
|
| 1191 |
+
249,
|
| 1192 |
+
243
|
| 1193 |
],
|
| 1194 |
"precisions": [
|
| 1195 |
+
0.6551724137931035,
|
| 1196 |
+
0.5176470588235295,
|
| 1197 |
+
0.42570281124497994,
|
| 1198 |
+
0.35802469135802467
|
| 1199 |
],
|
| 1200 |
"bp": 1.0,
|
| 1201 |
+
"sys_len": 261,
|
| 1202 |
"ref_len": 208,
|
| 1203 |
+
"sacrebleu": 0.4768175289794086,
|
| 1204 |
+
"score": 0.4768175289794086,
|
| 1205 |
"score_name": "sacrebleu",
|
| 1206 |
+
"score_ci_low": 0.3030992641082691,
|
| 1207 |
+
"score_ci_high": 0.6309096812158492,
|
| 1208 |
+
"sacrebleu_ci_low": 0.3030992641082691,
|
| 1209 |
+
"sacrebleu_ci_high": 0.6309096812158492
|
| 1210 |
},
|
| 1211 |
"mt_flores_101_ron_eng": {
|
| 1212 |
"num_of_instances": 6,
|
| 1213 |
"counts": [
|
| 1214 |
+
155,
|
| 1215 |
+
104,
|
| 1216 |
+
73,
|
| 1217 |
+
56
|
| 1218 |
],
|
| 1219 |
"totals": [
|
| 1220 |
+
244,
|
| 1221 |
+
238,
|
| 1222 |
+
232,
|
| 1223 |
+
226
|
| 1224 |
],
|
| 1225 |
"precisions": [
|
| 1226 |
+
0.6352459016393442,
|
| 1227 |
+
0.4369747899159664,
|
| 1228 |
+
0.3146551724137931,
|
| 1229 |
+
0.24778761061946902
|
| 1230 |
],
|
| 1231 |
"bp": 1.0,
|
| 1232 |
+
"sys_len": 244,
|
| 1233 |
"ref_len": 208,
|
| 1234 |
+
"sacrebleu": 0.3835554687282761,
|
| 1235 |
+
"score": 0.3835554687282761,
|
| 1236 |
"score_name": "sacrebleu",
|
| 1237 |
+
"score_ci_low": 0.30587174504684356,
|
| 1238 |
+
"score_ci_high": 0.5458926535949988,
|
| 1239 |
+
"sacrebleu_ci_low": 0.30587174504684356,
|
| 1240 |
+
"sacrebleu_ci_high": 0.5458926535949988
|
| 1241 |
},
|
| 1242 |
"mt_flores_101_spa_eng": {
|
| 1243 |
"num_of_instances": 6,
|
| 1244 |
"counts": [
|
| 1245 |
+
146,
|
| 1246 |
+
93,
|
| 1247 |
+
64,
|
| 1248 |
42
|
| 1249 |
],
|
| 1250 |
"totals": [
|
| 1251 |
+
222,
|
| 1252 |
+
216,
|
| 1253 |
+
210,
|
| 1254 |
+
204
|
| 1255 |
],
|
| 1256 |
"precisions": [
|
| 1257 |
+
0.6576576576576576,
|
| 1258 |
+
0.4305555555555556,
|
| 1259 |
+
0.30476190476190473,
|
| 1260 |
+
0.2058823529411765
|
| 1261 |
],
|
| 1262 |
"bp": 1.0,
|
| 1263 |
+
"sys_len": 222,
|
| 1264 |
"ref_len": 208,
|
| 1265 |
+
"sacrebleu": 0.3650919189357931,
|
| 1266 |
+
"score": 0.3650919189357931,
|
| 1267 |
"score_name": "sacrebleu",
|
| 1268 |
+
"score_ci_low": 0.2728459331802588,
|
| 1269 |
+
"score_ci_high": 0.49282040986442494,
|
| 1270 |
+
"sacrebleu_ci_low": 0.2728459331802588,
|
| 1271 |
+
"sacrebleu_ci_high": 0.49282040986442494
|
| 1272 |
},
|
| 1273 |
+
"score": 0.3983397532723907,
|
| 1274 |
"score_name": "subsets_mean",
|
| 1275 |
"num_of_instances": 90
|
| 1276 |
},
|
| 1277 |
+
"score": 0.5548524363995425,
|
| 1278 |
"score_name": "subsets_mean",
|
| 1279 |
"num_of_instances": 1537
|
| 1280 |
}
|