| import fire | |
| import sys | |
| from .data import HUMAN_EVAL | |
| from .evaluation import evaluate_functional_correctness | |
| def entry_point( | |
| sample_file: str, | |
| k: str = "1,10,100", | |
| n_workers: int = 4, | |
| timeout: float = 3.0, | |
| problem_file: str = "", | |
| is_mbpp: bool = False, | |
| ): | |
| """ | |
| Evaluates the functional correctness of generated samples, and writes | |
| results to f"{sample_file}_results.jsonl.gz" | |
| """ | |
| k = list(map(int, k.split(","))) | |
| results = evaluate_functional_correctness(sample_file, k, n_workers, timeout, problem_file, is_mbpp) | |
| print(results) | |
| def main(): | |
| fire.Fire(entry_point) | |
| sys.exit(main()) | |