optional json export of bench results

ATheorell · ATheorell · commit 2dcf6b215042 · 2024-05-12T12:16:43.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -92,3 +92,7 @@ webapp/.next/
 # locally saved datasets
 gpt_engineer/benchmark/benchmarks/apps/dataset
 gpt_engineer/benchmark/benchmarks/mbpp/dataset
+
+gpt_engineer/benchmark/minimal_bench_config.toml
+
+test.json
diff --git a/gpt_engineer/benchmark/__main__.py b/gpt_engineer/benchmark/__main__.py
@@ -32,7 +32,7 @@
 from gpt_engineer.applications.cli.main import load_env_if_needed
 from gpt_engineer.benchmark.bench_config import BenchConfig
 from gpt_engineer.benchmark.benchmarks.load import get_benchmark
-from gpt_engineer.benchmark.run import print_results, run
+from gpt_engineer.benchmark.run import export_json_results, print_results, run
 
 app = typer.Typer()  # creates a CLI app
 
@@ -72,8 +72,12 @@ def main(
         ),
     ],
     bench_config: Annotated[
-        Optional[str], typer.Argument(help="optional task name in benchmark")
+        str, typer.Argument(help="optional task name in benchmark")
     ] = os.path.join(os.path.dirname(__file__), "default_bench_config.toml"),
+    json_output: Annotated[
+        Optional[str],
+        typer.Option(help="print results for each task", show_default=False),
+    ] = None,
     verbose: Annotated[
         bool, typer.Option(help="print results for each task", show_default=False)
     ] = False,
@@ -85,13 +89,12 @@ def main(
     ----------
     path_to_agent : str
         The file path to the Python module that contains a function called 'default_config_agent'.
-    benchmarks : str
-        A comma-separated string of benchmark names to run.
-    bench_config : Optional[str], default=default_bench_config.toml
+    bench_config : str, default=default_bench_config.toml
         Configuration file for choosing which benchmark problems to run. See default config for more details.
+    json_output: Optional[str], default=None
+        Pass a path to a json file to have results written to file.
     verbose : bool, default=False
         A flag to indicate whether to print results for each task.
-
     Returns
     -------
     None
@@ -101,6 +104,7 @@ def main(
     config = BenchConfig.from_toml(bench_config)
     print("using config file: " + bench_config)
     benchmarks = list()
+    benchmark_results = dict()
     for specific_config_name in vars(config):
         specific_config = getattr(config, specific_config_name)
         if hasattr(specific_config, "active"):
@@ -124,6 +128,11 @@ def main(
         )
         print_results(results)
         print()
+        benchmark_results[benchmark_name] = {
+            "detailed": [result.to_dict() for result in results]
+        }
+    if json_output is not None:
+        export_json_results(json_output, benchmark_results)
 
 
 if __name__ == "__main__":
diff --git a/gpt_engineer/benchmark/run.py b/gpt_engineer/benchmark/run.py
@@ -12,6 +12,7 @@
 print_results : function
     Prints the results of the benchmark tasks to the console.
 """
+import json
 import time
 
 from typing import List
@@ -132,3 +133,16 @@ def print_results(results: list[TaskResult]):
     print(f"Average success rate: {avg_success_rate * 100}% on {len(results)} tasks")
     print("--- Results ---")
     print()
+
+
+def export_json_results(json_path, complete_results):
+    for results in complete_results.values():
+        correct_tasks = [
+            task_result
+            for task_result in results["detailed"]
+            if task_result["solved"] == 1.0
+        ]
+        fraction_correct = len(correct_tasks) / len(results["detailed"])
+        results["fully_solved"] = fraction_correct
+    with open(json_path, "w") as f:
+        json.dump(complete_results, f, indent=4)
diff --git a/gpt_engineer/benchmark/types.py b/gpt_engineer/benchmark/types.py
@@ -87,3 +87,8 @@ def success_rate(self) -> float:
         )
 
         return succeeded / len(self.assertion_results)
+
+    def to_dict(self) -> dict:
+        out_dict = {key: value for key, value in self.__dict__.items()}
+        out_dict["solved"] = self.success_rate
+        return out_dict

Original file line number	Diff line number	Diff line change
`@@ -87,3 +87,8 @@ def success_rate(self) -> float:`
`87`	`87`	`)`
`88`	`88`
`89`	`89`	`return succeeded / len(self.assertion_results)`
	`90`	`+`
	`91`	`+ def to_dict(self) -> dict:`
	`92`	`+ out_dict = {key: value for key, value in self.__dict__.items()}`
	`93`	`+ out_dict["solved"] = self.success_rate`
	`94`	`+ return out_dict`