Skip to content

Commit 2dcf6b2

Browse files
committed
optional json export of bench results
1 parent 81c980f commit 2dcf6b2

File tree

4 files changed

+38
-6
lines changed

4 files changed

+38
-6
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,7 @@ webapp/.next/
9292
# locally saved datasets
9393
gpt_engineer/benchmark/benchmarks/apps/dataset
9494
gpt_engineer/benchmark/benchmarks/mbpp/dataset
95+
96+
gpt_engineer/benchmark/minimal_bench_config.toml
97+
98+
test.json

gpt_engineer/benchmark/__main__.py

+15-6
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
from gpt_engineer.applications.cli.main import load_env_if_needed
3333
from gpt_engineer.benchmark.bench_config import BenchConfig
3434
from gpt_engineer.benchmark.benchmarks.load import get_benchmark
35-
from gpt_engineer.benchmark.run import print_results, run
35+
from gpt_engineer.benchmark.run import export_json_results, print_results, run
3636

3737
app = typer.Typer() # creates a CLI app
3838

@@ -72,8 +72,12 @@ def main(
7272
),
7373
],
7474
bench_config: Annotated[
75-
Optional[str], typer.Argument(help="optional task name in benchmark")
75+
str, typer.Argument(help="optional task name in benchmark")
7676
] = os.path.join(os.path.dirname(__file__), "default_bench_config.toml"),
77+
json_output: Annotated[
78+
Optional[str],
79+
typer.Option(help="print results for each task", show_default=False),
80+
] = None,
7781
verbose: Annotated[
7882
bool, typer.Option(help="print results for each task", show_default=False)
7983
] = False,
@@ -85,13 +89,12 @@ def main(
8589
----------
8690
path_to_agent : str
8791
The file path to the Python module that contains a function called 'default_config_agent'.
88-
benchmarks : str
89-
A comma-separated string of benchmark names to run.
90-
bench_config : Optional[str], default=default_bench_config.toml
92+
bench_config : str, default=default_bench_config.toml
9193
Configuration file for choosing which benchmark problems to run. See default config for more details.
94+
json_output: Optional[str], default=None
95+
Pass a path to a json file to have results written to file.
9296
verbose : bool, default=False
9397
A flag to indicate whether to print results for each task.
94-
9598
Returns
9699
-------
97100
None
@@ -101,6 +104,7 @@ def main(
101104
config = BenchConfig.from_toml(bench_config)
102105
print("using config file: " + bench_config)
103106
benchmarks = list()
107+
benchmark_results = dict()
104108
for specific_config_name in vars(config):
105109
specific_config = getattr(config, specific_config_name)
106110
if hasattr(specific_config, "active"):
@@ -124,6 +128,11 @@ def main(
124128
)
125129
print_results(results)
126130
print()
131+
benchmark_results[benchmark_name] = {
132+
"detailed": [result.to_dict() for result in results]
133+
}
134+
if json_output is not None:
135+
export_json_results(json_output, benchmark_results)
127136

128137

129138
if __name__ == "__main__":

gpt_engineer/benchmark/run.py

+14
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
print_results : function
1313
Prints the results of the benchmark tasks to the console.
1414
"""
15+
import json
1516
import time
1617

1718
from typing import List
@@ -132,3 +133,16 @@ def print_results(results: list[TaskResult]):
132133
print(f"Average success rate: {avg_success_rate * 100}% on {len(results)} tasks")
133134
print("--- Results ---")
134135
print()
136+
137+
138+
def export_json_results(json_path, complete_results):
139+
for results in complete_results.values():
140+
correct_tasks = [
141+
task_result
142+
for task_result in results["detailed"]
143+
if task_result["solved"] == 1.0
144+
]
145+
fraction_correct = len(correct_tasks) / len(results["detailed"])
146+
results["fully_solved"] = fraction_correct
147+
with open(json_path, "w") as f:
148+
json.dump(complete_results, f, indent=4)

gpt_engineer/benchmark/types.py

+5
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,8 @@ def success_rate(self) -> float:
8787
)
8888

8989
return succeeded / len(self.assertion_results)
90+
91+
def to_dict(self) -> dict:
92+
out_dict = {key: value for key, value in self.__dict__.items()}
93+
out_dict["solved"] = self.success_rate
94+
return out_dict

0 commit comments

Comments
 (0)