Add --output-iter-metrics flag to cpu userbenchmark scripts

murste01 · murste01 · commit 88314e6a0b94 · 2025-03-25T11:54:56.000Z
Adds a new `--output-iter-metrics` flag which adds per-iteration metrics
to benchmark result JSON files.
diff --git a/userbenchmark/cpu/cpu_utils.py b/userbenchmark/cpu/cpu_utils.py
@@ -119,22 +119,34 @@ def add_test_results(runs, result_metrics):
         ins_number = len(run["results"])
         assert ins_number
         latency_metric = "latency" in run["results"][0]["metrics"]
+        iter_latencies_metric = "iter_latencies" in run["results"][0]["metrics"]
         throughput_metric = "throughput" in run["results"][0]["metrics"]
+        iter_throughputs_metric = "iter_throughputs" in run["results"][0]["metrics"]
         cmem_metric = "cpu_peak_mem" in run["results"][0]["metrics"]
         latency_sum = 0
+        iter_latencies = []
         throughput_sum = 0
+        iter_throughputs = []
         cmem_sum = 0
         for ins_res in run["results"]:
             if latency_metric:
                 latency_sum += ins_res["metrics"]["latency"]
+            if iter_latencies_metric:
+                iter_latencies += ins_res["metrics"]["iter_latencies"]
             if throughput_metric:
                 throughput_sum += ins_res["metrics"]["throughput"]
+            if iter_throughputs_metric:
+                iter_throughputs += ins_res["metrics"]["iter_throughputs"]
             if cmem_metric:
                 cmem_sum += ins_res["metrics"]["cpu_peak_mem"]
         if latency_metric:
             result_metrics[f"{run_base_name}_latency"] = latency_sum / ins_number
+        if iter_latencies_metric:
+            result_metrics[f"{run_base_name}_iter_latencies"] = iter_latencies
         if throughput_metric:
             result_metrics[f"{run_base_name}_throughput"] = throughput_sum
+        if iter_throughputs_metric:
+            result_metrics[f"{run_base_name}_iter_throughputs"] = iter_throughputs
         if cmem_metric:
             result_metrics[f"{run_base_name}_cmem"] = cmem_sum / ins_number
     return result_metrics
diff --git a/userbenchmark/cpu/run_config.py b/userbenchmark/cpu/run_config.py
@@ -39,7 +39,7 @@
     CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))
 
     def result_to_output_metrics(
-        metrics: List[str], metrics_res: TorchBenchModelMetrics
+        metrics: List[str], metrics_res: TorchBenchModelMetrics, output_iter_metrics: bool
     ) -> Dict[str, float]:
         result_metrics = {}
         if metrics_res:
@@ -48,11 +48,17 @@ def result_to_output_metrics(
                 median_latency = numpy.median(metrics_res.latencies)
                 assert median_latency, f"Run failed for metric {latency_metric}"
                 result_metrics[latency_metric] = median_latency
+                if output_iter_metrics:
+                  iter_latencies_metric = "iter_latencies"
+                  result_metrics[iter_latencies_metric] = list(metrics_res.latencies)
             if "throughputs" in metrics and metrics_res.throughputs:
                 throughput_metric = "throughput"
                 median_throughput = numpy.median(metrics_res.throughputs)
                 assert median_throughput, f"Run failed for metric {throughput_metric}"
                 result_metrics[throughput_metric] = median_throughput
+                if output_iter_metrics:
+                  iter_throughputs_metric = "iter_throughputs"
+                  result_metrics[iter_throughputs_metric] = list(metrics_res.throughputs)
             if "cpu_peak_mem" in metrics and metrics_res.cpu_peak_mem:
                 cpu_peak_mem = "cpu_peak_mem"
                 result_metrics[cpu_peak_mem] = metrics_res.cpu_peak_mem
@@ -118,7 +124,7 @@ def run(args: List[str], extra_args: List[str]):
             args.output = args.output if args.output else get_output_dir(BM_NAME)
             target_dir = Path(args.output).joinpath(f"{config.name}-{config.test}")
             target_dir.mkdir(exist_ok=True, parents=True)
-            metrics_dict = result_to_output_metrics(metrics, metrics_res)
+            metrics_dict = result_to_output_metrics(metrics, metrics_res, args.output_iter_metrics)
             dump_result_to_json(metrics_dict, target_dir)
 
     if __name__ == "__main__":
@@ -143,6 +149,9 @@ def run(args: List[str], extra_args: List[str]):
         parser.add_argument(
             "--metrics", default="latencies", help="Benchmark metrics, split by comma."
         )
+        parser.add_argument(
+            "--output-iter-metrics", action=argparse.BooleanOptionalAction, help="Enable per-iteration benchmark metrics"
+        )
         parser.add_argument(
             "--nwarmup", default=20, help="Benchmark warmup iteration number."
         )