Add --output-iter-metrics flag to cpu userbenchmark scripts (#2600)

murste01 · facebook-github-bot · commit 49c3f18f62bd · 2025-03-26T13:46:45.000-07:00
Summary: Adds a new `--output-iter-metrics` flag which adds per-iteration metrics to benchmark result JSON files. This allows us to do our own statistical analysis and comparison of latency/throughput. Pull Request resolved: #2600 Reviewed By: xuzhao9 Differential Revision: D71902373 Pulled By: FindHao fbshipit-source-id: 8216ff91f03e220ff6b7631c038d369206736935
diff --git a/userbenchmark/cpu/README.md b/userbenchmark/cpu/README.md
@@ -32,6 +32,7 @@ All parameters of `cpu` userbenchmark as below,
 - `--niter` benchmark iteration number. Default value is 30.
 - `--output, -o` output dir. By default will create folder under
   `.userbenchmark/cpu`.
+- `--output-iter-metrics` include per-iteration metrics in output.
 - `--timeout` limit single model test run time. Default `None` means no
   limitation.
 - `--launcher` whether to use `torch.backends.xeon.run_cpu` to get the peak
diff --git a/userbenchmark/cpu/cpu_utils.py b/userbenchmark/cpu/cpu_utils.py
@@ -119,22 +119,34 @@ def add_test_results(runs, result_metrics):
         ins_number = len(run["results"])
         assert ins_number
         latency_metric = "latency" in run["results"][0]["metrics"]
+        iter_latencies_metric = "iter_latencies" in run["results"][0]["metrics"]
         throughput_metric = "throughput" in run["results"][0]["metrics"]
+        iter_throughputs_metric = "iter_throughputs" in run["results"][0]["metrics"]
         cmem_metric = "cpu_peak_mem" in run["results"][0]["metrics"]
         latency_sum = 0
+        iter_latencies = []
         throughput_sum = 0
+        iter_throughputs = []
         cmem_sum = 0
         for ins_res in run["results"]:
             if latency_metric:
                 latency_sum += ins_res["metrics"]["latency"]
+            if iter_latencies_metric:
+                iter_latencies += ins_res["metrics"]["iter_latencies"]
             if throughput_metric:
                 throughput_sum += ins_res["metrics"]["throughput"]
+            if iter_throughputs_metric:
+                iter_throughputs += ins_res["metrics"]["iter_throughputs"]
             if cmem_metric:
                 cmem_sum += ins_res["metrics"]["cpu_peak_mem"]
         if latency_metric:
             result_metrics[f"{run_base_name}_latency"] = latency_sum / ins_number
+        if iter_latencies_metric:
+            result_metrics[f"{run_base_name}_iter_latencies"] = iter_latencies
         if throughput_metric:
             result_metrics[f"{run_base_name}_throughput"] = throughput_sum
+        if iter_throughputs_metric:
+            result_metrics[f"{run_base_name}_iter_throughputs"] = iter_throughputs
         if cmem_metric:
             result_metrics[f"{run_base_name}_cmem"] = cmem_sum / ins_number
     return result_metrics
diff --git a/userbenchmark/cpu/run_config.py b/userbenchmark/cpu/run_config.py
@@ -38,8 +38,12 @@
     BM_NAME = "cpu"
     CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))
 
+    # output_iter_metrics is True only when '--output-iter-metrics' is given,
+    # otherwise it is False by default.
     def result_to_output_metrics(
-        metrics: List[str], metrics_res: TorchBenchModelMetrics
+        metrics: List[str],
+        metrics_res: TorchBenchModelMetrics,
+        output_iter_metrics: bool,
     ) -> Dict[str, float]:
         result_metrics = {}
         if metrics_res:
@@ -48,11 +52,19 @@ def result_to_output_metrics(
                 median_latency = numpy.median(metrics_res.latencies)
                 assert median_latency, f"Run failed for metric {latency_metric}"
                 result_metrics[latency_metric] = median_latency
+                if output_iter_metrics:
+                    iter_latencies_metric = "iter_latencies"
+                    result_metrics[iter_latencies_metric] = list(metrics_res.latencies)
             if "throughputs" in metrics and metrics_res.throughputs:
                 throughput_metric = "throughput"
                 median_throughput = numpy.median(metrics_res.throughputs)
                 assert median_throughput, f"Run failed for metric {throughput_metric}"
                 result_metrics[throughput_metric] = median_throughput
+                if output_iter_metrics:
+                    iter_throughputs_metric = "iter_throughputs"
+                    result_metrics[iter_throughputs_metric] = list(
+                        metrics_res.throughputs
+                    )
             if "cpu_peak_mem" in metrics and metrics_res.cpu_peak_mem:
                 cpu_peak_mem = "cpu_peak_mem"
                 result_metrics[cpu_peak_mem] = metrics_res.cpu_peak_mem
@@ -118,7 +130,9 @@ def run(args: List[str], extra_args: List[str]):
             args.output = args.output if args.output else get_output_dir(BM_NAME)
             target_dir = Path(args.output).joinpath(f"{config.name}-{config.test}")
             target_dir.mkdir(exist_ok=True, parents=True)
-            metrics_dict = result_to_output_metrics(metrics, metrics_res)
+            metrics_dict = result_to_output_metrics(
+                metrics, metrics_res, args.output_iter_metrics
+            )
             dump_result_to_json(metrics_dict, target_dir)
 
     if __name__ == "__main__":
@@ -143,6 +157,12 @@ def run(args: List[str], extra_args: List[str]):
         parser.add_argument(
             "--metrics", default="latencies", help="Benchmark metrics, split by comma."
         )
+        parser.add_argument(
+            "--output-iter-metrics",
+            action=argparse.BooleanOptionalAction,
+            default=False,
+            help="Enable per-iteration benchmark metrics",
+        )
         parser.add_argument(
             "--nwarmup", default=20, help="Benchmark warmup iteration number."
         )