Vary batch size when running dynamic shapes benchmarks (#154805)

bobrenjc93 · facebook-github-bot · commit 961a7682abb7 · 2025-06-02T18:02:03.000-07:00
Summary: This better measures the actual runtime performance of dynamic shapes where we aren't guaranteed to have similar shapes as the original hint. X-link: pytorch/pytorch#154805 Approved by: https://github.com/Skylion007 ghstack dependencies: #154802, #154826, #154822, #154823 Reviewed By: seemethere Differential Revision: D75811385 fbshipit-source-id: 3deccb5c569ce3a99e681c1f5e6f632d7875131b
diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py
@@ -14,6 +14,7 @@
 import json
 import logging
 import os
+import random
 import shutil
 import signal
 import subprocess
@@ -690,17 +691,52 @@ def timed(
     times=1,
     return_result=False,
     collect_outputs=False,
+    batch_size=None,
 ):
     use_xla = tensor_is_on_xla(example_inputs)
     synchronize()
 
+    if batch_size:
+        patch_torch_manual_seed()
+
     if use_xla:
         xm.mark_step()
         xm.wait_device_ops()
 
+    def vary_batch(t: torch.Tensor, new_batch_size) -> torch.Tensor:
+        for i, s in enumerate(t.size()):
+            if s == batch_size:
+                # If new batch is smaller, we truncate
+                if new_batch_size < batch_size:
+                    indexer = [slice(None)] * t.ndim
+                    indexer[i] = slice(0, new_batch_size)
+                    t = t[tuple(indexer)]
+                # If new batch is greater, we just duplicate the last row
+                # over and over until we hit the desired batch size
+                elif new_batch_size > batch_size:
+                    indexer = [slice(None)] * t.ndim
+                    indexer[i] = -1
+                    last_slice = t[tuple(indexer)].unsqueeze(i)
+                    repeat_shape = list(t.shape)
+                    repeat_shape[i] = new_batch_size - batch_size
+                    padding = last_slice.expand(*repeat_shape)
+                    t = torch.cat([t, padding], dim=i)
+                break
+        return t
+
     time_total = 0
     # Dont collect outputs to correctly measure timing
     for _ in range(times):
+        # If batch_size is 1, it too often collides with other non batch size
+        # dimensions resulting in errors.
+        if batch_size and batch_size > 1:
+            # Calculate new batch size by varying the original batch size by up to 20%
+            # Ensure it's at least greater than 1
+            variation = random.uniform(0.8, 1.2)
+            new_batch_size = max(2, int(batch_size * variation))
+            example_inputs = tree_map_only(
+                torch.Tensor, lambda x: vary_batch(x, new_batch_size), example_inputs
+            )
         # Put this call inside the loop to reset the seed for each iteration.
         # Don't include reset_rng_state() to correctly measure timing
         reset_rng_state(use_xla)
@@ -1071,6 +1107,7 @@ def maybe_mark_profile(*args, **kwargs):
                     return_result=True,
                     times=times,
                     collect_outputs=args.collect_outputs,
+                    batch_size=kwargs.get("batch_size"),
                 )
 
             # call mark_step between the 2 calls to make the comparison fair.
@@ -2478,7 +2515,14 @@ def warmup(fn, model, example_inputs, mode, niters=10):
             return " ".join(map(str, results))
 
     def run_performance_test(
-        self, name, model, example_inputs, optimize_ctx, experiment, tag=None
+        self,
+        name,
+        model,
+        example_inputs,
+        optimize_ctx,
+        experiment,
+        tag=None,
+        batch_size=None,
     ):
         if self.args.xla:
             with self.pick_grad(name, self.args.training):
@@ -2536,6 +2580,7 @@ def warmup(fn, model, example_inputs, mode, niters=5):
         with self.pick_grad(name, self.args.training), ctx:
             ok, total = Stats.reset_counters()
             experiment_kwargs = {}
+            experiment_kwargs["batch_size"] = batch_size
             if tag is not None:
                 experiment_kwargs["tag"] = tag
             results = []
@@ -2699,6 +2744,7 @@ def run_one_model(
         experiment,
         explain=False,
         tag=None,
+        batch_size=None,
     ):
         mode = "train" if self.args.training else "eval"
         msg = f"{current_device:4} {mode:5} {current_name:34} "
@@ -2727,7 +2773,13 @@ def run_one_model(
                 )
             else:
                 status = self.run_performance_test(
-                    name, model, example_inputs, optimize_ctx, experiment, tag
+                    name,
+                    model,
+                    example_inputs,
+                    optimize_ctx,
+                    experiment,
+                    tag,
+                    batch_size=batch_size,
                 )
             print(status)
         empty_gpu_cache(current_device)
@@ -4064,6 +4116,7 @@ def detect_and_mark_batch(t):
                         experiment,
                         explain=args.explain,
                         tag=args.tag,
+                        batch_size=batch_size if args.dynamic_batch_only else None,
                     )
         if args.generate_aot_autograd_stats:
             stats_file = output_filename.split(".csv")[0] + "_stats.csv"