Add --warm-start-latency to benchmark harness (#125353)

masnesral · facebook-github-bot · commit f188e24b6eb1 · 2024-05-10T18:59:45.000-07:00
Summary: This change introduces a new flagg to perform a "warm start" test from the benchmark harness. The idea is to test a model twice: first with a fresh inductor cache (i.e., a "cold start"), and then a second run in a fresh process with the cache available (i.e. a "warm start"). We can later add this mode to CI runs to collect compile times for warm start. X-link: pytorch/pytorch#125353 Approved by: https://github.com/eellison, https://github.com/desertfire Reviewed By: izaitsevfb Differential Revision: D57216102 Pulled By: masnesral fbshipit-source-id: 2cb5751260e05844ad3324572ada4caca5109f23
diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py
@@ -2005,33 +2005,6 @@ def get_dynamo_stats():
     )
 
 
-def maybe_fresh_cache(fn, is_cold_start):
-    def inner(*args, **kwargs):
-        cache_minder = contextlib.nullcontext()
-        if is_cold_start:
-            cache_entries = {}
-            cache_minder = fresh_inductor_cache(cache_entries)
-
-        try:
-            with cache_minder:
-                return fn(*args, **kwargs)
-        finally:
-            dump_cache = False
-            if dump_cache and is_cold_start:
-                output_csv(
-                    output_filename[:-4] + "_triton_cache.csv",
-                    ["dev", "name", "batch_size", "triton_cache"],
-                    [
-                        current_device,
-                        current_name,
-                        current_batch_size,
-                        cache_entries,
-                    ],
-                )
-
-    return inner
-
-
 @contextmanager
 def maybe_init_distributed(should_init_distributed, rank, world_size, port="6789"):
     try:
@@ -3297,12 +3270,6 @@ def get_example_inputs(self):
         action="store_true",
         help="print dataframe result used for calculating accuracy",
     )
-    parser.add_argument(
-        "--cold-start-latency",
-        "--cold_start_latency",
-        action="store_true",
-        help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
-    )
     parser.add_argument(
         "--disable-cudagraphs",
         action="store_true",
@@ -3415,6 +3382,19 @@ def get_example_inputs(self):
         help="Enables Memory Snapshot tool for memory deep dives: https://pytorch.org/blog/understanding-gpu-memory-1/",
     )
 
+    group_latency = parser.add_mutually_exclusive_group()
+    group_latency.add_argument(
+        "--cold-start-latency",
+        "--cold_start_latency",
+        action="store_true",
+        help="Use a fresh triton cachedir when running each model, to force cold-start compile.",
+    )
+    group_latency.add_argument(
+        "--warm-start-latency",
+        action="store_true",
+        help="Run model(s) twice and preseve caches in between to enable a 'warm start' on the 2nd run",
+    )
+
     group_fuser = parser.add_mutually_exclusive_group()
     # --nvfuser is now the default, keep the option to not break scripts
     group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS)
@@ -3571,9 +3551,17 @@ def process_entry(rank, runner, original_dir, args):
         world_size=args.world_size,
         port=args.distributed_master_port,
     ):
-        return maybe_fresh_cache(
-            run, (args.cold_start_latency and args.only) or args.ci
-        )(runner, args, original_dir)
+        return run(runner, args, original_dir)
+
+
+def maybe_fresh_cache(args):
+    cache_dir_assigned = "TORCHINDUCTOR_CACHE_DIR" in os.environ
+    if not cache_dir_assigned and (
+        args.cold_start_latency or args.warm_start_latency or args.ci
+    ):
+        return fresh_inductor_cache()
+    else:
+        return contextlib.nullcontext()
 
 
 def main(runner, original_dir=None, args=None):
@@ -3598,23 +3586,39 @@ def main(runner, original_dir=None, args=None):
                 f"--diff-branch: current branch is same as {args.diff_branch} branch, what are you diffing?"
             )
 
-    args.init_distributed = args.only and args.multiprocess
-    if args.init_distributed:
-        # NB: Do NOT query device count before CUDA initialization; we're
-        # going to overwrite CUDA_VISIBLE_DEVICES and this will result in
-        # https://github.com/pytorch/pytorch/issues/107300
-        device_count = torch.cuda.device_count()
-        if device_count <= 1:
-            log.warning(
-                "The use multiprocess flag is set but there are <= 1 devices available."
+    with maybe_fresh_cache(args):
+        args.init_distributed = args.only and args.multiprocess
+        if args.init_distributed:
+            # NB: Do NOT query device count before CUDA initialization; we're
+            # going to overwrite CUDA_VISIBLE_DEVICES and this will result in
+            # https://github.com/pytorch/pytorch/issues/107300
+            device_count = torch.cuda.device_count()
+            if device_count <= 1:
+                log.warning(
+                    "The use multiprocess flag is set but there are <= 1 devices available."
+                )
+            # multiprocess path
+            args.world_size = device_count
+            mp.spawn(
+                process_entry, args=(runner, original_dir, args), nprocs=device_count
             )
-        # multiprocess path
-        args.world_size = device_count
-        mp.spawn(process_entry, args=(runner, original_dir, args), nprocs=device_count)
-    else:
-        # single process path just uses the main process
-        args.world_size = 1
-        process_entry(0, runner, original_dir, args)
+        elif args.only and args.warm_start_latency:
+            # Warm start mode. Enable FX graph caching and perform back-to-back runs in
+            # separate processes (but ensure the inductor cache is preserved across runs).
+            env = os.environ.copy()
+            env["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
+            cmd = [sys.executable] + sys.argv
+            cmd.remove("--warm-start-latency")
+
+            print(f"Executing cold-start run for {args.only}")
+            subprocess.check_call(cmd, timeout=args.timeout, env=env)
+
+            print(f"Executing warm-start run for {args.only}")
+            subprocess.check_call(cmd, timeout=args.timeout, env=env)
+        else:
+            # single process path just uses the main process
+            args.world_size = 1
+            process_entry(0, runner, original_dir, args)
 
 
 def write_csv_when_exception(args, name: str, status: str, device=None):