Fix misleadingly high AOT Inductor dashboard performance (#153060)

generatedunixname499836121 · facebook-github-bot · commit d36d4778dfc2 · 2025-05-14T08:29:31.000-07:00
Summary: Fixes misleadingly high AOTInductor performance benchmark numbers in scenarios where a model updates internal parameters during `torch.export.export`. Since `FakeTensorMode` is enabled during export, all such parameters become `FakeTensor`s, slowing down future eager-mode runs using that model substantively. This, in turn, causes misleading performance stats, where the slowness of eager-mode makes `AOTInductor` look _very_ good. An [example benchmark](https://hud.pytorch.org/benchmark/timm_models/inductor_aot_inductor?dashboard=torchinductor&startTime=Wed%2C%2030%20Apr%202025%2015%3A54%3A04%20GMT&stopTime=Wed%2C%2007%20May%202025%2015%3A54%3A04%20GMT&granularity=hour&mode=inference&dtype=bfloat16&deviceName=cuda%20(h100)&lBranch=main&lCommit=1dd36ad2d440a4f3faf724b3a8e13925e3180c24&rBranch=main&rCommit=cc7346bf19c019255dcb4484694a75850ed74d5a&model=convit_base) with this issue. The equivalent `cpp_wrapper` benchmark run shows a 2x performance gain, not 20x. Only two benchmarks we regularly run are affected by this, both in the TIMM set. X-link: pytorch/pytorch#153060 Approved by: https://github.com/desertfire Reviewed By: jeanschmidt Differential Revision: D74729281 fbshipit-source-id: bf25cd22933d9670018d935747b0604dec4178aa
diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py
@@ -1383,12 +1383,11 @@ def _produce_dynamic_shapes_for_export(path, x):
 
 
 class AOTInductorModelCache:
-    cache = {}
+    cache: dict[weakref.ref, tuple[Any, float]] = {}
 
     @classmethod
     def load(cls, model, example_inputs, mode):
         import torch._inductor
-        import torch.export._trace
         from torch.export.dynamic_shapes import _combine_args, _tree_map_with_path
 
         key = weakref.ref(model)
@@ -1419,16 +1418,40 @@ def load(cls, model, example_inputs, mode):
             # delete example_outputs and reset memory stats here
             del example_outputs
             if current_device == "cuda":
-                torch.cuda.reset_peak_memory_stats()
                 empty_gpu_cache(current_device)
+                torch.cuda.reset_peak_memory_stats()
+                pre_clone_memory_used = torch.cuda.max_memory_allocated()
             elif current_device == "hpu":
                 torch.hpu.reset_peak_memory_stats()
+                pre_clone_memory_used = torch.hpu.max_memory_allocated()
+
+            # Clone the model pre-exporting.  This prevents scenarios observed in a few
+            # models, where the forward pass modifies model state while exporting, and
+            # FakeTensors are thus saved as model data members.  This invalidates model
+            # reuse in eager mode, so it's safest to export a model clone.
+            model_clone = copy.deepcopy(model)
+
+            # Since CPU doesn't monitor max memory allocation, anything measuring peak
+            # memory will miss our transient model clone on CPU anyway.
+            #
+            # The justification for tracking this value (in order to remove it from the
+            # AOTInductor memory measurements) is that normal usage of AOTInductor would
+            # not clone the model, since the eager model would be unused post-export.
+            clone_memory_used = 0.0
+            if current_device == "cuda":
+                clone_memory_used = (
+                    torch.cuda.max_memory_allocated() - pre_clone_memory_used
+                ) / 1e9
+            elif current_device == "hpu":
+                clone_memory_used = (
+                    torch.hpu.max_memory_allocated() - pre_clone_memory_used
+                ) / 1e9
 
             inductor_configs = {}
             if mode == "max-autotune":
                 inductor_configs["max_autotune"] = True
             ep = torch.export.export(
-                model,
+                model_clone,
                 example_args,
                 example_kwargs,
                 dynamic_shapes=dynamic_shapes,
@@ -1439,9 +1462,16 @@ def load(cls, model, example_inputs, mode):
                     ep, inductor_configs=inductor_configs
                 )  # type: ignore[arg-type]
 
-            cls.cache[key] = torch._inductor.aoti_load_package(package_path)
+            cls.cache[key] = (
+                torch._inductor.aoti_load_package(package_path),
+                clone_memory_used,
+            )
 
-        return cls.cache[key]
+        return cls.cache[key][0]
+
+    @classmethod
+    def get_excess_memory(cls, model) -> float:
+        return cls.cache.get(weakref.ref(model), (None, 0.0))[1]
 
 
 def export(model, example_inputs):
@@ -1456,6 +1486,9 @@ def export(model, example_inputs):
         _produce_dynamic_shapes_for_export, combined_args
     )
 
+    # NOTE: if args.export is ever enabled for --performance mode (rather than solely
+    # --accuracy), we'll need to clone the model and subtract out extra memory usage, as
+    # done in AOTInductorModelCache.
     ep = torch.export.export(
         model, example_args, example_kwargs, dynamic_shapes=dynamic_shapes, strict=True
     )
@@ -2468,6 +2501,11 @@ def warmup(fn, model, example_inputs, mode, niters=10):
                         "dynamo",
                         niters=1,
                     )
+                # If we use warm peak memory, the AOT model loading transient memory
+                # won't be present on the warm measurement.  We only have to account for
+                # it when using cold memory.
+                elif self.args.export_aot_inductor:
+                    dynamo_peak_mem -= AOTInductorModelCache.get_excess_memory(model)
 
             if self.args.profile_dynamo_cache_lookup:
                 with torch.profiler.profile(
@@ -2616,6 +2654,11 @@ def warmup(fn, model, example_inputs, mode, niters=5):
                         "dynamo",
                         niters=1,
                     )
+                # If we use warm peak memory, the AOT model loading transient memory
+                # won't be present on the warm measurement.  We only have to account for
+                # it when using cold memory.
+                elif self.args.export_aot_inductor:
+                    dynamo_peak_mem -= AOTInductorModelCache.get_excess_memory(model)
 
             if self.args.profile_dynamo_cache_lookup:
                 with torch.profiler.profile(