Use cuBLAS default workspace size in Lt (#153556)

generatedunixname499836121 · facebook-github-bot · commit e759c47423ff · 2025-05-26T20:01:56.000-07:00
Summary: Also enables unified workspaces by default for non-FBCODE use cases. Default Lt workspace size is also updated to match cuBLAS logic for default, including for Blackwell (SM 10.0) and GeForce Blackwell (SM 12.0). Recommended defaults are documented here: https://docs.nvidia.com/cuda/cublas/#cublassetworkspace X-link: pytorch/pytorch#153556 Approved by: https://github.com/Skylion007, https://github.com/ngimel Reviewed By: izaitsevfb Differential Revision: D75387537 fbshipit-source-id: 7ec68ba7362c49a3cfc16fa7e46c0ba490a601c1
diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py
@@ -3609,16 +3609,15 @@ def run(runner, args, original_dir=None):
         if args.devices == ["xpu"]:
             torch.use_deterministic_algorithms(True, warn_only=True)
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-        # TODO(eqy): revisit when cuBLASLt workspace size is bumped
-        # if args.only is not None and args.only in {
-        #     "DebertaForQuestionAnswering",
-        #     "RobertaForQuestionAnswering",
-        #     "nvidia_deeprecommender",
-        #     "volo_d1_224",
-        # }:
-        #     # These seem unhappy with numerics of larger cuBLASLt workspace
-        #     # sizes following #145130 (due to enabling split-k?)
-        #     torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+        if args.only is not None and args.only in {
+            "DebertaForQuestionAnswering",
+            "nvidia_deeprecommender",
+            "crossvit_9_240",
+        }:
+            # These seem unhappy with numerics of larger cuBLASLt workspace
+            torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
+            torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
+
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.allow_tf32 = False
         torch.backends.cudnn.benchmark = False
diff --git a/userbenchmark/dynamo/dynamobench/timm_models.py b/userbenchmark/dynamo/dynamobench/timm_models.py
@@ -71,6 +71,7 @@ def pip_install(package):
 }
 
 REQUIRE_HIGHER_TOLERANCE = {
+    "crossvit_9_240",
     "fbnetv3_b",
     "gmixer_24_224",
     "hrnet_w18",

Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,7 @@ def pip_install(package):`
`71`	`71`	`}`
`72`	`72`
`73`	`73`	`REQUIRE_HIGHER_TOLERANCE = {`
	`74`	`+ "crossvit_9_240",`
`74`	`75`	`"fbnetv3_b",`
`75`	`76`	`"gmixer_24_224",`
`76`	`77`	`"hrnet_w18",`