Add CUTLASS + PT2-Triton kernels to gemm benchmark

int3 · facebook-github-bot · commit a5f1710debe9 · 2024-05-29T12:35:12.000-07:00
Summary:
I did it by simply setting the max_autotune backend to only CUTLASS/TRITON as needed.

I also modified the baseline benchmark to explicitly disable autotuning, so that we can be more confident that it is invoking the ATen kernel.

Reviewed By: bertmaher, xuzhao9, chenyang78

Differential Revision: D56685216

fbshipit-source-id: 1638266254690b929f8c5591a194127c6a7c7be8
diff --git a/torchbenchmark/operators/gemm/operator.py b/torchbenchmark/operators/gemm/operator.py
@@ -20,6 +20,7 @@
 from .data_io import parse_args, read_shapes_from_csv
 from .triton_matmul import matmul as triton_matmul
 from .triton_matmul import matmul_kernel as triton_matmul_kernel
+import torch._inductor.config as inductor_config
 
 import inspect
 try:
@@ -128,6 +129,39 @@ def colfax_cutlass_matmul(self, a, b, bias) -> Callable:
         else:
             return lambda: colfax_gemm(a, b, alpha=1.0, beta=1.0)
 
+    @register_benchmark()
+    def pt2_triton_matmul(self, a, b, bias) -> Callable:
+        torch._dynamo.reset()
+        with inductor_config.patch(
+            max_autotune=True,
+            max_autotune_gemm_backends="TRITON",
+            autotune_fallback_to_aten=False,
+        ):
+            if bias is not None:
+                f = lambda a, b: a.matmul(b) + bias
+            else:
+                f = lambda a, b: a.matmul(b)
+            compiled = torch.compile(f, dynamic=False)
+            compiled(a, b)
+        return lambda: compiled(a, b)
+
+    @register_benchmark()
+    def pt2_cutlass_matmul(self, a, b, bias) -> Callable:
+        torch._dynamo.reset()
+        with inductor_config.patch(
+            max_autotune=True,
+            max_autotune_gemm_backends="CUTLASS",
+            autotune_fallback_to_aten=False,
+        ):
+            if bias is not None:
+                f = lambda a, b: a.matmul(b) + bias
+            else:
+                f = lambda a, b: a.matmul(b)
+            # cutlass needs to know the static shape, so set dynamic to False
+            compiled = torch.compile(f, dynamic=False)
+            compiled(a, b)
+        return lambda: compiled(a, b)
+
     @register_x_val(label="(M, N, K)")
     def get_x_val(self, example_inputs) -> Tuple[int, int, int]:
         # x-value: computation intensity