Log more errors + make CSV writing more robust

int3 · facebook-github-bot · commit d5d27628ddd3 · 2024-05-26T22:24:03.000-07:00
Summary:
Previously, we only caught specific CUDA OOM errors, but benchmarks can fail in
other ways too. Let's make it more robust by catching and logging all
exceptions.

While there is already code to log exception messages, it often leads to
malformed CSVs since there was no quoting going on. We should use Python's csv
module to avoid this issue.

Additionally, the previous logic would record the error message in each metric column
of the failed benchmark. This was redundant, so I've changed it to emit the message
only once.

Finally, since Python's csv writer writes directly to a file, instead of
creating a string first, the previous csv file naming convention using the hash
of its contents no longer applies. Instead I've used NamedTemporaryFile to get
a unique file name.

Reviewed By: chenyang78

Differential Revision: D57785120

fbshipit-source-id: 73c76bba7661b60a7357aaba3d5b9659b533479e
diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py
@@ -3,6 +3,7 @@
 import functools
 import gc
 import json
+import os
 import random
 import time
 import warnings
@@ -206,10 +207,11 @@ def select_metric(m):
                 row.append(x_only_metric_dict[x_only_metric])
             for k in y_val_keys:
                 metrics_dict = asdict(y_val[k])
+                if metrics_dict["error_msg"]:
+                    row.append(metrics_dict["error_msg"])
+                    row.extend([None] * (len(key_metrics[k]) - 1))
+                    continue
                 for metric in key_metrics[k]:
-                    if metrics_dict["error_msg"]:
-                        row.append(metrics_dict["error_msg"])
-                        continue
                     _metrics_dict = (
                         metrics_dict["extra_metrics"]
                         if metric in metrics_dict["extra_metrics"]
@@ -224,12 +226,28 @@ def select_metric(m):
             table.append(row)
         return headers, table
 
-    @property
-    def csv(self):
+    def write_csv_to_file(self, fileobj):
+        import csv
+
         headers, table = self._table()
-        headers = "; ".join(headers)
-        table = "\n".join(["; ".join([str(v) for v in row]) for row in table])
-        return f"{headers}\n{table}"
+        writer = csv.writer(fileobj, delimiter=";", quoting=csv.QUOTE_MINIMAL)
+        writer.writerow(headers)
+        writer.writerows(table)
+
+    def write_csv(self, dir_path):
+        import tempfile
+
+        # This is just a way to create a unique filename. It's not actually a
+        # temporary file (since delete=False).
+        with tempfile.NamedTemporaryFile(
+            mode='w',
+            prefix=os.path.join(dir_path, f"op_{self.op_name}_"),
+            suffix=".csv",
+            newline="",
+            delete=False,
+        ) as fileobj:
+            self.write_csv_to_file(fileobj)
+            return fileobj.name
 
     @property
     def x_vals(self):
@@ -779,6 +797,8 @@ def _init_extra_metrics() -> Dict[str, Any]:
                     metrics.extra_metrics[metric_name] = func(fn, self.example_inputs, metrics)
         except torch.cuda.OutOfMemoryError:
             metrics.error_msg = "CUDA OOM"
+        except Exception as e:
+            metrics.error_msg = str(e)
         return metrics
 
     def get_peak_mem(
diff --git a/userbenchmark/triton/run.py b/userbenchmark/triton/run.py
@@ -59,7 +59,7 @@ def _run(args: argparse.Namespace, extra_args: List[str]) -> BenchmarkOperatorRe
     metrics = opbench.run(args.warmup, args.iter)
     if not args.skip_print:
         if args.csv:
-            print(metrics.csv)
+            metrics.write_csv_to_file(sys.stdout)
         else:
             print(metrics)
     if not hasattr(torch_version, "git_version") and args.log_scuba:
@@ -73,16 +73,9 @@ def _run(args: argparse.Namespace, extra_args: List[str]) -> BenchmarkOperatorRe
             print(f"Plotting is not implemented for {args.op}")
 
     if args.dump_csv:
-        if not os.path.exists(TRITON_BENCH_CSV_DUMP_PATH):
-            os.mkdir(TRITON_BENCH_CSV_DUMP_PATH)
-
-        csv_str = metrics.csv
-        csv_str_hash = abs(hash(csv_str)) % (10**8)
-        file_name = f"op_{args.op}_{csv_str_hash}.csv"
-        file_path = os.path.join(TRITON_BENCH_CSV_DUMP_PATH, file_name)
-        with open(file_path, "w") as f:
-            f.write(csv_str)
-        print(f"[TritonBench] Dumped csv to {file_path}")
+        os.makedirs(TRITON_BENCH_CSV_DUMP_PATH, exist_ok=True)
+        path = metrics.write_csv(TRITON_BENCH_CSV_DUMP_PATH)
+        print(f"[TritonBench] Dumped csv to {path}")
     return metrics
 
 def run(args: List[str] = []):