24
24
REGISTERED_BENCHMARKS : Dict [str , List [str ]] = {}
25
25
REGISTERED_METRICS : Dict [str , List [str ]] = {}
26
26
BASELINE_BENCHMARKS : Dict [str , str ] = {}
27
- BUILTIN_METRICS = ["latency" , "tflops" , "speedup" , "accuracy" , "compile_time" ]
27
+ BUILTIN_METRICS = ["latency" , "tflops" , "speedup" , "accuracy" , "compile_time" , "ncu_trace" ]
28
28
BASELINE_SKIP_METRICS = ["speedup" , "accuracy" ]
29
29
PRECISION_DTYPE_MAPPING = {
30
30
"fp32" : torch .float32 ,
@@ -70,6 +70,15 @@ def do_bench_walltime(fn, warmup=25, rep=100):
70
70
wall_time_ms = (end_time - start_time ) * 1e3 / n_repeat
71
71
return wall_time_ms
72
72
73
+ def _find_param_loc (l , key : str ) -> int :
74
+ try :
75
+ return l .index (key )
76
+ except ValueError :
77
+ return - 1
78
+ def _remove_params (l , loc ):
79
+ if loc == - 1 :
80
+ return l
81
+ return l [:loc ] + l [loc + 2 :]
73
82
74
83
@dataclass
75
84
class BenchmarkOperatorMetrics :
@@ -85,6 +94,8 @@ class BenchmarkOperatorMetrics:
85
94
walltime : Optional [float ]
86
95
# compile time
87
96
compile_time : Optional [float ]
97
+ # ncu trace file
98
+ ncu_trace : Optional [str ]
88
99
# error message
89
100
error_msg : Optional [str ]
90
101
# extra metrics
@@ -544,13 +555,16 @@ def _do_bench(
544
555
accuracy = accuracy ,
545
556
walltime = walltime ,
546
557
compile_time = None ,
558
+ ncu_trace = None ,
547
559
error_msg = error_msg ,
548
560
extra_metrics = {},
549
561
)
550
562
if "tflops" in self .required_metrics :
551
563
metric .tflops = self .tflops (fn_name , self .example_inputs , metric )
552
564
if "compile_time" in self .required_metrics :
553
565
metric .compile_time = self .compile_time (batch_id , fn_name , metric )
566
+ if "ncu_trace" in self .required_metrics :
567
+ metric .ncu_trace = self .ncu_trace (batch_id , fn_name )
554
568
extra_metrics = {}
555
569
# run the hidden metric "_compile_time_in_task"
556
570
# to get the compile time in parent process
@@ -559,6 +573,13 @@ def _do_bench(
559
573
"_compile_time_in_task must be measured by itself. " \
560
574
f"required_metrics: { self .required_metrics } , _only: { self ._only } , _batch_id: { self ._batch_id } "
561
575
extra_metrics ["_compile_time_in_task" ] = self ._compile_time_in_task (fn )
576
+ if "_ncu_trace_in_task" in self .required_metrics :
577
+ assert self .required_metrics == ["_ncu_trace_in_task" ] and self ._only and (self ._batch_id is not None ), \
578
+ "_ncu_trace_in_task must be measured by itself. " \
579
+ f"required_metrics: { self .required_metrics } , _only: { self ._only } , _batch_id: { self ._batch_id } "
580
+ from torchbenchmark ._components .ncu import do_bench_ncu_in_task
581
+ do_bench_ncu_in_task (fn = fn , warmup = warmup , grad_to_none = self .get_grad_to_none (self .example_inputs ))
582
+ extra_metrics ["_ncu_trace_in_task" ] = "success"
562
583
# generate customized metrics
563
584
if self .name in REGISTERED_METRICS :
564
585
for metric_name in REGISTERED_METRICS [self .name ]:
@@ -577,29 +598,47 @@ def _do_bench(
577
598
accuracy = None ,
578
599
walltime = None ,
579
600
compile_time = None ,
601
+ ncu_trace = None ,
580
602
error_msg = "CUDA OOM" ,
581
603
extra_metrics = {},
582
604
)
583
605
return metric
584
606
585
607
608
+ @register_metric ()
609
+ def ncu_trace (self , batch_id : int , fn_name : str ) -> str :
610
+ # collect the ncu trace
611
+ import sys
612
+ import subprocess
613
+ from pathlib import Path
614
+ op_task_args = copy .deepcopy (sys .argv )
615
+ for override_option in ["--only" , "--batch-id" , "--metrics" ]:
616
+ op_task_args = _remove_params (op_task_args , _find_param_loc (op_task_args , override_option ))
617
+ op_task_args .extend (["--only" , fn_name , "--batch-id" , str (batch_id ), "--metrics" , "_ncu_trace_in_task" ])
618
+ # Disable DCGM
619
+ try :
620
+ disable_dcgm = ["sudo" , "dyno" , "dcgm_profiling" , "--mute=true" , "--duration=1000_s" ]
621
+ subprocess .run (disable_dcgm , check = True )
622
+ except subprocess .SubprocessError :
623
+ warnings .warn ("Cannot find dyno to disable DCGM. Proceed to collect NCU Trace." )
624
+ ncu_output_dir = Path (f"/tmp/tritonbench_{ self .name } _{ fn_name } _{ batch_id } " )
625
+ ncu_output_dir .mkdir (parents = True , exist_ok = True )
626
+ ncu_output_file = ncu_output_dir .joinpath ("ncu_output.csv" ).resolve ()
627
+ ncu_args = ["ncu" , "--set" , "full" , "--replay-mode" , "range" , "--target-processes" , "all" , \
628
+ "--csv" , "-f" , "--log-file" , str (ncu_output_file .resolve ())]
629
+ ncu_args .extend (op_task_args )
630
+ subprocess .check_call (ncu_args )
631
+ return str (ncu_output_file .resolve ())
632
+
633
+
586
634
@register_metric ()
587
635
def compile_time (self , batch_id : int , fn_name : str , metrics : BenchmarkOperatorMetrics ) -> float :
588
636
# We need to spawn a subprocess when user wants to measure the compile time
589
637
# of multiple batches and backends.
590
- def _find_loc (l , key : str ) -> int :
591
- try :
592
- return l .index (key )
593
- except ValueError :
594
- return - 1
595
- def _remove_element (l , loc ):
596
- if loc == - 1 :
597
- return l
598
- return l [:loc ] + l [loc + 2 :]
599
638
from torchbenchmark .operators .op_task import OpTask
600
639
op_task_args = copy .deepcopy (self ._raw_extra_args )
601
640
for override_option in ["--only" , "--batch-id" , "--metrics" ]:
602
- op_task_args = _remove_element (op_task_args , _find_loc (op_task_args , override_option ))
641
+ op_task_args = _remove_params (op_task_args , _find_param_loc (op_task_args , override_option ))
603
642
op_task_args .extend (["--only" , fn_name , "--batch-id" , str (batch_id ), "--metrics" , "_compile_time_in_task" ])
604
643
op_task = OpTask (name = self .name )
605
644
op_task .make_operator_instance (mode = self .mode .value , device = self .device , extra_args = op_task_args )
0 commit comments