11
11
from typing import Any , Optional
12
12
13
13
import torch .cuda
14
+ from torch .cuda .nvtx import range as nvtx_range
14
15
15
16
from utils import set_seed , clear_l2_cache
16
17
@@ -505,13 +506,16 @@ def _run_single_profile(test: TestCase) -> str:
505
506
"""
506
507
from submission import custom_kernel
507
508
from torch .profiler import profile , ProfilerActivity
508
- data = generate_input (** test .args )
509
- torch .cuda .synchronize ()
510
509
511
- with profile ( activities = [ ProfilerActivity . CPU , ProfilerActivity . CUDA ]) as prof :
512
- submission_output = custom_kernel ( _clone_data ( data , 0 ) )
510
+ with nvtx_range ( "generate input" ) :
511
+ data = generate_input ( ** test . args )
513
512
torch .cuda .synchronize ()
514
513
514
+ with profile (activities = [ProfilerActivity .CPU , ProfilerActivity .CUDA ]) as prof :
515
+ with nvtx_range ("custom_kernel" ):
516
+ submission_output = custom_kernel (_clone_data (data , 0 ))
517
+ torch .cuda .synchronize ()
518
+
515
519
return prof .key_averages ().table (sort_by = "self_cuda_time_total" , row_limit = 20 )
516
520
517
521
@@ -522,19 +526,25 @@ def _run_distributed_profile(test: TestCase, rank: int) -> "EventList":
522
526
from submission import custom_kernel
523
527
from torch .profiler import profile , ProfilerActivity
524
528
import torch .distributed as dist
525
- world_size = test .args ["world_size" ]
526
- os .environ ["MASTER_ADDR" ] = "127.0.0.1"
527
- os .environ ["MASTER_PORT" ] = "12356"
528
- dist .init_process_group ("nccl" , init_method = "env://" , rank = rank , world_size = world_size , device_id = torch .device (f'cuda:{ rank } ' ))
529
+
530
+ with nvtx_range (f"init nccl, rank { rank } " ):
531
+ world_size = test .args ["world_size" ]
532
+ os .environ ["MASTER_ADDR" ] = "127.0.0.1"
533
+ os .environ ["MASTER_PORT" ] = "12356"
534
+ dist .init_process_group ("nccl" , init_method = "env://" , rank = rank , world_size = world_size , device_id = torch .device (f'cuda:{ rank } ' ))
529
535
530
536
try :
531
- data = generate_input (** test .args , rank = rank )
532
- data = _clone_data (data , rank )
533
- torch .cuda .synchronize ()
537
+ with nvtx_range (f"generate input, rank { rank } " ):
538
+ data = generate_input (** test .args , rank = rank )
539
+ data = _clone_data (data , rank )
540
+ torch .cuda .synchronize ()
541
+ dist .barrier ()
534
542
535
543
with profile (activities = [ProfilerActivity .CPU , ProfilerActivity .CUDA ]) as prof :
536
- submission_output = custom_kernel (data )
537
- torch .cuda .synchronize ()
544
+ with nvtx_range (f"custom_kernel, rank { rank } " ):
545
+ submission_output = custom_kernel (data )
546
+ torch .cuda .synchronize ()
547
+ dist .barrier ()
538
548
539
549
return prof .events ()
540
550
0 commit comments