Skip to content

Commit 0d685b5

Browse files
committed
example: emit nvtx markers
1 parent bf74236 commit 0d685b5

File tree

1 file changed

+23
-13
lines changed

1 file changed

+23
-13
lines changed

examples/eval.py

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from typing import Any, Optional
1212

1313
import torch.cuda
14+
from torch.cuda.nvtx import range as nvtx_range
1415

1516
from utils import set_seed, clear_l2_cache
1617

@@ -505,13 +506,16 @@ def _run_single_profile(test: TestCase) -> str:
505506
"""
506507
from submission import custom_kernel
507508
from torch.profiler import profile, ProfilerActivity
508-
data = generate_input(**test.args)
509-
torch.cuda.synchronize()
510509

511-
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
512-
submission_output = custom_kernel(_clone_data(data, 0))
510+
with nvtx_range("generate input"):
511+
data = generate_input(**test.args)
513512
torch.cuda.synchronize()
514513

514+
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
515+
with nvtx_range("custom_kernel"):
516+
submission_output = custom_kernel(_clone_data(data, 0))
517+
torch.cuda.synchronize()
518+
515519
return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)
516520

517521

@@ -522,19 +526,25 @@ def _run_distributed_profile(test: TestCase, rank: int) -> "EventList":
522526
from submission import custom_kernel
523527
from torch.profiler import profile, ProfilerActivity
524528
import torch.distributed as dist
525-
world_size = test.args["world_size"]
526-
os.environ["MASTER_ADDR"] = "127.0.0.1"
527-
os.environ["MASTER_PORT"] = "12356"
528-
dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
529+
530+
with nvtx_range(f"init nccl, rank {rank}"):
531+
world_size = test.args["world_size"]
532+
os.environ["MASTER_ADDR"] = "127.0.0.1"
533+
os.environ["MASTER_PORT"] = "12356"
534+
dist.init_process_group("nccl", init_method="env://", rank=rank, world_size=world_size, device_id=torch.device(f'cuda:{rank}'))
529535

530536
try:
531-
data = generate_input(**test.args, rank=rank)
532-
data = _clone_data(data, rank)
533-
torch.cuda.synchronize()
537+
with nvtx_range(f"generate input, rank {rank}"):
538+
data = generate_input(**test.args, rank=rank)
539+
data = _clone_data(data, rank)
540+
torch.cuda.synchronize()
541+
dist.barrier()
534542

535543
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
536-
submission_output = custom_kernel(data)
537-
torch.cuda.synchronize()
544+
with nvtx_range(f"custom_kernel, rank {rank}"):
545+
submission_output = custom_kernel(data)
546+
torch.cuda.synchronize()
547+
dist.barrier()
538548

539549
return prof.events()
540550

0 commit comments

Comments
 (0)