Skip to content

Commit 1fa1843

Browse files
committed
split profiling into rocm/ncu; small code improvements
1 parent 1de31fd commit 1fa1843

2 files changed

Lines changed: 134 additions & 77 deletions

File tree

examples/eval.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -511,9 +511,10 @@ def _run_single_profile(test: TestCase) -> str:
511511
data = generate_input(**test.args)
512512
torch.cuda.synchronize()
513513

514+
cloned = _clone_data(data, 0)
514515
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
515516
with nvtx_range("custom_kernel"):
516-
submission_output = custom_kernel(_clone_data(data, 0))
517+
submission_output = custom_kernel(cloned)
517518
torch.cuda.synchronize()
518519

519520
return prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=20)

src/libkernelbot/run_eval.py

Lines changed: 132 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,128 @@ def run_program(
305305
)
306306

307307

308+
def profile_program_roc(
309+
call: list[str],
310+
seed: Optional[int],
311+
timeout: int,
312+
multi_gpu: bool,
313+
output_dir: Path,
314+
) -> tuple[RunResult, Optional[ProfileResult]]:
315+
# Wrap program in rocprof
316+
call = [
317+
"rocprofv3",
318+
"--log-level",
319+
"fatal",
320+
"--hip-trace",
321+
"--kernel-trace",
322+
"--rccl-trace",
323+
"--marker-trace",
324+
"--hip-trace",
325+
"--memory-copy-trace",
326+
# New? Doesn't work in the runner
327+
# "--memory-allocation-trace",
328+
"--scratch-memory-trace",
329+
# The HSA trace output is very large, so skip it for now
330+
# "--hsa-trace",
331+
"--output-format",
332+
"pftrace",
333+
"csv",
334+
"-d",
335+
str(output_dir),
336+
# Just store the files as %pid%_tracename.ext instead of putting them in an
337+
# additional directory named after the hostname.
338+
"-o",
339+
# Insert an extra path here so that the resulting zip has all files
340+
# in the profile_data/ directory rather than the root.
341+
"%pid%",
342+
"--",
343+
] + call
344+
345+
run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
346+
"GPU_DUMP_CODE_OBJECT": "1",
347+
},
348+
)
349+
350+
profile_result = None
351+
352+
if run_result.success:
353+
# Post-process trace data.
354+
# rocPROF generates one trace for every process, but its more useful to
355+
# have all traces be in the same file. Fortunately we can do that by
356+
# concatenating.
357+
traces = list(output_dir.glob("*.pftrace"))
358+
with (output_dir / "combined.pftrace").open("wb") as combined:
359+
for trace_path in traces:
360+
with trace_path.open("rb") as trace:
361+
shutil.copyfileobj(trace, combined)
362+
363+
# After we've created the combined trace, there is no point in
364+
# keeping the individual traces around.
365+
trace_path.unlink()
366+
367+
# Also move the code objects to the profiling output directory.
368+
for code_obj in list(Path.cwd().glob("_code_object*.o")):
369+
code_obj.rename(output_dir / code_obj.name)
370+
371+
profile_result = ProfileResult(
372+
profiler="rocPROF",
373+
download_url=None,
374+
)
375+
376+
return run_result, profile_result
377+
378+
379+
def profile_program_ncu(
380+
call: list[str],
381+
seed: Optional[int],
382+
timeout: int,
383+
multi_gpu: bool,
384+
output_dir: Path,
385+
) -> tuple[RunResult, Optional[ProfileResult]]:
386+
assert not multi_gpu, "Multi-GPU profiling not supported for ncu."
387+
388+
# Wrap program in ncu
389+
call = [
390+
"ncu",
391+
"--set", "full",
392+
"--nvtx",
393+
"--nvtx-include", "custom_kernel/"
394+
"--import-source", "1",
395+
"-o", "./profile-data/profile-data/profile.ncu-rep"
396+
"--",
397+
] + call
398+
399+
run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
400+
401+
profile_result = None
402+
403+
if run_result.success:
404+
# Post-process trace data.
405+
# rocPROF generates one trace for every process, but its more useful to
406+
# have all traces be in the same file. Fortunately we can do that by
407+
# concatenating.
408+
traces = list(output_dir.glob("*.pftrace"))
409+
with (output_dir / "combined.pftrace").open("wb") as combined:
410+
for trace_path in traces:
411+
with trace_path.open("rb") as trace:
412+
shutil.copyfileobj(trace, combined)
413+
414+
# After we've created the combined trace, there is no point in
415+
# keeping the individual traces around.
416+
trace_path.unlink()
417+
418+
# Also move the code objects to the profiling output directory.
419+
for code_obj in list(Path.cwd().glob("_code_object*.o")):
420+
code_obj.rename(output_dir / code_obj.name)
421+
422+
profile_result = ProfileResult(
423+
profiler='rocPROF',
424+
download_url=None,
425+
)
426+
427+
return run_result, profile_result
428+
429+
308430
def profile_program(
309431
system: SystemInfo,
310432
call: list[str],
@@ -315,89 +437,25 @@ def profile_program(
315437
# The runner-specific configuration should implement logic
316438
# to fetch the data in this directory and return it as
317439
# ProfileResult.download_url.
318-
# Insert an extra nested nested path here so that the resulting zip has all files
440+
# Insert an extra nested path here so that the resulting zip has all files
319441
# in the profile_data/ directory rather than directly in the root.
320442
output_dir = Path(".") / "profile_data" / "profile_data"
321443
output_dir.mkdir(parents=True, exist_ok=True)
322444

323445
if system.runtime == "ROCm":
324-
# Wrap program in rocprof
325-
call = [
326-
"rocprofv3",
327-
"--log-level",
328-
"fatal",
329-
"--hip-trace",
330-
"--kernel-trace",
331-
"--rccl-trace",
332-
"--marker-trace",
333-
"--hip-trace",
334-
"--memory-copy-trace",
335-
# New? Doesn't work in the runner
336-
# "--memory-allocation-trace",
337-
"--scratch-memory-trace",
338-
# The HSA trace output is very large, so skip it for now
339-
# "--hsa-trace",
340-
"--output-format",
341-
"pftrace",
342-
"csv",
343-
"-d",
344-
str(output_dir),
345-
# Just store the files as %pid%_tracename.ext instead of putting them in an
346-
# additional directory named after the hostname.
347-
"-o",
348-
# Insert an extra path here so that the resulting zip has all files
349-
# in the profile_data/ directory rather than the root.
350-
"%pid%",
351-
"--",
352-
] + call
353-
354-
run_result = run_program(
355-
call,
356-
seed=seed,
357-
timeout=timeout,
358-
multi_gpu=multi_gpu,
359-
extra_env={
360-
"GPU_DUMP_CODE_OBJECT": "1",
361-
},
362-
)
363-
364-
profile_result = None
365-
366-
if run_result.success:
367-
# Post-process trace data.
368-
# rocPROF generates one trace for every process, but its more useful to
369-
# have all traces be in the same file. Fortunately we can do that by
370-
# concatenating.
371-
traces = list(output_dir.glob("*.pftrace"))
372-
with (output_dir / "combined.pftrace").open("wb") as combined:
373-
for trace_path in traces:
374-
with trace_path.open("rb") as trace:
375-
shutil.copyfileobj(trace, combined)
376-
377-
# After we've created the combined trace, there is no point in
378-
# keeping the individual traces around.
379-
trace_path.unlink()
380-
381-
# Also move the code objects to the profiling output directory.
382-
for code_obj in list(Path.cwd().glob("_code_object*.o")):
383-
code_obj.rename(output_dir / code_obj.name)
384-
385-
profile_result = ProfileResult(
386-
profiler="rocPROF",
387-
download_url=None,
388-
)
389-
390-
return run_result, profile_result
446+
return profile_program_roc(call, seed, timeout, multi_gpu, output_dir)
447+
elif system.runtime == "CUDA":
448+
return profile_program_ncu(call, seed, timeout, multi_gpu, output_dir)
391449
else:
392-
# TODO: Implement profiling for other platforms
393-
return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
450+
raise ValueError(f"Unknown runtime {system.runtime}")
451+
394452

395453

396454
def run_single_evaluation(
397-
system: SystemInfo,
398455
call: list[str],
399456
mode: str,
400457
*,
458+
system: SystemInfo,
401459
multi_gpu: bool = False,
402460
tests: Optional[str] = None,
403461
benchmarks: Optional[str] = None,
@@ -426,7 +484,7 @@ def run_single_evaluation(
426484

427485
cases.flush()
428486

429-
call += [mode, cases.name]
487+
call = call + [mode, cases.name]
430488

431489
if mode == "profile":
432490
return profile_program(system, call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
@@ -498,7 +556,6 @@ def make_system_info() -> SystemInfo: # noqa: C901
498556

499557

500558
def run_cuda_script( # # noqa: C901
501-
system: SystemInfo,
502559
sources: dict[str, str],
503560
headers: Optional[dict[str, str]] = None,
504561
arch: Optional[int] = None,
@@ -559,7 +616,7 @@ def run_cuda_script( # # noqa: C901
559616
if os.path.exists(f):
560617
os.remove(f)
561618

562-
run_result, profile_result = run_single_evaluation(system, ["./eval.out"], **kwargs)
619+
run_result, profile_result = run_single_evaluation(["./eval.out"], **kwargs)
563620
return EvalResult(
564621
start=start,
565622
end=datetime.datetime.now(),
@@ -570,7 +627,6 @@ def run_cuda_script( # # noqa: C901
570627

571628

572629
def run_pytorch_script( # noqa: C901
573-
system: SystemInfo,
574630
sources: dict[str, str],
575631
main: str,
576632
**kwargs,
@@ -622,7 +678,7 @@ def run_pytorch_script( # noqa: C901
622678
exit_code=e.returncode,
623679
)
624680

625-
run, profile = run_single_evaluation(system, ["python3", main], **kwargs)
681+
run, profile = run_single_evaluation(["python3", main], **kwargs)
626682

627683
return EvalResult(
628684
start=start,

0 commit comments

Comments
 (0)