@@ -305,6 +305,128 @@ def run_program(
305305 )
306306
307307
308+ def profile_program_roc (
309+ call : list [str ],
310+ seed : Optional [int ],
311+ timeout : int ,
312+ multi_gpu : bool ,
313+ output_dir : Path ,
314+ ) -> tuple [RunResult , Optional [ProfileResult ]]:
315+ # Wrap program in rocprof
316+ call = [
317+ "rocprofv3" ,
318+ "--log-level" ,
319+ "fatal" ,
320+ "--hip-trace" ,
321+ "--kernel-trace" ,
322+ "--rccl-trace" ,
323+ "--marker-trace" ,
324+ "--hip-trace" ,
325+ "--memory-copy-trace" ,
326+ # New? Doesn't work in the runner
327+ # "--memory-allocation-trace",
328+ "--scratch-memory-trace" ,
329+ # The HSA trace output is very large, so skip it for now
330+ # "--hsa-trace",
331+ "--output-format" ,
332+ "pftrace" ,
333+ "csv" ,
334+ "-d" ,
335+ str (output_dir ),
336+ # Just store the files as %pid%_tracename.ext instead of putting them in an
337+ # additional directory named after the hostname.
338+ "-o" ,
339+ # Insert an extra path here so that the resulting zip has all files
340+ # in the profile_data/ directory rather than the root.
341+ "%pid%" ,
342+ "--" ,
343+ ] + call
344+
345+ run_result = run_program (call , seed = seed , timeout = timeout , multi_gpu = multi_gpu , extra_env = {
346+ "GPU_DUMP_CODE_OBJECT" : "1" ,
347+ },
348+ )
349+
350+ profile_result = None
351+
352+ if run_result .success :
353+ # Post-process trace data.
354+ # rocPROF generates one trace for every process, but its more useful to
355+ # have all traces be in the same file. Fortunately we can do that by
356+ # concatenating.
357+ traces = list (output_dir .glob ("*.pftrace" ))
358+ with (output_dir / "combined.pftrace" ).open ("wb" ) as combined :
359+ for trace_path in traces :
360+ with trace_path .open ("rb" ) as trace :
361+ shutil .copyfileobj (trace , combined )
362+
363+ # After we've created the combined trace, there is no point in
364+ # keeping the individual traces around.
365+ trace_path .unlink ()
366+
367+ # Also move the code objects to the profiling output directory.
368+ for code_obj in list (Path .cwd ().glob ("_code_object*.o" )):
369+ code_obj .rename (output_dir / code_obj .name )
370+
371+ profile_result = ProfileResult (
372+ profiler = "rocPROF" ,
373+ download_url = None ,
374+ )
375+
376+ return run_result , profile_result
377+
378+
379+ def profile_program_ncu (
380+ call : list [str ],
381+ seed : Optional [int ],
382+ timeout : int ,
383+ multi_gpu : bool ,
384+ output_dir : Path ,
385+ ) -> tuple [RunResult , Optional [ProfileResult ]]:
386+ assert not multi_gpu , "Multi-GPU profiling not supported for ncu."
387+
388+ # Wrap program in ncu
389+ call = [
390+ "ncu" ,
391+ "--set" , "full" ,
392+ "--nvtx" ,
393+ "--nvtx-include" , "custom_kernel/"
394+ "--import-source" , "1" ,
395+ "-o" , "./profile-data/profile-data/profile.ncu-rep"
396+ "--" ,
397+ ] + call
398+
399+ run_result = run_program (call , seed = seed , timeout = timeout , multi_gpu = multi_gpu )
400+
401+ profile_result = None
402+
403+ if run_result .success :
404+ # Post-process trace data.
405+ # rocPROF generates one trace for every process, but its more useful to
406+ # have all traces be in the same file. Fortunately we can do that by
407+ # concatenating.
408+ traces = list (output_dir .glob ("*.pftrace" ))
409+ with (output_dir / "combined.pftrace" ).open ("wb" ) as combined :
410+ for trace_path in traces :
411+ with trace_path .open ("rb" ) as trace :
412+ shutil .copyfileobj (trace , combined )
413+
414+ # After we've created the combined trace, there is no point in
415+ # keeping the individual traces around.
416+ trace_path .unlink ()
417+
418+ # Also move the code objects to the profiling output directory.
419+ for code_obj in list (Path .cwd ().glob ("_code_object*.o" )):
420+ code_obj .rename (output_dir / code_obj .name )
421+
422+ profile_result = ProfileResult (
423+ profiler = 'rocPROF' ,
424+ download_url = None ,
425+ )
426+
427+ return run_result , profile_result
428+
429+
308430def profile_program (
309431 system : SystemInfo ,
310432 call : list [str ],
@@ -315,89 +437,25 @@ def profile_program(
315437 # The runner-specific configuration should implement logic
316438 # to fetch the data in this directory and return it as
317439 # ProfileResult.download_url.
318- # Insert an extra nested nested path here so that the resulting zip has all files
440+ # Insert an extra nested path here so that the resulting zip has all files
319441 # in the profile_data/ directory rather than directly in the root.
320442 output_dir = Path ("." ) / "profile_data" / "profile_data"
321443 output_dir .mkdir (parents = True , exist_ok = True )
322444
323445 if system .runtime == "ROCm" :
324- # Wrap program in rocprof
325- call = [
326- "rocprofv3" ,
327- "--log-level" ,
328- "fatal" ,
329- "--hip-trace" ,
330- "--kernel-trace" ,
331- "--rccl-trace" ,
332- "--marker-trace" ,
333- "--hip-trace" ,
334- "--memory-copy-trace" ,
335- # New? Doesn't work in the runner
336- # "--memory-allocation-trace",
337- "--scratch-memory-trace" ,
338- # The HSA trace output is very large, so skip it for now
339- # "--hsa-trace",
340- "--output-format" ,
341- "pftrace" ,
342- "csv" ,
343- "-d" ,
344- str (output_dir ),
345- # Just store the files as %pid%_tracename.ext instead of putting them in an
346- # additional directory named after the hostname.
347- "-o" ,
348- # Insert an extra path here so that the resulting zip has all files
349- # in the profile_data/ directory rather than the root.
350- "%pid%" ,
351- "--" ,
352- ] + call
353-
354- run_result = run_program (
355- call ,
356- seed = seed ,
357- timeout = timeout ,
358- multi_gpu = multi_gpu ,
359- extra_env = {
360- "GPU_DUMP_CODE_OBJECT" : "1" ,
361- },
362- )
363-
364- profile_result = None
365-
366- if run_result .success :
367- # Post-process trace data.
368- # rocPROF generates one trace for every process, but its more useful to
369- # have all traces be in the same file. Fortunately we can do that by
370- # concatenating.
371- traces = list (output_dir .glob ("*.pftrace" ))
372- with (output_dir / "combined.pftrace" ).open ("wb" ) as combined :
373- for trace_path in traces :
374- with trace_path .open ("rb" ) as trace :
375- shutil .copyfileobj (trace , combined )
376-
377- # After we've created the combined trace, there is no point in
378- # keeping the individual traces around.
379- trace_path .unlink ()
380-
381- # Also move the code objects to the profiling output directory.
382- for code_obj in list (Path .cwd ().glob ("_code_object*.o" )):
383- code_obj .rename (output_dir / code_obj .name )
384-
385- profile_result = ProfileResult (
386- profiler = "rocPROF" ,
387- download_url = None ,
388- )
389-
390- return run_result , profile_result
446+ return profile_program_roc (call , seed , timeout , multi_gpu , output_dir )
447+ elif system .runtime == "CUDA" :
448+ return profile_program_ncu (call , seed , timeout , multi_gpu , output_dir )
391449 else :
392- # TODO: Implement profiling for other platforms
393- return run_program ( call , seed = seed , timeout = timeout , multi_gpu = multi_gpu ), None
450+ raise ValueError ( f"Unknown runtime { system . runtime } " )
451+
394452
395453
396454def run_single_evaluation (
397- system : SystemInfo ,
398455 call : list [str ],
399456 mode : str ,
400457 * ,
458+ system : SystemInfo ,
401459 multi_gpu : bool = False ,
402460 tests : Optional [str ] = None ,
403461 benchmarks : Optional [str ] = None ,
@@ -426,7 +484,7 @@ def run_single_evaluation(
426484
427485 cases .flush ()
428486
429- call += [mode , cases .name ]
487+ call = call + [mode , cases .name ]
430488
431489 if mode == "profile" :
432490 return profile_program (system , call , seed = seed , timeout = timeout , multi_gpu = multi_gpu )
@@ -498,7 +556,6 @@ def make_system_info() -> SystemInfo: # noqa: C901
498556
499557
500558def run_cuda_script ( # # noqa: C901
501- system : SystemInfo ,
502559 sources : dict [str , str ],
503560 headers : Optional [dict [str , str ]] = None ,
504561 arch : Optional [int ] = None ,
@@ -559,7 +616,7 @@ def run_cuda_script( # # noqa: C901
559616 if os .path .exists (f ):
560617 os .remove (f )
561618
562- run_result , profile_result = run_single_evaluation (system , ["./eval.out" ], ** kwargs )
619+ run_result , profile_result = run_single_evaluation (["./eval.out" ], ** kwargs )
563620 return EvalResult (
564621 start = start ,
565622 end = datetime .datetime .now (),
@@ -570,7 +627,6 @@ def run_cuda_script( # # noqa: C901
570627
571628
572629def run_pytorch_script ( # noqa: C901
573- system : SystemInfo ,
574630 sources : dict [str , str ],
575631 main : str ,
576632 ** kwargs ,
@@ -622,7 +678,7 @@ def run_pytorch_script( # noqa: C901
622678 exit_code = e .returncode ,
623679 )
624680
625- run , profile = run_single_evaluation (system , ["python3" , main ], ** kwargs )
681+ run , profile = run_single_evaluation (["python3" , main ], ** kwargs )
626682
627683 return EvalResult (
628684 start = start ,
0 commit comments