From 2948ef479c412cd35de5ca0180314c2043929f3d Mon Sep 17 00:00:00 2001 From: alexandermote Date: Fri, 18 Apr 2025 12:40:28 -0700 Subject: [PATCH 1/2] Add trace functionality --- mcdc/config.py | 9 ++ mcdc/kernel.py | 291 ++++++++++++++++++++++++++---------------------- mcdc/loop.py | 39 +++---- mcdc/main.py | 46 +++++--- mcdc/trace.py | 293 +++++++++++++++++++++++++++++++++++++++++++++++++ mcdc/type_.py | 31 ++++++ 6 files changed, 546 insertions(+), 163 deletions(-) create mode 100644 mcdc/trace.py diff --git a/mcdc/config.py b/mcdc/config.py index 3b80f3d04..4b3b2b5f2 100644 --- a/mcdc/config.py +++ b/mcdc/config.py @@ -66,6 +66,13 @@ parser.add_argument("--clear_cache", action="store_true") parser.add_argument("--caching", action="store_true") parser.add_argument("--no_caching", dest="caching", action="store_false") +parser.add_argument("--trace", action="store_true") +parser.add_argument( + "--trace_slot_limit", + type=int, + help="Maximum number of functions that may be traced.", + default=4096, +) parser.add_argument("--runtime_output", default=False, action="store_true") parser.set_defaults(caching=False) args, unargs = parser.parse_known_args() @@ -75,6 +82,8 @@ target = args.target caching = args.caching clear_cache = args.clear_cache +trace = args.trace +trace_slot_limit = args.trace_slot_limit from mpi4py import MPI import shutil diff --git a/mcdc/kernel.py b/mcdc/kernel.py index 8305716e7..d08bc3e1f 100644 --- a/mcdc/kernel.py +++ b/mcdc/kernel.py @@ -15,6 +15,7 @@ import mcdc.src.physics as physics import mcdc.src.surface as surface_ import mcdc.type_ as type_ +import mcdc.trace as trace from mcdc.adapt import toggle, for_cpu, for_gpu from mcdc.constant import * @@ -22,7 +23,7 @@ from mcdc.src.algorithm import binary_search, binary_search_with_length -@njit +@trace.njit() def round(float_val): return float_val # int_val = np.float64(float_val).view(np.uint64) @@ -128,12 +129,12 @@ def clear_requests(): requests = [] -@njit +@trace.njit() def dd_check_halt(mcdc): return mcdc["domain_decomp"]["work_done"] -@njit +@trace.njit() def dd_check_in(mcdc): mcdc["domain_decomp"]["send_count"] = 0 mcdc["domain_decomp"]["recv_count"] = 0 @@ -150,7 +151,7 @@ def dd_check_in(mcdc): mcdc["domain_decomp"]["busy_total"] = 0 -@njit +@trace.njit() def dd_check_out(mcdc): with objmode(): rank = MPI.COMM_WORLD.Get_rank() @@ -193,7 +194,7 @@ def dd_check_out(mcdc): clear_requests() -@njit +@trace.njit() def dd_signal_halt(mcdc): with objmode(): @@ -204,7 +205,7 @@ def dd_signal_halt(mcdc): mcdc["domain_decomp"]["work_done"] = True -@njit +@trace.njit() def dd_signal_block(mcdc): with objmode(rank="int64"): @@ -237,7 +238,7 @@ def dd_signal_block(mcdc): dd_signal_halt(mcdc) -@njit +@trace.njit() def dd_signal_unblock(mcdc): with objmode(rank="int64"): @@ -267,7 +268,7 @@ def dd_signal_unblock(mcdc): mcdc["domain_decomp"]["recv_count"] = 0 -@njit +@trace.njit() def dd_distribute_bank(mcdc, bank, dest_list): with objmode(send_delta="int64"): @@ -303,7 +304,7 @@ def dd_initiate_particle_send(prog): dd_particle_send(prog) -@njit +@trace.njit() def dd_particle_send(prog): mcdc = adapt.mcdc_global(prog) dd_distribute_bank( @@ -331,7 +332,7 @@ def dd_particle_send(prog): # ============================================================================= -@njit +@trace.njit() def dd_get_recv_tag(): with objmode(tag="int64"): @@ -342,7 +343,7 @@ def dd_get_recv_tag(): return tag -@njit +@trace.njit() def dd_recv_particles(mcdc): buff = np.zeros( @@ -370,7 +371,7 @@ def dd_recv_particles(mcdc): mcdc["domain_decomp"]["rank_busy"] = True -@njit +@trace.njit() def dd_recv_turnstile(mcdc): with objmode(busy_delta="int64", send_delta="int64"): @@ -391,7 +392,7 @@ def dd_recv_turnstile(mcdc): dd_signal_halt(mcdc) -@njit +@trace.njit() def dd_recv_halt(mcdc): with objmode(): @@ -403,7 +404,7 @@ def dd_recv_halt(mcdc): mcdc["domain_decomp"]["work_done"] = True -@njit +@trace.njit() def dd_recv(mcdc): if mcdc["domain_decomp"]["rank_busy"]: @@ -427,7 +428,7 @@ def dd_recv(mcdc): # Check if particle is in domain -@njit +@trace.njit() def particle_in_domain(P_arr, mcdc): P = P_arr[0] d_idx = mcdc["dd_idx"] @@ -455,7 +456,7 @@ def particle_in_domain(P_arr, mcdc): # Check for source in domain -@njit +@trace.njit() def source_in_domain(source, domain_mesh, d_idx): d_Nx = domain_mesh["x"].size - 1 d_Ny = domain_mesh["y"].size - 1 @@ -498,7 +499,7 @@ def source_in_domain(source, domain_mesh, d_idx): # ============================================================================= -@njit +@trace.njit() def domain_work(mcdc, domain, N): domain_mesh = mcdc["technique"]["dd_mesh"] @@ -622,7 +623,7 @@ def domain_work(mcdc, domain, N): # ============================================================================= -@njit() +@trace.njit() def source_particle_dd(seed, mcdc): domain_mesh = mcdc["technique"]["dd_mesh"] d_idx = mcdc["dd_idx"] @@ -698,28 +699,32 @@ def source_particle_dd(seed, mcdc): return P -@njit -def distribute_work_dd(N, mcdc, precursor=False): +@trace.njit() +def distribute_work_dd(N, mcdc): work_start = 0 work_size = N work_size_total = N - if not precursor: - mcdc["mpi_work_start"] = work_start - mcdc["mpi_work_size"] = work_size - mcdc["mpi_work_size_total"] = work_size_total - else: - mcdc["mpi_work_start_precursor"] = work_start - mcdc["mpi_work_size_precursor"] = work_size - mcdc["mpi_work_size_total_precursor"] = work_size_total + mcdc["mpi_work_start"] = work_start + mcdc["mpi_work_size"] = work_size + mcdc["mpi_work_size_total"] = work_size_total +@trace.njit() +def distribute_work_dd_precursor(N, mcdc): + work_start = 0 + work_size = N + work_size_total = N + + mcdc["mpi_work_start_precursor"] = work_start + mcdc["mpi_work_size_precursor"] = work_size + mcdc["mpi_work_size_total_precursor"] = work_size_total # ============================================================================= # Random sampling # ============================================================================= -@njit +@trace.njit() def sample_isotropic_direction(P_arr): P = P_arr[0] # Sample polar cosine and azimuthal angle uniformly @@ -734,7 +739,7 @@ def sample_isotropic_direction(P_arr): return x, y, z -@njit +@trace.njit() def sample_white_direction(nx, ny, nz, P_arr): P = P_arr[0] # Sample polar cosine @@ -765,14 +770,14 @@ def sample_white_direction(nx, ny, nz, P_arr): return x, y, z -@njit +@trace.njit() def sample_uniform(a, b, P_arr): P = P_arr[0] return a + rng(P_arr) * (b - a) # TODO: use cummulative density function and binary search -@njit +@trace.njit() def sample_discrete(group, P_arr): P = P_arr[0] tot = 0.0 @@ -783,7 +788,7 @@ def sample_discrete(group, P_arr): return i -@njit +@trace.njit() def sample_piecewise_linear(cdf, P_arr): P = P_arr[0] xi = rng(P_arr) @@ -805,12 +810,12 @@ def sample_piecewise_linear(cdf, P_arr): # ============================================================================= -@njit +@trace.njit() def wrapping_mul(a, b): return a * b -@njit +@trace.njit() def wrapping_add(a, b): return a + b @@ -836,7 +841,7 @@ def adapt_rng(object_mode=False): wrapping_mul = wrapping_mul_python -@njit +@trace.njit() def split_seed(key, seed): """murmur_hash64a""" multiplier = uint64(0xC6A4A7935BD1E995) @@ -859,25 +864,25 @@ def split_seed(key, seed): return hash_value -@njit +@trace.njit() def rng_(seed): seed = uint64(seed) return wrapping_add(wrapping_mul(RNG_G, seed), RNG_C) & RNG_MOD_MASK -@njit +@trace.njit() def rng(state_arr): state = state_arr[0] state["rng_seed"] = rng_(state["rng_seed"]) return state["rng_seed"] / RNG_MOD -@njit +@trace.njit() def rng_from_seed(seed): return rng_(seed) / RNG_MOD -@njit +@trace.njit() def rng_array(seed, shape, size): xi = np.zeros(size) for i in range(size): @@ -892,7 +897,7 @@ def rng_array(seed, shape, size): # ============================================================================= -@njit +@trace.njit() def source_particle(P_rec_arr, seed, mcdc): P_rec = P_rec_arr[0] P_rec["rng_seed"] = seed @@ -956,17 +961,17 @@ def source_particle(P_rec_arr, seed, mcdc): # ============================================================================= -@njit +@trace.njit() def get_bank_size(bank): return bank["size"][0] -@njit +@trace.njit() def set_bank_size(bank, value): bank["size"][0] = value -@njit +@trace.njit() def add_bank_size(bank, value): return adapt.global_add(bank["size"], 0, value) @@ -982,7 +987,7 @@ def full_bank_print(bank): pass -@njit +@trace.njit() def add_particle(P_arr, bank): P = P_arr[0] @@ -996,7 +1001,7 @@ def add_particle(P_arr, bank): copy_recordlike(bank["particles"][idx : idx + 1], P_arr) -@njit +@trace.njit() def get_particle(P_arr, bank, mcdc): P = P_arr[0] @@ -1035,7 +1040,7 @@ def get_particle(P_arr, bank, mcdc): return True -@njit +@trace.njit() def check_future_bank(mcdc): # Get the data needed bank_future = mcdc["bank_future"] @@ -1066,7 +1071,7 @@ def check_future_bank(mcdc): ) -@njit +@trace.njit() def manage_particle_banks(seed, mcdc): # Record time if mcdc["mpi_master"]: @@ -1110,7 +1115,7 @@ def manage_particle_banks(seed, mcdc): mcdc["runtime_bank_management"] += time_end - time_start -@njit +@trace.njit() def manage_IC_bank(mcdc): # Buffer bank buff_n = np.zeros( @@ -1162,7 +1167,7 @@ def manage_IC_bank(mcdc): set_bank_size(mcdc["technique"]["IC_bank_precursor_local"], 0) -@njit +@trace.njit() def bank_scanning(bank, mcdc): N_local = get_bank_size(bank) @@ -1181,7 +1186,7 @@ def bank_scanning(bank, mcdc): return idx_start, N_local, N_global -@njit +@trace.njit() def bank_scanning_weight(bank, mcdc): # Local weight CDF N_local = get_bank_size(bank) @@ -1206,7 +1211,7 @@ def bank_scanning_weight(bank, mcdc): return w_start, w_cdf, W_global -@njit +@trace.njit() def bank_scanning_DNP(bank, mcdc): N_DNP_local = get_bank_size(bank) @@ -1231,7 +1236,7 @@ def bank_scanning_DNP(bank, mcdc): return idx_start, N_local, N_global -@njit +@trace.njit() def normalize_weight(bank, norm): # Get total weight W = total_weight(bank) @@ -1241,7 +1246,7 @@ def normalize_weight(bank, norm): P["w"] *= norm / W -@njit +@trace.njit() def total_weight(bank): # Local total weight W_local = np.zeros(1) @@ -1255,7 +1260,7 @@ def total_weight(bank): return buff[0] -@njit +@trace.njit() def allreduce(value): total = np.zeros(1, np.float64) with objmode(): @@ -1263,7 +1268,7 @@ def allreduce(value): return total[0] -@njit +@trace.njit() def allreduce_array(array): buff = np.zeros_like(array) with objmode(): @@ -1271,7 +1276,7 @@ def allreduce_array(array): array[:] = buff -@njit +@trace.njit() def bank_rebalance(mcdc): # Scan the bank idx_start, N_local, N = bank_scanning(mcdc["bank_source"], mcdc) @@ -1356,8 +1361,8 @@ def bank_rebalance(mcdc): mcdc["bank_source"]["particles"][i] = buff[i] -@njit -def distribute_work(N, mcdc, precursor=False): +@trace.njit() +def distribute_work(N, mcdc): size = mcdc["mpi_size"] rank = mcdc["mpi_rank"] @@ -1380,14 +1385,38 @@ def distribute_work(N, mcdc, precursor=False): else: work_start += rem - if not precursor: - mcdc["mpi_work_start"] = work_start - mcdc["mpi_work_size"] = work_size - mcdc["mpi_work_size_total"] = work_size_total + mcdc["mpi_work_start"] = work_start + mcdc["mpi_work_size"] = work_size + mcdc["mpi_work_size_total"] = work_size_total + + +@trace.njit() +def distribute_work_precursor(N, mcdc): + size = mcdc["mpi_size"] + rank = mcdc["mpi_rank"] + + # Total # of work + work_size_total = N + + # Evenly distribute work + work_size = math.floor(N / size) + + # Starting index (based on even distribution) + work_start = work_size * rank + + # Count reminder + rem = N % size + + # Assign reminder and update starting index + if rank < rem: + work_size += 1 + work_start += rank else: - mcdc["mpi_work_start_precursor"] = work_start - mcdc["mpi_work_size_precursor"] = work_size - mcdc["mpi_work_size_total_precursor"] = work_size_total + work_start += rem + + mcdc["mpi_work_start_precursor"] = work_start + mcdc["mpi_work_size_precursor"] = work_size + mcdc["mpi_work_size_total_precursor"] = work_size_total # ============================================================================= @@ -1417,7 +1446,7 @@ def pp_over_one(): pass -@njit +@trace.njit() def bank_IC(P_arr, prog): P = P_arr[0] @@ -1529,7 +1558,7 @@ def bank_IC(P_arr, prog): # required due to pure-Python behavior of taking things by reference. -@njit +@trace.njit() def population_control(seed, mcdc): if mcdc["technique"]["pct"] == PCT_COMBING: pct_combing(seed, mcdc) @@ -1541,7 +1570,7 @@ def population_control(seed, mcdc): pct_splitting_roulette_weight(seed, mcdc) -@njit +@trace.njit() def pct_combing(seed, mcdc): bank_census = mcdc["bank_census"] M = mcdc["setting"]["N_particle"] @@ -1584,7 +1613,7 @@ def pct_combing(seed, mcdc): adapt.add_source(P_rec_arr, mcdc) -@njit +@trace.njit() def pct_combing_weight(seed, mcdc): bank_census = mcdc["bank_census"] M = mcdc["setting"]["N_particle"] @@ -1629,7 +1658,7 @@ def pct_combing_weight(seed, mcdc): adapt.add_source(P_rec_arr, mcdc) -@njit +@trace.njit() def pct_splitting_roulette(seed, mcdc): bank_census = mcdc["bank_census"] M = mcdc["setting"]["N_particle"] @@ -1678,7 +1707,7 @@ def pct_splitting_roulette(seed, mcdc): adapt.add_source(P_rec_arr, mcdc) -@njit +@trace.njit() def pct_splitting_roulette_weight(seed, mcdc): bank_census = mcdc["bank_census"] M = mcdc["setting"]["N_particle"] @@ -1730,7 +1759,7 @@ def pct_splitting_roulette_weight(seed, mcdc): # ============================================================================= -@njit +@trace.njit() def move_particle(P_arr, distance, mcdc): P = P_arr[0] P["x"] += P["ux"] * distance @@ -1739,7 +1768,7 @@ def move_particle(P_arr, distance, mcdc): P["t"] += distance / physics.get_speed(P_arr, mcdc) -@njit +@trace.njit() def copy_recordlike(P_new_arr, P_rec_arr): P_new = P_new_arr[0] P_rec = P_rec_arr[0] @@ -1757,7 +1786,7 @@ def copy_recordlike(P_new_arr, P_rec_arr): P_new["iqmc"]["w"] = P_rec["iqmc"]["w"] -@njit +@trace.njit() def copy_particle(P_new_arr, P_arr): P_new = P_new_arr[0] P = P_arr[0] @@ -1781,7 +1810,7 @@ def copy_particle(P_new_arr, P_arr): P_new["iqmc"]["w"] = P["iqmc"]["w"] -@njit +@trace.njit() def recordlike_to_particle(P_new_arr, P_rec_arr): P_new = P_new_arr[0] P_rec = P_rec_arr[0] @@ -1794,7 +1823,7 @@ def recordlike_to_particle(P_new_arr, P_rec_arr): P_new["event"] = -1 -@njit +@trace.njit() def split_as_record(P_new_rec_arr, P_rec_arr): P_rec = P_rec_arr[0] P_new_rec = P_new_rec_arr[0] @@ -1808,7 +1837,7 @@ def split_as_record(P_new_rec_arr, P_rec_arr): # ============================================================================= -@njit +@trace.njit() def mesh_get_angular_index(P_arr, mesh): P = P_arr[0] ux = P["ux"] @@ -1825,7 +1854,7 @@ def mesh_get_angular_index(P_arr, mesh): return mu, azi -@njit +@trace.njit() def mesh_get_energy_index(P_arr, mesh, mode_MG): P = P_arr[0] # Check if outside grid @@ -1849,7 +1878,7 @@ def mesh_get_energy_index(P_arr, mesh, mode_MG): # ============================================================================= -@njit +@trace.njit() def score_mesh_tally(P_arr, distance, tally, data, mcdc): P = P_arr[0] tally_bin = data[TALLY] @@ -2026,7 +2055,7 @@ def score_mesh_tally(P_arr, distance, tally, data, mcdc): idx += stride["t"] -@njit +@trace.njit() def score_surface_tally(P_arr, surface, tally, data, mcdc): # TODO: currently not supporting filters P = P_arr[0] @@ -2052,7 +2081,7 @@ def score_surface_tally(P_arr, surface, tally, data, mcdc): adapt.global_add(tally_bin, (TALLY_SCORE, idx + i), round(score)) -@njit +@trace.njit() def score_cell_tally(P_arr, distance, tally, data, mcdc): P = P_arr[0] tally_bin = data[TALLY] @@ -2128,7 +2157,7 @@ def score_cell_tally(P_arr, distance, tally, data, mcdc): idx += stride["t"] -@njit +@trace.njit() def score_cs_tally(P_arr, distance, tally, data, mcdc): # Each time that this function is called, EVERY cs bin needs to be checked to see if the particle is in it. # The particle needs to score into all the bins that it is within @@ -2212,7 +2241,7 @@ def score_cs_tally(P_arr, distance, tally, data, mcdc): ) -@njit +@trace.njit() def cs_clip(p, q, t0, t1): if p < 0: t = q / p @@ -2231,7 +2260,7 @@ def cs_clip(p, q, t0, t1): return True, t0, t1 -@njit +@trace.njit() def cs_tracklength_in_box(start, end, x_min, x_max, y_min, y_max): # Uses Liang-Barsky algorithm for finding tracklength in box t0, t1 = 0.0, 1.0 @@ -2266,7 +2295,7 @@ def cs_tracklength_in_box(start, end, x_min, x_max, y_min, y_max): return math.sqrt(X**2 + Y**2) -@njit +@trace.njit() def calculate_distance_in_coarse_bin(start, end, distance, center, cs_bin_size): # Edges of the coarse bin x_min = center[0] - cs_bin_size[0] / 2 @@ -2279,7 +2308,7 @@ def calculate_distance_in_coarse_bin(start, end, distance, center, cs_bin_size): return distance_inside -@njit +@trace.njit() def dd_reduce(data, mcdc): tally_bin = data[TALLY] @@ -2311,7 +2340,7 @@ def dd_reduce(data, mcdc): dd_comm.Free() -@njit +@trace.njit() def tally_reduce(data, mcdc): tally_bin = data[TALLY] N_bin = tally_bin.shape[1] @@ -2339,7 +2368,7 @@ def tally_reduce(data, mcdc): dd_reduce(data, mcdc) -@njit +@trace.njit() def tally_accumulate(data, mcdc): tally_bin = data[TALLY] N_bin = tally_bin.shape[1] @@ -2354,7 +2383,7 @@ def tally_accumulate(data, mcdc): tally_bin[TALLY_SCORE, i] = 0.0 -@njit +@trace.njit() def census_based_tally_output(data, mcdc): idx_batch = mcdc["idx_batch"] idx_census = mcdc["idx_census"] @@ -2456,7 +2485,7 @@ def census_based_tally_output(data, mcdc): f.close() -@njit +@trace.njit() def dd_closeout(data, mcdc): tally_bin = data[TALLY] @@ -2492,7 +2521,7 @@ def dd_closeout(data, mcdc): dd_comm.Free() -@njit +@trace.njit() def tally_closeout(data, mcdc): tally = data[TALLY] N_history = mcdc["setting"]["N_particle"] @@ -2544,7 +2573,7 @@ def tally_closeout(data, mcdc): # ============================================================================= -@njit +@trace.njit() def eigenvalue_tally(P_arr, distance, mcdc): P = P_arr[0] material = mcdc["materials"][P["material_ID"]] @@ -2600,7 +2629,7 @@ def eigenvalue_tally(P_arr, distance, mcdc): mcdc["C_max"] = C_density -@njit +@trace.njit() def eigenvalue_tally_closeout_history(mcdc): N_particle = mcdc["setting"]["N_particle"] @@ -2743,7 +2772,7 @@ def eigenvalue_tally_closeout_history(mcdc): mcdc["gyration_radius"][idx_cycle] = rms -@njit +@trace.njit() def eigenvalue_tally_closeout(mcdc): N = mcdc["setting"]["N_active"] mcdc["n_avg"] /= N @@ -2761,7 +2790,7 @@ def eigenvalue_tally_closeout(mcdc): # ====================================================================================== -@njit +@trace.njit() def move_to_event(P_arr, data, mcdc): # ================================================================================== # Preparation (as needed) @@ -2879,7 +2908,7 @@ def move_to_event(P_arr, data, mcdc): move_particle(P_arr, distance, mcdc) -@njit +@trace.njit() def distance_to_collision(P_arr, mcdc): P = P_arr[0] # Get total cross-section @@ -2901,7 +2930,7 @@ def distance_to_collision(P_arr, mcdc): # ============================================================================= -@njit +@trace.njit() def surface_crossing(P_arr, data, prog): P = P_arr[0] mcdc = adapt.mcdc_global(prog) @@ -2931,7 +2960,7 @@ def surface_crossing(P_arr, data, prog): # ============================================================================= -@njit +@trace.njit() def collision(P_arr, mcdc): P = P_arr[0] # Get the reaction cross-sections @@ -2965,7 +2994,7 @@ def collision(P_arr, mcdc): # ============================================================================= -@njit +@trace.njit() def scattering(P_arr, prog): P = P_arr[0] mcdc = adapt.mcdc_global(prog) @@ -3015,7 +3044,7 @@ def scattering(P_arr, prog): adapt.add_active(P_new_arr, prog) -@njit +@trace.njit() def sample_phasespace_scattering(P_arr, material, P_new_arr, mcdc): P_new = P_new_arr[0] P = P_arr[0] @@ -3031,7 +3060,7 @@ def sample_phasespace_scattering(P_arr, material, P_new_arr, mcdc): scattering_CE(P_arr, material, P_new_arr, mcdc) -@njit +@trace.njit() def sample_phasespace_scattering_nuclide(P_arr, nuclide, P_new_arr): P_new = P_new_arr[0] P = P_arr[0] @@ -3044,7 +3073,7 @@ def sample_phasespace_scattering_nuclide(P_arr, nuclide, P_new_arr): scattering_MG(P_arr, nuclide, P_new_arr) -@njit +@trace.njit() def scattering_MG(P_arr, material, P_new_arr): P_new = P_new_arr[0] P = P_arr[0] @@ -3072,7 +3101,7 @@ def scattering_MG(P_arr, material, P_new_arr): P_new["g"] = g_out -@njit +@trace.njit() def scattering_CE(P_arr, material, P_new_arr, mcdc): P_new = P_new_arr[0] P = P_arr[0] @@ -3165,7 +3194,7 @@ def scattering_CE(P_arr, material, P_new_arr, mcdc): P_new["uz"] = vz / P_speed -@njit +@trace.njit() def sample_nucleus_speed(A, P_arr, mcdc): P = P_arr[0] # Particle speed @@ -3207,7 +3236,7 @@ def sample_nucleus_speed(A, P_arr, mcdc): return Vx, Vy, Vz -@njit +@trace.njit() def scatter_direction(ux, uy, uz, mu0, azi): cos_azi = math.cos(azi) sin_azi = math.sin(azi) @@ -3238,7 +3267,7 @@ def scatter_direction(ux, uy, uz, mu0, azi): # ============================================================================= -@njit +@trace.njit() def fission(P_arr, prog): P = P_arr[0] mcdc = adapt.mcdc_global(prog) @@ -3326,7 +3355,7 @@ def fission(P_arr, prog): adapt.add_future(P_new_arr, prog) -@njit +@trace.njit() def sample_phasespace_fission(P_arr, material, P_new_arr, mcdc): P_new = P_new_arr[0] P = P_arr[0] @@ -3397,7 +3426,7 @@ def sample_phasespace_fission(P_arr, material, P_new_arr, mcdc): P_new["t"] -= math.log(xi) / decay -@njit +@trace.njit() def sample_phasespace_fission_nuclide(P_arr, nuclide, P_new_arr, mcdc): P_new = P_new_arr[0] P = P_arr[0] @@ -3416,7 +3445,7 @@ def sample_phasespace_fission_nuclide(P_arr, nuclide, P_new_arr, mcdc): fission_CE(P_arr, nuclide, P_new_arr) -@njit +@trace.njit() def fission_MG(P_arr, nuclide, P_new_arr): P_new = P_new_arr[0] P = P_arr[0] @@ -3461,7 +3490,7 @@ def fission_MG(P_arr, nuclide, P_new_arr): P_new["t"] -= math.log(xi) / decay -@njit +@trace.njit() def fission_CE(P_arr, nuclide, P_new_arr): P_new = P_new_arr[0] P = P_arr[0] @@ -3552,7 +3581,7 @@ def fission_CE(P_arr, nuclide, P_new_arr): # ============================================================================= -@njit +@trace.njit() def branchless_collision(P_arr, prog): P = P_arr[0] mcdc = adapt.mcdc_global(prog) @@ -3593,7 +3622,7 @@ def branchless_collision(P_arr, prog): # ============================================================================= -@njit +@trace.njit() def weight_window(P_arr, prog): P = P_arr[0] mcdc = adapt.mcdc_global(prog) @@ -3645,7 +3674,7 @@ def weight_window(P_arr, prog): P["w"] = w_target -@njit +@trace.njit() def update_weight_windows(data, mcdc): idx_batch = mcdc["idx_batch"] idx_census = mcdc["idx_census"] @@ -3697,7 +3726,7 @@ def update_weight_windows(data, mcdc): # ============================================================================= -@njit +@trace.njit() def weight_roulette(P_arr, mcdc): P = P_arr[0] w_survive = mcdc["technique"]["wr_survive"] @@ -3715,7 +3744,7 @@ def weight_roulette(P_arr, mcdc): # ============================================================================= -@njit +@trace.njit() def get_MacroXS(type_, material, P_arr, mcdc): P = P_arr[0] # Multigroup XS @@ -3772,7 +3801,7 @@ def get_MacroXS(type_, material, P_arr, mcdc): return MacroXS -@njit +@trace.njit() def get_microXS(type_, nuclide, E): # Cross sections if type_ == XS_TOTAL: @@ -3819,7 +3848,7 @@ def get_microXS(type_, nuclide, E): return nu * xs -@njit +@trace.njit() def get_XS(data, E, E_grid, NE): # Search XS energy bin index idx = binary_search_with_length(E, E_grid, NE) @@ -3839,7 +3868,7 @@ def get_XS(data, E, E_grid, NE): return XS1 + (E - E1) * (XS2 - XS1) / (E2 - E1) -@njit +@trace.njit() def get_nu_group(type_, nuclide, E, group): if type_ == NU_FISSION: nu = get_XS(nuclide["ce_nu_p"], E, nuclide["E_nu_p"], nuclide["NE_nu_p"]) @@ -3866,12 +3895,12 @@ def get_nu_group(type_, nuclide, E, group): ) -@njit +@trace.njit() def get_nu(type_, nuclide, E): return get_nu_group(type_, nuclide, E, -1) -@njit +@trace.njit() def sample_nuclide(material, P_arr, type_, mcdc): P = P_arr[0] xi = rng(P_arr) * get_MacroXS(type_, material, P_arr, mcdc) @@ -3888,7 +3917,7 @@ def sample_nuclide(material, P_arr, type_, mcdc): return nuclide -@njit +@trace.njit() def sample_Eout(P_new_arr, E_grid, NE, chi): P_new = P_new_arr[0] xi = rng(P_new_arr) @@ -3909,7 +3938,7 @@ def sample_Eout(P_new_arr, E_grid, NE, chi): # ============================================================================= -@njit +@trace.njit() def lartg(f, g): """ Originally a Lapack routine to generate a plane rotation with @@ -3937,7 +3966,7 @@ def lartg(f, g): return c, s, r -@njit +@trace.njit() def modified_gram_schmidt(V, u): """ Modified Gram Schmidt routine @@ -3958,7 +3987,7 @@ def modified_gram_schmidt(V, u): # ============================================================================= -@njit +@trace.njit() def uq_resample(mean, delta, info): # Currently only uniform distribution shape = mean.shape @@ -3968,7 +3997,7 @@ def uq_resample(mean, delta, info): return mean + (2 * xi - 1) * delta -@njit +@trace.njit() def reset_material(mcdc, idm, material_uq): # Assumes all nuclides have already been re-sampled # Basic XS @@ -4039,7 +4068,7 @@ def reset_material(mcdc, idm, material_uq): material["nu_f"] += material["nu_d"][:, j] -@njit +@trace.njit() def reset_nuclide(nuclide, nuclide_uq): for name in literal_unroll( ("speed", "decay", "capture", "fission", "nu_s", "nu_p") @@ -4105,7 +4134,7 @@ def reset_nuclide(nuclide, nuclide_uq): nuclide["chi_d"][dg, :] /= np.sum(nuclide["chi_d"][dg, :]) -@njit +@trace.njit() def uq_reset(mcdc, seed): # Types of uq parameters: materials, nuclides N = len(mcdc["technique"]["uq_"]["nuclides"]) @@ -4125,7 +4154,7 @@ def uq_reset(mcdc, seed): reset_material(mcdc, idm, mcdc["technique"]["uq_"]["materials"][i]) -@njit +@trace.njit() def uq_tally_closeout_history(data, mcdc): tally_bin = data[TALLY] @@ -4136,7 +4165,7 @@ def uq_tally_closeout_history(data, mcdc): tally_bin[TALLY_UQ_BATCH] = tally_bin[TALLY_SCORE] -@njit +@trace.njit() def uq_tally_closeout_batch(data, mcdc): tally_bin = data[TALLY] @@ -4153,7 +4182,7 @@ def uq_tally_closeout_batch(data, mcdc): tally_bin[TALLY_UQ_BATCH_VAR][:] = buff -@njit +@trace.njit() def uq_tally_closeout(data, mcdc): tally_bin = data[TALLY] diff --git a/mcdc/loop.py b/mcdc/loop.py index 6a603026e..c8a878ddf 100644 --- a/mcdc/loop.py +++ b/mcdc/loop.py @@ -9,6 +9,7 @@ import mcdc.kernel as kernel import mcdc.print_ as print_module import mcdc.type_ as type_ +import mcdc.trace as trace from mcdc.constant import * from mcdc.print_ import ( @@ -44,12 +45,12 @@ # If GPU execution is supported and selected, the functions shown below will # be redefined to overwrite the above symbols and perform initialization/ # finalization of GPU state -@njit +@trace.njit() def setup_gpu(mcdc): pass -@njit +@trace.njit() def teardown_gpu(mcdc): pass @@ -59,7 +60,7 @@ def teardown_gpu(mcdc): # ========================================================================= -@njit +@trace.njit() def loop_fixed_source(data_arr, mcdc_arr): # Ensure `data` and `mcdc` exist for the lifetime of the program @@ -177,7 +178,7 @@ def loop_fixed_source(data_arr, mcdc_arr): # ========================================================================= -@njit +@trace.njit() def loop_eigenvalue(data_arr, mcdc_arr): # Ensure `data` and `mcdc` exist for the lifetime of the program # by intentionally leaking their memory @@ -228,7 +229,7 @@ def loop_eigenvalue(data_arr, mcdc_arr): # ============================================================================= -@njit +@trace.njit() def generate_source_particle(work_start, idx_work, seed, prog): mcdc = adapt.mcdc_global(prog) @@ -315,7 +316,7 @@ def generate_source_particle(work_start, idx_work, seed, prog): """ -@njit +@trace.njit() def prep_particle(P_arr, prog): P = P_arr[0] mcdc = adapt.mcdc_global(prog) @@ -325,7 +326,7 @@ def prep_particle(P_arr, prog): kernel.weight_window(P_arr, prog) -@njit +@trace.njit() def exhaust_active_bank(data, prog): mcdc = adapt.mcdc_global(prog) P_arr = adapt.local_array(1, type_.particle) @@ -342,7 +343,7 @@ def exhaust_active_bank(data, prog): loop_particle(P_arr, data, mcdc) -@njit +@trace.njit() def source_closeout(prog, idx_work, N_prog, data): mcdc = adapt.mcdc_global(prog) @@ -363,7 +364,7 @@ def source_closeout(prog, idx_work, N_prog, data): print_progress(percent, mcdc) -@njit +@trace.njit() def source_dd_resolution(data, prog): mcdc = adapt.mcdc_global(prog) @@ -418,7 +419,7 @@ def source_dd_resolution(data, prog): terminated = True -@njit +@trace.njit() def loop_source(seed, data, mcdc): # Progress bar indicator N_prog = 0 @@ -487,7 +488,7 @@ def step(prog: nb.uintp, P_input: adapt.particle_gpu): ASYNC_EXECUTION = config.args.gpu_strat == "async" -@njit(cache=caching) +@trace.njit(cache=caching) def gpu_loop_source(seed, data, mcdc): # Progress bar indicator @@ -562,7 +563,7 @@ def gpu_loop_source(seed, data, mcdc): # ========================================================================= -@njit +@trace.njit() def loop_particle(P_arr, data, prog): P = P_arr[0] mcdc = adapt.mcdc_global(prog) @@ -571,7 +572,7 @@ def loop_particle(P_arr, data, prog): step_particle(P_arr, data, prog) -@njit +@trace.njit() def step_particle(P_arr, data, prog): P = P_arr[0] mcdc = adapt.mcdc_global(prog) @@ -643,7 +644,7 @@ def step_particle(P_arr, data, prog): # ============================================================================= -@njit +@trace.njit() def generate_precursor_particle(DNP_arr, particle_idx, seed, prog): mcdc = adapt.mcdc_global(prog) DNP = DNP_arr[0] @@ -728,7 +729,7 @@ def generate_precursor_particle(DNP_arr, particle_idx, seed, prog): adapt.add_active(P_new_arr, prog) -@njit +@trace.njit() def source_precursor_closeout(prog, idx_work, N_prog, data): mcdc = adapt.mcdc_global(prog) @@ -744,7 +745,7 @@ def source_precursor_closeout(prog, idx_work, N_prog, data): print_progress(percent, mcdc) -@njit +@trace.njit() def loop_source_precursor(seed, data, mcdc): # Progress bar indicator N_prog = 0 @@ -836,7 +837,7 @@ def step(prog: nb.uintp, P_input: adapt.particle_gpu): ) -@njit(cache=caching) +@trace.njit(cache=caching) def gpu_loop_source_precursor(seed, data, mcdc): # Progress bar indicator @@ -959,7 +960,7 @@ def build_gpu_progs(input_deck, args): pre_complete = pre_fns["complete"] pre_clear_flags = pre_fns["clear_flags"] - @njit + @trace.njit() def real_setup_gpu(mcdc): src_set_device(device_id) arena_size = ARENA_SIZE @@ -974,7 +975,7 @@ def real_setup_gpu(mcdc): pre_init_program(mcdc["precursor_program_pointer"], BLOCK_COUNT) return - @njit + @trace.njit() def real_teardown_gpu(mcdc): src_free_program(adapt.cast_uintp_to_voidptr(mcdc["source_program_pointer"])) pre_free_program(adapt.cast_uintp_to_voidptr(mcdc["precursor_program_pointer"])) diff --git a/mcdc/main.py b/mcdc/main.py index a0c962747..62ba53e7b 100644 --- a/mcdc/main.py +++ b/mcdc/main.py @@ -41,6 +41,7 @@ # Get input_deck import mcdc.global_ as mcdc_ +import mcdc.trace as trace input_deck = mcdc_.input_deck @@ -100,6 +101,9 @@ def run(): MPI.COMM_WORLD.Barrier() mcdc["runtime_total"] = MPI.Wtime() - total_start + if config.trace: + trace.output_report(mcdc) + # Closout closeout(mcdc) @@ -408,6 +412,12 @@ def dd_mesh_bounds(idx): xmesh_idx = d_idx % d_Nx # find spatial boundaries of subdomain + xlen = len(input_deck.mesh_tallies[idx].x) + ylen = len(input_deck.mesh_tallies[idx].y) + zlen = len(input_deck.mesh_tallies[idx].z) + if (xlen == 2) and (ylen == 2) and (zlen == 2): + return 0, 1, 0, 1, 0, 1 + xn = input_deck.technique["dd_mesh"]["x"][xmesh_idx] xp = input_deck.technique["dd_mesh"]["x"][xmesh_idx + 1] yn = input_deck.technique["dd_mesh"]["y"][ymesh_idx] @@ -510,6 +520,8 @@ def prepare(): # Make types # ========================================================================= + type_.make_type_trace_slot() + type_.make_type_trace(config.trace_slot_limit) type_.make_type_particle(input_deck) type_.make_type_particle_record(input_deck) type_.make_type_nuclide(input_deck) @@ -848,6 +860,10 @@ def prepare(): tally_size = 0 # Mesh tallies + mcdc["technique"]["dd_xsum"] = 1 + mcdc["technique"]["dd_ysum"] = 1 + mcdc["technique"]["dd_zsum"] = 1 + for i in range(N_mesh_tally): # Direct assignment copy_field(mcdc["mesh_tallies"][i], input_deck.mesh_tallies[i], "N_bin") @@ -861,9 +877,9 @@ def prepare(): ) else: # decomposed mesh filters - mcdc["technique"]["dd_xsum"] = len(input_deck.mesh_tallies[i].x) - 1 - mcdc["technique"]["dd_ysum"] = len(input_deck.mesh_tallies[i].y) - 1 - mcdc["technique"]["dd_zsum"] = len(input_deck.mesh_tallies[i].z) - 1 + mcdc["technique"]["dd_xsum"] = max(mcdc["technique"]["dd_xsum"],len(input_deck.mesh_tallies[i].x) - 1) + mcdc["technique"]["dd_ysum"] = max(mcdc["technique"]["dd_ysum"],len(input_deck.mesh_tallies[i].y) - 1) + mcdc["technique"]["dd_zsum"] = max(mcdc["technique"]["dd_zsum"],len(input_deck.mesh_tallies[i].z) - 1) mxn, mxp, myn, myp, mzn, mzp = dd_mesh_bounds(i) @@ -1738,7 +1754,7 @@ def dd_mergetally(mcdc, data): return dd_tally -def dd_mergemesh(mcdc, data): +def dd_mergemesh(mcdc): """ Performs mesh recombination on domain-decomposed mesh tallies. Gathers and re-organizes mesh data into a single array as it @@ -1790,10 +1806,7 @@ def dd_mergemesh(mcdc, data): if d_Nz > 1: sendcounts = np.array( - MPI.COMM_WORLD.gather( - len(mcdc["mesh_tallies"][0]["filter"]["z"]) - 1, root=0 - ) - ) + MPI.COMM_WORLD.gather(len(mcdc["mesh_tallies"][0]["filter"]["z"]), root=0)) if mcdc["mpi_master"]: z_filter = np.zeros((mcdc["mesh_tallies"].shape[0], sum(sendcounts))) else: @@ -1828,10 +1841,16 @@ def dd_mergemesh(mcdc, data): def generate_hdf5(data, mcdc): - if mcdc["technique"]["domain_decomposition"]: dd_tally = dd_mergetally(mcdc, data) - dd_mesh = dd_mergemesh(mcdc, data) + dd_mesh = dd_mergemesh(mcdc) + if mcdc["mpi_master"]: + print(dd_tally) + np.save("tally_dd.npy", dd_tally[1]) + else: + if mcdc["mpi_master"]: + print(data[TALLY]) + np.save("tally_nondd.npy", data[TALLY][1]) if mcdc["mpi_master"]: if mcdc["setting"]["progress_bar"]: @@ -1913,9 +1932,10 @@ def generate_hdf5(data, mcdc): # Set tally shape N_score = tally["N_score"] if mcdc["technique"]["domain_decomposition"]: - Nx = mcdc["technique"]["dd_xsum"] - Ny = mcdc["technique"]["dd_ysum"] - Nz = mcdc["technique"]["dd_zsum"] + if Nx or Ny or Nz: # check if spatial mesh exists + Nx = mcdc["technique"]["dd_xsum"] + Ny = mcdc["technique"]["dd_ysum"] + Nz = mcdc["technique"]["dd_zsum"] if not mcdc["technique"]["uq"]: shape = (3, Nmu, N_azi, Ng, Nt, Nx, Ny, Nz, N_score) else: diff --git a/mcdc/trace.py b/mcdc/trace.py new file mode 100644 index 000000000..0326e3a5c --- /dev/null +++ b/mcdc/trace.py @@ -0,0 +1,293 @@ +import inspect +import mcdc.config as config +import mcdc.adapt as adapt +import mcdc.type_ as type_ +import numba +import ctypes +import time +import subprocess +import os +from mpi4py import MPI +from llvmlite import binding +import numpy as np + + +CACH_PATH = './__trace_cache__' + +time_code = """ +#include +#include +#include +extern "C" +int64_t mono_clock() { + auto now = std::chrono::steady_clock::now(); + auto dur = std::chrono::duration_cast(now.time_since_epoch()).count(); + return dur; +} +""" + + + +mono_clock = None + +@numba.njit() +def extern_gpu_clock_rate (): + return 1000000000 + +if config.trace: + if not os.path.exists(CACH_PATH): + os.makedirs(CACH_PATH) + base_path = f"{CACH_PATH}/trace" + code_path = f"{base_path}.cpp" + lib_path = f"{base_path}.so" + file = open(code_path,"w") + file.write(time_code) + file.close() + cmd = f"g++ {code_path} --shared -fPIC -o {lib_path}" + subprocess.run(cmd.split(),shell=False,check=True) + abs_lib_path = os.path.abspath(lib_path) + binding.load_library_permanently(abs_lib_path) + sig = numba.types.int64() + mono_clock = numba.types.ExternalFunction("mono_clock", sig) + extern_gpu_clock_rate = numba.types.ExternalFunction("wall_clock_rate", sig) + + +@numba.njit() +def gpu_clock_rate(): + return extern_gpu_clock_rate() + + + +trace_roster = {} + +trace_wrapper_template = """ +def trace_{id}_{name} ({arg_str}) : + {trace_state_extractor} + t0 = trace_get_clock() + result = func ({arg_str}) + t1 = trace_get_clock() + platform_index = trace_platform_index() + adapt.global_add(trace['slots'][{id}]['runtime_total'],platform_index, t1 - t0) + adapt.global_add(trace['slots'][{id}]['call_total'],platform_index,1) + return result +""" + +trace_wrapper_name_template = "trace_{name}" + + + +sig = numba.core.typing.signature +ext_fn = numba.types.ExternalFunction +gpu_get_wall_clock = ext_fn("get_wall_clock",sig(numba.types.int64)) + + +def get_clock(): + return time.monotonic_ns() + +@numba.core.extending.overload(get_clock, target="cpu") +def cpu_get_clock(): + def inner_get_clock(): + return mono_clock() + return inner_get_clock + +@numba.core.extending.overload(get_clock, target="gpu") +def gpu_get_clock(): + def inner_get_clock(): + return gpu_get_wall_clock() + return inner_get_clock + + +def platform_index(): + return 0 + +@numba.core.extending.overload(platform_index, target="cpu") +def cpu_platform_index(): + def inner_platform_index(): + return 1 + return inner_platform_index + +@numba.core.extending.overload(platform_index, target="gpu") +def gpu_platform_index(): + def inner_platform_index(): + return 2 + return inner_platform_index + + +def trace(transforms=[]): + + def trace_inner(func): + global get_clock + global platform_index + trace_get_clock = get_clock + trace_platform_index = platform_index + + name = func.__name__ + arg_set = inspect.signature(func).parameters + + trace_state_extractors = { + "mcdc": "trace = mcdc['trace']", + "prog": "trace = adapt.mcdc_global(prog)['trace']", + "mcdc_arr": "trace = mcdc_arr[0]['trace']", + } + + extractor_target = None + for target, extractor in trace_state_extractors.items(): + if target in arg_set: + extractor_target = target + break + + if config.trace and (extractor_target != None): + + global trace_roster + import mcdc.adapt as adapt + + for tr in transforms: + func = tr(func) + + if func not in trace_roster: + trace_roster[name] = {'id': len(trace_roster)} + + func_id = trace_roster[name]['id'] + arg_str = ",".join([arg for arg in arg_set]) + + trace_wrapper_source = trace_wrapper_template.format( + name=name, + arg_str=arg_str, + id=func_id, + trace_state_extractor=trace_state_extractors[extractor_target] + ) + exec(trace_wrapper_source,locals(),locals()) + trace_func = eval(f"trace_{func_id}_{name}") + return trace_func + else: + return func + + return trace_inner + + + +def njit(*args,**kwargs): + + def trace_njit_inner(func): + trace_func = trace(transforms=[numba.njit(*args,**kwargs)])(func) + if (trace_func == func): + return numba.njit(*args,**kwargs)(func) + else: + return numba.njit()(trace_func) + + return trace_njit_inner + + +def dd_mergetrace(mcdc): + d_Nx = mcdc["technique"]["dd_mesh"]["x"].size - 1 + d_Ny = mcdc["technique"]["dd_mesh"]["y"].size - 1 + d_Nz = mcdc["technique"]["dd_mesh"]["z"].size - 1 + i = 0 + for n in range(d_Nx * d_Ny * d_Nz): + dd_ranks = [] + for r in range(int(mcdc["technique"]["dd_work_ratio"][n])): + dd_ranks.append(i) + i += 1 + # create MPI Comm group out of subdomain processors + dd_group = MPI.COMM_WORLD.group.Incl(dd_ranks) + dd_comm = MPI.COMM_WORLD.Create(dd_group) + # MPI Reduce on subdomain processors + for name, info in trace_roster.items(): + func_id = info['id'] + + if MPI.COMM_NULL != dd_comm: + python_nsecs = dd_comm.reduce(mcdc['trace']['slots'][func_id]['runtime_total'][0], MPI.SUM) + python_calls = dd_comm.reduce(mcdc['trace']['slots'][func_id]['call_total'][0], MPI.SUM) + cpu_nsecs = dd_comm.reduce(mcdc['trace']['slots'][func_id]['runtime_total'][1], MPI.SUM) + cpu_calls = dd_comm.reduce(mcdc['trace']['slots'][func_id]['call_total'][1], MPI.SUM) + if mcdc["dd_local_rank"] == 0: + mcdc['trace']['slots'][func_id]['runtime_total'][0] = python_nsecs + mcdc['trace']['slots'][func_id]['call_total'][0] = python_calls + mcdc['trace']['slots'][func_id]['runtime_total'][1] = cpu_nsecs + mcdc['trace']['slots'][func_id]['call_total'][1] = cpu_calls + + # free comm group + dd_group.Free() + if MPI.COMM_NULL != dd_comm: + dd_comm.Free() + + +def output_report(mcdc): + + if not mcdc["technique"]["domain_decomposition"]: + report = open("report.csv","w") + report.write("function name, ") + report.write("python total runtime (ns), python total calls, ") + report.write("cpu total runtime (ns), cpu total calls, ") + report.write("gpu total runtime (mystery units), gpu total calls, ") + report.write("\n") + + gpu_rate = 1000000000 + if config.target == "gpu": + gpu_rate = gpu_clock_rate() + + multi_rank = True + + for name, info in trace_roster.items(): + func_id = info['id'] + slot = mcdc['trace']['slots'][func_id] + + if multi_rank: + slot_arr = np.empty((1,),type_.trace_slot) + MPI.COMM_WORLD.Allreduce(slot['runtime_total'],slot_arr[0]['runtime_total']) + MPI.COMM_WORLD.Allreduce(slot['call_total'],slot_arr[0]['call_total']) + slot['runtime_total'] = slot_arr[0]['runtime_total'] + slot['call_total'] = slot_arr[0]['call_total'] + + python_nsecs = slot['runtime_total'][0] + python_calls = slot['call_total'][0] + cpu_nsecs = slot['runtime_total'][1] + cpu_calls = slot['call_total'][1] + gpu_nsecs = slot['runtime_total'][2] * 1000000000.0 / gpu_rate + gpu_calls = slot['call_total'][2] + report.write(f"{name},") + report.write(f"{python_nsecs},{python_calls},") + report.write(f"{cpu_nsecs},{cpu_calls},") + report.write(f"{gpu_nsecs},{gpu_calls},") + report.write("\n") + report.close() + + else: # write report for each subdomain + dd_mergetrace(mcdc) + d_Nx = mcdc["technique"]["dd_mesh"]["x"].size - 1 + d_Ny = mcdc["technique"]["dd_mesh"]["y"].size - 1 + d_Nz = mcdc["technique"]["dd_mesh"]["z"].size - 1 + + i = 0 + for n in range(d_Nx * d_Ny * d_Nz): + if mcdc["dd_local_rank"] == 0 and mcdc["dd_idx"] == n: + report_name = f"report{n}.csv" + report = open(report_name, "w") + report.write("function name, ") + report.write("python total runtime (ns), python total calls, ") + report.write("cpu total runtime (ns), cpu total calls, ") + report.write("gpu total runtime (mystery units), gpu total calls, ") + report.write("\n") + + gpu_rate = 1000000000 + if config.target == "gpu": + gpu_rate = gpu_clock_rate() + + for name, info in trace_roster.items(): + func_id = info['id'] + slot = mcdc['trace']['slots'][func_id] + + python_nsecs = slot['runtime_total'][0] + python_calls = slot['call_total'][0] + cpu_nsecs = slot['runtime_total'][1] + cpu_calls = slot['call_total'][1] + gpu_nsecs = slot['runtime_total'][2] * 1000000000.0 / gpu_rate + gpu_calls = slot['call_total'][2] + report.write(f"{name},") + report.write(f"{python_nsecs},{python_calls},") + report.write(f"{cpu_nsecs},{cpu_calls},") + report.write(f"{gpu_nsecs},{gpu_calls},") + report.write("\n") + report.close() + + diff --git a/mcdc/type_.py b/mcdc/type_.py index d005d057b..6e8a45662 100644 --- a/mcdc/type_.py +++ b/mcdc/type_.py @@ -51,6 +51,9 @@ global_ = None tally = None +trace = None +trace_slot = None + # ============================================================================== # MC/DC Member Array Sizes @@ -727,6 +730,11 @@ def dd_meshtally(input_deck): Ny = 2 Nz = 2 for card in input_deck.mesh_tallies: + xlen = len(card.x) + ylen = len(card.y) + zlen = len(card.z) + if (xlen == 2) and (ylen == 2) and (zlen == 2): + continue # find boundary indices in tally mesh mesh_xn = int(np.where(card.x == xn)[0]) mesh_xp = int(np.where(card.x == xp)[0]) + 1 @@ -1582,6 +1590,7 @@ def make_type_global(input_deck): ("source_program_pointer", uintp), ("precursor_program_pointer", uintp), ("source_seed", uint64), + ("trace",trace) ] ) @@ -1696,3 +1705,25 @@ def make_type_mesh_(card): mesh_names = ["x", "y", "z", "t", "mu", "azi", "g"] + + +def make_type_trace_slot(): + global trace_slot + + trace_slot = into_dtype( + [ + ("runtime_total",int64,(3,)), + ("call_total",int64,(3,)) + ] + ) + + +def make_type_trace(trace_slot_limit): + global trace + + trace = into_dtype( + [ + ("slots", trace_slot, (trace_slot_limit,)), + ("slot_limit", int64 ), + ] + ) From 74f3f502a43cae45edb66e51cd85ce551f8049f2 Mon Sep 17 00:00:00 2001 From: alexandermote Date: Tue, 22 Apr 2025 17:43:24 -0700 Subject: [PATCH 2/2] back in black --- mcdc/adapt.py | 8 --- mcdc/kernel.py | 12 +---- mcdc/loop.py | 6 --- mcdc/main.py | 52 +++++++++++-------- mcdc/trace.py | 134 ++++++++++++++++++++++++++----------------------- mcdc/type_.py | 9 ++-- 6 files changed, 107 insertions(+), 114 deletions(-) diff --git a/mcdc/adapt.py b/mcdc/adapt.py index f84fd9dd2..9c0a54d0d 100644 --- a/mcdc/adapt.py +++ b/mcdc/adapt.py @@ -106,11 +106,9 @@ def local_array(shape, dtype): @numba.extending.type_callable(local_array) def type_local_array(context): - from numba.core.typing.npydecl import parse_dtype, parse_shape if isinstance(context, numba.core.typing.context.Context): - # Function repurposed from Numba's ol_np_empty. def typer(shape, dtype): numba.np.arrayobj._check_const_str_dtype("empty", dtype) @@ -148,10 +146,8 @@ def typer(shape, dtype): return typer elif isinstance(context, numba.cuda.target.CUDATypingContext): - # Function repurposed from Numba's Cuda_array_decl. def typer(shape, dtype): - # Only integer literals and tuples of integer literals are valid # shapes if isinstance(shape, types.Integer): @@ -200,14 +196,12 @@ def typer(shape, dtype): @numba.extending.lower_builtin(local_array, types.IntegerLiteral, types.Any) def builtin_local_array(context, builder, sig, args): - shape, dtype = sig.args from numba.core.typing.npydecl import parse_dtype, parse_shape import numba.np.arrayobj as arrayobj if isinstance(context, numba.core.cpu.CPUContext): - # No default arguments. nb_dtype = parse_dtype(dtype) nb_shape = parse_shape(shape) @@ -420,7 +414,6 @@ def nopython_mode(is_on): def gpu_forward_declare(args): - if args.gpu_rocm_path != None: harm.config.set_rocm_path(args.gpu_rocm_path) @@ -651,7 +644,6 @@ def finalize(prog: numba.uintp): final_fn(prog) def step(prog: numba.uintp, arg: arg_type): - step_async() (step_async,) = harm.RuntimeSpec.async_dispatch(step) diff --git a/mcdc/kernel.py b/mcdc/kernel.py index d08bc3e1f..756ea0a55 100644 --- a/mcdc/kernel.py +++ b/mcdc/kernel.py @@ -196,7 +196,6 @@ def dd_check_out(mcdc): @trace.njit() def dd_signal_halt(mcdc): - with objmode(): for rank in range(1, MPI.COMM_WORLD.Get_size()): dummy_buff = np.zeros((1,), dtype=np.int32) @@ -207,7 +206,6 @@ def dd_signal_halt(mcdc): @trace.njit() def dd_signal_block(mcdc): - with objmode(rank="int64"): rank = MPI.COMM_WORLD.Get_rank() @@ -240,7 +238,6 @@ def dd_signal_block(mcdc): @trace.njit() def dd_signal_unblock(mcdc): - with objmode(rank="int64"): rank = MPI.COMM_WORLD.Get_rank() @@ -270,7 +267,6 @@ def dd_signal_unblock(mcdc): @trace.njit() def dd_distribute_bank(mcdc, bank, dest_list): - with objmode(send_delta="int64"): dest_count = len(dest_list) send_delta = 0 @@ -334,7 +330,6 @@ def dd_particle_send(prog): @trace.njit() def dd_get_recv_tag(): - with objmode(tag="int64"): status = MPI.Status() MPI.COMM_WORLD.Probe(status=status) @@ -345,7 +340,6 @@ def dd_get_recv_tag(): @trace.njit() def dd_recv_particles(mcdc): - buff = np.zeros( mcdc["domain_decomp"]["bank_zp"]["particles"].shape[0], dtype=type_.particle_record, @@ -373,7 +367,6 @@ def dd_recv_particles(mcdc): @trace.njit() def dd_recv_turnstile(mcdc): - with objmode(busy_delta="int64", send_delta="int64"): event_buff = np.zeros((1,), dtype=type_.dd_turnstile_event) MPI.COMM_WORLD.Recv([event_buff, type_.dd_turnstile_event_mpi]) @@ -394,7 +387,6 @@ def dd_recv_turnstile(mcdc): @trace.njit() def dd_recv_halt(mcdc): - with objmode(): dummy_buff = np.zeros((1,), dtype=np.int32) MPI.COMM_WORLD.Recv(dummy_buff) @@ -406,7 +398,6 @@ def dd_recv_halt(mcdc): @trace.njit() def dd_recv(mcdc): - if mcdc["domain_decomp"]["rank_busy"]: dd_signal_block(mcdc) mcdc["domain_decomp"]["rank_busy"] = False @@ -719,6 +710,8 @@ def distribute_work_dd_precursor(N, mcdc): mcdc["mpi_work_start_precursor"] = work_start mcdc["mpi_work_size_precursor"] = work_size mcdc["mpi_work_size_total_precursor"] = work_size_total + + # ============================================================================= # Random sampling # ============================================================================= @@ -2117,7 +2110,6 @@ def score_cell_tally(P_arr, distance, tally, data, mcdc): # Sweep through the distance distance_swept = 0.0 while distance_swept < distance - COINCIDENCE_TOLERANCE: - # Find distance to mesh grids dt = (min(mesh["t"][it + 1], t_final) - t) / ut diff --git a/mcdc/loop.py b/mcdc/loop.py index c8a878ddf..1fdfe36f7 100644 --- a/mcdc/loop.py +++ b/mcdc/loop.py @@ -62,7 +62,6 @@ def teardown_gpu(mcdc): @trace.njit() def loop_fixed_source(data_arr, mcdc_arr): - # Ensure `data` and `mcdc` exist for the lifetime of the program # by intentionally leaking their memory adapt.leak(data_arr) @@ -382,7 +381,6 @@ def source_dd_resolution(data, prog): if kernel.get_bank_size(mcdc["bank_active"]) > 0: # Loop until active bank is exhausted while kernel.get_bank_size(mcdc["bank_active"]) > 0: - kernel.get_particle(P_arr, mcdc["bank_active"], mcdc) if not kernel.particle_in_domain(P_arr, mcdc) and P["alive"] == True: print(f"recieved particle not in domain") @@ -490,7 +488,6 @@ def step(prog: nb.uintp, P_input: adapt.particle_gpu): @trace.njit(cache=caching) def gpu_loop_source(seed, data, mcdc): - # Progress bar indicator N_prog = 0 @@ -514,7 +511,6 @@ def gpu_loop_source(seed, data, mcdc): phase_count = (full_work_size + phase_size - 1) // phase_size for phase in range(phase_count): - mcdc["mpi_work_iter"][0] = phase_size * phase mcdc["mpi_work_size"] = min(phase_size * (phase + 1), full_work_size) mcdc["source_seed"] = seed @@ -839,7 +835,6 @@ def step(prog: nb.uintp, P_input: adapt.particle_gpu): @trace.njit(cache=caching) def gpu_loop_source_precursor(seed, data, mcdc): - # Progress bar indicator N_prog = 0 @@ -898,7 +893,6 @@ def gpu_loop_source_precursor(seed, data, mcdc): def build_gpu_progs(input_deck, args): - STRAT = args.gpu_strat src_spec = gpu_sources_spec() diff --git a/mcdc/main.py b/mcdc/main.py index 62ba53e7b..176a291ea 100644 --- a/mcdc/main.py +++ b/mcdc/main.py @@ -417,7 +417,7 @@ def dd_mesh_bounds(idx): zlen = len(input_deck.mesh_tallies[idx].z) if (xlen == 2) and (ylen == 2) and (zlen == 2): return 0, 1, 0, 1, 0, 1 - + xn = input_deck.technique["dd_mesh"]["x"][xmesh_idx] xp = input_deck.technique["dd_mesh"]["x"][xmesh_idx + 1] yn = input_deck.technique["dd_mesh"]["y"][ymesh_idx] @@ -645,9 +645,9 @@ def prepare(): for i in range(N_material): for name in type_.material.names: if name in ["nuclide_IDs", "nuclide_densities"]: - mcdc["materials"][i][name][: mcdc["materials"][i]["N_nuclide"]] = ( - getattr(input_deck.materials[i], name) - ) + mcdc["materials"][i][name][ + : mcdc["materials"][i]["N_nuclide"] + ] = getattr(input_deck.materials[i], name) else: copy_field(mcdc["materials"][i], input_deck.materials[i], name) @@ -877,9 +877,15 @@ def prepare(): ) else: # decomposed mesh filters - mcdc["technique"]["dd_xsum"] = max(mcdc["technique"]["dd_xsum"],len(input_deck.mesh_tallies[i].x) - 1) - mcdc["technique"]["dd_ysum"] = max(mcdc["technique"]["dd_ysum"],len(input_deck.mesh_tallies[i].y) - 1) - mcdc["technique"]["dd_zsum"] = max(mcdc["technique"]["dd_zsum"],len(input_deck.mesh_tallies[i].z) - 1) + mcdc["technique"]["dd_xsum"] = max( + mcdc["technique"]["dd_xsum"], len(input_deck.mesh_tallies[i].x) - 1 + ) + mcdc["technique"]["dd_ysum"] = max( + mcdc["technique"]["dd_ysum"], len(input_deck.mesh_tallies[i].y) - 1 + ) + mcdc["technique"]["dd_zsum"] = max( + mcdc["technique"]["dd_zsum"], len(input_deck.mesh_tallies[i].z) - 1 + ) mxn, mxp, myn, myp, mzn, mzp = dd_mesh_bounds(i) @@ -1404,9 +1410,9 @@ def prepare(): for i in range(M): idm = input_deck.uq_deltas["materials"][i].ID mcdc["technique"]["uq_"]["materials"][i]["info"]["ID"] = idm - mcdc["technique"]["uq_"]["materials"][i]["info"]["distribution"] = ( - input_deck.uq_deltas["materials"][i].distribution - ) + mcdc["technique"]["uq_"]["materials"][i]["info"][ + "distribution" + ] = input_deck.uq_deltas["materials"][i].distribution for name in input_deck.uq_deltas["materials"][i].flags: mcdc["technique"]["uq_"]["materials"][i]["flags"][name] = True mcdc["technique"]["uq_"]["materials"][i]["delta"][name] = getattr( @@ -1420,15 +1426,15 @@ def prepare(): flags["nu_f"] = True if mcdc["materials"][idm]["N_nuclide"] > 1: for name in type_.uq_mat.names: - mcdc["technique"]["uq_"]["materials"][i]["mean"][name] = ( - input_deck.materials[idm][name] - ) + mcdc["technique"]["uq_"]["materials"][i]["mean"][ + name + ] = input_deck.materials[idm][name] N = len(input_deck.uq_deltas["nuclides"]) for i in range(N): - mcdc["technique"]["uq_"]["nuclides"][i]["info"]["distribution"] = ( - input_deck.uq_deltas["nuclides"][i].distribution - ) + mcdc["technique"]["uq_"]["nuclides"][i]["info"][ + "distribution" + ] = input_deck.uq_deltas["nuclides"][i].distribution idn = input_deck.uq_deltas["nuclides"][i].ID mcdc["technique"]["uq_"]["nuclides"][i]["info"]["ID"] = idn for name in type_.uq_nuc.names: @@ -1806,7 +1812,8 @@ def dd_mergemesh(mcdc): if d_Nz > 1: sendcounts = np.array( - MPI.COMM_WORLD.gather(len(mcdc["mesh_tallies"][0]["filter"]["z"]), root=0)) + MPI.COMM_WORLD.gather(len(mcdc["mesh_tallies"][0]["filter"]["z"]), root=0) + ) if mcdc["mpi_master"]: z_filter = np.zeros((mcdc["mesh_tallies"].shape[0], sum(sendcounts))) else: @@ -1932,7 +1939,7 @@ def generate_hdf5(data, mcdc): # Set tally shape N_score = tally["N_score"] if mcdc["technique"]["domain_decomposition"]: - if Nx or Ny or Nz: # check if spatial mesh exists + if Nx or Ny or Nz: # check if spatial mesh exists Nx = mcdc["technique"]["dd_xsum"] Ny = mcdc["technique"]["dd_ysum"] Nz = mcdc["technique"]["dd_zsum"] @@ -2386,7 +2393,6 @@ def recombine_tallies(file="output.h5"): def closeout(mcdc): - loop.teardown_gpu(mcdc) # Runtime @@ -2523,9 +2529,11 @@ def visualize( particle["t"] = t # Random direction - particle["ux"], particle["uy"], particle["uz"] = ( - kernel.sample_isotropic_direction(particle_container) - ) + ( + particle["ux"], + particle["uy"], + particle["uz"], + ) = kernel.sample_isotropic_direction(particle_container) # RGB color data for each pixels data = np.zeros(pixels + (3,)) diff --git a/mcdc/trace.py b/mcdc/trace.py index 0326e3a5c..8ec90fabe 100644 --- a/mcdc/trace.py +++ b/mcdc/trace.py @@ -12,7 +12,7 @@ import numpy as np -CACH_PATH = './__trace_cache__' +CACH_PATH = "./__trace_cache__" time_code = """ #include @@ -27,24 +27,25 @@ """ - mono_clock = None + @numba.njit() -def extern_gpu_clock_rate (): +def extern_gpu_clock_rate(): return 1000000000 + if config.trace: if not os.path.exists(CACH_PATH): os.makedirs(CACH_PATH) base_path = f"{CACH_PATH}/trace" code_path = f"{base_path}.cpp" - lib_path = f"{base_path}.so" - file = open(code_path,"w") + lib_path = f"{base_path}.so" + file = open(code_path, "w") file.write(time_code) file.close() cmd = f"g++ {code_path} --shared -fPIC -o {lib_path}" - subprocess.run(cmd.split(),shell=False,check=True) + subprocess.run(cmd.split(), shell=False, check=True) abs_lib_path = os.path.abspath(lib_path) binding.load_library_permanently(abs_lib_path) sig = numba.types.int64() @@ -57,7 +58,6 @@ def gpu_clock_rate(): return extern_gpu_clock_rate() - trace_roster = {} trace_wrapper_template = """ @@ -75,46 +75,52 @@ def trace_{id}_{name} ({arg_str}) : trace_wrapper_name_template = "trace_{name}" - -sig = numba.core.typing.signature -ext_fn = numba.types.ExternalFunction -gpu_get_wall_clock = ext_fn("get_wall_clock",sig(numba.types.int64)) +sig = numba.core.typing.signature +ext_fn = numba.types.ExternalFunction +gpu_get_wall_clock = ext_fn("get_wall_clock", sig(numba.types.int64)) def get_clock(): return time.monotonic_ns() + @numba.core.extending.overload(get_clock, target="cpu") def cpu_get_clock(): def inner_get_clock(): return mono_clock() + return inner_get_clock + @numba.core.extending.overload(get_clock, target="gpu") def gpu_get_clock(): def inner_get_clock(): return gpu_get_wall_clock() + return inner_get_clock def platform_index(): return 0 + @numba.core.extending.overload(platform_index, target="cpu") def cpu_platform_index(): def inner_platform_index(): return 1 + return inner_platform_index + @numba.core.extending.overload(platform_index, target="gpu") def gpu_platform_index(): def inner_platform_index(): return 2 + return inner_platform_index def trace(transforms=[]): - def trace_inner(func): global get_clock global platform_index @@ -137,7 +143,6 @@ def trace_inner(func): break if config.trace and (extractor_target != None): - global trace_roster import mcdc.adapt as adapt @@ -145,18 +150,18 @@ def trace_inner(func): func = tr(func) if func not in trace_roster: - trace_roster[name] = {'id': len(trace_roster)} + trace_roster[name] = {"id": len(trace_roster)} - func_id = trace_roster[name]['id'] + func_id = trace_roster[name]["id"] arg_str = ",".join([arg for arg in arg_set]) trace_wrapper_source = trace_wrapper_template.format( name=name, arg_str=arg_str, id=func_id, - trace_state_extractor=trace_state_extractors[extractor_target] + trace_state_extractor=trace_state_extractors[extractor_target], ) - exec(trace_wrapper_source,locals(),locals()) + exec(trace_wrapper_source, locals(), locals()) trace_func = eval(f"trace_{func_id}_{name}") return trace_func else: @@ -165,13 +170,11 @@ def trace_inner(func): return trace_inner - -def njit(*args,**kwargs): - +def njit(*args, **kwargs): def trace_njit_inner(func): - trace_func = trace(transforms=[numba.njit(*args,**kwargs)])(func) - if (trace_func == func): - return numba.njit(*args,**kwargs)(func) + trace_func = trace(transforms=[numba.njit(*args, **kwargs)])(func) + if trace_func == func: + return numba.njit(*args, **kwargs)(func) else: return numba.njit()(trace_func) @@ -193,29 +196,36 @@ def dd_mergetrace(mcdc): dd_comm = MPI.COMM_WORLD.Create(dd_group) # MPI Reduce on subdomain processors for name, info in trace_roster.items(): - func_id = info['id'] + func_id = info["id"] if MPI.COMM_NULL != dd_comm: - python_nsecs = dd_comm.reduce(mcdc['trace']['slots'][func_id]['runtime_total'][0], MPI.SUM) - python_calls = dd_comm.reduce(mcdc['trace']['slots'][func_id]['call_total'][0], MPI.SUM) - cpu_nsecs = dd_comm.reduce(mcdc['trace']['slots'][func_id]['runtime_total'][1], MPI.SUM) - cpu_calls = dd_comm.reduce(mcdc['trace']['slots'][func_id]['call_total'][1], MPI.SUM) + python_nsecs = dd_comm.reduce( + mcdc["trace"]["slots"][func_id]["runtime_total"][0], MPI.SUM + ) + python_calls = dd_comm.reduce( + mcdc["trace"]["slots"][func_id]["call_total"][0], MPI.SUM + ) + cpu_nsecs = dd_comm.reduce( + mcdc["trace"]["slots"][func_id]["runtime_total"][1], MPI.SUM + ) + cpu_calls = dd_comm.reduce( + mcdc["trace"]["slots"][func_id]["call_total"][1], MPI.SUM + ) if mcdc["dd_local_rank"] == 0: - mcdc['trace']['slots'][func_id]['runtime_total'][0] = python_nsecs - mcdc['trace']['slots'][func_id]['call_total'][0] = python_calls - mcdc['trace']['slots'][func_id]['runtime_total'][1] = cpu_nsecs - mcdc['trace']['slots'][func_id]['call_total'][1] = cpu_calls - + mcdc["trace"]["slots"][func_id]["runtime_total"][0] = python_nsecs + mcdc["trace"]["slots"][func_id]["call_total"][0] = python_calls + mcdc["trace"]["slots"][func_id]["runtime_total"][1] = cpu_nsecs + mcdc["trace"]["slots"][func_id]["call_total"][1] = cpu_calls + # free comm group dd_group.Free() if MPI.COMM_NULL != dd_comm: dd_comm.Free() - -def output_report(mcdc): +def output_report(mcdc): if not mcdc["technique"]["domain_decomposition"]: - report = open("report.csv","w") + report = open("report.csv", "w") report.write("function name, ") report.write("python total runtime (ns), python total calls, ") report.write("cpu total runtime (ns), cpu total calls, ") @@ -229,22 +239,24 @@ def output_report(mcdc): multi_rank = True for name, info in trace_roster.items(): - func_id = info['id'] - slot = mcdc['trace']['slots'][func_id] + func_id = info["id"] + slot = mcdc["trace"]["slots"][func_id] if multi_rank: - slot_arr = np.empty((1,),type_.trace_slot) - MPI.COMM_WORLD.Allreduce(slot['runtime_total'],slot_arr[0]['runtime_total']) - MPI.COMM_WORLD.Allreduce(slot['call_total'],slot_arr[0]['call_total']) - slot['runtime_total'] = slot_arr[0]['runtime_total'] - slot['call_total'] = slot_arr[0]['call_total'] - - python_nsecs = slot['runtime_total'][0] - python_calls = slot['call_total'][0] - cpu_nsecs = slot['runtime_total'][1] - cpu_calls = slot['call_total'][1] - gpu_nsecs = slot['runtime_total'][2] * 1000000000.0 / gpu_rate - gpu_calls = slot['call_total'][2] + slot_arr = np.empty((1,), type_.trace_slot) + MPI.COMM_WORLD.Allreduce( + slot["runtime_total"], slot_arr[0]["runtime_total"] + ) + MPI.COMM_WORLD.Allreduce(slot["call_total"], slot_arr[0]["call_total"]) + slot["runtime_total"] = slot_arr[0]["runtime_total"] + slot["call_total"] = slot_arr[0]["call_total"] + + python_nsecs = slot["runtime_total"][0] + python_calls = slot["call_total"][0] + cpu_nsecs = slot["runtime_total"][1] + cpu_calls = slot["call_total"][1] + gpu_nsecs = slot["runtime_total"][2] * 1000000000.0 / gpu_rate + gpu_calls = slot["call_total"][2] report.write(f"{name},") report.write(f"{python_nsecs},{python_calls},") report.write(f"{cpu_nsecs},{cpu_calls},") @@ -252,7 +264,7 @@ def output_report(mcdc): report.write("\n") report.close() - else: # write report for each subdomain + else: # write report for each subdomain dd_mergetrace(mcdc) d_Nx = mcdc["technique"]["dd_mesh"]["x"].size - 1 d_Ny = mcdc["technique"]["dd_mesh"]["y"].size - 1 @@ -274,20 +286,18 @@ def output_report(mcdc): gpu_rate = gpu_clock_rate() for name, info in trace_roster.items(): - func_id = info['id'] - slot = mcdc['trace']['slots'][func_id] - - python_nsecs = slot['runtime_total'][0] - python_calls = slot['call_total'][0] - cpu_nsecs = slot['runtime_total'][1] - cpu_calls = slot['call_total'][1] - gpu_nsecs = slot['runtime_total'][2] * 1000000000.0 / gpu_rate - gpu_calls = slot['call_total'][2] + func_id = info["id"] + slot = mcdc["trace"]["slots"][func_id] + + python_nsecs = slot["runtime_total"][0] + python_calls = slot["call_total"][0] + cpu_nsecs = slot["runtime_total"][1] + cpu_calls = slot["call_total"][1] + gpu_nsecs = slot["runtime_total"][2] * 1000000000.0 / gpu_rate + gpu_calls = slot["call_total"][2] report.write(f"{name},") report.write(f"{python_nsecs},{python_calls},") report.write(f"{cpu_nsecs},{cpu_calls},") report.write(f"{gpu_nsecs},{gpu_calls},") report.write("\n") report.close() - - diff --git a/mcdc/type_.py b/mcdc/type_.py index 6e8a45662..6fa0000f2 100644 --- a/mcdc/type_.py +++ b/mcdc/type_.py @@ -1590,7 +1590,7 @@ def make_type_global(input_deck): ("source_program_pointer", uintp), ("precursor_program_pointer", uintp), ("source_seed", uint64), - ("trace",trace) + ("trace", trace), ] ) @@ -1711,10 +1711,7 @@ def make_type_trace_slot(): global trace_slot trace_slot = into_dtype( - [ - ("runtime_total",int64,(3,)), - ("call_total",int64,(3,)) - ] + [("runtime_total", int64, (3,)), ("call_total", int64, (3,))] ) @@ -1724,6 +1721,6 @@ def make_type_trace(trace_slot_limit): trace = into_dtype( [ ("slots", trace_slot, (trace_slot_limit,)), - ("slot_limit", int64 ), + ("slot_limit", int64), ] )