diff --git a/parsec/arena.c b/parsec/arena.c index ca7def080..321c94971 100644 --- a/parsec/arena.c +++ b/parsec/arena.c @@ -223,7 +223,7 @@ int parsec_arena_allocate_device_private(parsec_data_copy_t *copy, assert(0 == (((ptrdiff_t)chunk->data) % arena->alignment)); assert((arena->elem_size + (ptrdiff_t)chunk->data) <= (size + (ptrdiff_t)chunk)); - data->nb_elts = count * arena->elem_size; + data->span = count * arena->elem_size; copy->flags = PARSEC_DATA_FLAG_ARENA | PARSEC_DATA_FLAG_PARSEC_OWNED | diff --git a/parsec/data.c b/parsec/data.c index fa4fe9d2a..a944a0203 100644 --- a/parsec/data.c +++ b/parsec/data.c @@ -69,7 +69,7 @@ static void parsec_data_construct(parsec_data_t* obj ) obj->owner_device = -1; obj->preferred_device = -1; obj->key = 0; - obj->nb_elts = 0; + obj->span = 0; for( uint32_t i = 0; i < parsec_nb_devices; obj->device_copies[i] = NULL, i++ ); obj->dc = NULL; @@ -509,7 +509,7 @@ parsec_data_create( parsec_data_t **holder, data->owner_device = 0; data->key = key; data->dc = desc; - data->nb_elts = size; + data->span = size; parsec_data_copy_attach(data, data_copy, 0); if( !parsec_atomic_cas_ptr(holder, NULL, data) ) { @@ -546,7 +546,7 @@ parsec_data_create_with_type( parsec_data_collection_t *desc, clone->owner_device = 0; clone->key = key; clone->dc = desc; - clone->nb_elts = size; + clone->span = size; parsec_data_copy_attach(clone, data_copy, 0); return clone; diff --git a/parsec/data_dist/matrix/broadcast.jdf b/parsec/data_dist/matrix/broadcast.jdf index 6938132d6..114750785 100644 --- a/parsec/data_dist/matrix/broadcast.jdf +++ b/parsec/data_dist/matrix/broadcast.jdf @@ -59,7 +59,7 @@ static parsec_data_t* data_of(parsec_data_collection_t *desc, ...) data->owner_device = 0; data->key = k; data->dc = (parsec_data_collection_t*)desc; - data->nb_elts = 1; + data->span = 1; parsec_data_copy_t* data_copy = (parsec_data_copy_t*)PARSEC_OBJ_NEW(parsec_data_copy_t); parsec_data_copy_attach(data, data_copy, 0); data_copy->device_private = NULL; diff --git a/parsec/data_internal.h b/parsec/data_internal.h index 85cc103ca..f20ada28a 100644 --- a/parsec/data_internal.h +++ b/parsec/data_internal.h @@ -36,7 +36,7 @@ struct parsec_data_s { * which device this data should be modified RW when there * are multiple choices. -1 means no preference. */ struct parsec_data_collection_s* dc; - size_t nb_elts; /* size in bytes of the memory layout */ + size_t span; /* size in bytes of the memory layout */ struct parsec_data_copy_s *device_copies[]; /* this array allocated according to the number of devices * (parsec_nb_devices). It points to the most recent * version of the data. diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c index af84dba73..d0c6ae5cb 100644 --- a/parsec/interfaces/dtd/insert_function.c +++ b/parsec/interfaces/dtd/insert_function.c @@ -2280,20 +2280,18 @@ static parsec_hook_return_t parsec_dtd_gpu_task_submit(parsec_execution_stream_t #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT) || defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT) parsec_dtd_task_t *dtd_task = (parsec_dtd_task_t *)this_task; parsec_dtd_task_class_t *dtd_tc = (parsec_dtd_task_class_t*)this_task->task_class; - parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t *) calloc(1, sizeof(parsec_gpu_task_t)); - PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t); - gpu_task->release_device_task = free; /* by default free the device task */ + parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t); gpu_task->ec = (parsec_task_t *) this_task; gpu_task->submit = dtd_tc->gpu_func_ptr; gpu_task->task_type = 0; - gpu_task->last_data_check_epoch = -1; /* force at least one validation for the task */ gpu_task->pushout = 0; + gpu_task->nb_flows = dtd_tc->super.nb_flows; /* inherit the flows from the task class */ for(int i = 0; i < dtd_tc->super.nb_flows; i++) { parsec_dtd_flow_info_t *flow = FLOW_OF(dtd_task, i); if(flow->op_type & PARSEC_PUSHOUT) gpu_task->pushout |= 1<flow[i] = dtd_tc->super.in[i]; - gpu_task->flow_nb_elts[i] = this_task->data[i].data_in->original->nb_elts; + gpu_task->flow_info[i].flow = dtd_tc->super.in[i]; + gpu_task->flow_info[i].flow_span = this_task->data[i].data_in->original->span; } parsec_device_module_t *device = this_task->selected_device; diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c index 3c81a0158..ee41b2ea6 100644 --- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c +++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2009-2024 The University of Tennessee and The University + * Copyright (c) 2009-2025 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. @@ -6809,43 +6809,33 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf, " assert(NULL != dev);\n" " assert(PARSEC_DEV_IS_GPU(dev->type));\n" "\n" - " gpu_task = (parsec_gpu_task_t*)calloc(1, sizeof(parsec_gpu_task_t));\n" - " PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);\n" - " gpu_task->release_device_task = free; /* by default free the device task */\n" + " gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);" " gpu_task->ec = (parsec_task_t*)this_task;\n" " gpu_task->submit = &%s_kernel_submit_%s_%s;\n" - " gpu_task->task_type = 0;\n" - " gpu_task->last_data_check_epoch = -1; /* force at least one validation for the task */\n", + " gpu_task->task_type = PARSEC_GPU_TASK_TYPE_KERNEL;\n", dev_lower, jdf_basename, f->fname); /* Set up stage in/out callbacks */ jdf_find_property(body->properties, "stage_in", &stage_in_property); - jdf_find_property(body->properties, "stage_out", &stage_out_property); - - if(stage_in_property == NULL) { - coutput(" gpu_task->stage_in = parsec_default_gpu_stage_in;\n"); - }else{ - coutput(" gpu_task->stage_in = %s;\n", dump_expr((void**)stage_in_property->expr, &info)); - } + coutput(" gpu_task->stage_in = %s;\n", (NULL == stage_in_property) ? "parsec_default_gpu_stage_in" + : dump_expr((void **)stage_in_property->expr, &info)); - if(stage_out_property == NULL) { - coutput(" gpu_task->stage_out = parsec_default_gpu_stage_out;\n"); - }else{ - coutput(" gpu_task->stage_out = %s;\n", dump_expr((void**)stage_out_property->expr, &info)); - } + jdf_find_property(body->properties, "stage_out", &stage_out_property); + coutput(" gpu_task->stage_out = %s;\n", (NULL == stage_out_property) ? "parsec_default_gpu_stage_out" + : dump_expr((void **)stage_out_property->expr, &info)); /* Dump the dataflow */ coutput(" gpu_task->pushout = 0;\n"); for(fl = f->dataflow, di = 0; fl != NULL; fl = fl->next, di++) { - coutput(" gpu_task->flow[%d] = &%s;\n", + coutput(" gpu_task->flow_info[%d].flow = &%s;\n", di, JDF_OBJECT_ONAME( fl )); sprintf(sa->ptr, "%s.dc", fl->varname); jdf_find_property(body->properties, sa->ptr, &desc_property); - if(desc_property == NULL){ - coutput(" gpu_task->flow_dc[%d] = NULL;\n", di); + if(desc_property == NULL) { + coutput(" gpu_task->flow_info[%d].flow_dc = NULL;\n", di); }else{ - coutput(" gpu_task->flow_dc[%d] = (parsec_data_collection_t *)%s;\n", di, + coutput(" gpu_task->flow_info[%d].flow_dc = (parsec_data_collection_t *)%s;\n", di, dump_expr((void**)desc_property->expr, &info)); } @@ -6853,22 +6843,22 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf, jdf_find_property(body->properties, sa->ptr, &size_property); if(fl->flow_flags & JDF_FLOW_TYPE_CTL) { - if(size_property != NULL){ + if(size_property != NULL) { fprintf(stderr, "Error: specifying GPU buffer size for CTL flow %s at line %d\n", fl->varname, JDF_OBJECT_LINENO(fl)); exit(-1); } - coutput(" gpu_task->flow_nb_elts[%d] = 0;\n", di); - }else{ + coutput(" gpu_task->flow_info[%d].flow_span = 0;\n", di); + } else { coutput(" // A shortcut to check if the flow exists\n"); coutput(" if (gpu_task->ec->data[%d].data_in != NULL) {\n", di); if(size_property == NULL){ - coutput(" gpu_task->flow_nb_elts[%d] = gpu_task->ec->data[%d].data_in->original->nb_elts;\n", di, di); - }else{ - coutput(" gpu_task->flow_nb_elts[%d] = %s;\n", - di, dump_expr((void**)size_property->expr, &info)); + coutput(" gpu_task->flow_info[%d].flow_span = gpu_task->ec->data[%d].data_in->original->span;\n", di, di); + } else { + coutput(" gpu_task->flow_info[%d].flow_span = %s;\n", + di, dump_expr((void **)size_property->expr, &info)); if( (stage_in_property == NULL) || ( stage_out_property == NULL )){ - coutput(" assert(gpu_task->ec->data[%d].data_in->original->nb_elts <= %s);\n", + coutput(" assert(gpu_task->ec->data[%d].data_in->original->span <= %s);\n", di, dump_expr((void**)size_property->expr, &info)); } @@ -6936,6 +6926,7 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf, } } string_arena_free(info.sa); + coutput(" gpu_task->nb_flows = %d; /* inherit the flows from the task_class */\n", di); coutput("\n" " return dev->kernel_scheduler(dev, es, gpu_task);\n" diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index d57ac904e..dfaabc483 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -1,6 +1,5 @@ /* - * - * Copyright (c) 2021-2024 The University of Tennessee and The University + * Copyright (c) 2021-2025 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. @@ -39,22 +38,66 @@ static int parsec_gpu_profiling_initiated = 0; int parsec_gpu_output_stream = -1; int parsec_gpu_verbosity; +/** + * This is a special function to release standard device tasks instead of calling + * PARSEC_OBJ_RELEASE on them. If we use the PARSEC_OBJ_RELEASE route the memory pointed + * by the task will be free. In some cases, we don't want that to happen, but we still + * want to inform the DSL that the task has been done with. + */ +static void parsec_device_release_gpu_task(parsec_gpu_task_t *gpu_task) +{ + PARSEC_OBJ_RELEASE(gpu_task); +} + +static void parsec_device_task_t_constructor(parsec_gpu_task_t *gpu_task) +{ + gpu_task->task_type = PARSEC_GPU_TASK_TYPE_INVALID; /* need to be set later */ + gpu_task->pushout = 0; + gpu_task->last_status = 0; + gpu_task->submit = NULL; + gpu_task->complete_stage = NULL; + gpu_task->stage_in = NULL; + gpu_task->stage_out = NULL; + gpu_task->release_device_task = NULL; +#if defined(PARSEC_PROF_TRACE) + gpu_task->prof_key_end = 0; + gpu_task->prof_event_id = 0; + gpu_task->prof_tp_id = 0; +#endif + gpu_task->ec = NULL; + gpu_task->last_data_check_epoch = UINT64_MAX; /* force at least one validation for the task */ + gpu_task->nb_flows = 0; + gpu_task->flow_info = NULL; + /* Default release mechanism, can be replaced by the DSL */ + gpu_task->release_device_task = parsec_device_release_gpu_task; +} +PARSEC_OBJ_CLASS_INSTANCE(parsec_gpu_task_t, parsec_list_item_t, + parsec_device_task_t_constructor, NULL); + +static void parsec_device_dsl_task_t_constructor(parsec_gpu_dsl_task_t *gpu_dsl_task) +{ + gpu_dsl_task->super.flow_info = gpu_dsl_task->flows; +} + +PARSEC_OBJ_CLASS_INSTANCE(parsec_gpu_dsl_task_t, parsec_gpu_task_t, + parsec_device_dsl_task_t_constructor, NULL); + static inline int parsec_device_check_space_needed(parsec_device_gpu_module_t *gpu_device, parsec_gpu_task_t *gpu_task) { - int i; int space_needed = 0; parsec_task_t *this_task = gpu_task->ec; parsec_data_t *original; parsec_data_copy_t *data; const parsec_flow_t *flow; - for( i = 0; i < this_task->task_class->nb_flows; i++ ) { + /* would have been this_task->task_class->nb_flows for classical DSL */ + for( uint32_t i = 0; i < gpu_task->nb_flows; i++ ) { /* Make sure data_in is not NULL */ if( NULL == this_task->data[i].data_in ) continue; - flow = gpu_task->flow[i]; + flow = gpu_task->flow_info[i].flow; if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue; data = this_task->data[i].data_in; @@ -466,15 +509,13 @@ parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int gpu_device->super.device_index, gpu_device->super.name, __func__, __LINE__); return PARSEC_ERROR; } - parsec_gpu_task_t* gpu_task = NULL; - gpu_task = (parsec_gpu_task_t*)calloc(1, sizeof(parsec_gpu_task_t)); + parsec_gpu_task_t* gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t); gpu_task->task_type = PARSEC_GPU_TASK_TYPE_PREFETCH; - gpu_task->release_device_task = free; /* by default free the device task */ gpu_task->ec = calloc(1, sizeof(parsec_task_t)); PARSEC_OBJ_CONSTRUCT(gpu_task->ec, parsec_task_t); gpu_task->ec->task_class = &parsec_device_data_prefetch_tc; - gpu_task->flow[0] = &parsec_device_data_prefetch_flow; - gpu_task->flow_nb_elts[0] = data->device_copies[ data->owner_device ]->original->nb_elts; + gpu_task->flow_info[0].flow = &parsec_device_data_prefetch_flow; + gpu_task->flow_info[0].flow_span = data->device_copies[ data->owner_device ]->original->span; gpu_task->stage_in = parsec_default_gpu_stage_in; gpu_task->stage_out = parsec_default_gpu_stage_out; PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p [ref_count %d]", @@ -844,8 +885,9 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, parsec_gpu_data_copy_t* temp_loc[MAX_PARAM_COUNT], *gpu_elem, *lru_gpu_elem; parsec_data_t* master, *oldmaster; const parsec_flow_t *flow; - int i, j, data_avail_epoch = 0, copy_readers_update = 0; + int data_avail_epoch = 0, copy_readers_update = 0; parsec_gpu_data_copy_t *gpu_mem_lru_cycling = NULL; + uint32_t i, j; #if defined(PARSEC_DEBUG_NOISIER) char task_name[MAX_TASK_STRLEN]; @@ -858,8 +900,8 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, * Parse all the input and output flows of data and ensure all have * corresponding data on the GPU available. */ - for( i = 0; i < this_task->task_class->nb_flows; i++ ) { - flow = gpu_task->flow[i]; + for (i = 0; i < gpu_task->nb_flows /* not this_task->task_class->nb_flows */; i++) { + flow = gpu_task->flow_info[i].flow; assert( flow && (flow->flow_index == i) ); /* Skip CTL flows only */ @@ -920,12 +962,12 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU[%d:%s]:%s: Allocate GPU copy %p sz %zu [ref_count %d] for data %p", gpu_device->super.device_index, gpu_device->super.name, task_name, - gpu_elem, gpu_task->flow_nb_elts[i], gpu_elem->super.super.obj_reference_count, master); + gpu_elem, gpu_task->flow_info[i].flow_span, gpu_elem->super.super.obj_reference_count, master); gpu_elem->flags = PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED; malloc_data: copy_readers_update = 0; assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); - gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_nb_elts[i]); + gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_info[i].flow_span); if( NULL == gpu_elem->device_private ) { #endif @@ -942,7 +984,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream, "GPU[%d:%s]:%s:\tRequest space on GPU failed for flow %s index %d/%d for task %s", gpu_device->super.device_index, gpu_device->super.name, task_name, - flow->name, i, this_task->task_class->nb_flows, task_name ); + flow->name, i, gpu_task->nb_flows, task_name ); #endif /* defined(PARSEC_DEBUG_NOISIER) */ for( j = 0; j <= i; j++ ) { /* This flow could be a control flow */ @@ -1130,7 +1172,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, parsec_gpu_allocate_memory_key, (int64_t)gpu_elem->device_private, gpu_device->super.device_index, - &gpu_task->flow_nb_elts[i], PARSEC_PROFILING_EVENT_COUNTER|PARSEC_PROFILING_EVENT_HAS_INFO); + &gpu_task->flow_info[i].flow_span, PARSEC_PROFILING_EVENT_COUNTER|PARSEC_PROFILING_EVENT_HAS_INFO); } #endif #else @@ -1195,9 +1237,9 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask, size_t count; parsec_device_transfer_direction_t dir; - for(int i = 0; i < task->task_class->nb_flows; i++) { + for(uint32_t i = 0; i < gtask->nb_flows /* not task->task_class->nb_flows */; i++) { if( !(flow_mask & (1U << i)) ) continue; - source = gtask->sources[i]; + source = gtask->flow_info[i].source; assert(source->device_private != NULL); dest = task->data[i].data_out; src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index); @@ -1210,8 +1252,8 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask, dir = parsec_device_gpu_transfer_direction_h2d; } - count = (source->original->nb_elts <= dest->original->nb_elts) ? - source->original->nb_elts : dest->original->nb_elts; + count = (source->original->span <= dest->original->span) ? + source->original->span : dest->original->span; ret = dst_dev->memcpy_async( dst_dev, gpu_stream, dest->device_private, source->device_private, @@ -1244,16 +1286,16 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask, parsec_task_t *task = gtask->ec; size_t count; parsec_device_transfer_direction_t dir; - int i; - for(i = 0; i < task->task_class->nb_flows; i++){ + + for(uint32_t i = 0; i < gtask->nb_flows /* not task->task_class->nb_flows */; i++){ if(flow_mask & (1U << i)){ source = task->data[i].data_out; dest = source->original->device_copies[0]; dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index); src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index); - count = (source->original->nb_elts <= dest->original->nb_elts) ? source->original->nb_elts : - dest->original->nb_elts; + count = (source->original->span <= dest->original->span) ? source->original->span : + dest->original->span; if( src_dev->super.type == dst_dev->super.type ) { assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) ); dir = parsec_device_gpu_transfer_direction_d2d; @@ -1298,7 +1340,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, parsec_data_copy_t *candidate = task_data->data_in; /* best candidate for now */ parsec_data_t* original = candidate->original; parsec_gpu_data_copy_t* gpu_elem = task_data->data_out; - size_t nb_elts = gpu_task->flow_nb_elts[flow->flow_index]; + size_t span = gpu_task->flow_info[flow->flow_index].flow_span; int transfer_from = -1; if( gpu_task->task_type == PARSEC_GPU_TASK_TYPE_PREFETCH ) { @@ -1309,7 +1351,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, parsec_atomic_lock( &original->lock ); - gpu_task->sources[flow->flow_index] = candidate; /* default source for the transfer */ + gpu_task->flow_info[flow->flow_index].source = candidate; /* default source for the transfer */ /** * If the data will be accessed in write mode, remove it from any GPU data management * lists until the task is completed. @@ -1343,7 +1385,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, transfer_from = -1; /* Update the transferred required_data_in size */ - gpu_device->super.required_data_in += original->nb_elts; + gpu_device->super.required_data_in += original->span; if( -1 == transfer_from ) { /* Do not need to be transferred */ gpu_elem->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER; @@ -1368,7 +1410,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]:\t\tMove data copy %p [ref_count %d, key %x] of %zu bytes: data copy is already under transfer, ignoring double request", gpu_device->super.device_index, gpu_device->super.name, - gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, nb_elts); + gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, span); parsec_atomic_unlock( &original->lock ); return 1; /* positive returns have special meaning and are used for optimizations */ } @@ -1473,7 +1515,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, "GPU[%d:%s]:\t\tMove %s data copy %p [ref_count %d, key %x] of %zu bytes\t(src dev: %d, v:%d, ptr:%p, copy:%p [ref_count %d, under_transfer: %d, coherency_state: %d] / dst dev: %d, v:%d, ptr:%p)", gpu_device->super.device_index, gpu_device->super.name, PARSEC_DEV_IS_GPU(candidate_dev->super.type) ? "D2D": "H2D", - gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, nb_elts, + gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, span, candidate_dev->super.device_index, candidate->version, (void*)candidate->device_private, candidate, candidate->super.super.obj_reference_count, candidate->data_transfer_status, candidate->coherency_state, gpu_device->super.device_index, gpu_elem->version, (void*)gpu_elem->device_private); @@ -1511,7 +1553,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, } if(gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) { parsec_device_gpu_memory_prof_info_t _info; - _info.size = (uint64_t)nb_elts; + _info.size = (uint64_t)span; _info.data_key = gpu_elem->original->key; _info.dc_id = (uint64_t)(gpu_elem->original->dc); parsec_profiling_trace_flags(gpu_stream->profiling, @@ -1522,7 +1564,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, } } #endif - gpu_task->sources[flow->flow_index] = candidate; /* save the candidate for release on transfer completion */ + gpu_task->flow_info[flow->flow_index].source = candidate; /* save the candidate for release on transfer completion */ /* Push data into the GPU from the source device */ int rc = gpu_task->stage_in ? gpu_task->stage_in(gpu_task, (1U << flow->flow_index), gpu_stream): PARSEC_SUCCESS; if(PARSEC_SUCCESS != rc) { @@ -1531,15 +1573,15 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device, gpu_device->super.device_index, gpu_device->super.name, rc, __func__, __LINE__, candidate->device_private, candidate_dev->super.device_index, candidate_dev->super.name, gpu_elem->device_private, gpu_device->super.device_index, gpu_device->super.name, - nb_elts, (candidate_dev->super.type != gpu_device->super.type)? "H2D": "D2D"); + span, (candidate_dev->super.type != gpu_device->super.type)? "H2D": "D2D"); parsec_atomic_unlock( &original->lock ); assert(0); return PARSEC_HOOK_RETURN_ERROR; } assert(candidate_dev->super.device_index < gpu_device->super.data_in_array_size); - gpu_device->super.data_in_from_device[candidate_dev->super.device_index] += nb_elts; + gpu_device->super.data_in_from_device[candidate_dev->super.device_index] += span; if( PARSEC_GPU_TASK_TYPE_KERNEL == gpu_task->task_type ) - gpu_device->super.nb_data_faults += nb_elts; + gpu_device->super.nb_data_faults += span; /* We assign the version of the data preemptively (i.e. before the task is executing) * For read-only data, the GPU copy will get the same version as the source @@ -1616,15 +1658,13 @@ parsec_device_send_transfercomplete_cmd_to_device(parsec_data_copy_t *copy, parsec_device_module_t *current_dev, parsec_device_module_t *dst_dev) { - parsec_gpu_task_t* gpu_task = NULL; - gpu_task = (parsec_gpu_task_t*)calloc(1, sizeof(parsec_gpu_task_t)); + parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t *)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t); gpu_task->task_type = PARSEC_GPU_TASK_TYPE_D2D_COMPLETE; - gpu_task->release_device_task = free; /* by default free the device task */ gpu_task->ec = calloc(1, sizeof(parsec_task_t)); PARSEC_OBJ_CONSTRUCT(gpu_task->ec, parsec_task_t); gpu_task->ec->task_class = &parsec_device_d2d_complete_tc; - gpu_task->flow[0] = &parsec_device_d2d_complete_flow; - gpu_task->flow_nb_elts[0] = copy->original->nb_elts; + gpu_task->flow_info[0].flow = &parsec_device_d2d_complete_flow; + gpu_task->flow_info[0].flow_span = copy->original->span; gpu_task->stage_in = parsec_default_gpu_stage_in; gpu_task->stage_out = parsec_default_gpu_stage_out; gpu_task->ec->data[0].data_in = copy; /* We need to set not-null in data_in, so that the fake flow is @@ -1654,7 +1694,7 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t *gpu_device, parsec_gpu_task_t *gtask = *gpu_task; parsec_task_t *task; - int32_t i; + uint32_t i; #if defined(PARSEC_DEBUG_NOISIER) char task_str[MAX_TASK_STRLEN]; #endif @@ -1671,14 +1711,15 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t *gpu_device, "GPU[%d:%s]: parsec_device_callback_complete_push, PUSH of %s", gpu_device->super.device_index, gpu_device->super.name, parsec_task_snprintf(task_str, MAX_TASK_STRLEN, task)); - for( i = 0; i < task->task_class->nb_flows; i++ ) { + for (i = 0; i < gtask->nb_flows /* not task->task_class->nb_flows */; i++) + { /* Make sure data_in is not NULL */ if( NULL == task->data[i].data_in ) continue; /* We also don't push back non-parsec-owned copies */ if(NULL != task->data[i].data_out && 0 == (task->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED)) continue; - flow = gtask->flow[i]; + flow = gtask->flow_info[i].flow; assert( flow ); assert( flow->flow_index == i ); if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue; @@ -1690,9 +1731,6 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t *gpu_device, gpu_device->super.device_index, flow->flow_flags); - parsec_data_copy_t* source = gtask->sources[i]; - parsec_device_gpu_module_t *src_device = - (parsec_device_gpu_module_t*)parsec_mca_device_get( source->device_index ); if (task->data[i].data_in->flags & PARSEC_DATA_FLAG_EVICTED) { /** * The device copy had been evicted to the host and brought back in. @@ -1729,8 +1767,6 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t *gpu_device, task->data[i].data_in->flags ^= PARSEC_DATA_FLAG_EVICTED; } - parsec_atomic_unlock(&task->data[i].data_out->original->lock); - #if defined(PARSEC_PROF_TRACE) if(gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_DATA_IN) { PARSEC_PROFILING_TRACE(gpu_stream->profiling, @@ -1740,6 +1776,11 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t *gpu_device, NULL); } #endif + + parsec_atomic_unlock(&task->data[i].data_out->original->lock); + parsec_data_copy_t *source = gtask->flow_info[i].source; + parsec_device_gpu_module_t *src_device = + (parsec_device_gpu_module_t*)parsec_mca_device_get( source->device_index ); if( PARSEC_DEV_IS_GPU(src_device->super.type) ) { int om; while(1) { @@ -2018,7 +2059,7 @@ parsec_device_kernel_push( parsec_device_gpu_module_t *gpu_device, { parsec_task_t *this_task = gpu_task->ec; const parsec_flow_t *flow; - int i, ret = 0; + int ret = 0; #if defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; #endif @@ -2062,9 +2103,9 @@ parsec_device_kernel_push( parsec_device_gpu_module_t *gpu_device, return ret; } - for( i = 0; i < this_task->task_class->nb_flows; i++ ) { + for( uint32_t i = 0; i < gpu_task->nb_flows /* not this_task->task_class->nb_flows */; i++ ) { - flow = gpu_task->flow[i]; + flow = gpu_task->flow_info[i].flow; /* Skip CTL flows */ if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue; @@ -2140,11 +2181,11 @@ parsec_device_kernel_exec( parsec_device_gpu_module_t *gpu_device, #if defined(PARSEC_DEBUG_PARANOID) const parsec_flow_t *flow; - for( int i = 0; i < this_task->task_class->nb_flows; i++ ) { + for( uint i = 0; i < gpu_task->nb_flows /* this_task->task_class->nb_flows */; i++ ) { /* Make sure data_in is not NULL */ if( NULL == this_task->data[i].data_in ) continue; - flow = gpu_task->flow[i]; + flow = gpu_task->flow_info[i].flow; if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue; if( 0 == (this_task->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue; assert(this_task->data[i].data_out->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER); @@ -2170,15 +2211,15 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device, parsec_task_t *this_task = gpu_task->ec; parsec_gpu_data_copy_t *gpu_copy; parsec_data_t *original; - size_t nb_elts; + size_t span; const parsec_flow_t *flow; - int return_code = 0, rc, how_many = 0, i, update_data_epoch = 0; + int return_code = 0, rc, how_many = 0, update_data_epoch = 0; #if defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; #endif if (gpu_task->task_type == PARSEC_GPU_TASK_TYPE_D2HTRANSFER) { - for( i = 0; i < this_task->locals[0].value; i++ ) { + for( int i = 0; i < this_task->locals[0].value; i++ ) { gpu_copy = this_task->data[i].data_out; /* If the gpu copy is not owned by parsec, we don't manage it at all */ if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue; @@ -2202,13 +2243,13 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device, gpu_device->super.device_index, gpu_device->super.name, parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) ); - for( i = 0; i < this_task->task_class->nb_flows; i++ ) { + for( uint32_t i = 0; i < gpu_task->nb_flows /* not this_task->task_class->nb_flows */; i++ ) { /* We need to manage all data that has been used as input, even if they were read only */ /* Make sure data_in is not NULL */ if( NULL == this_task->data[i].data_in ) continue; - flow = gpu_task->flow[i]; + flow = gpu_task->flow_info[i].flow; if( PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags) ) continue; /* control flow */ gpu_copy = this_task->data[i].data_out; @@ -2217,7 +2258,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device, if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue; original = gpu_copy->original; - nb_elts = gpu_task->flow_nb_elts[i]; + span = gpu_task->flow_info[i].flow_span; assert( this_task->data[i].data_in == NULL || original == this_task->data[i].data_in->original ); @@ -2267,7 +2308,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device, gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->super.super.obj_reference_count, flow->name); /* Stage the transfer of the data back to main memory */ - gpu_device->super.required_data_out += nb_elts; + gpu_device->super.required_data_out += span; assert( ((parsec_list_item_t*)gpu_copy)->list_next == (parsec_list_item_t*)gpu_copy ); assert( ((parsec_list_item_t*)gpu_copy)->list_prev == (parsec_list_item_t*)gpu_copy ); @@ -2315,7 +2356,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t *gpu_device, parsec_atomic_unlock(&original->lock); goto release_and_return_error; } - gpu_device->super.data_out_to_host += nb_elts; /* TODO: not hardcoded, use datatype size */ + gpu_device->super.data_out_to_host += span; /* TODO: not hardcoded, use datatype size */ how_many++; } else { assert( 0 == gpu_copy->readers ); @@ -2346,7 +2387,6 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device, parsec_task_t *this_task = gpu_task->ec; parsec_gpu_data_copy_t *gpu_copy, *cpu_copy; parsec_data_t *original; - int i; #if defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; @@ -2356,7 +2396,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device, parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) ); #endif - for( i = 0; i < this_task->task_class->nb_flows; i++ ) { + for( uint32_t i = 0; i < gpu_task->nb_flows /* not this_task->task_class->nb_flows */; i++ ) { /* Make sure data_in is not NULL */ if( NULL == this_task->data[i].data_in ) continue; @@ -2364,7 +2404,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device, if(NULL == this_task->data[i].data_out) continue; - if( !(gpu_task->flow[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) { + if( !(gpu_task->flow_info[i].flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) { /* Warning data_out for read only flows has been overwritten in pop */ continue; } @@ -2449,7 +2489,7 @@ parsec_device_kernel_cleanout( parsec_device_gpu_module_t *gpu_device, parsec_task_t *this_task = gpu_task->ec; parsec_gpu_data_copy_t *gpu_copy, *cpu_copy; parsec_data_t *original; - int i, data_avail_epoch = 0; + int data_avail_epoch = 0; #if defined(PARSEC_DEBUG_NOISIER) char tmp[MAX_TASK_STRLEN]; @@ -2459,13 +2499,13 @@ parsec_device_kernel_cleanout( parsec_device_gpu_module_t *gpu_device, parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) ); #endif - for( i = 0; i < this_task->task_class->nb_flows; i++ ) { + for( uint32_t i = 0; i < gpu_task->nb_flows /* not this_task->task_class->nb_flows */; i++ ) { /* Make sure data_in is not NULL */ if( NULL == this_task->data[i].data_in ) continue; /* Don't bother if there is no real data (aka. CTL or no output) */ if(NULL == this_task->data[i].data_out) continue; - if( !(gpu_task->flow[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) { + if( !(gpu_task->flow_info[i].flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) { /* Warning data_out for read only flows has been overwritten in pop */ continue; } @@ -2709,9 +2749,9 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]: gpu_task %p freed", gpu_device->super.device_index, gpu_device->super.name, gpu_task); - if (NULL != gpu_task->release_device_task) { - gpu_task->release_device_task(gpu_task); - } + /* Release the GPU task */ + gpu_task->release_device_task(gpu_task); + rc = parsec_atomic_fetch_dec_int32( &(gpu_device->mutex) ); if( 1 == rc ) { /* I was the last one */ #if defined(PARSEC_PROF_TRACE) diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h index fa25b87a3..167b16c57 100644 --- a/parsec/mca/device/device_gpu.h +++ b/parsec/mca/device/device_gpu.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024 The University of Tennessee and The University + * Copyright (c) 2021-2025 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. @@ -81,7 +81,25 @@ typedef int (parsec_stage_out_function_t)(parsec_gpu_task_t *gtask, * and this function allows the device engine to delegate the release of such tasks back into * the DSL. Once this task called, the device task should not be accessed by the device. */ -typedef void (*parsec_release_device_task_function_t)(void*); +typedef void (*parsec_release_device_task_function_t)(parsec_gpu_task_t*); +typedef struct parsec_gpu_flow_info_s { + const parsec_flow_t *flow; /* Some DSL might not have a task_class, but they still need to provide a flow. */ + size_t flow_span; /* the span of the data on the device. For contiguous layout this is equal to the + * size of the data, for all the other copies this should be the amount of memory + * needed on the device. + */ + parsec_data_collection_t *flow_dc; /* the data collection from which the data originates. When the data copy is local, the data + * collection can be accessed via the data_t, but for all copies coming from the network there + * is no known data collection. Thus, for such cases the DSL need to provide a reference to the + * local data collection to be used for all transfers. This data collection is needed to get + * access to the mtype, to get the memory layout of the copy. + */ + /* The is private to the device code and should not be used outside the device driver */ + parsec_data_copy_t *source; /* If the driver decides to acquire the data from a different + * source, it will temporary store the best candidate here. + */ + +} parsec_gpu_flow_info_t; struct parsec_gpu_task_s { parsec_list_item_t list_item; @@ -102,23 +120,8 @@ struct parsec_gpu_task_s { struct { parsec_task_t *ec; uint64_t last_data_check_epoch; - const parsec_flow_t *flow[MAX_PARAM_COUNT]; /* There is no consistent way to access the flows from the task_class, - * so the DSL need to provide these flows here. - */ - size_t flow_nb_elts[MAX_PARAM_COUNT]; /* for each flow, size of the data to be allocated - * on the GPU. - */ - parsec_data_collection_t *flow_dc[MAX_PARAM_COUNT]; /* for each flow, data collection from which the data - * to be transferred logically belongs to. - * This gives the user the chance to indicate on the JDF - * a data collection to inspect during GPU transfer. - * User may want info from the DC (e.g. mtype), - * & otherwise remote copies don't have any info. - */ - /* These are private and should not be used outside the device driver */ - parsec_data_copy_t *sources[MAX_PARAM_COUNT]; /* If the driver decides to acquire the data from a different - * source, it will temporary store the best candidate here. - */ + uint32_t nb_flows; + parsec_gpu_flow_info_t *flow_info; }; struct { parsec_data_copy_t *copy; @@ -126,6 +129,20 @@ struct parsec_gpu_task_s { }; }; +PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_gpu_task_t); + +/** + * Specialized GPU tasks for the PTG and DTD DSL. The maximum number of flows being MAX_PARAM_COUNT we + * can make the gpu_flow_info array part of the struct, to allocate the gpu_task as a single, + * contiguous block of memory. + */ +typedef struct parsec_gpu_dsl_task_s { + parsec_gpu_task_t super; + parsec_gpu_flow_info_t flows[MAX_PARAM_COUNT]; /* All the flow info necessary for the PTG and DTD DSL */ +} parsec_gpu_dsl_task_t; + +PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_gpu_dsl_task_t); + typedef enum parsec_device_transfer_direction_e { parsec_device_gpu_transfer_direction_h2d, parsec_device_gpu_transfer_direction_d2h, @@ -318,6 +335,7 @@ char *parsec_device_describe_gpu_task( char *tmp, size_t len, parsec_gpu_task_t #define PARSEC_GPU_TASK_TYPE_PREFETCH 0x2000 #define PARSEC_GPU_TASK_TYPE_WARMUP 0x4000 #define PARSEC_GPU_TASK_TYPE_D2D_COMPLETE 0x8000 +#define PARSEC_GPU_TASK_TYPE_INVALID 0xf000 #if defined(PARSEC_PROF_TRACE) #define PARSEC_PROFILE_GPU_TRACK_DATA_IN 0x0001 diff --git a/parsec/mca/device/transfer_gpu.c b/parsec/mca/device/transfer_gpu.c index 50b0d886e..0f45bcdfe 100644 --- a/parsec/mca/device/transfer_gpu.c +++ b/parsec/mca/device/transfer_gpu.c @@ -276,9 +276,7 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, d2h_task->taskpool = NULL; d2h_task->locals[0].value = nb_cleaned; - w2r_task = (parsec_gpu_task_t *)malloc(sizeof(parsec_gpu_task_t)); - PARSEC_OBJ_CONSTRUCT(w2r_task, parsec_list_item_t); - w2r_task->release_device_task = free; /* by default free the device task */ + w2r_task = (parsec_gpu_task_t *)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t); w2r_task->ec = (parsec_task_t*)d2h_task; w2r_task->task_type = PARSEC_GPU_TASK_TYPE_D2HTRANSFER; w2r_task->last_data_check_epoch = gpu_device->data_avail_epoch - 1; @@ -309,7 +307,7 @@ int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device, parsec_atomic_lock(&gpu_copy->original->lock); gpu_copy->readers--; gpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER; - gpu_device->super.data_out_to_host += gpu_copy->original->nb_elts; /* TODO: not hardcoded, use datatype size */ + gpu_device->super.data_out_to_host += gpu_copy->original->span; /* TODO: not hardcoded, use datatype size */ assert(gpu_copy->readers >= 0); original = gpu_copy->original; @@ -343,7 +341,7 @@ int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device, parsec_atomic_unlock(&gpu_copy->original->lock); } parsec_thread_mempool_free(es->context_mempool, task); - free(gpu_task); + PARSEC_OBJ_RELEASE(gpu_task); /* no need to call release_device_task, just release the task */ gpu_device->data_avail_epoch++; return 0; } diff --git a/tests/dsl/ptg/choice/choice_data.c b/tests/dsl/ptg/choice/choice_data.c index a6c7dcd77..70281f3ae 100644 --- a/tests/dsl/ptg/choice/choice_data.c +++ b/tests/dsl/ptg/choice/choice_data.c @@ -50,7 +50,7 @@ get_or_create_data(my_datatype_t* dat, uint32_t pos) data->owner_device = 0; data->key = pos; - data->nb_elts = 1; + data->span = 1; data->device_copies[0] = data_copy; if( !parsec_atomic_cas_ptr(&dat->data_map[pos], NULL, data) ) { diff --git a/tests/runtime/cuda/nvlink_wrapper.c b/tests/runtime/cuda/nvlink_wrapper.c index abc4b19c9..a641ed053 100644 --- a/tests/runtime/cuda/nvlink_wrapper.c +++ b/tests/runtime/cuda/nvlink_wrapper.c @@ -189,7 +189,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb /* And copy the tile from CPU to GPU */ status = (cudaError_t)cudaMemcpy( gpu_copy->device_private, cpu_copy->device_private, - dta->nb_elts, + dta->span, cudaMemcpyHostToDevice ); PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaMemcpy", status, {return NULL;} ); g++; diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf index 7df99800f..0878746c6 100644 --- a/tests/runtime/cuda/stage_custom.jdf +++ b/tests/runtime/cuda/stage_custom.jdf @@ -37,12 +37,13 @@ stage_stride_in(parsec_gpu_task_t *gtask, parsec_device_gpu_module_t *in_elem_dev; parsec_tiled_matrix_t * dc; int elem_sz; - int i; - for(i = 0; i < task->task_class->nb_flows; i++){ + + assert(gtask->nb_flows == task->task_class->nb_flows); + for(uint32_t i = 0; i < gtask->nb_flows; i++){ if(flow_mask & (1U << i)){ copy_in = task->data[i].data_in; copy_out = task->data[i].data_out; - dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i]; + dc = (parsec_tiled_matrix_t*)gtask->flow_info[i].flow_dc; elem_sz = parsec_datadist_getsizeoftype(dc->mtype); in_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_in->device_index); if(in_elem_dev->super.type != PARSEC_DEV_CUDA ){ @@ -62,7 +63,7 @@ stage_stride_in(parsec_gpu_task_t *gtask, }else{ ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private, copy_in->device_private, - copy_in->original->nb_elts, + copy_in->original->span, cudaMemcpyDeviceToDevice, cuda_stream->cuda_stream ); PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync", ret, { return PARSEC_ERROR; } ); @@ -89,7 +90,7 @@ stage_stride_out(parsec_gpu_task_t *gtask, if(flow_mask & (1U << i)){ copy_in = task->data[i].data_out; copy_out = copy_in->original->device_copies[0]; - dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i]; + dc = (parsec_tiled_matrix_t*)gtask->flow_info[i].flow_dc; elem_sz = parsec_datadist_getsizeoftype(dc->mtype); /* copy width bytes heigth times, skipping pitch - width bytes every time */ size_t dpitch = dc->llm * elem_sz;