From ea73d65e5b53cab8ab30c601b92112479b1622b7 Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Wed, 6 Nov 2024 14:21:23 -0800
Subject: [PATCH 1/4] Derive the device_task_t from a parsec_object_t

Introduce the parsec_gpu_flow_info_s info structure to combine the flow
information needed by the GPU code.

Allow the standard device tasks (aka. parsec_gpu_dsl_task_t) to contain
the flow_info array inside the task, while allowing other DSL to have
their own type of device task (derived from parsec_gpu_task_t)

Enhance the mechanism to release the device tasks via the release_device_task
function pointer. The device code will call this function to let the DSL
decide how to device task release should be handled. Some DSL (PTG and
DTD as of now) will call OBJ_RELEASE on it (and the free is automatic),
while others (TTG as an example) will have its own handling.

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 parsec/arena.c                             |   2 +-
 parsec/data.c                              |   6 +-
 parsec/data_dist/matrix/broadcast.jdf      |   2 +-
 parsec/data_internal.h                     |   2 +-
 parsec/interfaces/dtd/insert_function.c    |   9 +-
 parsec/interfaces/ptg/ptg-compiler/jdf2c.c |  46 +++---
 parsec/mca/device/device_gpu.c             | 154 ++++++++++++---------
 parsec/mca/device/device_gpu.h             |  53 ++++---
 parsec/mca/device/transfer_gpu.c           |   8 +-
 tests/dsl/ptg/choice/choice_data.c         |   2 +-
 tests/runtime/cuda/nvlink_wrapper.c        |   2 +-
 tests/runtime/cuda/stage_custom.jdf        |  11 +-
 12 files changed, 165 insertions(+), 132 deletions(-)

diff --git a/parsec/arena.c b/parsec/arena.c
index 8d518d00a..336874a6b 100644
--- a/parsec/arena.c
+++ b/parsec/arena.c
@@ -223,7 +223,7 @@ int  parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
     assert(0 == (((ptrdiff_t)chunk->data) % arena->alignment));
     assert((arena->elem_size + (ptrdiff_t)chunk->data)  <= (size + (ptrdiff_t)chunk));
 
-    data->nb_elts = count * arena->elem_size;
+    data->span = count * arena->elem_size;
 
     copy->flags = PARSEC_DATA_FLAG_ARENA |
                   PARSEC_DATA_FLAG_PARSEC_OWNED |
diff --git a/parsec/data.c b/parsec/data.c
index 8dffaa027..eec68c285 100644
--- a/parsec/data.c
+++ b/parsec/data.c
@@ -63,7 +63,7 @@ static void parsec_data_construct(parsec_data_t* obj )
     obj->owner_device     = -1;
     obj->preferred_device = -1;
     obj->key              = 0;
-    obj->nb_elts          = 0;
+    obj->span          = 0;
     for( uint32_t i = 0; i < parsec_nb_devices;
          obj->device_copies[i] = NULL, i++ );
     obj->dc               = NULL;
@@ -503,7 +503,7 @@ parsec_data_create( parsec_data_t **holder,
         data->owner_device = 0;
         data->key = key;
         data->dc = desc;
-        data->nb_elts = size;
+        data->span = size;
         parsec_data_copy_attach(data, data_copy, 0);
 
         if( !parsec_atomic_cas_ptr(holder, NULL, data) ) {
@@ -540,7 +540,7 @@ parsec_data_create_with_type( parsec_data_collection_t *desc,
     clone->owner_device = 0;
     clone->key = key;
     clone->dc = desc;
-    clone->nb_elts = size;
+    clone->span = size;
     parsec_data_copy_attach(clone, data_copy, 0);
 
     return clone;
diff --git a/parsec/data_dist/matrix/broadcast.jdf b/parsec/data_dist/matrix/broadcast.jdf
index b476834b7..3dabba924 100644
--- a/parsec/data_dist/matrix/broadcast.jdf
+++ b/parsec/data_dist/matrix/broadcast.jdf
@@ -59,7 +59,7 @@ static parsec_data_t* data_of(parsec_data_collection_t *desc, ...)
             data->owner_device = 0;
             data->key = k;
             data->dc = (parsec_data_collection_t*)desc;
-            data->nb_elts = 1;
+            data->span = 1;
             parsec_data_copy_t* data_copy = (parsec_data_copy_t*)PARSEC_OBJ_NEW(parsec_data_copy_t);
             parsec_data_copy_attach(data, data_copy, 0);
             data_copy->device_private = NULL;
diff --git a/parsec/data_internal.h b/parsec/data_internal.h
index 49b3a3c7f..bc3a6b962 100644
--- a/parsec/data_internal.h
+++ b/parsec/data_internal.h
@@ -36,7 +36,7 @@ struct parsec_data_s {
                                                   * which device this data should be modified RW when there
                                                   * are multiple choices. -1 means no preference. */
     struct parsec_data_collection_s*     dc;
-    size_t                     nb_elts;          /* size in bytes of the memory layout */
+    size_t                     span;          /* size in bytes of the memory layout */
     struct parsec_data_copy_s *device_copies[];  /* this array allocated according to the number of devices
                                                   * (parsec_nb_devices). It points to the most recent
                                                   * version of the data.
diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c
index 781e47763..510b7691c 100644
--- a/parsec/interfaces/dtd/insert_function.c
+++ b/parsec/interfaces/dtd/insert_function.c
@@ -2279,20 +2279,19 @@ static parsec_hook_return_t parsec_dtd_gpu_task_submit(parsec_execution_stream_t
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT) || defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT)
     parsec_dtd_task_t *dtd_task = (parsec_dtd_task_t *)this_task;
     parsec_dtd_task_class_t *dtd_tc = (parsec_dtd_task_class_t*)this_task->task_class;
-    parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t *) calloc(1, sizeof(parsec_gpu_task_t));
-    PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
-    gpu_task->release_device_task = free;  /* by default free the device task */
+    parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);
     gpu_task->ec = (parsec_task_t *) this_task;
     gpu_task->submit = dtd_tc->gpu_func_ptr;
     gpu_task->task_type = 0;
     gpu_task->last_data_check_epoch = -1;       /* force at least one validation for the task */
     gpu_task->pushout = 0;
+    gpu_task->nb_flows = dtd_tc->super.nb_flows;  /* inherit the flows from the task class */
     for(int i = 0; i < dtd_tc->super.nb_flows; i++) {
         parsec_dtd_flow_info_t *flow = FLOW_OF(dtd_task, i);
         if(flow->op_type & PARSEC_PUSHOUT)
             gpu_task->pushout |= 1<<i;
-        gpu_task->flow[i] = dtd_tc->super.in[i];
-        gpu_task->flow_nb_elts[i] = this_task->data[i].data_in->original->nb_elts;
+        gpu_task->flow_info[i].flow = dtd_tc->super.in[i];
+        gpu_task->flow_info[i].flow_span = this_task->data[i].data_in->original->span;
     }
 
     parsec_device_module_t *device = this_task->selected_device;
diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
index 2524c65da..80592b58f 100644
--- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -6809,9 +6809,7 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
             "  assert(NULL != dev);\n"
             "  assert(PARSEC_DEV_IS_GPU(dev->type));\n"
             "\n"
-            "  gpu_task = (parsec_gpu_task_t*)calloc(1, sizeof(parsec_gpu_task_t));\n"
-            "  PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);\n"
-            "  gpu_task->release_device_task = free;  /* by default free the device task */\n"
+            "  gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);"
             "  gpu_task->ec = (parsec_task_t*)this_task;\n"
             "  gpu_task->submit = &%s_kernel_submit_%s_%s;\n"
             "  gpu_task->task_type = 0;\n"
@@ -6820,32 +6818,25 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
 
     /* Set up stage in/out callbacks */
     jdf_find_property(body->properties, "stage_in", &stage_in_property);
-    jdf_find_property(body->properties, "stage_out", &stage_out_property);
-
-    if(stage_in_property == NULL) {
-        coutput("  gpu_task->stage_in  = parsec_default_gpu_stage_in;\n");
-    }else{
-        coutput("  gpu_task->stage_in  = %s;\n", dump_expr((void**)stage_in_property->expr, &info));
-    }
+    coutput("  gpu_task->stage_in  = %s;\n", (NULL == stage_in_property) ? "parsec_default_gpu_stage_in"
+                                                                         : dump_expr((void **)stage_in_property->expr, &info));
 
-    if(stage_out_property == NULL) {
-        coutput("  gpu_task->stage_out = parsec_default_gpu_stage_out;\n");
-    }else{
-        coutput("  gpu_task->stage_out = %s;\n", dump_expr((void**)stage_out_property->expr, &info));
-    }
+    jdf_find_property(body->properties, "stage_out", &stage_out_property);
+    coutput("  gpu_task->stage_out = %s;\n", (NULL == stage_out_property) ? "parsec_default_gpu_stage_out"
+                                                                          : dump_expr((void **)stage_out_property->expr, &info));
 
     /* Dump the dataflow */
     coutput("  gpu_task->pushout = 0;\n");
     for(fl = f->dataflow, di = 0; fl != NULL; fl = fl->next, di++) {
-        coutput("  gpu_task->flow[%d]         = &%s;\n",
+        coutput("  gpu_task->flow_info[%d].flow    = &%s;\n",
                 di, JDF_OBJECT_ONAME( fl ));
 
         sprintf(sa->ptr, "%s.dc", fl->varname);
         jdf_find_property(body->properties, sa->ptr, &desc_property);
-        if(desc_property == NULL){
-            coutput("  gpu_task->flow_dc[%d] = NULL;\n", di);
+        if(desc_property == NULL) {
+            coutput("  gpu_task->flow_info[%d].flow_dc = NULL;\n", di);
         }else{
-            coutput("  gpu_task->flow_dc[%d] = (parsec_data_collection_t *)%s;\n", di,
+            coutput("  gpu_task->flow_info[%d].flow_dc = (parsec_data_collection_t *)%s;\n", di,
                         dump_expr((void**)desc_property->expr, &info));
         }
 
@@ -6853,22 +6844,22 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
         jdf_find_property(body->properties, sa->ptr, &size_property);
 
         if(fl->flow_flags & JDF_FLOW_TYPE_CTL) {
-            if(size_property != NULL){
+            if(size_property != NULL) {
                 fprintf(stderr, "Error: specifying GPU buffer size for CTL flow %s at line %d\n",
                         fl->varname, JDF_OBJECT_LINENO(fl));
                 exit(-1);
             }
-            coutput("  gpu_task->flow_nb_elts[%d] = 0;\n", di);
-        }else{
+            coutput("  gpu_task->flow_info[%d].flow_span = 0;\n", di);
+        } else {
             coutput("  // A shortcut to check if the flow exists\n");
             coutput("  if (gpu_task->ec->data[%d].data_in != NULL) {\n", di);
             if(size_property == NULL){
-                coutput("  gpu_task->flow_nb_elts[%d] = gpu_task->ec->data[%d].data_in->original->nb_elts;\n", di, di);
-            }else{
-                coutput("  gpu_task->flow_nb_elts[%d] = %s;\n",
-                        di, dump_expr((void**)size_property->expr, &info));
+                coutput("  gpu_task->flow_info[%d].flow_span = gpu_task->ec->data[%d].data_in->original->span;\n", di, di);
+            } else {
+                coutput("  gpu_task->flow_info[%d].flow_span = %s;\n",
+                        di, dump_expr((void **)size_property->expr, &info));
                 if( (stage_in_property == NULL) || ( stage_out_property == NULL )){
-                    coutput("  assert(gpu_task->ec->data[%d].data_in->original->nb_elts <= %s);\n",
+                    coutput("  assert(gpu_task->ec->data[%d].data_in->original->span <= %s);\n",
                             di, dump_expr((void**)size_property->expr, &info));
                 }
 
@@ -6936,6 +6927,7 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
         }
     }
     string_arena_free(info.sa);
+    coutput("  gpu_task->nb_flows = %d;  /* injerit the flows from the task_class */\n", di);
 
     coutput("\n"
             "  return dev->kernel_scheduler(dev, es, gpu_task);\n"
diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index 3f253098b..f0c306a24 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -39,22 +39,51 @@ static int parsec_gpu_profiling_initiated = 0;
 int parsec_gpu_output_stream = -1;
 int parsec_gpu_verbosity;
 
+/**
+ * This is a special function to release standard device tasks instead of calling
+ * PARSEC_OBJ_RELEASE on them. If we use the PARSEC_OBJ_RELEASE route the memory pointed
+ * by the task will be free. In some cases, we don't want that to happen, but we still
+ * want to inform the DSL that the task has been done with.
+ */
+static void parsec_device_release_gpu_task(parsec_gpu_task_t *gpu_task)
+{
+    PARSEC_OBJ_RELEASE(gpu_task);
+}
+
+static void parsec_device_task_t_constructor(parsec_gpu_task_t *gpu_task)
+{
+    gpu_task->nb_flows = 0;
+    gpu_task->flow_info = NULL;
+    /* Default release mechanism, can be replaced by the DSL */
+    gpu_task->release_device_task = parsec_device_release_gpu_task;
+}
+PARSEC_OBJ_CLASS_INSTANCE(parsec_gpu_task_t, parsec_list_item_t,
+                          parsec_device_task_t_constructor, NULL);
+
+static void parsec_device_dsl_task_t_constructor(parsec_gpu_dsl_task_t *gpu_dsl_task)
+{
+    gpu_dsl_task->super.flow_info = gpu_dsl_task->flows;
+}
+
+PARSEC_OBJ_CLASS_INSTANCE(parsec_gpu_dsl_task_t, parsec_gpu_task_t,
+                          parsec_device_dsl_task_t_constructor, NULL);
+
 static inline int
 parsec_device_check_space_needed(parsec_device_gpu_module_t *gpu_device,
                                  parsec_gpu_task_t *gpu_task)
 {
-    int i;
     int space_needed = 0;
     parsec_task_t *this_task = gpu_task->ec;
     parsec_data_t *original;
     parsec_data_copy_t *data;
     const parsec_flow_t *flow;
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    /* would have been this_task->task_class->nb_flows for classical DSL */
+    for( uint32_t i = 0; i < gpu_task->nb_flows; i++ ) {
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
-        flow = gpu_task->flow[i];
+        flow = gpu_task->flow_info[i].flow;
         if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
 
         data = this_task->data[i].data_in;
@@ -466,15 +495,13 @@ parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int
                                 gpu_device->super.device_index, gpu_device->super.name, __func__, __LINE__);
                 return PARSEC_ERROR;
             }
-            parsec_gpu_task_t* gpu_task = NULL;
-            gpu_task = (parsec_gpu_task_t*)calloc(1, sizeof(parsec_gpu_task_t));
+            parsec_gpu_task_t* gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);
             gpu_task->task_type = PARSEC_GPU_TASK_TYPE_PREFETCH;
-            gpu_task->release_device_task = free;  /* by default free the device task */
             gpu_task->ec = calloc(1, sizeof(parsec_task_t));
             PARSEC_OBJ_CONSTRUCT(gpu_task->ec, parsec_task_t);
             gpu_task->ec->task_class = &parsec_device_data_prefetch_tc;
-            gpu_task->flow[0] = &parsec_device_data_prefetch_flow;
-            gpu_task->flow_nb_elts[0] = data->device_copies[ data->owner_device ]->original->nb_elts;
+            gpu_task->flow_info[0].flow = &parsec_device_data_prefetch_flow;
+            gpu_task->flow_info[0].flow_span = data->device_copies[ data->owner_device ]->original->span;
             gpu_task->stage_in  = parsec_default_gpu_stage_in;
             gpu_task->stage_out = parsec_default_gpu_stage_out;
             PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p [ref_count %d]",
@@ -844,8 +871,9 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
     parsec_gpu_data_copy_t* temp_loc[MAX_PARAM_COUNT], *gpu_elem, *lru_gpu_elem;
     parsec_data_t* master, *oldmaster;
     const parsec_flow_t *flow;
-    int i, j, data_avail_epoch = 0, copy_readers_update = 0;
+    int data_avail_epoch = 0, copy_readers_update = 0;
     parsec_gpu_data_copy_t *gpu_mem_lru_cycling = NULL;
+    uint32_t i, j;
 
 #if defined(PARSEC_DEBUG_NOISIER)
     char task_name[MAX_TASK_STRLEN];
@@ -858,8 +886,8 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
      * Parse all the input and output flows of data and ensure all have
      * corresponding data on the GPU available.
      */
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
-        flow = gpu_task->flow[i];
+    for (i = 0; i < gpu_task->nb_flows /* not this_task->task_class->nb_flows */; i++) {
+        flow = gpu_task->flow_info[i].flow;
         assert( flow && (flow->flow_index == i) );
 
         /* Skip CTL flows only */
@@ -900,12 +928,12 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
         PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
                              "GPU[%d:%s]:%s: Allocate GPU copy %p sz %zu [ref_count %d] for data %p",
                              gpu_device->super.device_index, gpu_device->super.name, task_name,
-                             gpu_elem, gpu_task->flow_nb_elts[i], gpu_elem->super.super.obj_reference_count, master);
+                             gpu_elem, gpu_task->flow_info[i].flow_span, gpu_elem->super.super.obj_reference_count, master);
         gpu_elem->flags = PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED;
     malloc_data:
         copy_readers_update = 0;
         assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
-        gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_nb_elts[i]);
+        gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_info[i].flow_span);
         if( NULL == gpu_elem->device_private ) {
 #endif
 
@@ -922,7 +950,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
                 PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
                                      "GPU[%d:%s]:%s:\tRequest space on GPU failed for flow %s index %d/%d for task %s",
                                      gpu_device->super.device_index, gpu_device->super.name, task_name,
-                                     flow->name, i, this_task->task_class->nb_flows, task_name );
+                                     flow->name, i, gpu_task->nb_flows, task_name );
 #endif  /* defined(PARSEC_DEBUG_NOISIER) */
                 for( j = 0; j <= i; j++ ) {
                     /* This flow could be a control flow */
@@ -1110,7 +1138,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
             parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling,
                                          parsec_gpu_allocate_memory_key, (int64_t)gpu_elem->device_private,
                                          gpu_device->super.device_index,
-                                         &gpu_task->flow_nb_elts[i], PARSEC_PROFILING_EVENT_COUNTER|PARSEC_PROFILING_EVENT_HAS_INFO);
+                                         &gpu_task->flow_info[i].flow_span, PARSEC_PROFILING_EVENT_COUNTER|PARSEC_PROFILING_EVENT_HAS_INFO);
         }
 #endif
 #else
@@ -1175,9 +1203,9 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t        *gtask,
     size_t count;
     parsec_device_transfer_direction_t dir;
 
-    for(int i = 0; i < task->task_class->nb_flows; i++) {
+    for(uint32_t i = 0; i < gtask->nb_flows  /* not task->task_class->nb_flows */; i++) {
         if( !(flow_mask & (1U << i)) ) continue;
-        source = gtask->sources[i];
+        source = gtask->flow_info[i].source;
         dest = task->data[i].data_out;
         src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index);
         dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index);
@@ -1189,8 +1217,8 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t        *gtask,
             dir = parsec_device_gpu_transfer_direction_h2d;
         }
 
-        count = (source->original->nb_elts <= dest->original->nb_elts) ?
-            source->original->nb_elts : dest->original->nb_elts;
+        count = (source->original->span <= dest->original->span) ?
+            source->original->span : dest->original->span;
         ret = dst_dev->memcpy_async( dst_dev, gpu_stream,
                                      dest->device_private,
                                      source->device_private,
@@ -1223,16 +1251,16 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t        *gtask,
     parsec_task_t *task = gtask->ec;
     size_t count;
     parsec_device_transfer_direction_t dir;
-    int i;
-    for(i = 0; i < task->task_class->nb_flows; i++){
+
+    for(uint32_t i = 0; i < gtask->nb_flows  /* not task->task_class->nb_flows */; i++){
         if(flow_mask & (1U << i)){
             source = task->data[i].data_out;
             dest = source->original->device_copies[0];
             dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index);
             src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index);
 
-            count = (source->original->nb_elts <= dest->original->nb_elts) ? source->original->nb_elts :
-                        dest->original->nb_elts;
+            count = (source->original->span <= dest->original->span) ? source->original->span :
+                        dest->original->span;
             if( src_dev->super.type == dst_dev->super.type ) {
                 assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) );
                 dir = parsec_device_gpu_transfer_direction_d2d;
@@ -1271,7 +1299,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
     parsec_data_copy_t *candidate = task_data->data_in;  /* best candidate for now */
     parsec_data_t* original = candidate->original;
     parsec_gpu_data_copy_t* gpu_elem = task_data->data_out;
-    size_t nb_elts = gpu_task->flow_nb_elts[flow->flow_index];
+    size_t span = gpu_task->flow_info[flow->flow_index].flow_span;
     int transfer_from = -1;
 
     if( gpu_task->task_type == PARSEC_GPU_TASK_TYPE_PREFETCH ) {
@@ -1282,7 +1310,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
 
     parsec_atomic_lock( &original->lock );
 
-    gpu_task->sources[flow->flow_index] = candidate;  /* default source for the transfer */
+    gpu_task->flow_info[flow->flow_index].source = candidate; /* default source for the transfer */
     /**
      * If the data will be accessed in write mode, remove it from any GPU data management
      * lists until the task is completed.
@@ -1316,7 +1344,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         transfer_from = -1;
 
     /* Update the transferred required_data_in size */
-    gpu_device->super.required_data_in += original->nb_elts;
+    gpu_device->super.required_data_in += original->span;
 
     if( -1 == transfer_from ) {  /* Do not need to be transferred */
         gpu_elem->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;
@@ -1341,7 +1369,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
                              "GPU[%d:%s]:\t\tMove data copy %p [ref_count %d, key %x] of %zu bytes: data copy is already under transfer, ignoring double request",
                              gpu_device->super.device_index, gpu_device->super.name,
-                             gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, nb_elts);
+                             gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, span);
         parsec_atomic_unlock( &original->lock );
         return 1;  /* positive returns have special meaning and are used for optimizations */
     }
@@ -1446,7 +1474,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
                          "GPU[%d:%s]:\t\tMove %s data copy %p [ref_count %d, key %x] of %zu bytes\t(src dev: %d, v:%d, ptr:%p, copy:%p [ref_count %d, under_transfer: %d, coherency_state: %d] / dst dev: %d, v:%d, ptr:%p)",
                          gpu_device->super.device_index, gpu_device->super.name,
                          PARSEC_DEV_IS_GPU(candidate_dev->super.type) ? "D2D": "H2D",
-                         gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, nb_elts,
+                         gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, span,
                          candidate_dev->super.device_index, candidate->version, (void*)candidate->device_private,
                          candidate, candidate->super.super.obj_reference_count, candidate->data_transfer_status, candidate->coherency_state,
                          gpu_device->super.device_index, gpu_elem->version, (void*)gpu_elem->device_private);
@@ -1484,7 +1512,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         }
         if(gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) {
             parsec_device_gpu_memory_prof_info_t _info;
-            _info.size = (uint64_t)nb_elts;
+            _info.size = (uint64_t)span;
             _info.data_key = gpu_elem->original->key;
             _info.dc_id = (uint64_t)(gpu_elem->original->dc);
             parsec_profiling_trace_flags(gpu_stream->profiling,
@@ -1495,7 +1523,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         }
     }
 #endif
-    gpu_task->sources[flow->flow_index] = candidate;  /* save the candidate for release on transfer completion */
+    gpu_task->flow_info[flow->flow_index].source = candidate;  /* save the candidate for release on transfer completion */
     /* Push data into the GPU from the source device */
     int rc = gpu_task->stage_in ? gpu_task->stage_in(gpu_task, (1U << flow->flow_index), gpu_stream): PARSEC_SUCCESS;
     if(PARSEC_SUCCESS != rc) {
@@ -1504,15 +1532,15 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
                         gpu_device->super.device_index, gpu_device->super.name, rc, __func__, __LINE__,
                         candidate->device_private, candidate_dev->super.device_index, candidate_dev->super.name,
                         gpu_elem->device_private, gpu_device->super.device_index, gpu_device->super.name,
-                        nb_elts, (candidate_dev->super.type != gpu_device->super.type)? "H2D": "D2D");
+                        span, (candidate_dev->super.type != gpu_device->super.type)? "H2D": "D2D");
         parsec_atomic_unlock( &original->lock );
         assert(0);
         return PARSEC_HOOK_RETURN_ERROR;
     }
     assert(candidate_dev->super.device_index < gpu_device->super.data_in_array_size);
-    gpu_device->super.data_in_from_device[candidate_dev->super.device_index] += nb_elts;
+    gpu_device->super.data_in_from_device[candidate_dev->super.device_index] += span;
     if( PARSEC_GPU_TASK_TYPE_KERNEL == gpu_task->task_type )
-        gpu_device->super.nb_data_faults += nb_elts;
+        gpu_device->super.nb_data_faults += span;
 
     /* We assign the version of the data preemptively (i.e. before the task is executing)
      * For read-only data, the GPU copy will get the same version as the source
@@ -1589,15 +1617,13 @@ parsec_device_send_transfercomplete_cmd_to_device(parsec_data_copy_t *copy,
                                                   parsec_device_module_t *current_dev,
                                                   parsec_device_module_t *dst_dev)
 {
-    parsec_gpu_task_t* gpu_task = NULL;
-    gpu_task = (parsec_gpu_task_t*)calloc(1, sizeof(parsec_gpu_task_t));
+    parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t *)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);
     gpu_task->task_type = PARSEC_GPU_TASK_TYPE_D2D_COMPLETE;
-    gpu_task->release_device_task = free;  /* by default free the device task */
     gpu_task->ec = calloc(1, sizeof(parsec_task_t));
     PARSEC_OBJ_CONSTRUCT(gpu_task->ec, parsec_task_t);
     gpu_task->ec->task_class = &parsec_device_d2d_complete_tc;
-    gpu_task->flow[0] = &parsec_device_d2d_complete_flow;
-    gpu_task->flow_nb_elts[0] = copy->original->nb_elts;
+    gpu_task->flow_info[0].flow = &parsec_device_d2d_complete_flow;
+    gpu_task->flow_info[0].flow_span = copy->original->span;
     gpu_task->stage_in  = parsec_default_gpu_stage_in;
     gpu_task->stage_out = parsec_default_gpu_stage_out;
     gpu_task->ec->data[0].data_in = copy;  /* We need to set not-null in data_in, so that the fake flow is
@@ -1627,7 +1653,7 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
 
     parsec_gpu_task_t *gtask = *gpu_task;
     parsec_task_t *task;
-    int32_t i;
+    uint32_t i;
 #if defined(PARSEC_DEBUG_NOISIER)
     char task_str[MAX_TASK_STRLEN];
 #endif
@@ -1644,14 +1670,15 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
                          "GPU[%d:%s]: parsec_device_callback_complete_push, PUSH of %s",
                          gpu_device->super.device_index, gpu_device->super.name, parsec_task_snprintf(task_str, MAX_TASK_STRLEN, task));
 
-    for( i = 0; i < task->task_class->nb_flows; i++ ) {
+    for (i = 0; i < gtask->nb_flows /* not task->task_class->nb_flows */; i++)
+    {
         /* Make sure data_in is not NULL */
         if( NULL == task->data[i].data_in ) continue;
         /* We also don't push back non-parsec-owned copies */
         if(NULL != task->data[i].data_out &&
            0 == (task->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED)) continue;
 
-        flow = gtask->flow[i];
+        flow = gtask->flow_info[i].flow;
         assert( flow );
         assert( flow->flow_index == i );
         if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
@@ -1672,7 +1699,7 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
             }
 #endif
             parsec_atomic_unlock(&task->data[i].data_out->original->lock);
-            parsec_data_copy_t* source = gtask->sources[i];
+            parsec_data_copy_t *source = gtask->flow_info[i].source;
             parsec_device_gpu_module_t *src_device =
                     (parsec_device_gpu_module_t*)parsec_mca_device_get( source->device_index );
             if( PARSEC_DEV_IS_GPU(src_device->super.type) ) {
@@ -1953,7 +1980,7 @@ parsec_device_kernel_push( parsec_device_gpu_module_t      *gpu_device,
 {
     parsec_task_t *this_task = gpu_task->ec;
     const parsec_flow_t *flow;
-    int i, ret = 0;
+    int ret = 0;
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
 #endif
@@ -1997,9 +2024,9 @@ parsec_device_kernel_push( parsec_device_gpu_module_t      *gpu_device,
         return ret;
     }
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint32_t i = 0; i < gpu_task->nb_flows  /* not this_task->task_class->nb_flows */; i++ ) {
 
-        flow = gpu_task->flow[i];
+        flow = gpu_task->flow_info[i].flow;
         /* Skip CTL flows */
         if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
 
@@ -2074,11 +2101,11 @@ parsec_device_kernel_exec( parsec_device_gpu_module_t      *gpu_device,
 
 #if defined(PARSEC_DEBUG_PARANOID)
     const parsec_flow_t *flow;
-    for( int i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( int i = 0; i < gpu_task->nb_flows  /* this_task->task_class->nb_flows */; i++ ) {
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
-        flow = gpu_task->flow[i];
+        flow = gpu_task->flow_info[i].flow;
         if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
         if( 0 == (this_task->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
         assert(this_task->data[i].data_out->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER);
@@ -2104,15 +2131,15 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
     parsec_task_t *this_task = gpu_task->ec;
     parsec_gpu_data_copy_t     *gpu_copy;
     parsec_data_t              *original;
-    size_t                      nb_elts;
+    size_t                      span;
     const parsec_flow_t        *flow;
-    int return_code = 0, rc, how_many = 0, i, update_data_epoch = 0;
+    int return_code = 0, rc, how_many = 0, update_data_epoch = 0;
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
 #endif
 
     if (gpu_task->task_type == PARSEC_GPU_TASK_TYPE_D2HTRANSFER) {
-        for( i = 0; i < this_task->locals[0].value; i++ ) {
+        for( uint32_t i = 0; i < (uint32_t)this_task->locals[0].value; i++ ) {
             gpu_copy = this_task->data[i].data_out;
             /* If the gpu copy is not owned by parsec, we don't manage it at all */
             if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
@@ -2136,13 +2163,13 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
                         gpu_device->super.device_index, gpu_device->super.name,
                         parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint32_t i = 0; i < gpu_task->nb_flows  /* not this_task->task_class->nb_flows */; i++ ) {
         /* We need to manage all data that has been used as input, even if they were read only */
 
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
-        flow = gpu_task->flow[i];
+        flow = gpu_task->flow_info[i].flow;
         if( PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags) )  continue;  /* control flow */
 
         gpu_copy = this_task->data[i].data_out;
@@ -2151,7 +2178,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
         if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
 
         original = gpu_copy->original;
-        nb_elts = gpu_task->flow_nb_elts[i];
+        span = gpu_task->flow_info[i].flow_span;
 
         assert( this_task->data[i].data_in == NULL || original == this_task->data[i].data_in->original );
 
@@ -2201,7 +2228,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
                                 gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->super.super.obj_reference_count, flow->name);
 
             /* Stage the transfer of the data back to main memory */
-            gpu_device->super.required_data_out += nb_elts;
+            gpu_device->super.required_data_out += span;
             assert( ((parsec_list_item_t*)gpu_copy)->list_next == (parsec_list_item_t*)gpu_copy );
             assert( ((parsec_list_item_t*)gpu_copy)->list_prev == (parsec_list_item_t*)gpu_copy );
 
@@ -2249,7 +2276,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
                     parsec_atomic_unlock(&original->lock);
                     goto release_and_return_error;
                 }
-                gpu_device->super.data_out_to_host += nb_elts; /* TODO: not hardcoded, use datatype size */
+                gpu_device->super.data_out_to_host += span; /* TODO: not hardcoded, use datatype size */
                 how_many++;
             } else {
                 assert( 0 == gpu_copy->readers );
@@ -2280,7 +2307,6 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
     parsec_task_t *this_task = gpu_task->ec;
     parsec_gpu_data_copy_t     *gpu_copy, *cpu_copy;
     parsec_data_t              *original;
-    int i;
 
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
@@ -2290,7 +2316,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
                          parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
 #endif
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint32_t i = 0; i < gpu_task->nb_flows  /* not this_task->task_class->nb_flows */; i++ ) {
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
@@ -2298,7 +2324,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
         if(NULL == this_task->data[i].data_out) continue;
 
 
-        if( !(gpu_task->flow[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
+        if( !(gpu_task->flow_info[i].flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
             /* Warning data_out for read only flows has been overwritten in pop */
             continue;
         }
@@ -2383,7 +2409,7 @@ parsec_device_kernel_cleanout( parsec_device_gpu_module_t *gpu_device,
     parsec_task_t *this_task = gpu_task->ec;
     parsec_gpu_data_copy_t     *gpu_copy, *cpu_copy;
     parsec_data_t              *original;
-    int i, data_avail_epoch = 0;
+    int data_avail_epoch = 0;
 
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
@@ -2393,13 +2419,13 @@ parsec_device_kernel_cleanout( parsec_device_gpu_module_t *gpu_device,
                          parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
 #endif
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint32_t i = 0; i < gpu_task->nb_flows  /* not this_task->task_class->nb_flows */; i++ ) {
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
         /* Don't bother if there is no real data (aka. CTL or no output) */
         if(NULL == this_task->data[i].data_out) continue;
-        if( !(gpu_task->flow[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
+        if( !(gpu_task->flow_info[i].flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
             /* Warning data_out for read only flows has been overwritten in pop */
             continue;
         }
@@ -2643,9 +2669,9 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module,
     PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]: gpu_task %p freed",
                          gpu_device->super.device_index, gpu_device->super.name,
                          gpu_task);
-    if (NULL != gpu_task->release_device_task) {
-        gpu_task->release_device_task(gpu_task);
-    }
+    /* Release the GPU task */
+    gpu_task->release_device_task(gpu_task);
+
     rc = parsec_atomic_fetch_dec_int32( &(gpu_device->mutex) );
     if( 1 == rc ) {  /* I was the last one */
 #if defined(PARSEC_PROF_TRACE)
diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h
index 453008b07..1084ff7fb 100644
--- a/parsec/mca/device/device_gpu.h
+++ b/parsec/mca/device/device_gpu.h
@@ -81,7 +81,25 @@ typedef int (parsec_stage_out_function_t)(parsec_gpu_task_t        *gtask,
  * and this function allows the device engine to delegate the release of such tasks back into
  * the DSL. Once this task called, the device task should not be accessed by the device.
  */
-typedef void (*parsec_release_device_task_function_t)(void*);
+typedef void (*parsec_release_device_task_function_t)(parsec_gpu_task_t*);
+typedef struct parsec_gpu_flow_info_s {
+    const parsec_flow_t      *flow;         /* Some DSL might not have a task_class, but they still need to provide a flow. */
+    size_t                    flow_span;    /* the span of the data on the device. For contiguous layout this is equal to the
+                                             * size of the data, for all the other copies this should be the amount of memory
+                                             * needed on the device.
+                                             */
+    parsec_data_collection_t *flow_dc; /* the data collection from which the data originates. When the data copy is local, the data
+                                        * collection can be accessed via the data_t, but for all copies coming from the network there
+                                        * is no known data collection. Thus, for such cases the DSL need to provide a reference to the
+                                        * local data collection to be used for all transfers. This data collection is needed to get
+                                        * access to the mtype, to get the memory layout of the copy.
+                                        */
+    /* The is private to the device code and should not be used outside the device driver */
+    parsec_data_copy_t       *source; /* If the driver decides to acquire the data from a different
+                                        * source, it will temporary store the best candidate here.
+                                        */
+
+} parsec_gpu_flow_info_t;
 
 struct parsec_gpu_task_s {
     parsec_list_item_t                     list_item;
@@ -102,23 +120,8 @@ struct parsec_gpu_task_s {
         struct {
             parsec_task_t                 *ec;
             uint64_t                       last_data_check_epoch;
-            const parsec_flow_t           *flow[MAX_PARAM_COUNT];  /* There is no consistent way to access the flows from the task_class,
-                                                                    * so the DSL need to provide these flows here.
-                                                                    */
-            size_t                         flow_nb_elts[MAX_PARAM_COUNT]; /* for each flow, size of the data to be allocated
-                                                                           * on the GPU.
-                                                                           */
-            parsec_data_collection_t      *flow_dc[MAX_PARAM_COUNT];     /* for each flow, data collection from which the data
-                                                                          * to be transferred logically belongs to.
-                                                                          * This gives the user the chance to indicate on the JDF
-                                                                          * a data collection to inspect during GPU transfer.
-                                                                          * User may want info from the DC (e.g. mtype),
-                                                                          * & otherwise remote copies don't have any info.
-                                                                          */
-            /* These are private and should not be used outside the device driver */
-            parsec_data_copy_t            *sources[MAX_PARAM_COUNT];  /* If the driver decides to acquire the data from a different
-                                                                       * source, it will temporary store the best candidate here.
-                                                                       */
+            uint32_t                       nb_flows;
+            parsec_gpu_flow_info_t        *flow_info;
         };
         struct {
             parsec_data_copy_t            *copy;
@@ -126,6 +129,20 @@ struct parsec_gpu_task_s {
     };
 };
 
+PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_gpu_task_t);
+
+/**
+ * Specialized GPU tasks for the PTG and DTD DSL. The maximum number of flows being MAX_PARAM_COUNT we
+ * can make the gpu_flow_info array part of the struct, to allocate the gpu_task as a single, 
+ * contiguous block of memory.
+ */
+typedef struct parsec_gpu_dsl_task_s {
+    parsec_gpu_task_t                      super;
+    parsec_gpu_flow_info_t                 flows[MAX_PARAM_COUNT];  /* All the flow info necessary for the PTG and DTD DSL */
+} parsec_gpu_dsl_task_t;
+
+PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_gpu_dsl_task_t);
+
 typedef enum parsec_device_transfer_direction_e {
     parsec_device_gpu_transfer_direction_h2d,
     parsec_device_gpu_transfer_direction_d2h,
diff --git a/parsec/mca/device/transfer_gpu.c b/parsec/mca/device/transfer_gpu.c
index b0aa314ab..169ae651a 100644
--- a/parsec/mca/device/transfer_gpu.c
+++ b/parsec/mca/device/transfer_gpu.c
@@ -276,9 +276,7 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device,
     d2h_task->taskpool        = NULL;
     d2h_task->locals[0].value = nb_cleaned;
 
-    w2r_task = (parsec_gpu_task_t *)malloc(sizeof(parsec_gpu_task_t));
-    PARSEC_OBJ_CONSTRUCT(w2r_task, parsec_list_item_t);
-    w2r_task->release_device_task   = free;  /* by default free the device task */
+    w2r_task = (parsec_gpu_task_t *)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);
     w2r_task->ec                    = (parsec_task_t*)d2h_task;
     w2r_task->task_type             = PARSEC_GPU_TASK_TYPE_D2HTRANSFER;
     w2r_task->last_data_check_epoch = gpu_device->data_avail_epoch - 1;
@@ -309,7 +307,7 @@ int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device,
         parsec_atomic_lock(&gpu_copy->original->lock);
         gpu_copy->readers--;
         gpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;
-        gpu_device->super.data_out_to_host += gpu_copy->original->nb_elts; /* TODO: not hardcoded, use datatype size */
+        gpu_device->super.data_out_to_host += gpu_copy->original->span; /* TODO: not hardcoded, use datatype size */
         assert(gpu_copy->readers >= 0);
 
         original = gpu_copy->original;
@@ -338,7 +336,7 @@ int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device,
         parsec_atomic_unlock(&gpu_copy->original->lock);
     }
     parsec_thread_mempool_free(es->context_mempool, task);
-    free(gpu_task);
+    PARSEC_OBJ_RELEASE(gpu_task); /* no need to call release_device_task, just release the task */
     gpu_device->data_avail_epoch++;
     return 0;
 }
diff --git a/tests/dsl/ptg/choice/choice_data.c b/tests/dsl/ptg/choice/choice_data.c
index f903d1a87..5599d4d89 100644
--- a/tests/dsl/ptg/choice/choice_data.c
+++ b/tests/dsl/ptg/choice/choice_data.c
@@ -50,7 +50,7 @@ get_or_create_data(my_datatype_t* dat, uint32_t pos)
 
         data->owner_device = 0;
         data->key = pos;
-        data->nb_elts = 1;
+        data->span = 1;
         data->device_copies[0] = data_copy;
 
         if( !parsec_atomic_cas_ptr(&dat->data_map[pos], NULL, data) ) {
diff --git a/tests/runtime/cuda/nvlink_wrapper.c b/tests/runtime/cuda/nvlink_wrapper.c
index abc4b19c9..a641ed053 100644
--- a/tests/runtime/cuda/nvlink_wrapper.c
+++ b/tests/runtime/cuda/nvlink_wrapper.c
@@ -189,7 +189,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
             /* And copy the tile from CPU to GPU */
             status = (cudaError_t)cudaMemcpy( gpu_copy->device_private,
                                               cpu_copy->device_private,
-                                              dta->nb_elts,
+                                              dta->span,
                                               cudaMemcpyHostToDevice );
             PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaMemcpy", status, {return NULL;} );
             g++;
diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf
index 7df99800f..0878746c6 100644
--- a/tests/runtime/cuda/stage_custom.jdf
+++ b/tests/runtime/cuda/stage_custom.jdf
@@ -37,12 +37,13 @@ stage_stride_in(parsec_gpu_task_t *gtask,
     parsec_device_gpu_module_t *in_elem_dev;
     parsec_tiled_matrix_t * dc;
     int elem_sz;
-    int i;
-    for(i = 0; i < task->task_class->nb_flows; i++){
+
+    assert(gtask->nb_flows == task->task_class->nb_flows);
+    for(uint32_t i = 0; i < gtask->nb_flows; i++){
         if(flow_mask & (1U << i)){
             copy_in = task->data[i].data_in;
             copy_out = task->data[i].data_out;
-            dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i];
+            dc = (parsec_tiled_matrix_t*)gtask->flow_info[i].flow_dc;
             elem_sz = parsec_datadist_getsizeoftype(dc->mtype);
             in_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_in->device_index);
             if(in_elem_dev->super.type != PARSEC_DEV_CUDA ){
@@ -62,7 +63,7 @@ stage_stride_in(parsec_gpu_task_t *gtask,
             }else{
                 ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private,
                                                     copy_in->device_private,
-                                                    copy_in->original->nb_elts,
+                                                    copy_in->original->span,
                                                     cudaMemcpyDeviceToDevice,
                                                     cuda_stream->cuda_stream );
                 PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync", ret, { return PARSEC_ERROR; } );
@@ -89,7 +90,7 @@ stage_stride_out(parsec_gpu_task_t *gtask,
         if(flow_mask & (1U << i)){
             copy_in = task->data[i].data_out;
             copy_out = copy_in->original->device_copies[0];
-            dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i];
+            dc = (parsec_tiled_matrix_t*)gtask->flow_info[i].flow_dc;
             elem_sz = parsec_datadist_getsizeoftype(dc->mtype);
             /* copy width bytes heigth times, skipping pitch - width bytes every time */
             size_t dpitch = dc->llm * elem_sz;

From 9d744ae7ef9d3a53c9dad954b4c44c5cc0b40714 Mon Sep 17 00:00:00 2001
From: bosilca <bosilca@users.noreply.github.com>
Date: Thu, 12 Dec 2024 14:13:01 -0500
Subject: [PATCH 2/4] Apply suggestions from code review

Co-authored-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
---
 parsec/interfaces/ptg/ptg-compiler/jdf2c.c | 2 +-
 parsec/mca/device/device_gpu.c             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
index 80592b58f..d2f86f419 100644
--- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -6927,7 +6927,7 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
         }
     }
     string_arena_free(info.sa);
-    coutput("  gpu_task->nb_flows = %d;  /* injerit the flows from the task_class */\n", di);
+    coutput("  gpu_task->nb_flows = %d;  /* inherit the flows from the task_class */\n", di);
 
     coutput("\n"
             "  return dev->kernel_scheduler(dev, es, gpu_task);\n"
diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index f0c306a24..42c01b85c 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -2139,7 +2139,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
 #endif
 
     if (gpu_task->task_type == PARSEC_GPU_TASK_TYPE_D2HTRANSFER) {
-        for( uint32_t i = 0; i < (uint32_t)this_task->locals[0].value; i++ ) {
+        for( int i = 0; i < this_task->locals[0].value; i++ ) {
             gpu_copy = this_task->data[i].data_out;
             /* If the gpu copy is not owned by parsec, we don't manage it at all */
             if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;

From c5abdbf3309fbca07f72af4fd4619319ea813cc8 Mon Sep 17 00:00:00 2001
From: Aurelien Bouteiller <bouteill@icl.utk.edu>
Date: Thu, 2 Jan 2025 14:22:22 -0500
Subject: [PATCH 3/4] gpu_task_t must be zeroed during the constructor

---
 parsec/interfaces/ptg/ptg-compiler/jdf2c.c |  4 ++--
 parsec/mca/device/device_gpu.c             | 20 +++++++++++++++++---
 parsec/mca/device/device_gpu.h             |  3 ++-
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
index d2f86f419..241e6e400 100644
--- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2009-2023 The University of Tennessee and The University
+ * Copyright (c) 2009-2025 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
@@ -6812,7 +6812,7 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
             "  gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);"
             "  gpu_task->ec = (parsec_task_t*)this_task;\n"
             "  gpu_task->submit = &%s_kernel_submit_%s_%s;\n"
-            "  gpu_task->task_type = 0;\n"
+            "  gpu_task->task_type = PARSEC_GPU_TASK_TYPE_KERNEL;\n"
             "  gpu_task->last_data_check_epoch = -1;  /* force at least one validation for the task */\n",
             dev_lower, jdf_basename, f->fname);
 
diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index 42c01b85c..bb348ca0b 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -1,6 +1,5 @@
 /*
- *
- * Copyright (c) 2021-2022 The University of Tennessee and The University
+ * Copyright (c) 2021-2025 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
@@ -52,6 +51,21 @@ static void parsec_device_release_gpu_task(parsec_gpu_task_t *gpu_task)
 
 static void parsec_device_task_t_constructor(parsec_gpu_task_t *gpu_task)
 {
+    gpu_task->task_type = PARSEC_GPU_TASK_TYPE_INVALID; /* need to be set later */
+    gpu_task->pushout = 0;
+    gpu_task->last_status = 0;
+    gpu_task->submit = NULL;
+    gpu_task->complete_stage = NULL;
+    gpu_task->stage_in = NULL;
+    gpu_task->stage_out = NULL;
+    gpu_task->release_device_task = NULL;
+#if defined(PARSEC_PROF_TRACE)
+    gpu_task->prof_key_end = 0;
+    gpu_task->prof_event_id = 0;
+    gpu_task->prof_tp_id = 0;
+#endif
+    gpu_task->ec = NULL;
+    gpu_task->last_data_check_epoch = 0;
     gpu_task->nb_flows = 0;
     gpu_task->flow_info = NULL;
     /* Default release mechanism, can be replaced by the DSL */
@@ -2101,7 +2115,7 @@ parsec_device_kernel_exec( parsec_device_gpu_module_t      *gpu_device,
 
 #if defined(PARSEC_DEBUG_PARANOID)
     const parsec_flow_t *flow;
-    for( int i = 0; i < gpu_task->nb_flows  /* this_task->task_class->nb_flows */; i++ ) {
+    for( uint i = 0; i < gpu_task->nb_flows  /* this_task->task_class->nb_flows */; i++ ) {
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h
index 1084ff7fb..167b16c57 100644
--- a/parsec/mca/device/device_gpu.h
+++ b/parsec/mca/device/device_gpu.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 The University of Tennessee and The University
+ * Copyright (c) 2021-2025 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
@@ -335,6 +335,7 @@ char *parsec_device_describe_gpu_task( char *tmp, size_t len, parsec_gpu_task_t
 #define PARSEC_GPU_TASK_TYPE_PREFETCH     0x2000
 #define PARSEC_GPU_TASK_TYPE_WARMUP       0x4000
 #define PARSEC_GPU_TASK_TYPE_D2D_COMPLETE 0x8000
+#define PARSEC_GPU_TASK_TYPE_INVALID      0xf000
 
 #if defined(PARSEC_PROF_TRACE)
 #define PARSEC_PROFILE_GPU_TRACK_DATA_IN  0x0001

From de821b84dfd0ee707ed41e54aef17075faa56d73 Mon Sep 17 00:00:00 2001
From: Aurelien Bouteiller <bouteill@icl.utk.edu>
Date: Thu, 6 Mar 2025 11:13:55 -0500
Subject: [PATCH 4/4] Initialize last_data_check_epoch to INT_MAX during new
 gpu-task constructor

---
 parsec/interfaces/dtd/insert_function.c    | 1 -
 parsec/interfaces/ptg/ptg-compiler/jdf2c.c | 3 +--
 parsec/mca/device/device_gpu.c             | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c
index 6fec3f3fa..d0c6ae5cb 100644
--- a/parsec/interfaces/dtd/insert_function.c
+++ b/parsec/interfaces/dtd/insert_function.c
@@ -2284,7 +2284,6 @@ static parsec_hook_return_t parsec_dtd_gpu_task_submit(parsec_execution_stream_t
     gpu_task->ec = (parsec_task_t *) this_task;
     gpu_task->submit = dtd_tc->gpu_func_ptr;
     gpu_task->task_type = 0;
-    gpu_task->last_data_check_epoch = -1;       /* force at least one validation for the task */
     gpu_task->pushout = 0;
     gpu_task->nb_flows = dtd_tc->super.nb_flows;  /* inherit the flows from the task class */
     for(int i = 0; i < dtd_tc->super.nb_flows; i++) {
diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
index 241e6e400..ee41b2ea6 100644
--- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -6812,8 +6812,7 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
             "  gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);"
             "  gpu_task->ec = (parsec_task_t*)this_task;\n"
             "  gpu_task->submit = &%s_kernel_submit_%s_%s;\n"
-            "  gpu_task->task_type = PARSEC_GPU_TASK_TYPE_KERNEL;\n"
-            "  gpu_task->last_data_check_epoch = -1;  /* force at least one validation for the task */\n",
+            "  gpu_task->task_type = PARSEC_GPU_TASK_TYPE_KERNEL;\n",
             dev_lower, jdf_basename, f->fname);
 
     /* Set up stage in/out callbacks */
diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index 4fc3ffbd1..d3127f261 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -65,7 +65,7 @@ static void parsec_device_task_t_constructor(parsec_gpu_task_t *gpu_task)
     gpu_task->prof_tp_id = 0;
 #endif
     gpu_task->ec = NULL;
-    gpu_task->last_data_check_epoch = 0;
+    gpu_task->last_data_check_epoch = UINT64_MAX; /* force at least one validation for the task */
     gpu_task->nb_flows = 0;
     gpu_task->flow_info = NULL;
     /* Default release mechanism, can be replaced by the DSL */