diff --git a/parsec/arena.c b/parsec/arena.c
index ca7def080..321c94971 100644
--- a/parsec/arena.c
+++ b/parsec/arena.c
@@ -223,7 +223,7 @@ int  parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
     assert(0 == (((ptrdiff_t)chunk->data) % arena->alignment));
     assert((arena->elem_size + (ptrdiff_t)chunk->data)  <= (size + (ptrdiff_t)chunk));
 
-    data->nb_elts = count * arena->elem_size;
+    data->span = count * arena->elem_size;
 
     copy->flags = PARSEC_DATA_FLAG_ARENA |
                   PARSEC_DATA_FLAG_PARSEC_OWNED |
diff --git a/parsec/data.c b/parsec/data.c
index fa4fe9d2a..a944a0203 100644
--- a/parsec/data.c
+++ b/parsec/data.c
@@ -69,7 +69,7 @@ static void parsec_data_construct(parsec_data_t* obj )
     obj->owner_device     = -1;
     obj->preferred_device = -1;
     obj->key              = 0;
-    obj->nb_elts          = 0;
+    obj->span          = 0;
     for( uint32_t i = 0; i < parsec_nb_devices;
          obj->device_copies[i] = NULL, i++ );
     obj->dc               = NULL;
@@ -509,7 +509,7 @@ parsec_data_create( parsec_data_t **holder,
         data->owner_device = 0;
         data->key = key;
         data->dc = desc;
-        data->nb_elts = size;
+        data->span = size;
         parsec_data_copy_attach(data, data_copy, 0);
 
         if( !parsec_atomic_cas_ptr(holder, NULL, data) ) {
@@ -546,7 +546,7 @@ parsec_data_create_with_type( parsec_data_collection_t *desc,
     clone->owner_device = 0;
     clone->key = key;
     clone->dc = desc;
-    clone->nb_elts = size;
+    clone->span = size;
     parsec_data_copy_attach(clone, data_copy, 0);
 
     return clone;
diff --git a/parsec/data_dist/matrix/broadcast.jdf b/parsec/data_dist/matrix/broadcast.jdf
index 6938132d6..114750785 100644
--- a/parsec/data_dist/matrix/broadcast.jdf
+++ b/parsec/data_dist/matrix/broadcast.jdf
@@ -59,7 +59,7 @@ static parsec_data_t* data_of(parsec_data_collection_t *desc, ...)
             data->owner_device = 0;
             data->key = k;
             data->dc = (parsec_data_collection_t*)desc;
-            data->nb_elts = 1;
+            data->span = 1;
             parsec_data_copy_t* data_copy = (parsec_data_copy_t*)PARSEC_OBJ_NEW(parsec_data_copy_t);
             parsec_data_copy_attach(data, data_copy, 0);
             data_copy->device_private = NULL;
diff --git a/parsec/data_internal.h b/parsec/data_internal.h
index 85cc103ca..f20ada28a 100644
--- a/parsec/data_internal.h
+++ b/parsec/data_internal.h
@@ -36,7 +36,7 @@ struct parsec_data_s {
                                                   * which device this data should be modified RW when there
                                                   * are multiple choices. -1 means no preference. */
     struct parsec_data_collection_s*     dc;
-    size_t                     nb_elts;          /* size in bytes of the memory layout */
+    size_t                     span;          /* size in bytes of the memory layout */
     struct parsec_data_copy_s *device_copies[];  /* this array allocated according to the number of devices
                                                   * (parsec_nb_devices). It points to the most recent
                                                   * version of the data.
diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c
index af84dba73..d0c6ae5cb 100644
--- a/parsec/interfaces/dtd/insert_function.c
+++ b/parsec/interfaces/dtd/insert_function.c
@@ -2280,20 +2280,18 @@ static parsec_hook_return_t parsec_dtd_gpu_task_submit(parsec_execution_stream_t
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT) || defined(PARSEC_HAVE_DEV_LEVEL_ZERO_SUPPORT)
     parsec_dtd_task_t *dtd_task = (parsec_dtd_task_t *)this_task;
     parsec_dtd_task_class_t *dtd_tc = (parsec_dtd_task_class_t*)this_task->task_class;
-    parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t *) calloc(1, sizeof(parsec_gpu_task_t));
-    PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);
-    gpu_task->release_device_task = free;  /* by default free the device task */
+    parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);
     gpu_task->ec = (parsec_task_t *) this_task;
     gpu_task->submit = dtd_tc->gpu_func_ptr;
     gpu_task->task_type = 0;
-    gpu_task->last_data_check_epoch = -1;       /* force at least one validation for the task */
     gpu_task->pushout = 0;
+    gpu_task->nb_flows = dtd_tc->super.nb_flows;  /* inherit the flows from the task class */
     for(int i = 0; i < dtd_tc->super.nb_flows; i++) {
         parsec_dtd_flow_info_t *flow = FLOW_OF(dtd_task, i);
         if(flow->op_type & PARSEC_PUSHOUT)
             gpu_task->pushout |= 1<<i;
-        gpu_task->flow[i] = dtd_tc->super.in[i];
-        gpu_task->flow_nb_elts[i] = this_task->data[i].data_in->original->nb_elts;
+        gpu_task->flow_info[i].flow = dtd_tc->super.in[i];
+        gpu_task->flow_info[i].flow_span = this_task->data[i].data_in->original->span;
     }
 
     parsec_device_module_t *device = this_task->selected_device;
diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
index 3c81a0158..ee41b2ea6 100644
--- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2009-2024 The University of Tennessee and The University
+ * Copyright (c) 2009-2025 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
@@ -6809,43 +6809,33 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
             "  assert(NULL != dev);\n"
             "  assert(PARSEC_DEV_IS_GPU(dev->type));\n"
             "\n"
-            "  gpu_task = (parsec_gpu_task_t*)calloc(1, sizeof(parsec_gpu_task_t));\n"
-            "  PARSEC_OBJ_CONSTRUCT(gpu_task, parsec_list_item_t);\n"
-            "  gpu_task->release_device_task = free;  /* by default free the device task */\n"
+            "  gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);"
             "  gpu_task->ec = (parsec_task_t*)this_task;\n"
             "  gpu_task->submit = &%s_kernel_submit_%s_%s;\n"
-            "  gpu_task->task_type = 0;\n"
-            "  gpu_task->last_data_check_epoch = -1;  /* force at least one validation for the task */\n",
+            "  gpu_task->task_type = PARSEC_GPU_TASK_TYPE_KERNEL;\n",
             dev_lower, jdf_basename, f->fname);
 
     /* Set up stage in/out callbacks */
     jdf_find_property(body->properties, "stage_in", &stage_in_property);
-    jdf_find_property(body->properties, "stage_out", &stage_out_property);
-
-    if(stage_in_property == NULL) {
-        coutput("  gpu_task->stage_in  = parsec_default_gpu_stage_in;\n");
-    }else{
-        coutput("  gpu_task->stage_in  = %s;\n", dump_expr((void**)stage_in_property->expr, &info));
-    }
+    coutput("  gpu_task->stage_in  = %s;\n", (NULL == stage_in_property) ? "parsec_default_gpu_stage_in"
+                                                                         : dump_expr((void **)stage_in_property->expr, &info));
 
-    if(stage_out_property == NULL) {
-        coutput("  gpu_task->stage_out = parsec_default_gpu_stage_out;\n");
-    }else{
-        coutput("  gpu_task->stage_out = %s;\n", dump_expr((void**)stage_out_property->expr, &info));
-    }
+    jdf_find_property(body->properties, "stage_out", &stage_out_property);
+    coutput("  gpu_task->stage_out = %s;\n", (NULL == stage_out_property) ? "parsec_default_gpu_stage_out"
+                                                                          : dump_expr((void **)stage_out_property->expr, &info));
 
     /* Dump the dataflow */
     coutput("  gpu_task->pushout = 0;\n");
     for(fl = f->dataflow, di = 0; fl != NULL; fl = fl->next, di++) {
-        coutput("  gpu_task->flow[%d]         = &%s;\n",
+        coutput("  gpu_task->flow_info[%d].flow    = &%s;\n",
                 di, JDF_OBJECT_ONAME( fl ));
 
         sprintf(sa->ptr, "%s.dc", fl->varname);
         jdf_find_property(body->properties, sa->ptr, &desc_property);
-        if(desc_property == NULL){
-            coutput("  gpu_task->flow_dc[%d] = NULL;\n", di);
+        if(desc_property == NULL) {
+            coutput("  gpu_task->flow_info[%d].flow_dc = NULL;\n", di);
         }else{
-            coutput("  gpu_task->flow_dc[%d] = (parsec_data_collection_t *)%s;\n", di,
+            coutput("  gpu_task->flow_info[%d].flow_dc = (parsec_data_collection_t *)%s;\n", di,
                         dump_expr((void**)desc_property->expr, &info));
         }
 
@@ -6853,22 +6843,22 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
         jdf_find_property(body->properties, sa->ptr, &size_property);
 
         if(fl->flow_flags & JDF_FLOW_TYPE_CTL) {
-            if(size_property != NULL){
+            if(size_property != NULL) {
                 fprintf(stderr, "Error: specifying GPU buffer size for CTL flow %s at line %d\n",
                         fl->varname, JDF_OBJECT_LINENO(fl));
                 exit(-1);
             }
-            coutput("  gpu_task->flow_nb_elts[%d] = 0;\n", di);
-        }else{
+            coutput("  gpu_task->flow_info[%d].flow_span = 0;\n", di);
+        } else {
             coutput("  // A shortcut to check if the flow exists\n");
             coutput("  if (gpu_task->ec->data[%d].data_in != NULL) {\n", di);
             if(size_property == NULL){
-                coutput("  gpu_task->flow_nb_elts[%d] = gpu_task->ec->data[%d].data_in->original->nb_elts;\n", di, di);
-            }else{
-                coutput("  gpu_task->flow_nb_elts[%d] = %s;\n",
-                        di, dump_expr((void**)size_property->expr, &info));
+                coutput("  gpu_task->flow_info[%d].flow_span = gpu_task->ec->data[%d].data_in->original->span;\n", di, di);
+            } else {
+                coutput("  gpu_task->flow_info[%d].flow_span = %s;\n",
+                        di, dump_expr((void **)size_property->expr, &info));
                 if( (stage_in_property == NULL) || ( stage_out_property == NULL )){
-                    coutput("  assert(gpu_task->ec->data[%d].data_in->original->nb_elts <= %s);\n",
+                    coutput("  assert(gpu_task->ec->data[%d].data_in->original->span <= %s);\n",
                             di, dump_expr((void**)size_property->expr, &info));
                 }
 
@@ -6936,6 +6926,7 @@ static void jdf_generate_code_hook_gpu(const jdf_t *jdf,
         }
     }
     string_arena_free(info.sa);
+    coutput("  gpu_task->nb_flows = %d;  /* inherit the flows from the task_class */\n", di);
 
     coutput("\n"
             "  return dev->kernel_scheduler(dev, es, gpu_task);\n"
diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index d57ac904e..dfaabc483 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -1,6 +1,5 @@
 /*
- *
- * Copyright (c) 2021-2024 The University of Tennessee and The University
+ * Copyright (c) 2021-2025 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
@@ -39,22 +38,66 @@ static int parsec_gpu_profiling_initiated = 0;
 int parsec_gpu_output_stream = -1;
 int parsec_gpu_verbosity;
 
+/**
+ * This is a special function to release standard device tasks instead of calling
+ * PARSEC_OBJ_RELEASE on them. If we use the PARSEC_OBJ_RELEASE route the memory pointed
+ * by the task will be free. In some cases, we don't want that to happen, but we still
+ * want to inform the DSL that the task has been done with.
+ */
+static void parsec_device_release_gpu_task(parsec_gpu_task_t *gpu_task)
+{
+    PARSEC_OBJ_RELEASE(gpu_task);
+}
+
+static void parsec_device_task_t_constructor(parsec_gpu_task_t *gpu_task)
+{
+    gpu_task->task_type = PARSEC_GPU_TASK_TYPE_INVALID; /* need to be set later */
+    gpu_task->pushout = 0;
+    gpu_task->last_status = 0;
+    gpu_task->submit = NULL;
+    gpu_task->complete_stage = NULL;
+    gpu_task->stage_in = NULL;
+    gpu_task->stage_out = NULL;
+    gpu_task->release_device_task = NULL;
+#if defined(PARSEC_PROF_TRACE)
+    gpu_task->prof_key_end = 0;
+    gpu_task->prof_event_id = 0;
+    gpu_task->prof_tp_id = 0;
+#endif
+    gpu_task->ec = NULL;
+    gpu_task->last_data_check_epoch = UINT64_MAX; /* force at least one validation for the task */
+    gpu_task->nb_flows = 0;
+    gpu_task->flow_info = NULL;
+    /* Default release mechanism, can be replaced by the DSL */
+    gpu_task->release_device_task = parsec_device_release_gpu_task;
+}
+PARSEC_OBJ_CLASS_INSTANCE(parsec_gpu_task_t, parsec_list_item_t,
+                          parsec_device_task_t_constructor, NULL);
+
+static void parsec_device_dsl_task_t_constructor(parsec_gpu_dsl_task_t *gpu_dsl_task)
+{
+    gpu_dsl_task->super.flow_info = gpu_dsl_task->flows;
+}
+
+PARSEC_OBJ_CLASS_INSTANCE(parsec_gpu_dsl_task_t, parsec_gpu_task_t,
+                          parsec_device_dsl_task_t_constructor, NULL);
+
 static inline int
 parsec_device_check_space_needed(parsec_device_gpu_module_t *gpu_device,
                                  parsec_gpu_task_t *gpu_task)
 {
-    int i;
     int space_needed = 0;
     parsec_task_t *this_task = gpu_task->ec;
     parsec_data_t *original;
     parsec_data_copy_t *data;
     const parsec_flow_t *flow;
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    /* would have been this_task->task_class->nb_flows for classical DSL */
+    for( uint32_t i = 0; i < gpu_task->nb_flows; i++ ) {
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
-        flow = gpu_task->flow[i];
+        flow = gpu_task->flow_info[i].flow;
         if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
 
         data = this_task->data[i].data_in;
@@ -466,15 +509,13 @@ parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int
                                 gpu_device->super.device_index, gpu_device->super.name, __func__, __LINE__);
                 return PARSEC_ERROR;
             }
-            parsec_gpu_task_t* gpu_task = NULL;
-            gpu_task = (parsec_gpu_task_t*)calloc(1, sizeof(parsec_gpu_task_t));
+            parsec_gpu_task_t* gpu_task = (parsec_gpu_task_t*)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);
             gpu_task->task_type = PARSEC_GPU_TASK_TYPE_PREFETCH;
-            gpu_task->release_device_task = free;  /* by default free the device task */
             gpu_task->ec = calloc(1, sizeof(parsec_task_t));
             PARSEC_OBJ_CONSTRUCT(gpu_task->ec, parsec_task_t);
             gpu_task->ec->task_class = &parsec_device_data_prefetch_tc;
-            gpu_task->flow[0] = &parsec_device_data_prefetch_flow;
-            gpu_task->flow_nb_elts[0] = data->device_copies[ data->owner_device ]->original->nb_elts;
+            gpu_task->flow_info[0].flow = &parsec_device_data_prefetch_flow;
+            gpu_task->flow_info[0].flow_span = data->device_copies[ data->owner_device ]->original->span;
             gpu_task->stage_in  = parsec_default_gpu_stage_in;
             gpu_task->stage_out = parsec_default_gpu_stage_out;
             PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p [ref_count %d]",
@@ -844,8 +885,9 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
     parsec_gpu_data_copy_t* temp_loc[MAX_PARAM_COUNT], *gpu_elem, *lru_gpu_elem;
     parsec_data_t* master, *oldmaster;
     const parsec_flow_t *flow;
-    int i, j, data_avail_epoch = 0, copy_readers_update = 0;
+    int data_avail_epoch = 0, copy_readers_update = 0;
     parsec_gpu_data_copy_t *gpu_mem_lru_cycling = NULL;
+    uint32_t i, j;
 
 #if defined(PARSEC_DEBUG_NOISIER)
     char task_name[MAX_TASK_STRLEN];
@@ -858,8 +900,8 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
      * Parse all the input and output flows of data and ensure all have
      * corresponding data on the GPU available.
      */
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
-        flow = gpu_task->flow[i];
+    for (i = 0; i < gpu_task->nb_flows /* not this_task->task_class->nb_flows */; i++) {
+        flow = gpu_task->flow_info[i].flow;
         assert( flow && (flow->flow_index == i) );
 
         /* Skip CTL flows only */
@@ -920,12 +962,12 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
         PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
                              "GPU[%d:%s]:%s: Allocate GPU copy %p sz %zu [ref_count %d] for data %p",
                              gpu_device->super.device_index, gpu_device->super.name, task_name,
-                             gpu_elem, gpu_task->flow_nb_elts[i], gpu_elem->super.super.obj_reference_count, master);
+                             gpu_elem, gpu_task->flow_info[i].flow_span, gpu_elem->super.super.obj_reference_count, master);
         gpu_elem->flags = PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED;
     malloc_data:
         copy_readers_update = 0;
         assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
-        gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_nb_elts[i]);
+        gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_info[i].flow_span);
         if( NULL == gpu_elem->device_private ) {
 #endif
 
@@ -942,7 +984,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
                 PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
                                      "GPU[%d:%s]:%s:\tRequest space on GPU failed for flow %s index %d/%d for task %s",
                                      gpu_device->super.device_index, gpu_device->super.name, task_name,
-                                     flow->name, i, this_task->task_class->nb_flows, task_name );
+                                     flow->name, i, gpu_task->nb_flows, task_name );
 #endif  /* defined(PARSEC_DEBUG_NOISIER) */
                 for( j = 0; j <= i; j++ ) {
                     /* This flow could be a control flow */
@@ -1130,7 +1172,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
             parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling,
                                          parsec_gpu_allocate_memory_key, (int64_t)gpu_elem->device_private,
                                          gpu_device->super.device_index,
-                                         &gpu_task->flow_nb_elts[i], PARSEC_PROFILING_EVENT_COUNTER|PARSEC_PROFILING_EVENT_HAS_INFO);
+                                         &gpu_task->flow_info[i].flow_span, PARSEC_PROFILING_EVENT_COUNTER|PARSEC_PROFILING_EVENT_HAS_INFO);
         }
 #endif
 #else
@@ -1195,9 +1237,9 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t        *gtask,
     size_t count;
     parsec_device_transfer_direction_t dir;
 
-    for(int i = 0; i < task->task_class->nb_flows; i++) {
+    for(uint32_t i = 0; i < gtask->nb_flows  /* not task->task_class->nb_flows */; i++) {
         if( !(flow_mask & (1U << i)) ) continue;
-        source = gtask->sources[i];
+        source = gtask->flow_info[i].source;
         assert(source->device_private != NULL);
         dest = task->data[i].data_out;
         src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index);
@@ -1210,8 +1252,8 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t        *gtask,
             dir = parsec_device_gpu_transfer_direction_h2d;
         }
 
-        count = (source->original->nb_elts <= dest->original->nb_elts) ?
-            source->original->nb_elts : dest->original->nb_elts;
+        count = (source->original->span <= dest->original->span) ?
+            source->original->span : dest->original->span;
         ret = dst_dev->memcpy_async( dst_dev, gpu_stream,
                                      dest->device_private,
                                      source->device_private,
@@ -1244,16 +1286,16 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t        *gtask,
     parsec_task_t *task = gtask->ec;
     size_t count;
     parsec_device_transfer_direction_t dir;
-    int i;
-    for(i = 0; i < task->task_class->nb_flows; i++){
+
+    for(uint32_t i = 0; i < gtask->nb_flows  /* not task->task_class->nb_flows */; i++){
         if(flow_mask & (1U << i)){
             source = task->data[i].data_out;
             dest = source->original->device_copies[0];
             dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index);
             src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index);
 
-            count = (source->original->nb_elts <= dest->original->nb_elts) ? source->original->nb_elts :
-                        dest->original->nb_elts;
+            count = (source->original->span <= dest->original->span) ? source->original->span :
+                        dest->original->span;
             if( src_dev->super.type == dst_dev->super.type ) {
                 assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) );
                 dir = parsec_device_gpu_transfer_direction_d2d;
@@ -1298,7 +1340,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
     parsec_data_copy_t *candidate = task_data->data_in;  /* best candidate for now */
     parsec_data_t* original = candidate->original;
     parsec_gpu_data_copy_t* gpu_elem = task_data->data_out;
-    size_t nb_elts = gpu_task->flow_nb_elts[flow->flow_index];
+    size_t span = gpu_task->flow_info[flow->flow_index].flow_span;
     int transfer_from = -1;
 
     if( gpu_task->task_type == PARSEC_GPU_TASK_TYPE_PREFETCH ) {
@@ -1309,7 +1351,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
 
     parsec_atomic_lock( &original->lock );
 
-    gpu_task->sources[flow->flow_index] = candidate;  /* default source for the transfer */
+    gpu_task->flow_info[flow->flow_index].source = candidate; /* default source for the transfer */
     /**
      * If the data will be accessed in write mode, remove it from any GPU data management
      * lists until the task is completed.
@@ -1343,7 +1385,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         transfer_from = -1;
 
     /* Update the transferred required_data_in size */
-    gpu_device->super.required_data_in += original->nb_elts;
+    gpu_device->super.required_data_in += original->span;
 
     if( -1 == transfer_from ) {  /* Do not need to be transferred */
         gpu_elem->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;
@@ -1368,7 +1410,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
                              "GPU[%d:%s]:\t\tMove data copy %p [ref_count %d, key %x] of %zu bytes: data copy is already under transfer, ignoring double request",
                              gpu_device->super.device_index, gpu_device->super.name,
-                             gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, nb_elts);
+                             gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, span);
         parsec_atomic_unlock( &original->lock );
         return 1;  /* positive returns have special meaning and are used for optimizations */
     }
@@ -1473,7 +1515,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
                          "GPU[%d:%s]:\t\tMove %s data copy %p [ref_count %d, key %x] of %zu bytes\t(src dev: %d, v:%d, ptr:%p, copy:%p [ref_count %d, under_transfer: %d, coherency_state: %d] / dst dev: %d, v:%d, ptr:%p)",
                          gpu_device->super.device_index, gpu_device->super.name,
                          PARSEC_DEV_IS_GPU(candidate_dev->super.type) ? "D2D": "H2D",
-                         gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, nb_elts,
+                         gpu_elem, gpu_elem->super.super.obj_reference_count, original->key, span,
                          candidate_dev->super.device_index, candidate->version, (void*)candidate->device_private,
                          candidate, candidate->super.super.obj_reference_count, candidate->data_transfer_status, candidate->coherency_state,
                          gpu_device->super.device_index, gpu_elem->version, (void*)gpu_elem->device_private);
@@ -1511,7 +1553,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         }
         if(gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) {
             parsec_device_gpu_memory_prof_info_t _info;
-            _info.size = (uint64_t)nb_elts;
+            _info.size = (uint64_t)span;
             _info.data_key = gpu_elem->original->key;
             _info.dc_id = (uint64_t)(gpu_elem->original->dc);
             parsec_profiling_trace_flags(gpu_stream->profiling,
@@ -1522,7 +1564,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
         }
     }
 #endif
-    gpu_task->sources[flow->flow_index] = candidate;  /* save the candidate for release on transfer completion */
+    gpu_task->flow_info[flow->flow_index].source = candidate;  /* save the candidate for release on transfer completion */
     /* Push data into the GPU from the source device */
     int rc = gpu_task->stage_in ? gpu_task->stage_in(gpu_task, (1U << flow->flow_index), gpu_stream): PARSEC_SUCCESS;
     if(PARSEC_SUCCESS != rc) {
@@ -1531,15 +1573,15 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
                         gpu_device->super.device_index, gpu_device->super.name, rc, __func__, __LINE__,
                         candidate->device_private, candidate_dev->super.device_index, candidate_dev->super.name,
                         gpu_elem->device_private, gpu_device->super.device_index, gpu_device->super.name,
-                        nb_elts, (candidate_dev->super.type != gpu_device->super.type)? "H2D": "D2D");
+                        span, (candidate_dev->super.type != gpu_device->super.type)? "H2D": "D2D");
         parsec_atomic_unlock( &original->lock );
         assert(0);
         return PARSEC_HOOK_RETURN_ERROR;
     }
     assert(candidate_dev->super.device_index < gpu_device->super.data_in_array_size);
-    gpu_device->super.data_in_from_device[candidate_dev->super.device_index] += nb_elts;
+    gpu_device->super.data_in_from_device[candidate_dev->super.device_index] += span;
     if( PARSEC_GPU_TASK_TYPE_KERNEL == gpu_task->task_type )
-        gpu_device->super.nb_data_faults += nb_elts;
+        gpu_device->super.nb_data_faults += span;
 
     /* We assign the version of the data preemptively (i.e. before the task is executing)
      * For read-only data, the GPU copy will get the same version as the source
@@ -1616,15 +1658,13 @@ parsec_device_send_transfercomplete_cmd_to_device(parsec_data_copy_t *copy,
                                                   parsec_device_module_t *current_dev,
                                                   parsec_device_module_t *dst_dev)
 {
-    parsec_gpu_task_t* gpu_task = NULL;
-    gpu_task = (parsec_gpu_task_t*)calloc(1, sizeof(parsec_gpu_task_t));
+    parsec_gpu_task_t *gpu_task = (parsec_gpu_task_t *)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);
     gpu_task->task_type = PARSEC_GPU_TASK_TYPE_D2D_COMPLETE;
-    gpu_task->release_device_task = free;  /* by default free the device task */
     gpu_task->ec = calloc(1, sizeof(parsec_task_t));
     PARSEC_OBJ_CONSTRUCT(gpu_task->ec, parsec_task_t);
     gpu_task->ec->task_class = &parsec_device_d2d_complete_tc;
-    gpu_task->flow[0] = &parsec_device_d2d_complete_flow;
-    gpu_task->flow_nb_elts[0] = copy->original->nb_elts;
+    gpu_task->flow_info[0].flow = &parsec_device_d2d_complete_flow;
+    gpu_task->flow_info[0].flow_span = copy->original->span;
     gpu_task->stage_in  = parsec_default_gpu_stage_in;
     gpu_task->stage_out = parsec_default_gpu_stage_out;
     gpu_task->ec->data[0].data_in = copy;  /* We need to set not-null in data_in, so that the fake flow is
@@ -1654,7 +1694,7 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
 
     parsec_gpu_task_t *gtask = *gpu_task;
     parsec_task_t *task;
-    int32_t i;
+    uint32_t i;
 #if defined(PARSEC_DEBUG_NOISIER)
     char task_str[MAX_TASK_STRLEN];
 #endif
@@ -1671,14 +1711,15 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
                          "GPU[%d:%s]: parsec_device_callback_complete_push, PUSH of %s",
                          gpu_device->super.device_index, gpu_device->super.name, parsec_task_snprintf(task_str, MAX_TASK_STRLEN, task));
 
-    for( i = 0; i < task->task_class->nb_flows; i++ ) {
+    for (i = 0; i < gtask->nb_flows /* not task->task_class->nb_flows */; i++)
+    {
         /* Make sure data_in is not NULL */
         if( NULL == task->data[i].data_in ) continue;
         /* We also don't push back non-parsec-owned copies */
         if(NULL != task->data[i].data_out &&
            0 == (task->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED)) continue;
 
-        flow = gtask->flow[i];
+        flow = gtask->flow_info[i].flow;
         assert( flow );
         assert( flow->flow_index == i );
         if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
@@ -1690,9 +1731,6 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
                                                        gpu_device->super.device_index,
                                                        flow->flow_flags);
 
-            parsec_data_copy_t* source = gtask->sources[i];
-            parsec_device_gpu_module_t *src_device =
-                    (parsec_device_gpu_module_t*)parsec_mca_device_get( source->device_index );
             if (task->data[i].data_in->flags & PARSEC_DATA_FLAG_EVICTED) {
                 /**
                  * The device copy had been evicted to the host and brought back in.
@@ -1729,8 +1767,6 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
                 task->data[i].data_in->flags ^= PARSEC_DATA_FLAG_EVICTED;
             }
 
-            parsec_atomic_unlock(&task->data[i].data_out->original->lock);
-
 #if defined(PARSEC_PROF_TRACE)
             if(gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_DATA_IN) {
                 PARSEC_PROFILING_TRACE(gpu_stream->profiling,
@@ -1740,6 +1776,11 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
                                        NULL);
             }
 #endif
+
+            parsec_atomic_unlock(&task->data[i].data_out->original->lock);
+            parsec_data_copy_t *source = gtask->flow_info[i].source;
+            parsec_device_gpu_module_t *src_device =
+                    (parsec_device_gpu_module_t*)parsec_mca_device_get( source->device_index );
             if( PARSEC_DEV_IS_GPU(src_device->super.type) ) {
                 int om;
                 while(1) {
@@ -2018,7 +2059,7 @@ parsec_device_kernel_push( parsec_device_gpu_module_t      *gpu_device,
 {
     parsec_task_t *this_task = gpu_task->ec;
     const parsec_flow_t *flow;
-    int i, ret = 0;
+    int ret = 0;
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
 #endif
@@ -2062,9 +2103,9 @@ parsec_device_kernel_push( parsec_device_gpu_module_t      *gpu_device,
         return ret;
     }
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint32_t i = 0; i < gpu_task->nb_flows  /* not this_task->task_class->nb_flows */; i++ ) {
 
-        flow = gpu_task->flow[i];
+        flow = gpu_task->flow_info[i].flow;
         /* Skip CTL flows */
         if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
 
@@ -2140,11 +2181,11 @@ parsec_device_kernel_exec( parsec_device_gpu_module_t      *gpu_device,
 
 #if defined(PARSEC_DEBUG_PARANOID)
     const parsec_flow_t *flow;
-    for( int i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint i = 0; i < gpu_task->nb_flows  /* this_task->task_class->nb_flows */; i++ ) {
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
-        flow = gpu_task->flow[i];
+        flow = gpu_task->flow_info[i].flow;
         if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
         if( 0 == (this_task->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
         assert(this_task->data[i].data_out->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER);
@@ -2170,15 +2211,15 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
     parsec_task_t *this_task = gpu_task->ec;
     parsec_gpu_data_copy_t     *gpu_copy;
     parsec_data_t              *original;
-    size_t                      nb_elts;
+    size_t                      span;
     const parsec_flow_t        *flow;
-    int return_code = 0, rc, how_many = 0, i, update_data_epoch = 0;
+    int return_code = 0, rc, how_many = 0, update_data_epoch = 0;
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
 #endif
 
     if (gpu_task->task_type == PARSEC_GPU_TASK_TYPE_D2HTRANSFER) {
-        for( i = 0; i < this_task->locals[0].value; i++ ) {
+        for( int i = 0; i < this_task->locals[0].value; i++ ) {
             gpu_copy = this_task->data[i].data_out;
             /* If the gpu copy is not owned by parsec, we don't manage it at all */
             if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
@@ -2202,13 +2243,13 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
                         gpu_device->super.device_index, gpu_device->super.name,
                         parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint32_t i = 0; i < gpu_task->nb_flows  /* not this_task->task_class->nb_flows */; i++ ) {
         /* We need to manage all data that has been used as input, even if they were read only */
 
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
-        flow = gpu_task->flow[i];
+        flow = gpu_task->flow_info[i].flow;
         if( PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags) )  continue;  /* control flow */
 
         gpu_copy = this_task->data[i].data_out;
@@ -2217,7 +2258,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
         if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
 
         original = gpu_copy->original;
-        nb_elts = gpu_task->flow_nb_elts[i];
+        span = gpu_task->flow_info[i].flow_span;
 
         assert( this_task->data[i].data_in == NULL || original == this_task->data[i].data_in->original );
 
@@ -2267,7 +2308,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
                                 gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->super.super.obj_reference_count, flow->name);
 
             /* Stage the transfer of the data back to main memory */
-            gpu_device->super.required_data_out += nb_elts;
+            gpu_device->super.required_data_out += span;
             assert( ((parsec_list_item_t*)gpu_copy)->list_next == (parsec_list_item_t*)gpu_copy );
             assert( ((parsec_list_item_t*)gpu_copy)->list_prev == (parsec_list_item_t*)gpu_copy );
 
@@ -2315,7 +2356,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
                     parsec_atomic_unlock(&original->lock);
                     goto release_and_return_error;
                 }
-                gpu_device->super.data_out_to_host += nb_elts; /* TODO: not hardcoded, use datatype size */
+                gpu_device->super.data_out_to_host += span; /* TODO: not hardcoded, use datatype size */
                 how_many++;
             } else {
                 assert( 0 == gpu_copy->readers );
@@ -2346,7 +2387,6 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
     parsec_task_t *this_task = gpu_task->ec;
     parsec_gpu_data_copy_t     *gpu_copy, *cpu_copy;
     parsec_data_t              *original;
-    int i;
 
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
@@ -2356,7 +2396,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
                          parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
 #endif
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint32_t i = 0; i < gpu_task->nb_flows  /* not this_task->task_class->nb_flows */; i++ ) {
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
@@ -2364,7 +2404,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
         if(NULL == this_task->data[i].data_out) continue;
 
 
-        if( !(gpu_task->flow[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
+        if( !(gpu_task->flow_info[i].flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
             /* Warning data_out for read only flows has been overwritten in pop */
             continue;
         }
@@ -2449,7 +2489,7 @@ parsec_device_kernel_cleanout( parsec_device_gpu_module_t *gpu_device,
     parsec_task_t *this_task = gpu_task->ec;
     parsec_gpu_data_copy_t     *gpu_copy, *cpu_copy;
     parsec_data_t              *original;
-    int i, data_avail_epoch = 0;
+    int data_avail_epoch = 0;
 
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
@@ -2459,13 +2499,13 @@ parsec_device_kernel_cleanout( parsec_device_gpu_module_t *gpu_device,
                          parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
 #endif
 
-    for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
+    for( uint32_t i = 0; i < gpu_task->nb_flows  /* not this_task->task_class->nb_flows */; i++ ) {
         /* Make sure data_in is not NULL */
         if( NULL == this_task->data[i].data_in ) continue;
 
         /* Don't bother if there is no real data (aka. CTL or no output) */
         if(NULL == this_task->data[i].data_out) continue;
-        if( !(gpu_task->flow[i]->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
+        if( !(gpu_task->flow_info[i].flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
             /* Warning data_out for read only flows has been overwritten in pop */
             continue;
         }
@@ -2709,9 +2749,9 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module,
     PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, "GPU[%d:%s]: gpu_task %p freed",
                          gpu_device->super.device_index, gpu_device->super.name,
                          gpu_task);
-    if (NULL != gpu_task->release_device_task) {
-        gpu_task->release_device_task(gpu_task);
-    }
+    /* Release the GPU task */
+    gpu_task->release_device_task(gpu_task);
+
     rc = parsec_atomic_fetch_dec_int32( &(gpu_device->mutex) );
     if( 1 == rc ) {  /* I was the last one */
 #if defined(PARSEC_PROF_TRACE)
diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h
index fa25b87a3..167b16c57 100644
--- a/parsec/mca/device/device_gpu.h
+++ b/parsec/mca/device/device_gpu.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024 The University of Tennessee and The University
+ * Copyright (c) 2021-2025 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
@@ -81,7 +81,25 @@ typedef int (parsec_stage_out_function_t)(parsec_gpu_task_t        *gtask,
  * and this function allows the device engine to delegate the release of such tasks back into
  * the DSL. Once this task called, the device task should not be accessed by the device.
  */
-typedef void (*parsec_release_device_task_function_t)(void*);
+typedef void (*parsec_release_device_task_function_t)(parsec_gpu_task_t*);
+typedef struct parsec_gpu_flow_info_s {
+    const parsec_flow_t      *flow;         /* Some DSL might not have a task_class, but they still need to provide a flow. */
+    size_t                    flow_span;    /* the span of the data on the device. For contiguous layout this is equal to the
+                                             * size of the data, for all the other copies this should be the amount of memory
+                                             * needed on the device.
+                                             */
+    parsec_data_collection_t *flow_dc; /* the data collection from which the data originates. When the data copy is local, the data
+                                        * collection can be accessed via the data_t, but for all copies coming from the network there
+                                        * is no known data collection. Thus, for such cases the DSL need to provide a reference to the
+                                        * local data collection to be used for all transfers. This data collection is needed to get
+                                        * access to the mtype, to get the memory layout of the copy.
+                                        */
+    /* The is private to the device code and should not be used outside the device driver */
+    parsec_data_copy_t       *source; /* If the driver decides to acquire the data from a different
+                                        * source, it will temporary store the best candidate here.
+                                        */
+
+} parsec_gpu_flow_info_t;
 
 struct parsec_gpu_task_s {
     parsec_list_item_t                     list_item;
@@ -102,23 +120,8 @@ struct parsec_gpu_task_s {
         struct {
             parsec_task_t                 *ec;
             uint64_t                       last_data_check_epoch;
-            const parsec_flow_t           *flow[MAX_PARAM_COUNT];  /* There is no consistent way to access the flows from the task_class,
-                                                                    * so the DSL need to provide these flows here.
-                                                                    */
-            size_t                         flow_nb_elts[MAX_PARAM_COUNT]; /* for each flow, size of the data to be allocated
-                                                                           * on the GPU.
-                                                                           */
-            parsec_data_collection_t      *flow_dc[MAX_PARAM_COUNT];     /* for each flow, data collection from which the data
-                                                                          * to be transferred logically belongs to.
-                                                                          * This gives the user the chance to indicate on the JDF
-                                                                          * a data collection to inspect during GPU transfer.
-                                                                          * User may want info from the DC (e.g. mtype),
-                                                                          * & otherwise remote copies don't have any info.
-                                                                          */
-            /* These are private and should not be used outside the device driver */
-            parsec_data_copy_t            *sources[MAX_PARAM_COUNT];  /* If the driver decides to acquire the data from a different
-                                                                       * source, it will temporary store the best candidate here.
-                                                                       */
+            uint32_t                       nb_flows;
+            parsec_gpu_flow_info_t        *flow_info;
         };
         struct {
             parsec_data_copy_t            *copy;
@@ -126,6 +129,20 @@ struct parsec_gpu_task_s {
     };
 };
 
+PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_gpu_task_t);
+
+/**
+ * Specialized GPU tasks for the PTG and DTD DSL. The maximum number of flows being MAX_PARAM_COUNT we
+ * can make the gpu_flow_info array part of the struct, to allocate the gpu_task as a single, 
+ * contiguous block of memory.
+ */
+typedef struct parsec_gpu_dsl_task_s {
+    parsec_gpu_task_t                      super;
+    parsec_gpu_flow_info_t                 flows[MAX_PARAM_COUNT];  /* All the flow info necessary for the PTG and DTD DSL */
+} parsec_gpu_dsl_task_t;
+
+PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_gpu_dsl_task_t);
+
 typedef enum parsec_device_transfer_direction_e {
     parsec_device_gpu_transfer_direction_h2d,
     parsec_device_gpu_transfer_direction_d2h,
@@ -318,6 +335,7 @@ char *parsec_device_describe_gpu_task( char *tmp, size_t len, parsec_gpu_task_t
 #define PARSEC_GPU_TASK_TYPE_PREFETCH     0x2000
 #define PARSEC_GPU_TASK_TYPE_WARMUP       0x4000
 #define PARSEC_GPU_TASK_TYPE_D2D_COMPLETE 0x8000
+#define PARSEC_GPU_TASK_TYPE_INVALID      0xf000
 
 #if defined(PARSEC_PROF_TRACE)
 #define PARSEC_PROFILE_GPU_TRACK_DATA_IN  0x0001
diff --git a/parsec/mca/device/transfer_gpu.c b/parsec/mca/device/transfer_gpu.c
index 50b0d886e..0f45bcdfe 100644
--- a/parsec/mca/device/transfer_gpu.c
+++ b/parsec/mca/device/transfer_gpu.c
@@ -276,9 +276,7 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device,
     d2h_task->taskpool        = NULL;
     d2h_task->locals[0].value = nb_cleaned;
 
-    w2r_task = (parsec_gpu_task_t *)malloc(sizeof(parsec_gpu_task_t));
-    PARSEC_OBJ_CONSTRUCT(w2r_task, parsec_list_item_t);
-    w2r_task->release_device_task   = free;  /* by default free the device task */
+    w2r_task = (parsec_gpu_task_t *)PARSEC_OBJ_NEW(parsec_gpu_dsl_task_t);
     w2r_task->ec                    = (parsec_task_t*)d2h_task;
     w2r_task->task_type             = PARSEC_GPU_TASK_TYPE_D2HTRANSFER;
     w2r_task->last_data_check_epoch = gpu_device->data_avail_epoch - 1;
@@ -309,7 +307,7 @@ int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device,
         parsec_atomic_lock(&gpu_copy->original->lock);
         gpu_copy->readers--;
         gpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;
-        gpu_device->super.data_out_to_host += gpu_copy->original->nb_elts; /* TODO: not hardcoded, use datatype size */
+        gpu_device->super.data_out_to_host += gpu_copy->original->span; /* TODO: not hardcoded, use datatype size */
         assert(gpu_copy->readers >= 0);
 
         original = gpu_copy->original;
@@ -343,7 +341,7 @@ int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device,
         parsec_atomic_unlock(&gpu_copy->original->lock);
     }
     parsec_thread_mempool_free(es->context_mempool, task);
-    free(gpu_task);
+    PARSEC_OBJ_RELEASE(gpu_task); /* no need to call release_device_task, just release the task */
     gpu_device->data_avail_epoch++;
     return 0;
 }
diff --git a/tests/dsl/ptg/choice/choice_data.c b/tests/dsl/ptg/choice/choice_data.c
index a6c7dcd77..70281f3ae 100644
--- a/tests/dsl/ptg/choice/choice_data.c
+++ b/tests/dsl/ptg/choice/choice_data.c
@@ -50,7 +50,7 @@ get_or_create_data(my_datatype_t* dat, uint32_t pos)
 
         data->owner_device = 0;
         data->key = pos;
-        data->nb_elts = 1;
+        data->span = 1;
         data->device_copies[0] = data_copy;
 
         if( !parsec_atomic_cas_ptr(&dat->data_map[pos], NULL, data) ) {
diff --git a/tests/runtime/cuda/nvlink_wrapper.c b/tests/runtime/cuda/nvlink_wrapper.c
index abc4b19c9..a641ed053 100644
--- a/tests/runtime/cuda/nvlink_wrapper.c
+++ b/tests/runtime/cuda/nvlink_wrapper.c
@@ -189,7 +189,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
             /* And copy the tile from CPU to GPU */
             status = (cudaError_t)cudaMemcpy( gpu_copy->device_private,
                                               cpu_copy->device_private,
-                                              dta->nb_elts,
+                                              dta->span,
                                               cudaMemcpyHostToDevice );
             PARSEC_CUDA_CHECK_ERROR( "(nvlink_wrapper) cudaMemcpy", status, {return NULL;} );
             g++;
diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf
index 7df99800f..0878746c6 100644
--- a/tests/runtime/cuda/stage_custom.jdf
+++ b/tests/runtime/cuda/stage_custom.jdf
@@ -37,12 +37,13 @@ stage_stride_in(parsec_gpu_task_t *gtask,
     parsec_device_gpu_module_t *in_elem_dev;
     parsec_tiled_matrix_t * dc;
     int elem_sz;
-    int i;
-    for(i = 0; i < task->task_class->nb_flows; i++){
+
+    assert(gtask->nb_flows == task->task_class->nb_flows);
+    for(uint32_t i = 0; i < gtask->nb_flows; i++){
         if(flow_mask & (1U << i)){
             copy_in = task->data[i].data_in;
             copy_out = task->data[i].data_out;
-            dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i];
+            dc = (parsec_tiled_matrix_t*)gtask->flow_info[i].flow_dc;
             elem_sz = parsec_datadist_getsizeoftype(dc->mtype);
             in_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_in->device_index);
             if(in_elem_dev->super.type != PARSEC_DEV_CUDA ){
@@ -62,7 +63,7 @@ stage_stride_in(parsec_gpu_task_t *gtask,
             }else{
                 ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private,
                                                     copy_in->device_private,
-                                                    copy_in->original->nb_elts,
+                                                    copy_in->original->span,
                                                     cudaMemcpyDeviceToDevice,
                                                     cuda_stream->cuda_stream );
                 PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync", ret, { return PARSEC_ERROR; } );
@@ -89,7 +90,7 @@ stage_stride_out(parsec_gpu_task_t *gtask,
         if(flow_mask & (1U << i)){
             copy_in = task->data[i].data_out;
             copy_out = copy_in->original->device_copies[0];
-            dc = (parsec_tiled_matrix_t*)gtask->flow_dc[i];
+            dc = (parsec_tiled_matrix_t*)gtask->flow_info[i].flow_dc;
             elem_sz = parsec_datadist_getsizeoftype(dc->mtype);
             /* copy width bytes heigth times, skipping pitch - width bytes every time */
             size_t dpitch = dc->llm * elem_sz;