From 1c70089d3b6a43369dfb4fcd57139988ae60785e Mon Sep 17 00:00:00 2001 From: Romain PEREIRA Date: Tue, 10 Jun 2025 20:01:17 +0000 Subject: [PATCH 1/2] Re-implemented replicaset using HWLOC memattr --- include/aml/higher/replicaset.h | 8 + include/aml/higher/replicaset/hwloc.h | 23 +-- src/replicaset/hwloc.c | 256 +++++++++++++------------- 3 files changed, 142 insertions(+), 145 deletions(-) diff --git a/include/aml/higher/replicaset.h b/include/aml/higher/replicaset.h index bc0ccaae..9770bc94 100644 --- a/include/aml/higher/replicaset.h +++ b/include/aml/higher/replicaset.h @@ -31,6 +31,14 @@ struct aml_replicaset_data; // Replicaset required methods. See below struct aml_replicaset_ops; +/** + * Distance kind + */ +enum aml_replicaset_attr_kind_e { + AML_REPLICASET_ATTR_LATENCY, + AML_REPLICASET_ATTR_BANDWIDTH +}; + /** * High level replicaset structure. * See specific implementations for instanciation. diff --git a/include/aml/higher/replicaset/hwloc.h b/include/aml/higher/replicaset/hwloc.h index d5ec1150..c54e57f7 100644 --- a/include/aml/higher/replicaset/hwloc.h +++ b/include/aml/higher/replicaset/hwloc.h @@ -34,22 +34,18 @@ extern "C" { * @{ **/ +typedef struct aml_replicaset_hwloc_memattr_t { + hwloc_obj_t obj; // a numa object + hwloc_uint64_t attr; // attr value +} aml_replicaset_hwloc_memattr_t; + /** * Inner data implementation of replicaset for hwloc backend. */ struct aml_replicaset_hwloc_data { - /** The type of object used as initiator. */ - hwloc_obj_type_t type; - /** Number of initiators */ - unsigned num_ptr; - /** - * Array of pointers to replicas. - * Contains one pointer per initiator of the topology. - * Pointers are ordered by initiator logical index. - * Multiple initiators may have the same pointer, e.g - * two neighbor cores pointing to data on closest NUMA node. - */ - void **ptr; + + /** the numa node of each replica */ + aml_replicaset_hwloc_memattr_t *numas; }; /** Public methods struct for aml_replicaset_hwloc */ @@ -74,8 +70,7 @@ extern struct aml_replicaset_ops aml_replicaset_hwloc_ops; **/ int aml_replicaset_hwloc_create(struct aml_replicaset **out, const size_t size, - const hwloc_obj_type_t initiator_type, - const enum hwloc_distances_kind_e kind); + const enum aml_replicaset_attr_kind_e kind); /** * Destroy hwloc replicaset data. diff --git a/src/replicaset/hwloc.c b/src/replicaset/hwloc.c index d31f51b3..2b7fcbcb 100644 --- a/src/replicaset/hwloc.c +++ b/src/replicaset/hwloc.c @@ -12,138 +12,137 @@ #include "aml.h" -#include "aml/area/hwloc.h" #include "aml/higher/replicaset.h" #include "aml/higher/replicaset/hwloc.h" #include "aml/utils/backend/hwloc.h" -int aml_replicaset_hwloc_alloc(struct aml_replicaset **out, - const hwloc_obj_type_t initiator_type) +int aml_replicaset_hwloc_alloc(struct aml_replicaset **out) { - struct aml_replicaset *replicaset = NULL; - struct aml_replicaset_hwloc_data *data = NULL; - - // Check initiator type. - const unsigned int n_initiator = - hwloc_get_nbobjs_by_type(aml_topology, initiator_type); - hwloc_obj_t initiator = - hwloc_get_obj_by_type(aml_topology, initiator_type, 0); - if (n_initiator == 0) - return -AML_EDOM; - if (initiator == NULL || initiator->cpuset == NULL || - hwloc_bitmap_weight(initiator->cpuset) <= 0) - return -AML_EINVAL; - + // Allocation + zero const unsigned int n_numa = hwloc_get_nbobjs_by_type(aml_topology, HWLOC_OBJ_NUMANODE); - - // Allocation - replicaset = AML_INNER_MALLOC_ARRAY(n_numa + n_initiator, void *, - struct aml_replicaset, - struct aml_replicaset_hwloc_data); - if (replicaset == NULL) + const size_t size = sizeof(struct aml_replicaset) + + sizeof(struct aml_replicaset_hwloc_data) + + n_numa * sizeof(void *) + + n_numa * sizeof(aml_replicaset_hwloc_memattr_t); + unsigned char *mem = (unsigned char *)calloc(1, size); + if (mem == NULL) return -AML_ENOMEM; + struct aml_replicaset *replicaset = (struct aml_replicaset *)(mem + 0); + struct aml_replicaset_hwloc_data *data = + (struct aml_replicaset_hwloc_data + *)(mem + sizeof(struct aml_replicaset)); + // Set ops replicaset->ops = &aml_replicaset_hwloc_ops; - // Set data - replicaset->data = - (struct aml_replicaset_data *)AML_INNER_MALLOC_GET_FIELD( - replicaset, 2, struct aml_replicaset, - struct aml_replicaset_hwloc_data); - data = (struct aml_replicaset_hwloc_data *)replicaset->data; - // Set replica pointers array - replicaset->replica = (void **)AML_INNER_MALLOC_GET_ARRAY( - replicaset, void *, struct aml_replicaset, - struct aml_replicaset_hwloc_data); - for (unsigned i = 0; i < n_numa; i++) - replicaset->replica[i] = NULL; + replicaset->replica = + (void **)(mem + sizeof(struct aml_replicaset) + + sizeof(struct aml_replicaset_hwloc_data)); - // Set initiator pointers array - data->ptr = replicaset->replica + n_numa; - - // Set number of initiators - data->num_ptr = n_initiator; - - // Set number of replicas to 0. Initialization will set - // it to the correct value. - replicaset->n = 0; + // Set data + replicaset->data = (struct aml_replicaset_data *)data; + data->numas = (aml_replicaset_hwloc_memattr_t + *)(mem + sizeof(struct aml_replicaset) + + sizeof(struct aml_replicaset_hwloc_data) + + n_numa * sizeof(void *)); + // save *out = replicaset; + return AML_SUCCESS; } +static int aml_memattr_cmp(const aml_replicaset_hwloc_memattr_t *x, + const aml_replicaset_hwloc_memattr_t *y) +{ + return (x->attr < y->attr ? 1 : x->attr > y->attr ? -1 : 0); +} + int aml_replicaset_hwloc_create(struct aml_replicaset **out, const size_t size, - const hwloc_obj_type_t initiator_type, - const enum hwloc_distances_kind_e kind) + const enum aml_replicaset_attr_kind_e kind) { - int err = -AML_FAILURE; - struct aml_replicaset *replicaset = NULL; - struct aml_replicaset_hwloc_data *data = NULL; - struct aml_area *area = NULL; - struct aml_area_hwloc_preferred_data *area_data = NULL; - const unsigned int n_numa = - hwloc_get_nbobjs_by_type(aml_topology, HWLOC_OBJ_NUMANODE); - hwloc_obj_t targets[n_numa]; + assert(kind == AML_REPLICASET_ATTR_LATENCY || + kind == AML_REPLICASET_ATTR_BANDWIDTH); + + if (out == NULL) + return -AML_FAILURE; - err = aml_replicaset_hwloc_alloc(&replicaset, initiator_type); + // Allocate replicaset + int err = aml_replicaset_hwloc_alloc(out); if (err != AML_SUCCESS) return err; - replicaset->size = size; - data = (struct aml_replicaset_hwloc_data *)replicaset->data; - - // For each initiator allocate replica on preferred area - for (hwloc_obj_t initiator = - hwloc_get_obj_by_type(aml_topology, initiator_type, 0); - initiator != NULL; initiator = initiator->next_cousin) { - - // Get preferred area. - err = aml_area_hwloc_preferred_create(&area, initiator, kind); - if (err != AML_SUCCESS) - goto err_with_replicaset; - area_data = (struct aml_area_hwloc_preferred_data *)area->data; - - // Search if preferred numa node is already a target - for (unsigned i = 0; i < replicaset->n; i++) { - if (targets[i] == area_data->numanodes[0]) { - data->ptr[initiator->logical_index] = - replicaset->replica[i]; - goto next; - } - } + (*out)->size = size; + struct aml_replicaset_hwloc_data *data = + (struct aml_replicaset_hwloc_data *)(*out)->data; - // Preferred numa node is not a target yet. - void *ptr = aml_area_mmap(area, size, NULL); - if (ptr == NULL) { - err = -AML_ENOMEM; - goto err_with_replicas; + // (1) create memattr lists in the form: [(numa node x0, cpuset c0, attr + // a0), ...] + const unsigned int n_numa = + hwloc_get_nbobjs_by_type(aml_topology, HWLOC_OBJ_NUMANODE); + for (unsigned int i = 0; i < n_numa; ++i) { + // get the numa node + hwloc_obj_t numa = hwloc_get_obj_by_type(aml_topology, + HWLOC_OBJ_NUMANODE, i); + + // get the attributie + hwloc_uint64_t value; + hwloc_memattr_id_t attr_id; + const char *attr_name = (kind == AML_REPLICASET_ATTR_LATENCY) ? + "Latency" : + "Bandwidth"; + if (hwloc_memattr_get_by_name(aml_topology, attr_name, + &attr_id) == 0) { + struct hwloc_location initiator = { + .type = HWLOC_LOCATION_TYPE_CPUSET, + .location.cpuset = numa->cpuset}; + unsigned long flags = 0; + if (hwloc_memattr_get_value(aml_topology, attr_id, numa, + &initiator, flags, &value)) + value = 0; + } else { + value = 0; } - replicaset->replica[replicaset->n] = ptr; - data->ptr[initiator->logical_index] = ptr; - targets[replicaset->n] = area_data->numanodes[0]; - replicaset->n++; + data->numas[i].obj = numa; + data->numas[i].attr = value; + } - next: - // Area cleanup - aml_area_hwloc_preferred_destroy(&area); + // (2) sort the list by attr + qsort(data->numas, n_numa, sizeof(aml_replicaset_hwloc_memattr_t), + (int (*)(const void *, const void *))aml_memattr_cmp); + + // (3) iterate through it to find a union of numa node that covers all + // cpus (4) replicate memory on each numa node + hwloc_const_cpuset_t full_cpuset = + hwloc_topology_get_topology_cpuset(aml_topology); + hwloc_bitmap_t cpuset = hwloc_bitmap_alloc(); + for (unsigned int i = 0; i < n_numa; ++i) { + hwloc_bitmap_or(cpuset, cpuset, data->numas[i].obj->cpuset); + (*out)->n += 1; + (*out)->replica[i] = hwloc_alloc_membind( + aml_topology, + size, // size_t size + data->numas[i].obj->nodeset, // hwloc_nodeset_t + HWLOC_MEMBIND_BIND, // force binding strictly to the + // node + HWLOC_MEMBIND_STRICT | + HWLOC_MEMBIND_BYNODESET // fail if can't bind + // exactly + ); + assert((*out)->replica[i]); + + // if all cpus are represented in one replica, stop here + if (hwloc_bitmap_isincluded(full_cpuset, cpuset)) { + break; + } } + hwloc_bitmap_free(cpuset); - // Success - *out = replicaset; return AML_SUCCESS; - - // Failure -err_with_replicas: - for (unsigned i = 0; i < replicaset->n; i++) - munmap(replicaset->replica[i], size); - -err_with_replicaset: - free(replicaset); - return err; } void aml_replicaset_hwloc_destroy(struct aml_replicaset **replicaset) @@ -181,45 +180,40 @@ int aml_replicaset_hwloc_sync(struct aml_replicaset *replicaset, return AML_SUCCESS; } +// a tls for the cpuset, to avoid reallocating on every +// `aml_replicaset_hwloc_local_replica` calls +_Thread_local hwloc_bitmap_t aml_tls_cpuset = NULL; + void *aml_replicaset_hwloc_local_replica(struct aml_replicaset *replicaset) { - int err; - hwloc_obj_t initiator; - struct aml_replicaset_hwloc_data *data = NULL; - - data = (struct aml_replicaset_hwloc_data *)replicaset->data; - - // Get object where this function is called. - err = aml_hwloc_local_initiator(&initiator); - if (err != AML_SUCCESS) + // Sanitary checks + if (replicaset == NULL || replicaset->data == NULL) return NULL; - // If initiator is a child of the replica location, then select a - // parent, at same depth as replica. - while (initiator != NULL && - hwloc_get_nbobjs_by_depth(aml_topology, initiator->depth) > - data->num_ptr) - initiator = initiator->parent; - if (initiator == NULL) - return NULL; + struct aml_replicaset_hwloc_data *data = + (struct aml_replicaset_hwloc_data *)replicaset->data; - // If initiator is bound to a parent of several replica, then - // chose a child replica with a modulo on thread id. - if (hwloc_get_nbobjs_by_depth(aml_topology, initiator->depth) < - data->num_ptr) { - int depth = hwloc_get_type_depth(aml_topology, data->type); - unsigned n = hwloc_get_nbobjs_inside_cpuset_by_depth( - aml_topology, initiator->cpuset, depth); - if (n == 0) - return NULL; - hwloc_obj_t target = hwloc_get_next_obj_inside_cpuset_by_depth( - aml_topology, initiator->cpuset, depth, NULL); - if (target == NULL) + // Allocate and retrieve the cpuset if not allocated already + // If the cpuset changes for the current thread, then the replicate + // won't... but the overhead of `hwloc_get_cpubind` is non-neglectable + if (aml_tls_cpuset == NULL) { + aml_tls_cpuset = hwloc_bitmap_alloc(); + + // Get the cpuset of the current thread + if (hwloc_get_cpubind(aml_topology, aml_tls_cpuset, + HWLOC_CPUBIND_THREAD)) return NULL; - initiator = target; } - return data->ptr[initiator->logical_index]; + // Go through the list of replicate + for (unsigned int i = 0; i < replicaset->n; ++i) { + // Return the first replicate that matches the current cpuset + if (hwloc_bitmap_intersects(data->numas[i].obj->cpuset, + aml_tls_cpuset)) + return replicaset->replica[i]; + } + + return NULL; } struct aml_replicaset_ops aml_replicaset_hwloc_ops = { From 9b1e80902a5c400688807c070165f8a3cdd419dd Mon Sep 17 00:00:00 2001 From: Romain PEREIRA Date: Tue, 17 Jun 2025 21:09:12 +0000 Subject: [PATCH 2/2] Fixed sorting for latency (which must be the opposite of bandwidth, obviously) --- src/replicaset/hwloc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/replicaset/hwloc.c b/src/replicaset/hwloc.c index 2b7fcbcb..a377b818 100644 --- a/src/replicaset/hwloc.c +++ b/src/replicaset/hwloc.c @@ -57,6 +57,12 @@ int aml_replicaset_hwloc_alloc(struct aml_replicaset **out) static int aml_memattr_cmp(const aml_replicaset_hwloc_memattr_t *x, const aml_replicaset_hwloc_memattr_t *y) +{ + return (x->attr < y->attr ? -1 : x->attr > y->attr ? 1 : 0); +} + +static int aml_memattr_cmp_inv(const aml_replicaset_hwloc_memattr_t *x, + const aml_replicaset_hwloc_memattr_t *y) { return (x->attr < y->attr ? 1 : x->attr > y->attr ? -1 : 0); } @@ -112,8 +118,8 @@ int aml_replicaset_hwloc_create(struct aml_replicaset **out, } // (2) sort the list by attr - qsort(data->numas, n_numa, sizeof(aml_replicaset_hwloc_memattr_t), - (int (*)(const void *, const void *))aml_memattr_cmp); + int (*cmpf)(const void *, const void *) = (int (*)(const void *, const void *)) (kind == AML_REPLICASET_ATTR_LATENCY ? aml_memattr_cmp : aml_memattr_cmp_inv); + qsort(data->numas, n_numa, sizeof(aml_replicaset_hwloc_memattr_t), cmpf); // (3) iterate through it to find a union of numa node that covers all // cpus (4) replicate memory on each numa node