From d55402fd88fa86ccfe49cd836a278021fe8b399a Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 29 Mar 2026 14:28:47 +0200 Subject: [PATCH 01/11] fix(gateway): prevent configurations endpoint deadlock on nodes without parameter service Replace single param_operations_mutex_ with per-node mutexes so one slow node doesn't block parameter queries to other nodes. Add negative cache for nodes whose parameter service is unavailable (skip for 60s instead of blocking on every request). Add self-query guard so the gateway never deadlocks querying its own parameter service via IPC - uses direct node_->get_parameters() instead. Use thread-local param nodes for SyncParametersClient to eliminate executor conflicts without a global mutex. Reduce default parameter_service_timeout_sec from 2.0s to 0.5s. Fixes #318 --- .../configuration_manager.hpp | 55 +++- .../src/configuration_manager.cpp | 273 ++++++++++++------ .../test/test_configuration_manager.cpp | 60 ++++ 3 files changed, 287 insertions(+), 101 deletions(-) diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp index 0ae12a8f0..271711500 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp @@ -14,12 +14,15 @@ #pragma once +#include #include #include #include #include #include +#include #include +#include #include namespace ros2_medkit_gateway { @@ -84,12 +87,28 @@ class ConfigurationManager { ParameterResult reset_all_parameters(const std::string & node_name); private: - /// Get or create a SyncParametersClient for the given node + /// Get or create a SyncParametersClient for the given node. + /// Creates a per-thread param node to avoid executor conflicts. std::shared_ptr get_param_client(const std::string & node_name); /// Cache default values for a node (called on first access) void cache_default_values(const std::string & node_name); + /// Check if a node is in the negative cache (recently unavailable) + bool is_node_unavailable(const std::string & node_name) const; + + /// Mark a node as unavailable in the negative cache + void mark_node_unavailable(const std::string & node_name); + + /// Check if node_name is the gateway's own node (self-query guard) + bool is_self_node(const std::string & node_name) const; + + /// List parameters for the gateway's own node (direct access, no IPC) + ParameterResult list_own_parameters(); + + /// Get a specific parameter from the gateway's own node + ParameterResult get_own_parameter(const std::string & param_name); + /// Convert ROS2 parameter type to string static std::string parameter_type_to_string(rclcpp::ParameterType type); @@ -104,28 +123,32 @@ class ConfigurationManager { rclcpp::Node * node_; - /// Internal node for parameter client operations - /// SyncParametersClient requires a node that is NOT in an executor - /// to perform synchronous operations (spinning internally) - std::shared_ptr param_node_; - /// Timeout for waiting for parameter services (configurable via parameter_service_timeout_sec parameter) - double service_timeout_sec_{2.0}; + double service_timeout_sec_{0.5}; - /// Cache of parameter clients per node (avoids recreating clients) - mutable std::mutex clients_mutex_; - std::map> param_clients_; + /// Negative cache TTL (configurable via parameter_service_negative_cache_sec parameter) + double negative_cache_ttl_sec_{60.0}; + + /// Gateway's own fully qualified node name (for self-query detection) + std::string own_node_fqn_; + + /// Per-node mutexes for parameter operations. + /// Each node gets its own mutex so one slow node doesn't block queries to other nodes. + mutable std::shared_mutex node_mutexes_mutex_; + mutable std::unordered_map> node_mutexes_; + + /// Get or create a mutex for a specific node + std::mutex & get_node_mutex(const std::string & node_name) const; + + /// Negative cache: nodes whose parameter service was recently unavailable. + /// Avoids repeated blocking waits on nodes that don't have parameter services. + mutable std::shared_mutex negative_cache_mutex_; + std::unordered_map unavailable_nodes_; /// Cache of default parameter values per node /// Key: node_name, Value: map of param_name -> Parameter mutable std::mutex defaults_mutex_; std::map> default_values_; - - /// Mutex to serialize all parameter operations. - /// SyncParametersClient internally spins param_node_ via spin_node_until_future_complete(), - /// which is not thread-safe. Concurrent HTTP requests would cause: - /// "Node '/_param_client_node' has already been added to an executor" - mutable std::mutex param_operations_mutex_; }; } // namespace ros2_medkit_gateway diff --git a/src/ros2_medkit_gateway/src/configuration_manager.cpp b/src/ros2_medkit_gateway/src/configuration_manager.cpp index 2dac920cc..6b434e8c1 100644 --- a/src/ros2_medkit_gateway/src/configuration_manager.cpp +++ b/src/ros2_medkit_gateway/src/configuration_manager.cpp @@ -14,55 +14,164 @@ #include "ros2_medkit_gateway/configuration_manager.hpp" +#include #include using namespace std::chrono_literals; namespace ros2_medkit_gateway { +// Thread-local param node counter for unique names +static std::atomic param_node_counter{0}; + ConfigurationManager::ConfigurationManager(rclcpp::Node * node) : node_(node) { - // Create an internal node for parameter client operations - // SyncParametersClient needs a node that is NOT added to any executor - // so it can spin internally when waiting for service responses - rclcpp::NodeOptions options; - options.start_parameter_services(false); - options.start_parameter_event_publisher(false); - // Don't inherit node name/namespace from global arguments (--ros-args -r __node:=...) - // Without this, the internal node would have the same name as the main gateway node - options.use_global_arguments(false); - param_node_ = std::make_shared("_param_client_node", options); - - // Get configurable timeout for parameter services (default 2.0 seconds) - service_timeout_sec_ = node_->declare_parameter("parameter_service_timeout_sec", 2.0); - - RCLCPP_INFO(node_->get_logger(), "ConfigurationManager initialized"); + // Get configurable timeout for parameter services (default 0.5 seconds) + service_timeout_sec_ = node_->declare_parameter("parameter_service_timeout_sec", 0.5); + + // Get negative cache TTL (default 60 seconds) + negative_cache_ttl_sec_ = node_->declare_parameter("parameter_service_negative_cache_sec", 60.0); + + // Store own node FQN for self-query detection + own_node_fqn_ = node_->get_fully_qualified_name(); + + RCLCPP_INFO(node_->get_logger(), "ConfigurationManager initialized (timeout=%.1fs, negative_cache=%.0fs)", + service_timeout_sec_, negative_cache_ttl_sec_); } -/// Helper to get the service timeout as a chrono duration std::chrono::duration ConfigurationManager::get_service_timeout() const { return std::chrono::duration(service_timeout_sec_); } +std::mutex & ConfigurationManager::get_node_mutex(const std::string & node_name) const { + // Fast path: check if mutex exists + { + std::shared_lock lock(node_mutexes_mutex_); + auto it = node_mutexes_.find(node_name); + if (it != node_mutexes_.end()) { + return *it->second; + } + } + // Slow path: create new mutex + std::unique_lock lock(node_mutexes_mutex_); + auto [it, inserted] = node_mutexes_.try_emplace(node_name, std::make_unique()); + return *it->second; +} + +bool ConfigurationManager::is_node_unavailable(const std::string & node_name) const { + std::shared_lock lock(negative_cache_mutex_); + auto it = unavailable_nodes_.find(node_name); + if (it == unavailable_nodes_.end()) { + return false; + } + auto elapsed = std::chrono::steady_clock::now() - it->second; + return elapsed < std::chrono::duration(negative_cache_ttl_sec_); +} + +void ConfigurationManager::mark_node_unavailable(const std::string & node_name) { + std::unique_lock lock(negative_cache_mutex_); + unavailable_nodes_[node_name] = std::chrono::steady_clock::now(); +} + +bool ConfigurationManager::is_self_node(const std::string & node_name) const { + return node_name == own_node_fqn_; +} + +ParameterResult ConfigurationManager::list_own_parameters() { + ParameterResult result; + try { + auto params = node_->get_parameters(node_->list_parameters({}, 0).names); + json params_array = json::array(); + for (const auto & param : params) { + json param_obj; + param_obj["name"] = param.get_name(); + param_obj["value"] = parameter_value_to_json(param.get_parameter_value()); + param_obj["type"] = parameter_type_to_string(param.get_type()); + params_array.push_back(param_obj); + } + result.success = true; + result.data = params_array; + } catch (const std::exception & e) { + result.success = false; + result.error_message = std::string("Failed to list own parameters: ") + e.what(); + result.error_code = ParameterErrorCode::INTERNAL_ERROR; + } + return result; +} + +ParameterResult ConfigurationManager::get_own_parameter(const std::string & param_name) { + ParameterResult result; + try { + if (!node_->has_parameter(param_name)) { + result.success = false; + result.error_message = "Parameter not found: " + param_name; + result.error_code = ParameterErrorCode::NOT_FOUND; + return result; + } + auto param = node_->get_parameter(param_name); + auto descriptor = node_->describe_parameter(param_name); + + json param_obj; + param_obj["name"] = param.get_name(); + param_obj["value"] = parameter_value_to_json(param.get_parameter_value()); + param_obj["type"] = parameter_type_to_string(param.get_type()); + param_obj["description"] = descriptor.description; + param_obj["read_only"] = descriptor.read_only; + + result.success = true; + result.data = param_obj; + } catch (const std::exception & e) { + result.success = false; + result.error_message = std::string("Failed to get own parameter: ") + e.what(); + result.error_code = ParameterErrorCode::INTERNAL_ERROR; + } + return result; +} + std::shared_ptr ConfigurationManager::get_param_client(const std::string & node_name) { - std::lock_guard lock(clients_mutex_); + // Create a thread-local param node to avoid executor conflicts. + // SyncParametersClient spins its node internally - sharing one node + // across threads causes "Node already added to executor" errors. + thread_local std::shared_ptr tl_param_node; + thread_local std::map> tl_clients; + + if (!tl_param_node) { + rclcpp::NodeOptions options; + options.start_parameter_services(false); + options.start_parameter_event_publisher(false); + options.use_global_arguments(false); + int id = param_node_counter.fetch_add(1); + tl_param_node = std::make_shared( + "_param_client_" + std::to_string(id), options); + } - auto it = param_clients_.find(node_name); - if (it != param_clients_.end()) { + auto it = tl_clients.find(node_name); + if (it != tl_clients.end()) { return it->second; } - // Create new client for this node using the internal param_node_ - // (not the main gateway node, which is already in an executor) - auto client = std::make_shared(param_node_, node_name); - param_clients_[node_name] = client; + auto client = std::make_shared(tl_param_node, node_name); + tl_clients[node_name] = client; return client; } ParameterResult ConfigurationManager::list_parameters(const std::string & node_name) { - std::lock_guard op_lock(param_operations_mutex_); - ParameterResult result; + // Self-query guard: use direct access for gateway's own node + if (is_self_node(node_name)) { + return list_own_parameters(); + } - RCLCPP_DEBUG(node_->get_logger(), "list_parameters called for node: '%s'", node_name.c_str()); + // Negative cache: skip nodes that recently had no parameter service + if (is_node_unavailable(node_name)) { + ParameterResult result; + result.success = false; + result.error_message = "Parameter service not available for node (cached): " + node_name; + result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; + return result; + } + + // Per-node mutex: only block concurrent queries to the SAME node + std::lock_guard op_lock(get_node_mutex(node_name)); + ParameterResult result; // Cache default values on first access to this node cache_default_values(node_name); @@ -70,39 +179,24 @@ ParameterResult ConfigurationManager::list_parameters(const std::string & node_n try { auto client = get_param_client(node_name); - RCLCPP_DEBUG(node_->get_logger(), "Got param client for node: '%s'", node_name.c_str()); - if (!client->wait_for_service(get_service_timeout())) { result.success = false; result.error_message = "Parameter service not available for node: " + node_name; result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; - RCLCPP_WARN(node_->get_logger(), "Parameter service not available for node: '%s'", node_name.c_str()); + mark_node_unavailable(node_name); + RCLCPP_WARN(node_->get_logger(), "Parameter service not available for node: '%s' (cached for %.0fs)", + node_name.c_str(), negative_cache_ttl_sec_); return result; } - RCLCPP_DEBUG(node_->get_logger(), "Service ready for node: '%s'", node_name.c_str()); - - // List all parameter names auto param_names = client->list_parameters({}, 0); - RCLCPP_DEBUG(node_->get_logger(), "Got %zu parameter names for node: '%s'", param_names.names.size(), - node_name.c_str()); - - // Log first few parameter names for debugging - for (size_t i = 0; i < std::min(param_names.names.size(), static_cast(5)); ++i) { - RCLCPP_DEBUG(node_->get_logger(), " param[%zu]: '%s'", i, param_names.names[i].c_str()); - } - - // Get all parameter values - try in smaller batches if needed std::vector parameters; - - // First try getting all at once parameters = client->get_parameters(param_names.names); if (parameters.empty() && !param_names.names.empty()) { RCLCPP_WARN(node_->get_logger(), "get_parameters returned empty, trying one by one for node: '%s'", node_name.c_str()); - // Try getting parameters one by one for (const auto & name : param_names.names) { try { auto single_params = client->get_parameters({name}); @@ -115,8 +209,6 @@ ParameterResult ConfigurationManager::list_parameters(const std::string & node_n } } - RCLCPP_DEBUG(node_->get_logger(), "Got %zu parameter values for node: '%s'", parameters.size(), node_name.c_str()); - json params_array = json::array(); for (const auto & param : parameters) { json param_obj; @@ -139,7 +231,19 @@ ParameterResult ConfigurationManager::list_parameters(const std::string & node_n } ParameterResult ConfigurationManager::get_parameter(const std::string & node_name, const std::string & param_name) { - std::lock_guard op_lock(param_operations_mutex_); + if (is_self_node(node_name)) { + return get_own_parameter(param_name); + } + + if (is_node_unavailable(node_name)) { + ParameterResult result; + result.success = false; + result.error_message = "Parameter service not available for node (cached): " + node_name; + result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; + return result; + } + + std::lock_guard op_lock(get_node_mutex(node_name)); ParameterResult result; try { @@ -149,10 +253,10 @@ ParameterResult ConfigurationManager::get_parameter(const std::string & node_nam result.success = false; result.error_message = "Parameter service not available for node: " + node_name; result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; + mark_node_unavailable(node_name); return result; } - // Check if parameter exists auto param_names = client->list_parameters({param_name}, 1); if (param_names.names.empty()) { result.success = false; @@ -161,7 +265,6 @@ ParameterResult ConfigurationManager::get_parameter(const std::string & node_nam return result; } - // Get parameter value auto parameters = client->get_parameters({param_name}); if (parameters.empty()) { result.success = false; @@ -171,8 +274,6 @@ ParameterResult ConfigurationManager::get_parameter(const std::string & node_nam } const auto & param = parameters[0]; - - // Get parameter descriptor for additional metadata auto descriptors = client->describe_parameters({param_name}); json param_obj; @@ -198,7 +299,15 @@ ParameterResult ConfigurationManager::get_parameter(const std::string & node_nam ParameterResult ConfigurationManager::set_parameter(const std::string & node_name, const std::string & param_name, const json & value) { - std::lock_guard op_lock(param_operations_mutex_); + if (is_node_unavailable(node_name)) { + ParameterResult result; + result.success = false; + result.error_message = "Parameter service not available for node (cached): " + node_name; + result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; + return result; + } + + std::lock_guard op_lock(get_node_mutex(node_name)); ParameterResult result; try { @@ -208,26 +317,23 @@ ParameterResult ConfigurationManager::set_parameter(const std::string & node_nam result.success = false; result.error_message = "Parameter service not available for node: " + node_name; result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; + mark_node_unavailable(node_name); return result; } - // Get current parameter to determine type hint auto current_params = client->get_parameters({param_name}); rclcpp::ParameterType hint_type = rclcpp::ParameterType::PARAMETER_NOT_SET; if (!current_params.empty()) { hint_type = current_params[0].get_type(); } - // Convert JSON value to ROS2 parameter rclcpp::ParameterValue param_value = json_to_parameter_value(value, hint_type); rclcpp::Parameter param(param_name, param_value); - // Set parameter auto results = client->set_parameters({param}); if (results.empty() || !results[0].successful) { result.success = false; result.error_message = results.empty() ? "Failed to set parameter" : results[0].reason; - // Classify error based on reason if (!results.empty()) { const auto & reason = results[0].reason; if (reason.find("read-only") != std::string::npos || reason.find("read only") != std::string::npos || @@ -244,7 +350,6 @@ ParameterResult ConfigurationManager::set_parameter(const std::string & node_nam return result; } - // Return the new value json param_obj; param_obj["name"] = param_name; param_obj["value"] = parameter_value_to_json(param_value); @@ -315,7 +420,6 @@ json ConfigurationManager::parameter_value_to_json(const rclcpp::ParameterValue rclcpp::ParameterValue ConfigurationManager::json_to_parameter_value(const json & value, rclcpp::ParameterType hint_type) { - // If we have a type hint, try to match it if (hint_type != rclcpp::ParameterType::PARAMETER_NOT_SET) { switch (hint_type) { case rclcpp::ParameterType::PARAMETER_BOOL: @@ -344,7 +448,6 @@ rclcpp::ParameterValue ConfigurationManager::json_to_parameter_value(const json if (value.is_string()) { return rclcpp::ParameterValue(value.get()); } - // Convert other types to string return rclcpp::ParameterValue(value.dump()); case rclcpp::ParameterType::PARAMETER_BOOL_ARRAY: if (value.is_array()) { @@ -383,7 +486,6 @@ rclcpp::ParameterValue ConfigurationManager::json_to_parameter_value(const json } } - // Infer type from JSON value if (value.is_boolean()) { return rclcpp::ParameterValue(value.get()); } @@ -398,10 +500,8 @@ rclcpp::ParameterValue ConfigurationManager::json_to_parameter_value(const json } if (value.is_array()) { if (value.empty()) { - // Empty array with no type hint - default to string array return rclcpp::ParameterValue(std::vector{}); } - // Determine array type from first element if (value[0].is_boolean()) { return rclcpp::ParameterValue(value.get>()); } @@ -416,21 +516,21 @@ rclcpp::ParameterValue ConfigurationManager::json_to_parameter_value(const json } } - // Default: convert to string return rclcpp::ParameterValue(value.dump()); } void ConfigurationManager::cache_default_values(const std::string & node_name) { - // Check if already cached (quick check under lock) { std::lock_guard lock(defaults_mutex_); if (default_values_.find(node_name) != default_values_.end()) { return; } } - // Lock released — perform blocking I/O without holding defaults_mutex_ - RCLCPP_DEBUG(node_->get_logger(), "Caching default values for node: '%s'", node_name.c_str()); + // Skip if node is in negative cache + if (is_node_unavailable(node_name)) { + return; + } try { auto client = get_param_client(node_name); @@ -438,17 +538,15 @@ void ConfigurationManager::cache_default_values(const std::string & node_name) { if (!client->wait_for_service(get_service_timeout())) { RCLCPP_WARN(node_->get_logger(), "Cannot cache defaults - service not available for node: '%s'", node_name.c_str()); + mark_node_unavailable(node_name); return; } - // List all parameter names auto param_names = client->list_parameters({}, 0); - // Get all parameter values std::vector parameters; parameters = client->get_parameters(param_names.names); - // Fallback to one-by-one if batch fails if (parameters.empty() && !param_names.names.empty()) { for (const auto & name : param_names.names) { try { @@ -457,12 +555,10 @@ void ConfigurationManager::cache_default_values(const std::string & node_name) { parameters.push_back(single_params[0]); } } catch (const std::exception &) { - // Skip failed parameters } } } - // Store as defaults (re-acquire lock, double-check to avoid overwriting concurrent cacher) std::map node_defaults; for (const auto & param : parameters) { node_defaults[param.get_name()] = param; @@ -472,8 +568,6 @@ void ConfigurationManager::cache_default_values(const std::string & node_name) { std::lock_guard lock(defaults_mutex_); if (default_values_.find(node_name) == default_values_.end()) { default_values_[node_name] = std::move(node_defaults); - RCLCPP_DEBUG(node_->get_logger(), "Cached %zu default values for node: '%s'", default_values_[node_name].size(), - node_name.c_str()); } } } catch (const std::exception & e) { @@ -482,14 +576,20 @@ void ConfigurationManager::cache_default_values(const std::string & node_name) { } ParameterResult ConfigurationManager::reset_parameter(const std::string & node_name, const std::string & param_name) { - std::lock_guard op_lock(param_operations_mutex_); + if (is_node_unavailable(node_name)) { + ParameterResult result; + result.success = false; + result.error_message = "Parameter service not available for node (cached): " + node_name; + result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; + return result; + } + + std::lock_guard op_lock(get_node_mutex(node_name)); ParameterResult result; try { - // Ensure defaults are cached cache_default_values(node_name); - // Look up default value — copy under lock, then release before blocking I/O rclcpp::Parameter default_param; { std::lock_guard lock(defaults_mutex_); @@ -512,12 +612,12 @@ ParameterResult ConfigurationManager::reset_parameter(const std::string & node_n default_param = param_it->second; } - // Lock released — perform blocking I/O without holding defaults_mutex_ auto client = get_param_client(node_name); if (!client->wait_for_service(get_service_timeout())) { result.success = false; result.error_message = "Parameter service not available for node: " + node_name; result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; + mark_node_unavailable(node_name); return result; } @@ -529,7 +629,6 @@ ParameterResult ConfigurationManager::reset_parameter(const std::string & node_n return result; } - // Return the reset value json param_obj; param_obj["name"] = param_name; param_obj["value"] = parameter_value_to_json(default_param.get_parameter_value()); @@ -551,14 +650,20 @@ ParameterResult ConfigurationManager::reset_parameter(const std::string & node_n } ParameterResult ConfigurationManager::reset_all_parameters(const std::string & node_name) { - std::lock_guard op_lock(param_operations_mutex_); + if (is_node_unavailable(node_name)) { + ParameterResult result; + result.success = false; + result.error_message = "Parameter service not available for node (cached): " + node_name; + result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; + return result; + } + + std::lock_guard op_lock(get_node_mutex(node_name)); ParameterResult result; try { - // Ensure defaults are cached cache_default_values(node_name); - // Look up default values — copy under lock, then release before blocking I/O std::vector params_to_reset; { std::lock_guard lock(defaults_mutex_); @@ -576,21 +681,19 @@ ParameterResult ConfigurationManager::reset_all_parameters(const std::string & n } } - // Lock released — perform blocking I/O without holding defaults_mutex_ auto client = get_param_client(node_name); if (!client->wait_for_service(get_service_timeout())) { result.success = false; result.error_message = "Parameter service not available for node: " + node_name; result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; + mark_node_unavailable(node_name); return result; } - // Reset all parameters size_t reset_count = 0; size_t failed_count = 0; json failed_params = json::array(); - // Set parameters one by one to handle partial failures for (const auto & param : params_to_reset) { try { auto results = client->set_parameters({param}); @@ -619,7 +722,7 @@ ParameterResult ConfigurationManager::reset_all_parameters(const std::string & n if (failed_count > 0) { result.error_message = "Some parameters could not be reset"; - result.error_code = ParameterErrorCode::INTERNAL_ERROR; // Partial failure + result.error_code = ParameterErrorCode::INTERNAL_ERROR; } RCLCPP_INFO(node_->get_logger(), "Reset %zu parameters on node '%s' (%zu failed)", reset_count, node_name.c_str(), diff --git a/src/ros2_medkit_gateway/test/test_configuration_manager.cpp b/src/ros2_medkit_gateway/test/test_configuration_manager.cpp index 196b35ceb..57fcdc5c6 100644 --- a/src/ros2_medkit_gateway/test/test_configuration_manager.cpp +++ b/src/ros2_medkit_gateway/test/test_configuration_manager.cpp @@ -316,6 +316,66 @@ TEST_F(TestConfigurationManager, test_error_messages_informative) { EXPECT_FALSE(result2.error_message.empty()); } +// ==================== NEGATIVE CACHE TESTS ==================== + +TEST_F(TestConfigurationManager, test_negative_cache_fast_return) { + // First call to nonexistent node: waits for timeout, returns SERVICE_UNAVAILABLE + auto start1 = std::chrono::steady_clock::now(); + auto result1 = config_manager_->list_parameters("/nonexistent_cached_node"); + auto elapsed1 = std::chrono::steady_clock::now() - start1; + EXPECT_FALSE(result1.success); + EXPECT_EQ(result1.error_code, ParameterErrorCode::SERVICE_UNAVAILABLE); + + // Second call: should return immediately from negative cache (< 10ms) + auto start2 = std::chrono::steady_clock::now(); + auto result2 = config_manager_->list_parameters("/nonexistent_cached_node"); + auto elapsed2 = std::chrono::steady_clock::now() - start2; + EXPECT_FALSE(result2.success); + EXPECT_EQ(result2.error_code, ParameterErrorCode::SERVICE_UNAVAILABLE); + EXPECT_TRUE(result2.error_message.find("cached") != std::string::npos); + + // Second call should be much faster than first (negative cache hit) + EXPECT_LT(elapsed2, elapsed1 / 2); +} + +// ==================== SELF-QUERY GUARD TESTS ==================== + +TEST_F(TestConfigurationManager, test_self_query_no_deadlock) { + // Querying our own node should work without deadlock (direct access, no IPC) + auto start = std::chrono::steady_clock::now(); + auto result = config_manager_->list_parameters("/test_config_manager_node"); + auto elapsed = std::chrono::steady_clock::now() - start; + + EXPECT_TRUE(result.success); + // Self-query should be fast (< 100ms, no service call overhead) + EXPECT_LT(elapsed, std::chrono::milliseconds(100)); +} + +// ==================== PER-NODE MUTEX TESTS ==================== + +TEST_F(TestConfigurationManager, test_parallel_different_nodes_not_serialized) { + // Queries to different nonexistent nodes should run in parallel, + // not serialize behind a single mutex + auto start = std::chrono::steady_clock::now(); + + std::vector threads; + for (int i = 0; i < 3; ++i) { + threads.emplace_back([this, i]() { + config_manager_->list_parameters("/parallel_test_node_" + std::to_string(i)); + }); + } + for (auto & t : threads) { + t.join(); + } + + auto elapsed = std::chrono::steady_clock::now() - start; + // With per-node mutexes: ~0.1s (parallel, timeout per node) + // With single mutex: ~0.3s (serial, 3 x timeout) + // Allow some slack for CI: should be under 2x single timeout + auto single_timeout = std::chrono::duration(0.1); // test uses 0.1s timeout + EXPECT_LT(elapsed, single_timeout * 2.5); +} + // ==================== CONCURRENT ACCESS TEST ==================== TEST_F(TestConfigurationManager, test_concurrent_parameter_access) { From 6ebc474807561bfac880c6bba5c7931ba53a8b85 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 29 Mar 2026 14:41:12 +0200 Subject: [PATCH 02/11] fix(gateway): address PR #323 review - self-guard for set/reset, lazy cleanup, stable tests - Add self-query guard to set_parameter() and reset_parameter() (Copilot review) - Move cache_default_values() after wait_for_service() to avoid double timeout - Add lazy cleanup of expired negative cache entries (caps at 100 before sweep) - Remove timing-based test assertions, use serial vs parallel comparison instead --- .../src/configuration_manager.cpp | 81 ++++++++++++++++++- .../test/test_configuration_manager.cpp | 37 ++++----- 2 files changed, 94 insertions(+), 24 deletions(-) diff --git a/src/ros2_medkit_gateway/src/configuration_manager.cpp b/src/ros2_medkit_gateway/src/configuration_manager.cpp index 6b434e8c1..608040f01 100644 --- a/src/ros2_medkit_gateway/src/configuration_manager.cpp +++ b/src/ros2_medkit_gateway/src/configuration_manager.cpp @@ -70,6 +70,19 @@ bool ConfigurationManager::is_node_unavailable(const std::string & node_name) co void ConfigurationManager::mark_node_unavailable(const std::string & node_name) { std::unique_lock lock(negative_cache_mutex_); unavailable_nodes_[node_name] = std::chrono::steady_clock::now(); + + // Lazy cleanup: remove expired entries to prevent unbounded growth + if (unavailable_nodes_.size() > 100) { + auto now = std::chrono::steady_clock::now(); + auto ttl = std::chrono::duration(negative_cache_ttl_sec_); + for (auto it = unavailable_nodes_.begin(); it != unavailable_nodes_.end();) { + if (now - it->second > ttl) { + it = unavailable_nodes_.erase(it); + } else { + ++it; + } + } + } } bool ConfigurationManager::is_self_node(const std::string & node_name) const { @@ -173,9 +186,6 @@ ParameterResult ConfigurationManager::list_parameters(const std::string & node_n std::lock_guard op_lock(get_node_mutex(node_name)); ParameterResult result; - // Cache default values on first access to this node - cache_default_values(node_name); - try { auto client = get_param_client(node_name); @@ -189,6 +199,9 @@ ParameterResult ConfigurationManager::list_parameters(const std::string & node_n return result; } + // Cache default values after confirming service is available + cache_default_values(node_name); + auto param_names = client->list_parameters({}, 0); std::vector parameters; @@ -299,6 +312,33 @@ ParameterResult ConfigurationManager::get_parameter(const std::string & node_nam ParameterResult ConfigurationManager::set_parameter(const std::string & node_name, const std::string & param_name, const json & value) { + // Self-query guard: set parameter directly on own node + if (is_self_node(node_name)) { + ParameterResult result; + try { + auto current_value = node_->get_parameter(param_name).get_parameter_value(); + rclcpp::ParameterValue param_value = json_to_parameter_value(value, current_value.get_type()); + auto set_result = node_->set_parameter(rclcpp::Parameter(param_name, param_value)); + if (!set_result.successful) { + result.success = false; + result.error_message = set_result.reason; + result.error_code = ParameterErrorCode::INVALID_VALUE; + return result; + } + json param_obj; + param_obj["name"] = param_name; + param_obj["value"] = parameter_value_to_json(param_value); + param_obj["type"] = parameter_type_to_string(param_value.get_type()); + result.success = true; + result.data = param_obj; + } catch (const std::exception & e) { + result.success = false; + result.error_message = std::string("Failed to set own parameter: ") + e.what(); + result.error_code = ParameterErrorCode::INTERNAL_ERROR; + } + return result; + } + if (is_node_unavailable(node_name)) { ParameterResult result; result.success = false; @@ -576,6 +616,41 @@ void ConfigurationManager::cache_default_values(const std::string & node_name) { } ParameterResult ConfigurationManager::reset_parameter(const std::string & node_name, const std::string & param_name) { + // Self-query guard: reset via direct access + if (is_self_node(node_name)) { + ParameterResult result; + std::lock_guard lock(defaults_mutex_); + auto node_it = default_values_.find(node_name); + if (node_it == default_values_.end()) { + result.success = false; + result.error_message = "No default values cached for node: " + node_name; + result.error_code = ParameterErrorCode::NO_DEFAULTS_CACHED; + return result; + } + auto param_it = node_it->second.find(param_name); + if (param_it == node_it->second.end()) { + result.success = false; + result.error_message = "No default value for parameter: " + param_name; + result.error_code = ParameterErrorCode::NOT_FOUND; + return result; + } + auto set_result = node_->set_parameter(param_it->second); + if (!set_result.successful) { + result.success = false; + result.error_message = set_result.reason; + result.error_code = ParameterErrorCode::INTERNAL_ERROR; + return result; + } + json param_obj; + param_obj["name"] = param_name; + param_obj["value"] = parameter_value_to_json(param_it->second.get_parameter_value()); + param_obj["type"] = parameter_type_to_string(param_it->second.get_type()); + param_obj["reset_to_default"] = true; + result.success = true; + result.data = param_obj; + return result; + } + if (is_node_unavailable(node_name)) { ParameterResult result; result.success = false; diff --git a/src/ros2_medkit_gateway/test/test_configuration_manager.cpp b/src/ros2_medkit_gateway/test/test_configuration_manager.cpp index 57fcdc5c6..7ae762907 100644 --- a/src/ros2_medkit_gateway/test/test_configuration_manager.cpp +++ b/src/ros2_medkit_gateway/test/test_configuration_manager.cpp @@ -320,60 +320,55 @@ TEST_F(TestConfigurationManager, test_error_messages_informative) { TEST_F(TestConfigurationManager, test_negative_cache_fast_return) { // First call to nonexistent node: waits for timeout, returns SERVICE_UNAVAILABLE - auto start1 = std::chrono::steady_clock::now(); auto result1 = config_manager_->list_parameters("/nonexistent_cached_node"); - auto elapsed1 = std::chrono::steady_clock::now() - start1; EXPECT_FALSE(result1.success); EXPECT_EQ(result1.error_code, ParameterErrorCode::SERVICE_UNAVAILABLE); - // Second call: should return immediately from negative cache (< 10ms) - auto start2 = std::chrono::steady_clock::now(); + // Second call: should return from negative cache (no service discovery wait) auto result2 = config_manager_->list_parameters("/nonexistent_cached_node"); - auto elapsed2 = std::chrono::steady_clock::now() - start2; EXPECT_FALSE(result2.success); EXPECT_EQ(result2.error_code, ParameterErrorCode::SERVICE_UNAVAILABLE); EXPECT_TRUE(result2.error_message.find("cached") != std::string::npos); - - // Second call should be much faster than first (negative cache hit) - EXPECT_LT(elapsed2, elapsed1 / 2); } // ==================== SELF-QUERY GUARD TESTS ==================== TEST_F(TestConfigurationManager, test_self_query_no_deadlock) { // Querying our own node should work without deadlock (direct access, no IPC) - auto start = std::chrono::steady_clock::now(); auto result = config_manager_->list_parameters("/test_config_manager_node"); - auto elapsed = std::chrono::steady_clock::now() - start; EXPECT_TRUE(result.success); - // Self-query should be fast (< 100ms, no service call overhead) - EXPECT_LT(elapsed, std::chrono::milliseconds(100)); } // ==================== PER-NODE MUTEX TESTS ==================== TEST_F(TestConfigurationManager, test_parallel_different_nodes_not_serialized) { // Queries to different nonexistent nodes should run in parallel, - // not serialize behind a single mutex - auto start = std::chrono::steady_clock::now(); + // not serialize behind a single mutex. + // Compare serial vs parallel to avoid absolute timing thresholds. + + // Serial baseline + auto serial_start = std::chrono::steady_clock::now(); + for (int i = 0; i < 3; ++i) { + (void)config_manager_->list_parameters("/parallel_test_node_" + std::to_string(i)); + } + auto serial_elapsed = std::chrono::steady_clock::now() - serial_start; + // Parallel run + auto parallel_start = std::chrono::steady_clock::now(); std::vector threads; for (int i = 0; i < 3; ++i) { threads.emplace_back([this, i]() { - config_manager_->list_parameters("/parallel_test_node_" + std::to_string(i)); + (void)config_manager_->list_parameters("/parallel_test_node_" + std::to_string(i)); }); } for (auto & t : threads) { t.join(); } + auto parallel_elapsed = std::chrono::steady_clock::now() - parallel_start; - auto elapsed = std::chrono::steady_clock::now() - start; - // With per-node mutexes: ~0.1s (parallel, timeout per node) - // With single mutex: ~0.3s (serial, 3 x timeout) - // Allow some slack for CI: should be under 2x single timeout - auto single_timeout = std::chrono::duration(0.1); // test uses 0.1s timeout - EXPECT_LT(elapsed, single_timeout * 2.5); + // Parallel should be noticeably faster than serial (generous factor for CI) + EXPECT_LT(parallel_elapsed, serial_elapsed * 0.9); } // ==================== CONCURRENT ACCESS TEST ==================== From 9faaf3b35395775b2acfdd2e43bf2c5b3e162c59 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 29 Mar 2026 14:46:58 +0200 Subject: [PATCH 03/11] fix(gateway): deterministic shutdown for thread-local param nodes + document new parameters - Add shutdown() method and destructor to ConfigurationManager that cleans up thread-local ROS 2 param nodes via weak_ptr registry, preventing use-after-free when httplib threads outlive the ROS 2 context - Register each thread-local param node in registry on creation - Document parameter_service_timeout_sec (default 0.5s) and parameter_service_negative_cache_sec (default 60s) in gateway_params.yaml and docs/config/server.rst --- docs/config/server.rst | 12 +++++++++++ .../config/gateway_params.yaml | 14 +++++++++++++ .../configuration_manager.hpp | 14 +++++++++++++ .../src/configuration_manager.cpp | 21 +++++++++++++++++++ 4 files changed, 61 insertions(+) diff --git a/docs/config/server.rst b/docs/config/server.rst index a097fd172..9455fc1d7 100644 --- a/docs/config/server.rst +++ b/docs/config/server.rst @@ -154,6 +154,18 @@ Data Access Settings - float - ``2.0`` - Timeout for sampling topics with active publishers. Range: 0.1-30.0. + * - ``parameter_service_timeout_sec`` + - float + - ``0.5`` + - Timeout for ROS 2 parameter service calls (configurations endpoint). + Nodes without parameter service (e.g., micro-ROS bridges) block for this + duration before returning SERVICE_UNAVAILABLE. Range: 0.1-10.0. + * - ``parameter_service_negative_cache_sec`` + - float + - ``60.0`` + - After a node's parameter service fails to respond, subsequent requests + return immediately with SERVICE_UNAVAILABLE for this duration. + Set to 0 to disable. Range: 0-3600. .. note:: diff --git a/src/ros2_medkit_gateway/config/gateway_params.yaml b/src/ros2_medkit_gateway/config/gateway_params.yaml index c8a1ee98f..ef46d7c15 100644 --- a/src/ros2_medkit_gateway/config/gateway_params.yaml +++ b/src/ros2_medkit_gateway/config/gateway_params.yaml @@ -91,6 +91,20 @@ ros2_medkit_gateway: # Valid range: 0.1-30.0 topic_sample_timeout_sec: 2.0 + # Timeout (in seconds) for ROS 2 parameter service calls (configurations endpoint). + # Nodes without parameter service (e.g., micro-ROS bridges) will block for this + # duration before returning SERVICE_UNAVAILABLE. Lower values improve responsiveness + # on systems with mixed node types. + # Valid range: 0.1-10.0 + parameter_service_timeout_sec: 0.5 + + # Negative cache TTL (in seconds) for parameter service unavailability. + # After a node's parameter service fails to respond, subsequent requests return + # immediately with SERVICE_UNAVAILABLE for this duration instead of blocking. + # Set to 0 to disable negative caching. + # Valid range: 0-3600 + parameter_service_negative_cache_sec: 60.0 + # Fault Manager integration fault_manager: # Namespace prefix used to resolve Fault Manager services. diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp index 271711500..e3f8210e5 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp @@ -56,6 +56,11 @@ struct ParameterResult { class ConfigurationManager { public: explicit ConfigurationManager(rclcpp::Node * node); + ~ConfigurationManager(); + + /// Clean up thread-local param nodes before ROS 2 context shutdown. + /// Must be called before rclcpp::shutdown() to prevent use-after-free. + void shutdown(); /// List all parameters for a node /// @param node_name Fully qualified node name (e.g., "/powertrain/engine/engine_temp_sensor") @@ -149,6 +154,15 @@ class ConfigurationManager { /// Key: node_name, Value: map of param_name -> Parameter mutable std::mutex defaults_mutex_; std::map> default_values_; + + /// Registry of thread-local param nodes for deterministic cleanup. + /// Thread-local nodes are registered here on creation and cleared in shutdown() + /// to prevent use-after-free when ROS 2 context is destroyed before httplib threads exit. + mutable std::mutex param_nodes_registry_mutex_; + std::vector> param_nodes_registry_; + + /// Register a thread-local param node for cleanup tracking + void register_param_node(std::shared_ptr node); }; } // namespace ros2_medkit_gateway diff --git a/src/ros2_medkit_gateway/src/configuration_manager.cpp b/src/ros2_medkit_gateway/src/configuration_manager.cpp index 608040f01..343921260 100644 --- a/src/ros2_medkit_gateway/src/configuration_manager.cpp +++ b/src/ros2_medkit_gateway/src/configuration_manager.cpp @@ -38,6 +38,25 @@ ConfigurationManager::ConfigurationManager(rclcpp::Node * node) : node_(node) { service_timeout_sec_, negative_cache_ttl_sec_); } +ConfigurationManager::~ConfigurationManager() { + shutdown(); +} + +void ConfigurationManager::shutdown() { + std::lock_guard lock(param_nodes_registry_mutex_); + for (auto & weak_node : param_nodes_registry_) { + if (auto node = weak_node.lock()) { + node.reset(); + } + } + param_nodes_registry_.clear(); +} + +void ConfigurationManager::register_param_node(std::shared_ptr node) { + std::lock_guard lock(param_nodes_registry_mutex_); + param_nodes_registry_.push_back(node); +} + std::chrono::duration ConfigurationManager::get_service_timeout() const { return std::chrono::duration(service_timeout_sec_); } @@ -144,6 +163,7 @@ std::shared_ptr ConfigurationManager::get_param_cl // Create a thread-local param node to avoid executor conflicts. // SyncParametersClient spins its node internally - sharing one node // across threads causes "Node already added to executor" errors. + // Nodes are registered for deterministic cleanup on shutdown. thread_local std::shared_ptr tl_param_node; thread_local std::map> tl_clients; @@ -155,6 +175,7 @@ std::shared_ptr ConfigurationManager::get_param_cl int id = param_node_counter.fetch_add(1); tl_param_node = std::make_shared( "_param_client_" + std::to_string(id), options); + register_param_node(tl_param_node); } auto it = tl_clients.find(node_name); From 7c623526aed1acf4b151667bc35bd0f35e8b801d Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 29 Mar 2026 14:55:10 +0200 Subject: [PATCH 04/11] fix(gateway): replace thread-local param nodes with owned pool, fix test parallelism Address remaining review findings from PR #323 re-review: - Replace thread_local param nodes with ConfigurationManager-owned pool. Nodes are acquired/released via RAII (custom deleter on shared_ptr). shutdown() deterministically destroys all nodes before rclcpp::shutdown(). - Fix parallel test: use different node names for serial vs parallel phases so both phases actually hit parameter service timeouts (not negative cache). - Add lazy cleanup for node_mutexes_ map at >200 entries (removes unlocked mutexes for transient nodes). --- .../configuration_manager.hpp | 21 +++-- .../src/configuration_manager.cpp | 86 +++++++++++-------- .../test/test_configuration_manager.cpp | 8 +- 3 files changed, 66 insertions(+), 49 deletions(-) diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp index e3f8210e5..ca55bd85c 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp @@ -155,14 +155,19 @@ class ConfigurationManager { mutable std::mutex defaults_mutex_; std::map> default_values_; - /// Registry of thread-local param nodes for deterministic cleanup. - /// Thread-local nodes are registered here on creation and cleared in shutdown() - /// to prevent use-after-free when ROS 2 context is destroyed before httplib threads exit. - mutable std::mutex param_nodes_registry_mutex_; - std::vector> param_nodes_registry_; - - /// Register a thread-local param node for cleanup tracking - void register_param_node(std::shared_ptr node); + /// Pool of param nodes for SyncParametersClient operations. + /// Each node can only be used by one thread at a time (SyncParametersClient + /// spins internally and is not thread-safe). Threads acquire a node from the + /// pool and return it when done. Pool grows on demand, shrinks on shutdown(). + mutable std::mutex param_pool_mutex_; + std::vector> param_pool_available_; + std::vector> param_pool_all_; + + /// Acquire a param node from the pool (creates one if none available) + std::shared_ptr acquire_param_node(); + + /// Return a param node to the pool + void release_param_node(std::shared_ptr node); }; } // namespace ros2_medkit_gateway diff --git a/src/ros2_medkit_gateway/src/configuration_manager.cpp b/src/ros2_medkit_gateway/src/configuration_manager.cpp index 343921260..3f097f38b 100644 --- a/src/ros2_medkit_gateway/src/configuration_manager.cpp +++ b/src/ros2_medkit_gateway/src/configuration_manager.cpp @@ -43,18 +43,33 @@ ConfigurationManager::~ConfigurationManager() { } void ConfigurationManager::shutdown() { - std::lock_guard lock(param_nodes_registry_mutex_); - for (auto & weak_node : param_nodes_registry_) { - if (auto node = weak_node.lock()) { - node.reset(); - } - } - param_nodes_registry_.clear(); + std::lock_guard lock(param_pool_mutex_); + param_pool_available_.clear(); + param_pool_all_.clear(); } -void ConfigurationManager::register_param_node(std::shared_ptr node) { - std::lock_guard lock(param_nodes_registry_mutex_); - param_nodes_registry_.push_back(node); +std::shared_ptr ConfigurationManager::acquire_param_node() { + std::lock_guard lock(param_pool_mutex_); + if (!param_pool_available_.empty()) { + auto node = std::move(param_pool_available_.back()); + param_pool_available_.pop_back(); + return node; + } + // Create new node + rclcpp::NodeOptions options; + options.start_parameter_services(false); + options.start_parameter_event_publisher(false); + options.use_global_arguments(false); + int id = param_node_counter.fetch_add(1); + auto node = std::make_shared( + "_param_client_" + std::to_string(id), options); + param_pool_all_.push_back(node); + return node; +} + +void ConfigurationManager::release_param_node(std::shared_ptr node) { + std::lock_guard lock(param_pool_mutex_); + param_pool_available_.push_back(std::move(node)); } std::chrono::duration ConfigurationManager::get_service_timeout() const { @@ -70,8 +85,19 @@ std::mutex & ConfigurationManager::get_node_mutex(const std::string & node_name) return *it->second; } } - // Slow path: create new mutex + // Slow path: create new mutex, with lazy cleanup to prevent unbounded growth std::unique_lock lock(node_mutexes_mutex_); + if (node_mutexes_.size() > 200) { + // Remove mutexes that are not currently locked (try_lock succeeds) + for (auto it = node_mutexes_.begin(); it != node_mutexes_.end();) { + if (it->first != node_name && it->second->try_lock()) { + it->second->unlock(); + it = node_mutexes_.erase(it); + } else { + ++it; + } + } + } auto [it, inserted] = node_mutexes_.try_emplace(node_name, std::make_unique()); return *it->second; } @@ -160,32 +186,18 @@ ParameterResult ConfigurationManager::get_own_parameter(const std::string & para } std::shared_ptr ConfigurationManager::get_param_client(const std::string & node_name) { - // Create a thread-local param node to avoid executor conflicts. - // SyncParametersClient spins its node internally - sharing one node - // across threads causes "Node already added to executor" errors. - // Nodes are registered for deterministic cleanup on shutdown. - thread_local std::shared_ptr tl_param_node; - thread_local std::map> tl_clients; - - if (!tl_param_node) { - rclcpp::NodeOptions options; - options.start_parameter_services(false); - options.start_parameter_event_publisher(false); - options.use_global_arguments(false); - int id = param_node_counter.fetch_add(1); - tl_param_node = std::make_shared( - "_param_client_" + std::to_string(id), options); - register_param_node(tl_param_node); - } - - auto it = tl_clients.find(node_name); - if (it != tl_clients.end()) { - return it->second; - } - - auto client = std::make_shared(tl_param_node, node_name); - tl_clients[node_name] = client; - return client; + // Acquire a param node from the pool. The param node is captured in a + // custom deleter on the SyncParametersClient shared_ptr - when the client + // is destroyed (goes out of scope in the calling function), the param node + // is automatically returned to the pool. + auto param_node = acquire_param_node(); + auto * mgr = this; + auto client_raw = new rclcpp::SyncParametersClient(param_node, node_name); + return std::shared_ptr( + client_raw, [mgr, param_node](rclcpp::SyncParametersClient * ptr) { + delete ptr; + mgr->release_param_node(param_node); + }); } ParameterResult ConfigurationManager::list_parameters(const std::string & node_name) { diff --git a/src/ros2_medkit_gateway/test/test_configuration_manager.cpp b/src/ros2_medkit_gateway/test/test_configuration_manager.cpp index 7ae762907..378757cbb 100644 --- a/src/ros2_medkit_gateway/test/test_configuration_manager.cpp +++ b/src/ros2_medkit_gateway/test/test_configuration_manager.cpp @@ -345,16 +345,16 @@ TEST_F(TestConfigurationManager, test_self_query_no_deadlock) { TEST_F(TestConfigurationManager, test_parallel_different_nodes_not_serialized) { // Queries to different nonexistent nodes should run in parallel, // not serialize behind a single mutex. - // Compare serial vs parallel to avoid absolute timing thresholds. + // Use different node names for serial vs parallel to avoid negative cache hits. - // Serial baseline + // Serial baseline (uses _serial_ prefix) auto serial_start = std::chrono::steady_clock::now(); for (int i = 0; i < 3; ++i) { - (void)config_manager_->list_parameters("/parallel_test_node_" + std::to_string(i)); + (void)config_manager_->list_parameters("/serial_test_node_" + std::to_string(i)); } auto serial_elapsed = std::chrono::steady_clock::now() - serial_start; - // Parallel run + // Parallel run (uses _parallel_ prefix - different names, no cache hit) auto parallel_start = std::chrono::steady_clock::now(); std::vector threads; for (int i = 0; i < 3; ++i) { From 62c30ca2bc5087954590d7264d6f2907714e7389 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 29 Mar 2026 16:05:42 +0200 Subject: [PATCH 05/11] fix(gateway): cache defaults in self-query path, add self-guard for reset_all, fix clang-format - list_own_parameters() now caches defaults for reset operations (fixes test_set_and_reset_parameter failure) - Add self-query guard to reset_all_parameters() (fixes test_concurrent_parameter_operations_no_executor_error - all 50 ops now go through direct access for own node) - Run clang-format to match CI style --- .../src/configuration_manager.cpp | 71 +++++++++++++++++-- 1 file changed, 64 insertions(+), 7 deletions(-) diff --git a/src/ros2_medkit_gateway/src/configuration_manager.cpp b/src/ros2_medkit_gateway/src/configuration_manager.cpp index 3f097f38b..bed05c9d8 100644 --- a/src/ros2_medkit_gateway/src/configuration_manager.cpp +++ b/src/ros2_medkit_gateway/src/configuration_manager.cpp @@ -61,8 +61,7 @@ std::shared_ptr ConfigurationManager::acquire_param_node() { options.start_parameter_event_publisher(false); options.use_global_arguments(false); int id = param_node_counter.fetch_add(1); - auto node = std::make_shared( - "_param_client_" + std::to_string(id), options); + auto node = std::make_shared("_param_client_" + std::to_string(id), options); param_pool_all_.push_back(node); return node; } @@ -138,6 +137,19 @@ ParameterResult ConfigurationManager::list_own_parameters() { ParameterResult result; try { auto params = node_->get_parameters(node_->list_parameters({}, 0).names); + + // Cache defaults for reset operations (same as IPC path) + { + std::lock_guard lock(defaults_mutex_); + if (default_values_.find(own_node_fqn_) == default_values_.end()) { + std::map node_defaults; + for (const auto & param : params) { + node_defaults[param.get_name()] = param; + } + default_values_[own_node_fqn_] = std::move(node_defaults); + } + } + json params_array = json::array(); for (const auto & param : params) { json param_obj; @@ -193,11 +205,11 @@ std::shared_ptr ConfigurationManager::get_param_cl auto param_node = acquire_param_node(); auto * mgr = this; auto client_raw = new rclcpp::SyncParametersClient(param_node, node_name); - return std::shared_ptr( - client_raw, [mgr, param_node](rclcpp::SyncParametersClient * ptr) { - delete ptr; - mgr->release_param_node(param_node); - }); + return std::shared_ptr(client_raw, + [mgr, param_node](rclcpp::SyncParametersClient * ptr) { + delete ptr; + mgr->release_param_node(param_node); + }); } ParameterResult ConfigurationManager::list_parameters(const std::string & node_name) { @@ -758,6 +770,51 @@ ParameterResult ConfigurationManager::reset_parameter(const std::string & node_n } ParameterResult ConfigurationManager::reset_all_parameters(const std::string & node_name) { + // Self-query guard: reset via direct access + if (is_self_node(node_name)) { + ParameterResult result; + std::vector params_to_reset; + { + std::lock_guard lock(defaults_mutex_); + auto node_it = default_values_.find(node_name); + if (node_it == default_values_.end()) { + result.success = false; + result.error_message = "No default values cached for node: " + node_name; + result.error_code = ParameterErrorCode::NO_DEFAULTS_CACHED; + return result; + } + for (const auto & [name, param] : node_it->second) { + params_to_reset.push_back(param); + } + } + size_t reset_count = 0; + size_t failed_count = 0; + json failed_params = json::array(); + for (const auto & param : params_to_reset) { + auto set_result = node_->set_parameter(param); + if (set_result.successful) { + reset_count++; + } else { + failed_count++; + failed_params.push_back(param.get_name()); + } + } + json response; + response["node_name"] = node_name; + response["reset_count"] = reset_count; + response["failed_count"] = failed_count; + if (!failed_params.empty()) { + response["failed_parameters"] = failed_params; + } + result.success = (failed_count == 0); + result.data = response; + if (failed_count > 0) { + result.error_message = "Some parameters could not be reset"; + result.error_code = ParameterErrorCode::INTERNAL_ERROR; + } + return result; + } + if (is_node_unavailable(node_name)) { ParameterResult result; result.success = false; From a1cb80a1b680d516816ef9b1f3063e057418ba55 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 29 Mar 2026 20:41:46 +0200 Subject: [PATCH 06/11] fix(gateway): replace pool with per-target cached param nodes, restore default timeout Pool-based param nodes created fresh SyncParametersClient per call, breaking DDS service discovery caching and causing test_17 regression (test_scenario_discovery_manifest). Replace pool with per-target-node entries: each target gets a dedicated param_node + cached SyncParametersClient. Per-node mutex ensures no concurrent spin. Service discovery is cached across calls to the same target. shutdown() clears the map deterministically. Restore default parameter_service_timeout_sec to 2.0s (was 0.5s) - the lower value caused integration test failures. Users with micro-ROS nodes can set 0.5s in their config. --- docs/config/server.rst | 2 +- .../config/gateway_params.yaml | 2 +- .../configuration_manager.hpp | 25 ++++--- .../src/configuration_manager.cpp | 66 ++++++++----------- 4 files changed, 39 insertions(+), 56 deletions(-) diff --git a/docs/config/server.rst b/docs/config/server.rst index 9455fc1d7..96ad372a8 100644 --- a/docs/config/server.rst +++ b/docs/config/server.rst @@ -156,7 +156,7 @@ Data Access Settings - Timeout for sampling topics with active publishers. Range: 0.1-30.0. * - ``parameter_service_timeout_sec`` - float - - ``0.5`` + - ``2.0`` - Timeout for ROS 2 parameter service calls (configurations endpoint). Nodes without parameter service (e.g., micro-ROS bridges) block for this duration before returning SERVICE_UNAVAILABLE. Range: 0.1-10.0. diff --git a/src/ros2_medkit_gateway/config/gateway_params.yaml b/src/ros2_medkit_gateway/config/gateway_params.yaml index ef46d7c15..3016c67f2 100644 --- a/src/ros2_medkit_gateway/config/gateway_params.yaml +++ b/src/ros2_medkit_gateway/config/gateway_params.yaml @@ -96,7 +96,7 @@ ros2_medkit_gateway: # duration before returning SERVICE_UNAVAILABLE. Lower values improve responsiveness # on systems with mixed node types. # Valid range: 0.1-10.0 - parameter_service_timeout_sec: 0.5 + parameter_service_timeout_sec: 2.0 # Negative cache TTL (in seconds) for parameter service unavailability. # After a node's parameter service fails to respond, subsequent requests return diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp index ca55bd85c..4532dd75c 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp @@ -129,7 +129,7 @@ class ConfigurationManager { rclcpp::Node * node_; /// Timeout for waiting for parameter services (configurable via parameter_service_timeout_sec parameter) - double service_timeout_sec_{0.5}; + double service_timeout_sec_{2.0}; /// Negative cache TTL (configurable via parameter_service_negative_cache_sec parameter) double negative_cache_ttl_sec_{60.0}; @@ -155,19 +155,16 @@ class ConfigurationManager { mutable std::mutex defaults_mutex_; std::map> default_values_; - /// Pool of param nodes for SyncParametersClient operations. - /// Each node can only be used by one thread at a time (SyncParametersClient - /// spins internally and is not thread-safe). Threads acquire a node from the - /// pool and return it when done. Pool grows on demand, shrinks on shutdown(). - mutable std::mutex param_pool_mutex_; - std::vector> param_pool_available_; - std::vector> param_pool_all_; - - /// Acquire a param node from the pool (creates one if none available) - std::shared_ptr acquire_param_node(); - - /// Return a param node to the pool - void release_param_node(std::shared_ptr node); + /// Per-target-node param client entries. + /// Each target node gets a dedicated ROS 2 param node + cached SyncParametersClient. + /// The per-node mutex (from node_mutexes_) ensures only one thread spins a + /// given param_node at a time, preventing executor conflicts. + struct ParamClientEntry { + std::shared_ptr param_node; + std::shared_ptr client; + }; + mutable std::mutex param_clients_mutex_; + std::unordered_map param_clients_; }; } // namespace ros2_medkit_gateway diff --git a/src/ros2_medkit_gateway/src/configuration_manager.cpp b/src/ros2_medkit_gateway/src/configuration_manager.cpp index bed05c9d8..709cb2852 100644 --- a/src/ros2_medkit_gateway/src/configuration_manager.cpp +++ b/src/ros2_medkit_gateway/src/configuration_manager.cpp @@ -25,8 +25,9 @@ namespace ros2_medkit_gateway { static std::atomic param_node_counter{0}; ConfigurationManager::ConfigurationManager(rclcpp::Node * node) : node_(node) { - // Get configurable timeout for parameter services (default 0.5 seconds) - service_timeout_sec_ = node_->declare_parameter("parameter_service_timeout_sec", 0.5); + // Get configurable timeout for parameter services (default 2.0 seconds) + // For systems with micro-ROS nodes without parameter service, set lower (e.g., 0.5s) + service_timeout_sec_ = node_->declare_parameter("parameter_service_timeout_sec", 2.0); // Get negative cache TTL (default 60 seconds) negative_cache_ttl_sec_ = node_->declare_parameter("parameter_service_negative_cache_sec", 60.0); @@ -43,32 +44,8 @@ ConfigurationManager::~ConfigurationManager() { } void ConfigurationManager::shutdown() { - std::lock_guard lock(param_pool_mutex_); - param_pool_available_.clear(); - param_pool_all_.clear(); -} - -std::shared_ptr ConfigurationManager::acquire_param_node() { - std::lock_guard lock(param_pool_mutex_); - if (!param_pool_available_.empty()) { - auto node = std::move(param_pool_available_.back()); - param_pool_available_.pop_back(); - return node; - } - // Create new node - rclcpp::NodeOptions options; - options.start_parameter_services(false); - options.start_parameter_event_publisher(false); - options.use_global_arguments(false); - int id = param_node_counter.fetch_add(1); - auto node = std::make_shared("_param_client_" + std::to_string(id), options); - param_pool_all_.push_back(node); - return node; -} - -void ConfigurationManager::release_param_node(std::shared_ptr node) { - std::lock_guard lock(param_pool_mutex_); - param_pool_available_.push_back(std::move(node)); + std::lock_guard lock(param_clients_mutex_); + param_clients_.clear(); } std::chrono::duration ConfigurationManager::get_service_timeout() const { @@ -198,18 +175,27 @@ ParameterResult ConfigurationManager::get_own_parameter(const std::string & para } std::shared_ptr ConfigurationManager::get_param_client(const std::string & node_name) { - // Acquire a param node from the pool. The param node is captured in a - // custom deleter on the SyncParametersClient shared_ptr - when the client - // is destroyed (goes out of scope in the calling function), the param node - // is automatically returned to the pool. - auto param_node = acquire_param_node(); - auto * mgr = this; - auto client_raw = new rclcpp::SyncParametersClient(param_node, node_name); - return std::shared_ptr(client_raw, - [mgr, param_node](rclcpp::SyncParametersClient * ptr) { - delete ptr; - mgr->release_param_node(param_node); - }); + // Get or create a dedicated param node + cached client for this target node. + // The per-node mutex (held by caller) ensures only one thread uses this + // param_node at a time, preventing executor conflicts in SyncParametersClient. + std::lock_guard lock(param_clients_mutex_); + + auto it = param_clients_.find(node_name); + if (it != param_clients_.end()) { + return it->second.client; + } + + // Create dedicated param node for this target + rclcpp::NodeOptions options; + options.start_parameter_services(false); + options.start_parameter_event_publisher(false); + options.use_global_arguments(false); + int id = param_node_counter.fetch_add(1); + auto param_node = std::make_shared("_param_client_" + std::to_string(id), options); + auto client = std::make_shared(param_node, node_name); + + param_clients_[node_name] = {param_node, client}; + return client; } ParameterResult ConfigurationManager::list_parameters(const std::string & node_name) { From 8a7818253ae7f708c7b7367033a0c794c8e263d0 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 29 Mar 2026 21:37:01 +0200 Subject: [PATCH 07/11] fix(gateway): use single pre-created param_node with spin_mutex for CI stability Per-target param_nodes caused DDS discovery delay on CI (fresh node needs time to join graph). Revert to single param_node_ created in constructor (like original code) for fast service discovery. Add recursive spin_mutex_ to serialize ROS 2 IPC calls (wait_for_service, list_parameters, get_parameters) on the shared param_node_. This prevents executor conflicts while keeping all other improvements: - Negative cache (instant return for nodes without parameter service) - Self-query guard (direct access for gateway's own node) - Cached SyncParametersClient per target node Remove per-node mutexes and node pool (no longer needed with spin_mutex). Replace parallel timing test with concurrent crash/hang test. --- .../configuration_manager.hpp | 34 +++-- .../src/configuration_manager.cpp | 124 +++++++----------- .../test/test_configuration_manager.cpp | 27 +--- 3 files changed, 70 insertions(+), 115 deletions(-) diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp index 4532dd75c..87924993c 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp @@ -137,14 +137,6 @@ class ConfigurationManager { /// Gateway's own fully qualified node name (for self-query detection) std::string own_node_fqn_; - /// Per-node mutexes for parameter operations. - /// Each node gets its own mutex so one slow node doesn't block queries to other nodes. - mutable std::shared_mutex node_mutexes_mutex_; - mutable std::unordered_map> node_mutexes_; - - /// Get or create a mutex for a specific node - std::mutex & get_node_mutex(const std::string & node_name) const; - /// Negative cache: nodes whose parameter service was recently unavailable. /// Avoids repeated blocking waits on nodes that don't have parameter services. mutable std::shared_mutex negative_cache_mutex_; @@ -155,16 +147,22 @@ class ConfigurationManager { mutable std::mutex defaults_mutex_; std::map> default_values_; - /// Per-target-node param client entries. - /// Each target node gets a dedicated ROS 2 param node + cached SyncParametersClient. - /// The per-node mutex (from node_mutexes_) ensures only one thread spins a - /// given param_node at a time, preventing executor conflicts. - struct ParamClientEntry { - std::shared_ptr param_node; - std::shared_ptr client; - }; - mutable std::mutex param_clients_mutex_; - std::unordered_map param_clients_; + /// Internal node for parameter client operations. + /// Created once in constructor - must be in DDS graph early for fast service discovery. + /// SyncParametersClient requires a node NOT in an executor to spin internally. + std::shared_ptr param_node_; + + /// Mutex to serialize spin operations on param_node_. + /// SyncParametersClient::wait_for_service/get_parameters/etc spin param_node_ + /// internally via spin_node_until_future_complete which is NOT thread-safe. + /// This mutex is ONLY held during the actual ROS 2 IPC call, not during + /// cache lookups or JSON building. With negative cache + self-guard, + /// most requests never touch this mutex. + mutable std::recursive_mutex spin_mutex_; + + /// Cached SyncParametersClient per target node (avoids recreating clients) + mutable std::mutex clients_mutex_; + std::map> param_clients_; }; } // namespace ros2_medkit_gateway diff --git a/src/ros2_medkit_gateway/src/configuration_manager.cpp b/src/ros2_medkit_gateway/src/configuration_manager.cpp index 709cb2852..02e43b9cb 100644 --- a/src/ros2_medkit_gateway/src/configuration_manager.cpp +++ b/src/ros2_medkit_gateway/src/configuration_manager.cpp @@ -14,17 +14,21 @@ #include "ros2_medkit_gateway/configuration_manager.hpp" -#include #include using namespace std::chrono_literals; namespace ros2_medkit_gateway { -// Thread-local param node counter for unique names -static std::atomic param_node_counter{0}; - ConfigurationManager::ConfigurationManager(rclcpp::Node * node) : node_(node) { + // Create internal node for parameter client operations early. + // Must be in DDS graph before any parameter queries for fast service discovery. + rclcpp::NodeOptions options; + options.start_parameter_services(false); + options.start_parameter_event_publisher(false); + options.use_global_arguments(false); + param_node_ = std::make_shared("_param_client_node", options); + // Get configurable timeout for parameter services (default 2.0 seconds) // For systems with micro-ROS nodes without parameter service, set lower (e.g., 0.5s) service_timeout_sec_ = node_->declare_parameter("parameter_service_timeout_sec", 2.0); @@ -44,40 +48,15 @@ ConfigurationManager::~ConfigurationManager() { } void ConfigurationManager::shutdown() { - std::lock_guard lock(param_clients_mutex_); + std::lock_guard lock(clients_mutex_); param_clients_.clear(); + param_node_.reset(); } std::chrono::duration ConfigurationManager::get_service_timeout() const { return std::chrono::duration(service_timeout_sec_); } -std::mutex & ConfigurationManager::get_node_mutex(const std::string & node_name) const { - // Fast path: check if mutex exists - { - std::shared_lock lock(node_mutexes_mutex_); - auto it = node_mutexes_.find(node_name); - if (it != node_mutexes_.end()) { - return *it->second; - } - } - // Slow path: create new mutex, with lazy cleanup to prevent unbounded growth - std::unique_lock lock(node_mutexes_mutex_); - if (node_mutexes_.size() > 200) { - // Remove mutexes that are not currently locked (try_lock succeeds) - for (auto it = node_mutexes_.begin(); it != node_mutexes_.end();) { - if (it->first != node_name && it->second->try_lock()) { - it->second->unlock(); - it = node_mutexes_.erase(it); - } else { - ++it; - } - } - } - auto [it, inserted] = node_mutexes_.try_emplace(node_name, std::make_unique()); - return *it->second; -} - bool ConfigurationManager::is_node_unavailable(const std::string & node_name) const { std::shared_lock lock(negative_cache_mutex_); auto it = unavailable_nodes_.find(node_name); @@ -175,26 +154,16 @@ ParameterResult ConfigurationManager::get_own_parameter(const std::string & para } std::shared_ptr ConfigurationManager::get_param_client(const std::string & node_name) { - // Get or create a dedicated param node + cached client for this target node. - // The per-node mutex (held by caller) ensures only one thread uses this - // param_node at a time, preventing executor conflicts in SyncParametersClient. - std::lock_guard lock(param_clients_mutex_); + std::lock_guard lock(clients_mutex_); auto it = param_clients_.find(node_name); if (it != param_clients_.end()) { - return it->second.client; + return it->second; } - // Create dedicated param node for this target - rclcpp::NodeOptions options; - options.start_parameter_services(false); - options.start_parameter_event_publisher(false); - options.use_global_arguments(false); - int id = param_node_counter.fetch_add(1); - auto param_node = std::make_shared("_param_client_" + std::to_string(id), options); - auto client = std::make_shared(param_node, node_name); - - param_clients_[node_name] = {param_node, client}; + // Create cached client on the shared param_node_ (created in constructor) + auto client = std::make_shared(param_node_, node_name); + param_clients_[node_name] = client; return client; } @@ -213,45 +182,46 @@ ParameterResult ConfigurationManager::list_parameters(const std::string & node_n return result; } - // Per-node mutex: only block concurrent queries to the SAME node - std::lock_guard op_lock(get_node_mutex(node_name)); ParameterResult result; try { auto client = get_param_client(node_name); - if (!client->wait_for_service(get_service_timeout())) { - result.success = false; - result.error_message = "Parameter service not available for node: " + node_name; - result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; - mark_node_unavailable(node_name); - RCLCPP_WARN(node_->get_logger(), "Parameter service not available for node: '%s' (cached for %.0fs)", - node_name.c_str(), negative_cache_ttl_sec_); - return result; - } + // All ROS 2 IPC calls (wait_for_service, list_parameters, get_parameters) + // spin param_node_ internally. spin_mutex_ serializes these to prevent + // "Node already added to executor" errors from concurrent spin. + std::vector parameters; + { + std::lock_guard spin_lock(spin_mutex_); - // Cache default values after confirming service is available - cache_default_values(node_name); + if (!client->wait_for_service(get_service_timeout())) { + result.success = false; + result.error_message = "Parameter service not available for node: " + node_name; + result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; + mark_node_unavailable(node_name); + RCLCPP_WARN(node_->get_logger(), "Parameter service not available for node: '%s' (cached for %.0fs)", + node_name.c_str(), negative_cache_ttl_sec_); + return result; + } - auto param_names = client->list_parameters({}, 0); + cache_default_values(node_name); - std::vector parameters; - parameters = client->get_parameters(param_names.names); + auto param_names = client->list_parameters({}, 0); + parameters = client->get_parameters(param_names.names); - if (parameters.empty() && !param_names.names.empty()) { - RCLCPP_WARN(node_->get_logger(), "get_parameters returned empty, trying one by one for node: '%s'", - node_name.c_str()); - for (const auto & name : param_names.names) { - try { - auto single_params = client->get_parameters({name}); - if (!single_params.empty()) { - parameters.push_back(single_params[0]); + if (parameters.empty() && !param_names.names.empty()) { + for (const auto & name : param_names.names) { + try { + auto single_params = client->get_parameters({name}); + if (!single_params.empty()) { + parameters.push_back(single_params[0]); + } + } catch (const std::exception & e) { + RCLCPP_DEBUG(node_->get_logger(), "Failed to get param '%s': %s", name.c_str(), e.what()); } - } catch (const std::exception & e) { - RCLCPP_DEBUG(node_->get_logger(), "Failed to get param '%s': %s", name.c_str(), e.what()); } } - } + } // spin_mutex_ released - JSON building is lock-free json params_array = json::array(); for (const auto & param : parameters) { @@ -287,7 +257,7 @@ ParameterResult ConfigurationManager::get_parameter(const std::string & node_nam return result; } - std::lock_guard op_lock(get_node_mutex(node_name)); + std::lock_guard spin_lock(spin_mutex_); ParameterResult result; try { @@ -378,7 +348,7 @@ ParameterResult ConfigurationManager::set_parameter(const std::string & node_nam return result; } - std::lock_guard op_lock(get_node_mutex(node_name)); + std::lock_guard spin_lock(spin_mutex_); ParameterResult result; try { @@ -690,7 +660,7 @@ ParameterResult ConfigurationManager::reset_parameter(const std::string & node_n return result; } - std::lock_guard op_lock(get_node_mutex(node_name)); + std::lock_guard spin_lock(spin_mutex_); ParameterResult result; try { @@ -809,7 +779,7 @@ ParameterResult ConfigurationManager::reset_all_parameters(const std::string & n return result; } - std::lock_guard op_lock(get_node_mutex(node_name)); + std::lock_guard spin_lock(spin_mutex_); ParameterResult result; try { diff --git a/src/ros2_medkit_gateway/test/test_configuration_manager.cpp b/src/ros2_medkit_gateway/test/test_configuration_manager.cpp index 378757cbb..60e3f48f3 100644 --- a/src/ros2_medkit_gateway/test/test_configuration_manager.cpp +++ b/src/ros2_medkit_gateway/test/test_configuration_manager.cpp @@ -340,35 +340,22 @@ TEST_F(TestConfigurationManager, test_self_query_no_deadlock) { EXPECT_TRUE(result.success); } -// ==================== PER-NODE MUTEX TESTS ==================== +// ==================== SPIN SERIALIZATION TESTS ==================== -TEST_F(TestConfigurationManager, test_parallel_different_nodes_not_serialized) { - // Queries to different nonexistent nodes should run in parallel, - // not serialize behind a single mutex. - // Use different node names for serial vs parallel to avoid negative cache hits. - - // Serial baseline (uses _serial_ prefix) - auto serial_start = std::chrono::steady_clock::now(); - for (int i = 0; i < 3; ++i) { - (void)config_manager_->list_parameters("/serial_test_node_" + std::to_string(i)); - } - auto serial_elapsed = std::chrono::steady_clock::now() - serial_start; - - // Parallel run (uses _parallel_ prefix - different names, no cache hit) - auto parallel_start = std::chrono::steady_clock::now(); +TEST_F(TestConfigurationManager, test_concurrent_queries_no_crash) { + // Concurrent queries to different nonexistent nodes must not crash + // (spin_mutex_ serializes ROS 2 IPC to prevent executor conflicts). + // Second+ calls hit negative cache and return instantly. std::vector threads; for (int i = 0; i < 3; ++i) { threads.emplace_back([this, i]() { - (void)config_manager_->list_parameters("/parallel_test_node_" + std::to_string(i)); + (void)config_manager_->list_parameters("/concurrent_node_" + std::to_string(i)); }); } for (auto & t : threads) { t.join(); } - auto parallel_elapsed = std::chrono::steady_clock::now() - parallel_start; - - // Parallel should be noticeably faster than serial (generous factor for CI) - EXPECT_LT(parallel_elapsed, serial_elapsed * 0.9); + // If we get here without crash/hang, spin serialization works } // ==================== CONCURRENT ACCESS TEST ==================== From a09588da99570919c1af85a47ee396b65cb54319 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Mon, 30 Mar 2026 08:45:33 +0200 Subject: [PATCH 08/11] fix(gateway): address v4 review - regular mutex, narrow scope, safe shutdown - Replace recursive_mutex with std::mutex for spin_mutex_ (recursive was unnecessary - cache_default_values is always called with spin_mutex_ held, never re-acquires it) - Narrow spin_mutex_ scope in get_parameter: JSON building outside lock - shutdown() acquires spin_mutex_ before clearing param_node_ to wait for any in-flight IPC to complete (prevents teardown race) - Document @pre spin_mutex_ contract on cache_default_values --- .../configuration_manager.hpp | 3 +- .../src/configuration_manager.cpp | 65 ++++++++++--------- 2 files changed, 38 insertions(+), 30 deletions(-) diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp index 87924993c..e639eec2c 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp @@ -97,6 +97,7 @@ class ConfigurationManager { std::shared_ptr get_param_client(const std::string & node_name); /// Cache default values for a node (called on first access) + /// @pre spin_mutex_ must be held by the caller void cache_default_values(const std::string & node_name); /// Check if a node is in the negative cache (recently unavailable) @@ -158,7 +159,7 @@ class ConfigurationManager { /// This mutex is ONLY held during the actual ROS 2 IPC call, not during /// cache lookups or JSON building. With negative cache + self-guard, /// most requests never touch this mutex. - mutable std::recursive_mutex spin_mutex_; + mutable std::mutex spin_mutex_; /// Cached SyncParametersClient per target node (avoids recreating clients) mutable std::mutex clients_mutex_; diff --git a/src/ros2_medkit_gateway/src/configuration_manager.cpp b/src/ros2_medkit_gateway/src/configuration_manager.cpp index 02e43b9cb..e4a62d76f 100644 --- a/src/ros2_medkit_gateway/src/configuration_manager.cpp +++ b/src/ros2_medkit_gateway/src/configuration_manager.cpp @@ -48,6 +48,8 @@ ConfigurationManager::~ConfigurationManager() { } void ConfigurationManager::shutdown() { + // Acquire spin_mutex_ first to wait for any in-flight IPC to complete + std::lock_guard spin_lock(spin_mutex_); std::lock_guard lock(clients_mutex_); param_clients_.clear(); param_node_.reset(); @@ -192,7 +194,7 @@ ParameterResult ConfigurationManager::list_parameters(const std::string & node_n // "Node already added to executor" errors from concurrent spin. std::vector parameters; { - std::lock_guard spin_lock(spin_mutex_); + std::lock_guard spin_lock(spin_mutex_); if (!client->wait_for_service(get_service_timeout())) { result.success = false; @@ -257,38 +259,43 @@ ParameterResult ConfigurationManager::get_parameter(const std::string & node_nam return result; } - std::lock_guard spin_lock(spin_mutex_); ParameterResult result; try { - auto client = get_param_client(node_name); + rclcpp::Parameter param; + std::vector descriptors; - if (!client->wait_for_service(get_service_timeout())) { - result.success = false; - result.error_message = "Parameter service not available for node: " + node_name; - result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; - mark_node_unavailable(node_name); - return result; - } + { + std::lock_guard spin_lock(spin_mutex_); + auto client = get_param_client(node_name); - auto param_names = client->list_parameters({param_name}, 1); - if (param_names.names.empty()) { - result.success = false; - result.error_message = "Parameter not found: " + param_name; - result.error_code = ParameterErrorCode::NOT_FOUND; - return result; - } + if (!client->wait_for_service(get_service_timeout())) { + result.success = false; + result.error_message = "Parameter service not available for node: " + node_name; + result.error_code = ParameterErrorCode::SERVICE_UNAVAILABLE; + mark_node_unavailable(node_name); + return result; + } - auto parameters = client->get_parameters({param_name}); - if (parameters.empty()) { - result.success = false; - result.error_message = "Failed to get parameter: " + param_name; - result.error_code = ParameterErrorCode::INTERNAL_ERROR; - return result; - } + auto param_names = client->list_parameters({param_name}, 1); + if (param_names.names.empty()) { + result.success = false; + result.error_message = "Parameter not found: " + param_name; + result.error_code = ParameterErrorCode::NOT_FOUND; + return result; + } + + auto parameters = client->get_parameters({param_name}); + if (parameters.empty()) { + result.success = false; + result.error_message = "Failed to get parameter: " + param_name; + result.error_code = ParameterErrorCode::INTERNAL_ERROR; + return result; + } - const auto & param = parameters[0]; - auto descriptors = client->describe_parameters({param_name}); + param = parameters[0]; + descriptors = client->describe_parameters({param_name}); + } // spin_mutex_ released json param_obj; param_obj["name"] = param.get_name(); @@ -348,7 +355,7 @@ ParameterResult ConfigurationManager::set_parameter(const std::string & node_nam return result; } - std::lock_guard spin_lock(spin_mutex_); + std::lock_guard spin_lock(spin_mutex_); ParameterResult result; try { @@ -660,7 +667,7 @@ ParameterResult ConfigurationManager::reset_parameter(const std::string & node_n return result; } - std::lock_guard spin_lock(spin_mutex_); + std::lock_guard spin_lock(spin_mutex_); ParameterResult result; try { @@ -779,7 +786,7 @@ ParameterResult ConfigurationManager::reset_all_parameters(const std::string & n return result; } - std::lock_guard spin_lock(spin_mutex_); + std::lock_guard spin_lock(spin_mutex_); ParameterResult result; try { From ffd945cd1c61b3a473fc9d7b4609bde7ca38ca2c Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Mon, 30 Mar 2026 09:44:25 +0200 Subject: [PATCH 09/11] fix(gateway): address bburda PR review - all findings - Fix stale docstrings (thread-local -> shared param_node) - Fix member declaration order (spin_mutex_ before param_node_) - Add parameter range validation via ParameterDescriptor (FloatingPointRange) - Add idempotent shutdown with atomic flag + post-shutdown guard on all public methods - Add hard cap (500) on negative cache with oldest-entry eviction - Fix set_parameter self-query: check has_parameter() before get (404 not 500) - Fix reset_parameter self-query: narrow defaults_mutex_ scope (copy then release) - Fix concurrent test: verify results instead of void cast - Add cross-method negative cache test (list marks unavailable, get returns cached) - Add SHUT_DOWN error code for clean post-shutdown responses --- .../configuration_manager.hpp | 28 +++-- .../src/configuration_manager.cpp | 112 ++++++++++++++---- .../test/test_configuration_manager.cpp | 28 ++++- 3 files changed, 130 insertions(+), 38 deletions(-) diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp index e639eec2c..c795e6b48 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/configuration_manager.hpp @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -39,6 +40,7 @@ enum class ParameterErrorCode { TYPE_MISMATCH, ///< Value type doesn't match parameter type INVALID_VALUE, ///< Invalid value for parameter NO_DEFAULTS_CACHED, ///< No default values cached for reset operation + SHUT_DOWN, ///< ConfigurationManager has been shut down INTERNAL_ERROR ///< Internal/unexpected error }; @@ -58,8 +60,9 @@ class ConfigurationManager { explicit ConfigurationManager(rclcpp::Node * node); ~ConfigurationManager(); - /// Clean up thread-local param nodes before ROS 2 context shutdown. + /// Clean up shared param node and cached clients before ROS 2 context shutdown. /// Must be called before rclcpp::shutdown() to prevent use-after-free. + /// Idempotent - safe to call multiple times. void shutdown(); /// List all parameters for a node @@ -92,8 +95,7 @@ class ConfigurationManager { ParameterResult reset_all_parameters(const std::string & node_name); private: - /// Get or create a SyncParametersClient for the given node. - /// Creates a per-thread param node to avoid executor conflicts. + /// Get or create a cached SyncParametersClient for the given node. std::shared_ptr get_param_client(const std::string & node_name); /// Cache default values for a node (called on first access) @@ -115,6 +117,9 @@ class ConfigurationManager { /// Get a specific parameter from the gateway's own node ParameterResult get_own_parameter(const std::string & param_name); + /// Return SHUT_DOWN error result + static ParameterResult shut_down_result(); + /// Convert ROS2 parameter type to string static std::string parameter_type_to_string(rclcpp::ParameterType type); @@ -138,6 +143,9 @@ class ConfigurationManager { /// Gateway's own fully qualified node name (for self-query detection) std::string own_node_fqn_; + /// Shutdown flag - prevents use-after-free on param_node_ after shutdown + std::atomic shutdown_{false}; + /// Negative cache: nodes whose parameter service was recently unavailable. /// Avoids repeated blocking waits on nodes that don't have parameter services. mutable std::shared_mutex negative_cache_mutex_; @@ -148,22 +156,26 @@ class ConfigurationManager { mutable std::mutex defaults_mutex_; std::map> default_values_; - /// Internal node for parameter client operations. - /// Created once in constructor - must be in DDS graph early for fast service discovery. - /// SyncParametersClient requires a node NOT in an executor to spin internally. - std::shared_ptr param_node_; - /// Mutex to serialize spin operations on param_node_. /// SyncParametersClient::wait_for_service/get_parameters/etc spin param_node_ /// internally via spin_node_until_future_complete which is NOT thread-safe. /// This mutex is ONLY held during the actual ROS 2 IPC call, not during /// cache lookups or JSON building. With negative cache + self-guard, /// most requests never touch this mutex. + /// Declared BEFORE param_node_ so it outlives the node during destruction. mutable std::mutex spin_mutex_; + /// Internal node for parameter client operations. + /// Created once in constructor - must be in DDS graph early for fast service discovery. + /// SyncParametersClient requires a node NOT in an executor to spin internally. + std::shared_ptr param_node_; + /// Cached SyncParametersClient per target node (avoids recreating clients) mutable std::mutex clients_mutex_; std::map> param_clients_; + + /// Maximum entries in negative cache before hard eviction + static constexpr size_t kMaxNegativeCacheSize = 500; }; } // namespace ros2_medkit_gateway diff --git a/src/ros2_medkit_gateway/src/configuration_manager.cpp b/src/ros2_medkit_gateway/src/configuration_manager.cpp index e4a62d76f..182faf748 100644 --- a/src/ros2_medkit_gateway/src/configuration_manager.cpp +++ b/src/ros2_medkit_gateway/src/configuration_manager.cpp @@ -29,12 +29,22 @@ ConfigurationManager::ConfigurationManager(rclcpp::Node * node) : node_(node) { options.use_global_arguments(false); param_node_ = std::make_shared("_param_client_node", options); - // Get configurable timeout for parameter services (default 2.0 seconds) - // For systems with micro-ROS nodes without parameter service, set lower (e.g., 0.5s) - service_timeout_sec_ = node_->declare_parameter("parameter_service_timeout_sec", 2.0); - - // Get negative cache TTL (default 60 seconds) - negative_cache_ttl_sec_ = node_->declare_parameter("parameter_service_negative_cache_sec", 60.0); + // Declare parameters with range validation + rcl_interfaces::msg::ParameterDescriptor timeout_desc; + timeout_desc.description = "Timeout for ROS 2 parameter service calls (configurations endpoint)"; + rcl_interfaces::msg::FloatingPointRange timeout_range; + timeout_range.from_value = 0.1; + timeout_range.to_value = 10.0; + timeout_desc.floating_point_range.push_back(timeout_range); + service_timeout_sec_ = node_->declare_parameter("parameter_service_timeout_sec", 2.0, timeout_desc); + + rcl_interfaces::msg::ParameterDescriptor cache_desc; + cache_desc.description = "Negative cache TTL for unavailable parameter services (0 = disabled)"; + rcl_interfaces::msg::FloatingPointRange cache_range; + cache_range.from_value = 0.0; + cache_range.to_value = 3600.0; + cache_desc.floating_point_range.push_back(cache_range); + negative_cache_ttl_sec_ = node_->declare_parameter("parameter_service_negative_cache_sec", 60.0, cache_desc); // Store own node FQN for self-query detection own_node_fqn_ = node_->get_fully_qualified_name(); @@ -48,6 +58,9 @@ ConfigurationManager::~ConfigurationManager() { } void ConfigurationManager::shutdown() { + if (shutdown_.exchange(true)) { + return; // Already shut down + } // Acquire spin_mutex_ first to wait for any in-flight IPC to complete std::lock_guard spin_lock(spin_mutex_); std::lock_guard lock(clients_mutex_); @@ -55,6 +68,10 @@ void ConfigurationManager::shutdown() { param_node_.reset(); } +ParameterResult ConfigurationManager::shut_down_result() { + return {false, {}, "ConfigurationManager is shut down", ParameterErrorCode::SHUT_DOWN}; +} + std::chrono::duration ConfigurationManager::get_service_timeout() const { return std::chrono::duration(service_timeout_sec_); } @@ -73,8 +90,9 @@ void ConfigurationManager::mark_node_unavailable(const std::string & node_name) std::unique_lock lock(negative_cache_mutex_); unavailable_nodes_[node_name] = std::chrono::steady_clock::now(); - // Lazy cleanup: remove expired entries to prevent unbounded growth - if (unavailable_nodes_.size() > 100) { + // Cleanup to prevent unbounded growth + if (unavailable_nodes_.size() > kMaxNegativeCacheSize) { + // First pass: remove expired entries auto now = std::chrono::steady_clock::now(); auto ttl = std::chrono::duration(negative_cache_ttl_sec_); for (auto it = unavailable_nodes_.begin(); it != unavailable_nodes_.end();) { @@ -84,6 +102,16 @@ void ConfigurationManager::mark_node_unavailable(const std::string & node_name) ++it; } } + // If still over limit, evict oldest entry + if (unavailable_nodes_.size() > kMaxNegativeCacheSize) { + auto oldest = + std::min_element(unavailable_nodes_.begin(), unavailable_nodes_.end(), [](const auto & a, const auto & b) { + return a.second < b.second; + }); + if (oldest != unavailable_nodes_.end()) { + unavailable_nodes_.erase(oldest); + } + } } } @@ -170,6 +198,10 @@ std::shared_ptr ConfigurationManager::get_param_cl } ParameterResult ConfigurationManager::list_parameters(const std::string & node_name) { + if (shutdown_.load()) { + return shut_down_result(); + } + // Self-query guard: use direct access for gateway's own node if (is_self_node(node_name)) { return list_own_parameters(); @@ -247,6 +279,10 @@ ParameterResult ConfigurationManager::list_parameters(const std::string & node_n } ParameterResult ConfigurationManager::get_parameter(const std::string & node_name, const std::string & param_name) { + if (shutdown_.load()) { + return shut_down_result(); + } + if (is_self_node(node_name)) { return get_own_parameter(param_name); } @@ -320,10 +356,20 @@ ParameterResult ConfigurationManager::get_parameter(const std::string & node_nam ParameterResult ConfigurationManager::set_parameter(const std::string & node_name, const std::string & param_name, const json & value) { + if (shutdown_.load()) { + return shut_down_result(); + } + // Self-query guard: set parameter directly on own node if (is_self_node(node_name)) { ParameterResult result; try { + if (!node_->has_parameter(param_name)) { + result.success = false; + result.error_message = "Parameter not found: " + param_name; + result.error_code = ParameterErrorCode::NOT_FOUND; + return result; + } auto current_value = node_->get_parameter(param_name).get_parameter_value(); rclcpp::ParameterValue param_value = json_to_parameter_value(value, current_value.get_type()); auto set_result = node_->set_parameter(rclcpp::Parameter(param_name, param_value)); @@ -624,25 +670,35 @@ void ConfigurationManager::cache_default_values(const std::string & node_name) { } ParameterResult ConfigurationManager::reset_parameter(const std::string & node_name, const std::string & param_name) { + if (shutdown_.load()) { + return shut_down_result(); + } + // Self-query guard: reset via direct access if (is_self_node(node_name)) { ParameterResult result; - std::lock_guard lock(defaults_mutex_); - auto node_it = default_values_.find(node_name); - if (node_it == default_values_.end()) { - result.success = false; - result.error_message = "No default values cached for node: " + node_name; - result.error_code = ParameterErrorCode::NO_DEFAULTS_CACHED; - return result; - } - auto param_it = node_it->second.find(param_name); - if (param_it == node_it->second.end()) { - result.success = false; - result.error_message = "No default value for parameter: " + param_name; - result.error_code = ParameterErrorCode::NOT_FOUND; - return result; - } - auto set_result = node_->set_parameter(param_it->second); + // Copy default value under lock, release before set_parameter I/O + rclcpp::Parameter default_param; + { + std::lock_guard lock(defaults_mutex_); + auto node_it = default_values_.find(node_name); + if (node_it == default_values_.end()) { + result.success = false; + result.error_message = "No default values cached for node: " + node_name; + result.error_code = ParameterErrorCode::NO_DEFAULTS_CACHED; + return result; + } + auto param_it = node_it->second.find(param_name); + if (param_it == node_it->second.end()) { + result.success = false; + result.error_message = "No default value for parameter: " + param_name; + result.error_code = ParameterErrorCode::NOT_FOUND; + return result; + } + default_param = param_it->second; + } // defaults_mutex_ released + + auto set_result = node_->set_parameter(default_param); if (!set_result.successful) { result.success = false; result.error_message = set_result.reason; @@ -651,8 +707,8 @@ ParameterResult ConfigurationManager::reset_parameter(const std::string & node_n } json param_obj; param_obj["name"] = param_name; - param_obj["value"] = parameter_value_to_json(param_it->second.get_parameter_value()); - param_obj["type"] = parameter_type_to_string(param_it->second.get_type()); + param_obj["value"] = parameter_value_to_json(default_param.get_parameter_value()); + param_obj["type"] = parameter_type_to_string(default_param.get_type()); param_obj["reset_to_default"] = true; result.success = true; result.data = param_obj; @@ -733,6 +789,10 @@ ParameterResult ConfigurationManager::reset_parameter(const std::string & node_n } ParameterResult ConfigurationManager::reset_all_parameters(const std::string & node_name) { + if (shutdown_.load()) { + return shut_down_result(); + } + // Self-query guard: reset via direct access if (is_self_node(node_name)) { ParameterResult result; diff --git a/src/ros2_medkit_gateway/test/test_configuration_manager.cpp b/src/ros2_medkit_gateway/test/test_configuration_manager.cpp index 60e3f48f3..f5b870d17 100644 --- a/src/ros2_medkit_gateway/test/test_configuration_manager.cpp +++ b/src/ros2_medkit_gateway/test/test_configuration_manager.cpp @@ -345,17 +345,37 @@ TEST_F(TestConfigurationManager, test_self_query_no_deadlock) { TEST_F(TestConfigurationManager, test_concurrent_queries_no_crash) { // Concurrent queries to different nonexistent nodes must not crash // (spin_mutex_ serializes ROS 2 IPC to prevent executor conflicts). - // Second+ calls hit negative cache and return instantly. + std::vector results(3); std::vector threads; for (int i = 0; i < 3; ++i) { - threads.emplace_back([this, i]() { - (void)config_manager_->list_parameters("/concurrent_node_" + std::to_string(i)); + threads.emplace_back([this, i, &results]() { + results[static_cast(i)] = config_manager_->list_parameters("/concurrent_node_" + std::to_string(i)); }); } for (auto & t : threads) { t.join(); } - // If we get here without crash/hang, spin serialization works + for (int i = 0; i < 3; ++i) { + EXPECT_FALSE(results[static_cast(i)].success); + EXPECT_EQ(results[static_cast(i)].error_code, ParameterErrorCode::SERVICE_UNAVAILABLE); + } +} + +TEST_F(TestConfigurationManager, test_negative_cache_cross_method) { + // list_parameters marks node unavailable, get_parameter should return cached + auto list_result = config_manager_->list_parameters("/cross_method_cached_node"); + EXPECT_FALSE(list_result.success); + EXPECT_EQ(list_result.error_code, ParameterErrorCode::SERVICE_UNAVAILABLE); + + auto get_result = config_manager_->get_parameter("/cross_method_cached_node", "param"); + EXPECT_FALSE(get_result.success); + EXPECT_EQ(get_result.error_code, ParameterErrorCode::SERVICE_UNAVAILABLE); + EXPECT_TRUE(get_result.error_message.find("cached") != std::string::npos); + + auto set_result = config_manager_->set_parameter("/cross_method_cached_node", "param", nlohmann::json(42)); + EXPECT_FALSE(set_result.success); + EXPECT_EQ(set_result.error_code, ParameterErrorCode::SERVICE_UNAVAILABLE); + EXPECT_TRUE(set_result.error_message.find("cached") != std::string::npos); } // ==================== CONCURRENT ACCESS TEST ==================== From 2ff376a12429519cf4b1286a3a4d447fd89e9e96 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Mon, 30 Mar 2026 09:52:52 +0200 Subject: [PATCH 10/11] fix(gateway): add explicit config_mgr_->shutdown() in GatewayNode destructor For consistency with other managers (trigger_mgr_, plugin_mgr_, operation_mgr_), explicitly shut down ConfigurationManager before member destruction begins. Ensures param_node_ and cached clients are cleared while ROS 2 context is still alive. --- src/ros2_medkit_gateway/src/gateway_node.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/ros2_medkit_gateway/src/gateway_node.cpp b/src/ros2_medkit_gateway/src/gateway_node.cpp index af2a5c3de..d6e118263 100644 --- a/src/ros2_medkit_gateway/src/gateway_node.cpp +++ b/src/ros2_medkit_gateway/src/gateway_node.cpp @@ -1178,7 +1178,12 @@ GatewayNode::~GatewayNode() { if (operation_mgr_) { operation_mgr_->shutdown(); } - // 8. Normal member destruction (managers safe - all transports stopped) + // 8. Shutdown ConfigurationManager (clears param_node_ and cached clients + // before rclcpp context is destroyed - prevents use-after-free) + if (config_mgr_) { + config_mgr_->shutdown(); + } + // 9. Normal member destruction (managers safe - all transports stopped) } const ThreadSafeEntityCache & GatewayNode::get_thread_safe_cache() const { From f15b9a251e5b68fefbca66d235b0e749618e0565 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Mon, 30 Mar 2026 10:40:28 +0200 Subject: [PATCH 11/11] fix(gateway): restore cache_default_values before wait_for_service Move cache_default_values() back before wait_for_service() in list_parameters(), matching the original code order. This gives nodes extra time for DDS service discovery (cache_default_values also calls wait_for_service internally). On slow CI VMs, the additional discovery window prevents test_17 flakiness. --- src/ros2_medkit_gateway/src/configuration_manager.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ros2_medkit_gateway/src/configuration_manager.cpp b/src/ros2_medkit_gateway/src/configuration_manager.cpp index 182faf748..051b72862 100644 --- a/src/ros2_medkit_gateway/src/configuration_manager.cpp +++ b/src/ros2_medkit_gateway/src/configuration_manager.cpp @@ -228,6 +228,9 @@ ParameterResult ConfigurationManager::list_parameters(const std::string & node_n { std::lock_guard spin_lock(spin_mutex_); + // Cache default values first (gives node extra time for DDS service discovery) + cache_default_values(node_name); + if (!client->wait_for_service(get_service_timeout())) { result.success = false; result.error_message = "Parameter service not available for node: " + node_name; @@ -238,8 +241,6 @@ ParameterResult ConfigurationManager::list_parameters(const std::string & node_n return result; } - cache_default_values(node_name); - auto param_names = client->list_parameters({}, 0); parameters = client->get_parameters(param_names.names);