diff --git a/src/plugins/hetero/src/subgraph_collector.cpp b/src/plugins/hetero/src/subgraph_collector.cpp index 65fb9e323d6d78..23218c2d30540b 100644 --- a/src/plugins/hetero/src/subgraph_collector.cpp +++ b/src/plugins/hetero/src/subgraph_collector.cpp @@ -6,10 +6,12 @@ #include #include +#include #include #include #include #include +#include #if defined(_MSC_VER) # include @@ -68,8 +70,7 @@ ov::hetero::SubgraphCollector::SubgraphCollector(const std::shared_ptr node_to_index; node_to_index.reserve(nodes_count); std::vector ordered_inputs(nodes_count); + std::vector> output_consumer_counts(nodes_count); for (size_t i = 0; i < nodes_count; ++i) { node_to_index.emplace(_ordered_ops[i].get(), i); ordered_inputs[i] = _ordered_ops[i]->inputs(); + const auto outputs = _ordered_ops[i]->outputs(); + auto& consumer_counts = output_consumer_counts[i]; + consumer_counts.reserve(outputs.size()); + for (const auto& output : outputs) { + consumer_counts.push_back(output.get_target_inputs().size()); + } } auto get_index_by_node = [&node_to_index](const ov::Node* node) { @@ -160,15 +174,31 @@ void ov::hetero::SubgraphCollector::split_cyclic_dependencies() { return true; return false; }; + auto bit_all_of = [&](const Bits& a, const auto& pred) { + for (size_t i = 0; i < a.size(); ++i) { + uint64_t bits = a[i]; + while (bits) { + const size_t b = (i << 6) + ctz64(bits); + bits &= bits - 1; + if (!pred(b)) { + return false; + } + } + } + return true; + }; + + // Subgraph-ID state is shared across the per-node loop, the SCC loop, and the return value. + SubgraphIdsMap subgraph_ids; + std::vector subgraph_id_by_index(nodes_count); // Split cyclic dependencies. for (size_t prev_subgraphs = 0, cyclic_split_step = 0; prev_subgraphs != _subgraph_inputs.size(); ++cyclic_split_step) { OPENVINO_ASSERT(cyclic_split_step < _ordered_ops.size(), "Cannot resolve cycles during submodels split!"); prev_subgraphs = _subgraph_inputs.size(); - auto subgraph_ids = collect_subgraphs_ids(); + subgraph_ids = collect_subgraphs_ids(); - std::vector subgraph_id_by_index(nodes_count); for (const auto& node : _ordered_ops) { const auto index = get_index_by_node(node.get()); subgraph_id_by_index[index] = subgraph_ids.at(node); @@ -297,7 +327,21 @@ void ov::hetero::SubgraphCollector::split_cyclic_dependencies() { const auto& src_cyc_dep = node_subgraph_cyclic_input_dependencies[input_source_idx]; const auto& src_sg_dep = node_subgraph_input_dependencies[input_source_idx]; if (!bit_intersects(cyc_dep, src_cyc_dep) && bit_intersects(cyclic_inputs_dependencies, src_sg_dep)) { - _subgraph_inputs.insert(input); + const auto source_output = input.get_source_output(); + const bool single_consumer_graph_input_leaf = + output_consumer_counts[input_source_idx][source_output.get_index()] == 1 && + !is_graph_input_node(source_output.get_node()) && !bit_any(src_cyc_dep) && + bit_all_of(src_sg_dep, [&](size_t b) { + const auto& traced_input = bit_to_input[b]; + if (is_graph_input_node(traced_input.get_node())) { + return true; + } + const auto* traced_producer = traced_input.get_source_output().get_node(); + return is_graph_input_node(traced_producer); + }); + if (!single_consumer_graph_input_leaf) { + _subgraph_inputs.insert(input); + } } } }; @@ -305,6 +349,348 @@ void ov::hetero::SubgraphCollector::split_cyclic_dependencies() { promote_boundaries_for_node(node_idx); } } + + // === Subgraph-level SCC fallback. =========================================================== + // The per-node heuristic above only detects cycles whose re-entry point sits on a node whose + // own cyc_dep bitset is non-empty (same-sg data flows back through a foreign sg into that + // node's inputs). Two classes of subgraph-DAG cycles fall outside its scope, and both are + // first-class cases this fallback exists to handle -- neither is exceptional: + // + // (a) Multi-hop subgraph-DAG cycles (sg_A -> sg_B -> sg_C -> sg_D -> sg_A) where the + // producer and re-entry consumer are several subgraphs apart and no single node sees + // its own sg on the cycle. + // (b) Shared-graph-input cycles, where a Constant (or other graph input) fans out to + // multiple consumers that Union-Find fuses into a single subgraph, and that fused + // subgraph then both produces and consumes data on the same neighbor subgraph. The + // cut edge here is an input of the foreign-sg node, not of the same-sg node whose + // cyc_dep is non-empty, so Phase 4b cannot promote it by construction. + // + // Both arise from Union-Find merging structurally independent regions via shared inputs. + // The ov::Model itself is a DAG; the cycle is purely an artifact of subgraph fusion that + // run()'s topological sort cannot resolve. + // + // Break the cycle by identifying non-trivial SCCs in the subgraph DAG and, per iteration, + // isolating one node out of some SCC-member Union-Find component by promoting all of its + // same-sg input edges to boundary (see isolate_one_scc_node for the rationale and the + // convergence argument). The loop is bounded by the total number of node-input edges; in + // practice it converges in ~#SCC iterations. + + // Helper 1: build the subgraph DAG from cross-subgraph edges already recorded in + // _subgraph_inputs. Parallel edges between the same pair of subgraphs are de-duplicated; + // self-edges (producer_sg == owner_sg) are filtered so single-subgraph SCCs cannot arise. + using SgAdj = std::unordered_map>; + auto build_subgraph_adjacency = + [&](const std::vector& sg_id_by_index) -> std::pair> { + SgAdj adj; + std::unordered_set all_sgs; + for (size_t i = 0; i < nodes_count; ++i) { + all_sgs.insert(sg_id_by_index[i]); + } + for (const auto& inp : _subgraph_inputs) { + if (is_graph_input_node(inp.get_node())) + continue; + const auto owner_sg = sg_id_by_index[get_index_by_node(inp.get_node())]; + const auto producer_sg = sg_id_by_index[get_index_by_node(inp.get_source_output().get_node())]; + if (owner_sg == producer_sg) + continue; + adj[producer_sg].insert(owner_sg); + } + return {std::move(adj), std::move(all_sgs)}; + }; + + // Helper 2: return the set of subgraphs that belong to any non-trivial SCC of `adj`, using + // iterative Tarjan. An exact SCC algorithm is required here: a two-peel (forward + reverse + // Kahn) approximation also flags acyclic bridges between two disjoint cycles (e.g. X in + // A<->B -> X -> C<->D survives both peels), which would either waste a promotion on an + // acyclic subgraph or trip the "no internal edge" assert below when the bridge subgraph has + // no same-sg edge. The loop is iterative to avoid recursion depth issues on large partitions. + auto find_non_trivial_scc_members = + [](const SgAdj& adj, const std::unordered_set& all_sgs) -> std::unordered_set { + std::unordered_set scc_members; + std::unordered_map index_of; + std::unordered_map lowlink; + std::unordered_set on_stack; + std::vector tarjan_stack; + int next_index = 0; + struct Frame { + SubgraphId v; + std::vector neighbors; + size_t next_neighbor; + }; + std::vector call_stack; + auto neighbors_of = [&adj](SubgraphId v) { + std::vector out; + const auto it = adj.find(v); + if (it != adj.end()) + out.assign(it->second.begin(), it->second.end()); + return out; + }; + auto open_node = [&](SubgraphId v) { + index_of[v] = next_index; + lowlink[v] = next_index; + ++next_index; + tarjan_stack.push_back(v); + on_stack.insert(v); + call_stack.push_back({v, neighbors_of(v), 0}); + }; + for (auto start : all_sgs) { + if (index_of.count(start)) + continue; + open_node(start); + while (!call_stack.empty()) { + auto& frame = call_stack.back(); + if (frame.next_neighbor < frame.neighbors.size()) { + const auto w = frame.neighbors[frame.next_neighbor++]; + if (!index_of.count(w)) { + open_node(w); + } else if (on_stack.count(w)) { + lowlink[frame.v] = std::min(lowlink[frame.v], index_of[w]); + } + } else { + const auto v = frame.v; + if (lowlink[v] == index_of[v]) { + std::vector comp; + while (true) { + const auto w = tarjan_stack.back(); + tarjan_stack.pop_back(); + on_stack.erase(w); + comp.push_back(w); + if (w == v) + break; + } + // Only non-trivial SCCs (size > 1) represent real cycles in the subgraph + // DAG; singletons are reported by Tarjan even for nodes with no cycle and + // must be excluded. Self-loops were filtered out by build_subgraph_adjacency. + if (comp.size() > 1) { + for (auto m : comp) + scc_members.insert(m); + } + } + const auto finished = frame.v; + call_stack.pop_back(); + if (!call_stack.empty()) { + lowlink[call_stack.back().v] = std::min(lowlink[call_stack.back().v], lowlink[finished]); + } + } + } + } + return scc_members; + }; + + // Helper 3: isolate one Union-Find node from its SCC member by promoting ALL its + // same-subgraph non-boundary input edges into _subgraph_inputs. Returns the number of + // edges promoted (1 .. node's input arity). + // + // Rationale (why this works and the simpler alternatives don't): + // * Promoting a single same-sg input edge per iteration diverges: the chosen node still + // re-merges into the SCC via its OTHER same-sg inputs in the next collect_subgraphs_ids + // round, and "first-input-wins" union-find keeps it in the same component. Observed on + // yolo26s-seg: SCC member count grew 4 -> 26 across iterations. + // * Promoting only edges at entry/exit points of SCC members misses the common + // "shared-Constant fuses regions" case: S = {c_shared, a, b, c, ...} where c_shared is + // a Constant unioning multiple consumers. c_shared has no same-sg consumers in OTHER + // SCC members (its consumers are all in S), so it is neither an entry nor an exit, and + // the only same-sg input that would break the cycle — (a <- c_shared) — is skipped. + // * Dissolving a whole SCC-member subgraph at once explodes the partition. On + // yolo26s-seg the GPU mainland S has 428 nodes / 449 internal edges; full dissolution + // produces ~450 subgraphs and breaks downstream compile_model. + // + // The "isolate one node" cut is the minimum needed: by promoting all of n's same-sg + // inputs, n becomes a Union-Find root on the next round, severed from every upstream node + // in S (including shared-Constant connectors). Each iteration thus strictly reduces the + // size of some SCC member by 1 (n moves to its own singleton component). + // + // Convergence: + // * In any non-trivial SCC (size > 1) of the subgraph DAG, at least one member is not a + // Union-Find singleton: if ALL members were singletons, the SCC-DAG cycle + // sg_X1 -> ... -> sg_Xk -> sg_X1 would unfold into a node-level cycle + // x1 -> ... -> xk -> x1 in the original ov::Model, which is a DAG. + // * A non-singleton Union-Find component of size m has exactly m-1 unification edges, + // i.e. m-1 non-boundary input edges, so at least one node in it has a same-sg input. + // * Each iteration isolates one such node, strictly reducing the total non-singleton + // mass of SCC members. The loop therefore terminates in at most nodes_count iterations + // and well within the total_node_inputs edge budget. + // + // Target selection: among all candidate nodes (in any SCC member with >= 1 same-sg input), + // prefer cuts at actual SCC re-entry nodes and shared connectors. Falling back to the node + // with the fewest same-sg inputs is still valid for convergence, but doing so too early may + // peel ordinary linear compute nodes out of the main device region and create tiny + // Parameter->op->Result submodels. Those are especially expensive for GPU compilation. + auto isolate_one_scc_node = [&](const std::vector& sg_id_by_index, + const std::unordered_set& scc_members) -> size_t { + struct CandidateRank { + size_t lacks_scc_boundary_input = 1; + size_t lacks_shared_same_sg_source = 1; + size_t has_trivial_leaf_input = 1; + size_t is_linear_compute_node = 1; + size_t same_sg_inputs = 0; + size_t node_idx = 0; + }; + + auto is_better_rank = [](const CandidateRank& lhs, const CandidateRank& rhs) { + if (lhs.lacks_scc_boundary_input != rhs.lacks_scc_boundary_input) + return lhs.lacks_scc_boundary_input < rhs.lacks_scc_boundary_input; + if (lhs.lacks_shared_same_sg_source != rhs.lacks_shared_same_sg_source) + return lhs.lacks_shared_same_sg_source < rhs.lacks_shared_same_sg_source; + if (lhs.has_trivial_leaf_input != rhs.has_trivial_leaf_input) + return lhs.has_trivial_leaf_input < rhs.has_trivial_leaf_input; + if (lhs.is_linear_compute_node != rhs.is_linear_compute_node) + return lhs.is_linear_compute_node < rhs.is_linear_compute_node; + if (lhs.same_sg_inputs != rhs.same_sg_inputs) + return lhs.same_sg_inputs < rhs.same_sg_inputs; + return lhs.node_idx < rhs.node_idx; + }; + + auto count_non_result_consumers = [](const std::shared_ptr& node) { + size_t non_result_consumers = 0; + for (const auto& output : node->outputs()) { + for (const auto& target_input : output.get_target_inputs()) { + if (!ov::op::util::is_output(target_input.get_node())) { + ++non_result_consumers; + } + } + } + return non_result_consumers; + }; + std::vector non_result_consumer_counts(nodes_count, static_cast(-1)); + auto count_non_result_consumers_by_index = [&](size_t node_idx) { + auto& cached = non_result_consumer_counts[node_idx]; + if (cached == static_cast(-1)) { + cached = count_non_result_consumers(_ordered_ops[node_idx]); + } + return cached; + }; + + bool have_target = false; + size_t target_idx = 0; + CandidateRank target_rank; + auto is_graph_input_leaf_source = [&](size_t node_idx) { + const auto& node = _ordered_ops[node_idx]; + if (is_graph_input_node(node.get())) + return false; + + if (count_non_result_consumers_by_index(node_idx) != 1) + return false; + + for (const auto& input : ordered_inputs[node_idx]) { + if (!is_graph_input_node(input.get_source_output().get_node())) + return false; + } + return true; + }; + for (size_t i = 0; i < nodes_count; ++i) { + const auto my_sg = sg_id_by_index[i]; + if (!scc_members.count(my_sg)) + continue; + size_t same_sg_inputs = 0; + bool has_scc_boundary_input = false; + bool has_shared_same_sg_source = false; + bool has_trivial_leaf_input = false; + for (const auto& input : ordered_inputs[i]) { + if (_subgraph_inputs.count(input)) { + if (!is_graph_input_node(input.get_node())) { + const auto src_idx = get_index_by_node(input.get_source_output().get_node()); + const auto producer_sg = sg_id_by_index[src_idx]; + has_scc_boundary_input = + has_scc_boundary_input || (producer_sg != my_sg && scc_members.count(producer_sg)); + } + continue; + } + const auto src_idx = get_index_by_node(input.get_source_output().get_node()); + if (sg_id_by_index[src_idx] != my_sg) + continue; + ++same_sg_inputs; + has_shared_same_sg_source = + has_shared_same_sg_source || count_non_result_consumers_by_index(src_idx) > 1; + has_trivial_leaf_input = has_trivial_leaf_input || is_graph_input_leaf_source(src_idx); + } + if (same_sg_inputs == 0) + continue; + + const CandidateRank candidate_rank{ + has_scc_boundary_input ? 0UL : 1UL, + has_shared_same_sg_source ? 0UL : 1UL, + has_trivial_leaf_input ? 1UL : 0UL, + (same_sg_inputs == 1 && count_non_result_consumers_by_index(i) <= 1) ? 1UL : 0UL, + same_sg_inputs, + i}; + const bool better_target = !have_target || is_better_rank(candidate_rank, target_rank); + if (better_target) { + have_target = true; + target_idx = i; + target_rank = candidate_rank; + } + } + OPENVINO_ASSERT(have_target, + "Subgraph SCC fallback found a cyclic subgraph DAG but every node in " + "every SCC member is a Union-Find singleton; that would require a " + "node-level cycle in the original ov::Model, which is impossible on a DAG."); + + size_t promoted = 0; + const auto target_sg = sg_id_by_index[target_idx]; + for (const auto& input : ordered_inputs[target_idx]) { + if (_subgraph_inputs.count(input)) + continue; + const auto src_idx = get_index_by_node(input.get_source_output().get_node()); + if (sg_id_by_index[src_idx] != target_sg) + continue; + _subgraph_inputs.insert(input); + ++promoted; + } + return promoted; + }; + + size_t total_node_inputs = 0; + for (size_t i = 0; i < nodes_count; ++i) { + total_node_inputs += ordered_inputs[i].size(); + } + // subgraph_ids / subgraph_id_by_index reach this point already valid w.r.t. the current + // _subgraph_inputs: the per-node loop exits only when its last iteration adds no boundaries, + // so the ids it computed at the top of that final iteration are still in sync. Recompute + // only after the SCC step actually modifies _subgraph_inputs. + bool ids_valid = true; + for (size_t scc_step = 0;; ++scc_step) { + OPENVINO_ASSERT(scc_step < total_node_inputs + 1, + "Subgraph SCC fallback did not converge: exceeded node-input edge budget"); + if (!ids_valid) { + subgraph_ids = collect_subgraphs_ids(); + for (size_t i = 0; i < nodes_count; ++i) { + subgraph_id_by_index[i] = subgraph_ids.at(_ordered_ops[i]); + } + ids_valid = true; + } + const size_t inputs_before_step = _subgraph_inputs.size(); + + const auto sg_graph = build_subgraph_adjacency(subgraph_id_by_index); + const auto& sg_adj = sg_graph.first; + const auto& all_sgs = sg_graph.second; + const auto scc_members = find_non_trivial_scc_members(sg_adj, all_sgs); + if (scc_members.empty()) { + break; // subgraph DAG is acyclic, fix-point reached. + } + + // Isolate one Union-Find node from any SCC member by promoting ALL its same-sg input + // edges. See isolate_one_scc_node for why a single-edge cut diverges, why entry/exit + // cuts miss shared-Constant SCCs, and the convergence argument (the candidate always + // exists because singleton-only SCCs are impossible on a DAG). + const size_t promoted = isolate_one_scc_node(subgraph_id_by_index, scc_members); + OPENVINO_ASSERT(promoted > 0, + "Subgraph SCC fallback found a cyclic subgraph DAG but the chosen node " + "had no same-subgraph inputs to promote; helper invariant violated."); + // Defensive: each iteration must grow _subgraph_inputs strictly. If insert() ever found + // all promoted edges already present (logic bug), surface it here instead of looping + // silently until the edge budget runs out. + OPENVINO_ASSERT(_subgraph_inputs.size() > inputs_before_step, + "Subgraph SCC fallback promoted edges but _subgraph_inputs did not grow"); + ids_valid = false; // _subgraph_inputs grew; next iteration must rebuild ids. + } + + // Edge case: if init() produced no _subgraph_inputs at all, the per-node loop never ran and + // subgraph_ids is empty. Materialize the final mapping in that case. + if (subgraph_ids.empty()) { + subgraph_ids = collect_subgraphs_ids(); + } + return subgraph_ids; } ov::hetero::SubgraphCollector::SubgraphIdsMap ov::hetero::SubgraphCollector::collect_subgraphs_ids() { diff --git a/src/plugins/hetero/src/subgraph_collector.hpp b/src/plugins/hetero/src/subgraph_collector.hpp index bfdc6dcf5c0bc5..428e87e1f700ba 100644 --- a/src/plugins/hetero/src/subgraph_collector.hpp +++ b/src/plugins/hetero/src/subgraph_collector.hpp @@ -55,7 +55,9 @@ class SubgraphCollector { private: void init(); bool is_graph_input_node(const ov::Node* node) const; - void split_cyclic_dependencies(); + // Splits cyclic subgraph dependencies and returns the final SubgraphIdsMap valid + // w.r.t. the resulting _subgraph_inputs, so the caller does not need to recompute it. + SubgraphIdsMap split_cyclic_dependencies(); void split_subgraphs_by_parameter_results(); SubgraphIdsMap collect_subgraphs_ids(); std::unordered_map collect_subgraphs(); diff --git a/src/plugins/hetero/tests/unit/subgraph_collector.cpp b/src/plugins/hetero/tests/unit/subgraph_collector.cpp index 55eb3c88ab3237..e75b27a80a9a1d 100644 --- a/src/plugins/hetero/tests/unit/subgraph_collector.cpp +++ b/src/plugins/hetero/tests/unit/subgraph_collector.cpp @@ -7,6 +7,7 @@ #include #include +#include #include "common_test_utils/graph_comparator.hpp" #include "common_test_utils/test_assertions.hpp" @@ -487,6 +488,162 @@ std::shared_ptr create_shared_const_cross_device_fanout_model() { return std::make_shared(ov::ResultVector{res}, ov::ParameterVector{param}); } +// Multi-hop subgraph-level SCC. Two regions of M0 get fused into one Union-Find subgraph via a +// shared Constant (c_top: feeds X1 in region 1 and X2 in region 2). The resulting M0 subgraph +// then participates in a 4-subgraph cycle that no single node can detect with the per-node +// heuristic (the producer and re-entry consumer of the cycle are different nodes far apart in +// topology). Only the subgraph-DAG SCC fallback can break it. +// +// Topology (M0 = MOCK.0, M1 = MOCK.1): +// +// in1(M0) ─┐ ┌─ X1(M0,+c_top) ─ res_x1 +// ├─ A1(M0) ─ B1(M1) ─ C1(M0) ─ D1(M1) ┘ +// c_top(M0) ──────────────┐ +// in2(M0) ─┐ │ +// ├─ A2(M0) ─ B2(M1) ┘ ┌─ X2(M0,+c_top) ─ res_x2 +// │ +// ├─ A2(M0) ─────── C2(M1) ─ D2(M0) ───┘ +// +// Initial Union-Find groups (M0 only): {in1,A1,C1}, {in2,A2,D2,X2,c_top,X1}. The shared c_top +// merges X1 (region 1) and X2 (region 2) into the same M0 subgraph, call it M0_big. +// Cross-subgraph data edges then form: M0_big -> M1 (A1->B1, A2->B2, A2->C2), +// M1 -> M0_big (D1->X1-via-c_top-region, D2 already inside M0_big). After the per-node fix-point +// loop, the subgraph DAG still contains M0_big -> M1 -> M0_big -> M1 -> M0_big, but no single +// node in M0_big has a producer-in-my-sg cyclic dependency (X1's producers are D1 in M1 and +// c_top which is a graph input). SCC fallback must split M0_big into multiple subgraphs. +std::shared_ptr create_multi_hop_scc_cycle_model() { + auto in1 = std::make_shared(ov::element::f32, ov::PartialShape{4}); + in1->set_friendly_name("in1"); + auto in2 = std::make_shared(ov::element::f32, ov::PartialShape{4}); + in2->set_friendly_name("in2"); + auto c_top = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{4}, {1.0f, 1.0f, 1.0f, 1.0f}); + c_top->set_friendly_name("c_top"); + auto a1 = std::make_shared(in1); + a1->set_friendly_name("A1"); + auto b1 = std::make_shared(a1); + b1->set_friendly_name("B1"); + auto c1 = std::make_shared(b1); + c1->set_friendly_name("C1"); + auto d1 = std::make_shared(c1); + d1->set_friendly_name("D1"); + auto x1 = std::make_shared(d1, c_top); + x1->set_friendly_name("X1"); + auto a2 = std::make_shared(in2); + a2->set_friendly_name("A2"); + auto b2 = std::make_shared(a2); + b2->set_friendly_name("B2"); + auto c2 = std::make_shared(b2); + c2->set_friendly_name("C2"); + auto d2 = std::make_shared(c2); + d2->set_friendly_name("D2"); + auto x2 = std::make_shared(d2, c_top); + x2->set_friendly_name("X2"); + auto res_x1 = std::make_shared(x1); + res_x1->set_friendly_name("res_x1"); + auto res_x2 = std::make_shared(x2); + res_x2->set_friendly_name("res_x2"); + return std::make_shared(ov::ResultVector{res_x1, res_x2}, ov::ParameterVector{in1, in2}); +} + +// Bridge-between-cycles topology. Two independent 2-subgraph SCCs sit on the left and right; +// a multi-node bridge subgraph X on a third device sits between them, with one incoming edge +// from the left SCC and one outgoing edge to the right SCC. The bridge is acyclic in the +// subgraph DAG (it lies on a single path between the two cycles, not in any cycle itself). +// +// Subgraph DAG after initial partitioning: +// +// A_L(M0) ↔ B_L(M1) ──► X(M2) ──► A_R(M0) ↔ B_R(M1) +// +// Each 2-cycle is formed without per-node cyclic inputs (the round-trip goes through nodes in +// different subgraphs), so split_cyclic_dependencies()'s per-node fix-point loop cannot break +// them; only the subgraph-DAG SCC fallback can. This is the structural ingredient that exposes +// the difference between an exact SCC algorithm and a forward+reverse Kahn approximation: the +// approximation marks X as cyclic (it survives both peels because every subgraph has both +// incoming and outgoing edges), and the promotion loop would eventually split the bridge by +// promoting x_bridge1 → x_bridge2. An exact SCC algorithm classifies X as a singleton SCC and +// never touches its internal edges, preserving the bridge as a single subgraph. +// +// The test below asserts the latter: x_bridge1 and x_bridge2 must end up in the same subgraph +// after run() converges. +std::shared_ptr create_bridge_between_cycles_model() { + // Left cycle: c_LA fuses {in_L, a_L1, a_L2} into A_L (M0); c_LB fuses {b_L1, b_L2} into B_L (M1). + auto in_L = std::make_shared(ov::element::f32, ov::PartialShape{4}); + in_L->set_friendly_name("in_L"); + auto c_LA = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{4}, {1.0f, 1.0f, 1.0f, 1.0f}); + c_LA->set_friendly_name("c_LA"); + auto c_LB = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{4}, {2.0f, 2.0f, 2.0f, 2.0f}); + c_LB->set_friendly_name("c_LB"); + auto a_L1 = std::make_shared(in_L, c_LA); + a_L1->set_friendly_name("a_L1"); + auto b_L1 = std::make_shared(a_L1, c_LB); // A_L → B_L edge + b_L1->set_friendly_name("b_L1"); + auto b_L2 = std::make_shared(b_L1, c_LB); + b_L2->set_friendly_name("b_L2"); + auto a_L2 = std::make_shared(b_L2, c_LA); // B_L → A_L edge + a_L2->set_friendly_name("a_L2"); + // Bridge: two M2 nodes connected by an internal same-sg edge (x_bridge1 → x_bridge2). This + // internal edge is what the buggy two-peel would wrongly promote. + auto x_bridge1 = std::make_shared(a_L2); // A_L → X edge + x_bridge1->set_friendly_name("x_bridge1"); + auto x_bridge2 = std::make_shared(x_bridge1); + x_bridge2->set_friendly_name("x_bridge2"); + // Right cycle: mirror of left, fed from the bridge tail. + auto c_RA = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{4}, {3.0f, 3.0f, 3.0f, 3.0f}); + c_RA->set_friendly_name("c_RA"); + auto c_RB = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{4}, {4.0f, 4.0f, 4.0f, 4.0f}); + c_RB->set_friendly_name("c_RB"); + auto a_R1 = std::make_shared(x_bridge2, c_RA); // X → A_R edge + a_R1->set_friendly_name("a_R1"); + auto b_R1 = std::make_shared(a_R1, c_RB); // A_R → B_R edge + b_R1->set_friendly_name("b_R1"); + auto b_R2 = std::make_shared(b_R1, c_RB); + b_R2->set_friendly_name("b_R2"); + auto a_R2 = std::make_shared(b_R2, c_RA); // B_R → A_R edge + a_R2->set_friendly_name("a_R2"); + auto res = std::make_shared(a_R2); + res->set_friendly_name("res"); + return std::make_shared(ov::ResultVector{res}, ov::ParameterVector{in_L}); +} + +// Subgraph-DAG SCC where the only same-subgraph promotable edges have a Constant producer. +// A shared M0 Constant `c_shared` is consumed by three M0 nodes (A, C, E) which are interleaved +// with three independent M1 nodes (B, D, F). The interleaving forms two 2-cycles in the +// subgraph DAG (M0_big <-> sg_B and M0_big <-> sg_D), both incident to the fused M0_big +// subgraph; every M0_big internal edge whose other endpoint is not foreign-sg ends up being a +// Constant -> consumer edge (c_shared -> A, c_shared -> C, c_shared -> E). This is the exact +// shape of the failure reproduced on yolo26s-seg: the SCC fallback finds a cyclic subgraph, +// but every candidate same-sg edge has a graph-input producer. The earlier implementation +// filtered those out and tripped the "no internal edge to promote" assert. +// +// Topology (M0 = MOCK.0, M1 = MOCK.1): +// +// in(M0) --> A(M0,+c_shared) --> B(M1) --> C(M0,+c_shared) --> D(M1) --> E(M0,+c_shared) --> F(M1) --> res +// ^ (M1) ^ (M1) ^ (M1) +// | | | +// +--- c_shared(M0) ----------------+---------------------------------+ +// +std::shared_ptr create_shared_const_scc_only_const_promotable_model() { + auto in_node = std::make_shared(ov::element::f32, ov::PartialShape{4}); + in_node->set_friendly_name("in"); + auto c_shared = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{4}, {1.0f, 1.0f, 1.0f, 1.0f}); + c_shared->set_friendly_name("c_shared"); + auto A = std::make_shared(in_node, c_shared); + A->set_friendly_name("A"); + auto B = std::make_shared(A); + B->set_friendly_name("B"); + auto C = std::make_shared(B, c_shared); + C->set_friendly_name("C"); + auto D = std::make_shared(C); + D->set_friendly_name("D"); + auto E = std::make_shared(D, c_shared); + E->set_friendly_name("E"); + auto F = std::make_shared(E); + F->set_friendly_name("F"); + auto res = std::make_shared(F); + res->set_friendly_name("res"); + return std::make_shared(ov::ResultVector{res}, ov::ParameterVector{in_node}); +} + // Stateful model: param → read_value → add(+c1) → {result, assign(sink)}. // Single-device by design — exercises Subgraph::_sinks wire-through and // create_submodel_from_collected_subgraph()'s sink-preserving construction without @@ -867,7 +1024,10 @@ struct SubgraphCollectorTestParam { ModelFactory create_model; // factory to build the model under test std::map affinity_map; // node_name → device; empty = broadcast default std::string default_affinity; // used when affinity_map is empty - size_t expected_subgraph_count; // number of subgraphs from run() + // Expected number of subgraphs from run(). std::nullopt explicitly opts out of the count + // check; use only when the partition shape is an implementation detail but convergence and + // merge round-trip must still hold (e.g. SCC fallback tests). + std::optional expected_subgraph_count; // --- optional checks (a default-constructed/empty/false value disables the check) --- std::vector expected_affinities = {}; // sorted affinity list per subgraph std::map expected_ids = {}; // node_name → expected subgraph ID @@ -877,10 +1037,11 @@ struct SubgraphCollectorTestParam { bool verify_merge_roundtrip = false; // merge submodels back and check size == 1 bool verify_merge_compare = false; // compare_functions(original, merged) // Per-resulting-subgraph structural counts. Empty vector = check disabled. When non-empty, - // size MUST equal expected_subgraph_count; each entry is the expected count in the subgraph - // at the same index. Intended primarily as direct evidence of Constant duplication after a - // promoted boundary (see shared_const_*_cycle cases), without requiring a full reference - // submodel via expected_submodel_factories. + // size MUST equal the actual runtime subgraph count (`subgraphs.size()`); each entry is the + // expected count in the subgraph at the same index. This remains valid even when + // expected_subgraph_count is std::nullopt. Intended primarily as direct evidence of Constant + // duplication after a promoted boundary (see shared_const_*_cycle cases), without requiring + // a full reference submodel via expected_submodel_factories. std::vector expected_constants_per_submodel = {}; std::vector expected_parameters_per_submodel = {}; std::vector expected_results_per_submodel = {}; @@ -934,7 +1095,9 @@ TEST_P(SubgraphCollectorParamTest, split_by_affinity) { const auto& [subgraphs, mapping] = collector.run(); - ASSERT_EQ(param.expected_subgraph_count, subgraphs.size()); + if (param.expected_subgraph_count.has_value()) { + ASSERT_EQ(*param.expected_subgraph_count, subgraphs.size()); + } std::map actual_to_expected_subgraph_ids; std::vector expected_to_actual_subgraph_ids; @@ -1636,9 +1799,144 @@ INSTANTIATE_TEST_SUITE_P( /*expected_parameters_per_submodel*/ {2, 1, 2, 1}, /*expected_results_per_submodel*/ {3, 3, 1, 1}, {std::set{"A", "X"}, std::set{"B", "B2"}, std::set{"C"}, std::set{"F"}}, + }, + // --- Multi-hop subgraph-level SCC. Two independent M0 regions get fused through a shared + // Constant (c_top), and the fused M0 subgraph then participates in a cycle that no single + // node can detect (the producer and re-entry consumer are far apart). The per-node + // heuristic in split_cyclic_dependencies() converges without breaking it; only the + // subgraph-DAG SCC fallback can. This case is the minimal synthesis of the + // 4-subgraph cycle observed on yolo26s-seg HETERO:GPU,CPU. The exact partition the SCC + // fallback produces depends on the order it discovers cyclic subgraphs; this test only + // asserts that compile-time topo sort succeeds (i.e., the assertion "Cannot sort + // subgraphs!" does NOT fire) by requiring run() to complete and merge round-trip back to + // the original. + SubgraphCollectorTestParam{ + "multi_hop_subgraph_scc_cycle", + create_multi_hop_scc_cycle_model, + {{"in1", "MOCK.0"}, {"in2", "MOCK.0"}, {"c_top", "MOCK.0"}, + {"A1", "MOCK.0"}, {"B1", "MOCK.1"}, {"C1", "MOCK.0"}, {"D1", "MOCK.1"}, {"X1", "MOCK.0"}, + {"A2", "MOCK.0"}, {"B2", "MOCK.1"}, {"C2", "MOCK.1"}, {"D2", "MOCK.0"}, {"X2", "MOCK.0"}, + {"res_x1", "MOCK.0"}, {"res_x2", "MOCK.0"}}, + "", + // expected_subgraph_count = std::nullopt: the SCC fallback's promotion ordering is an + // implementation detail; the contract under test is "run() does not assert Cannot sort + // subgraphs!" and "merge round-trip succeeds". + std::nullopt, + {}, + {}, + {}, + {}, + 0, + true, + true, } ), [](const testing::TestParamInfo& info) { return info.param.test_name; }); // clang-format on + +// Regression test for the SCC fallback's bridge-between-cycles handling. See the comment on +// create_bridge_between_cycles_model() for the topology and why an exact SCC algorithm is +// required here. The contract under test: an acyclic bridge subgraph lying between two +// disjoint cycles in the subgraph DAG must NOT be split by the SCC fallback. The two M2 ops +// (x_bridge1, x_bridge2) belong to one bridge subgraph in the initial partition; after run() +// converges they must still share a subgraph. A regression that swaps the exact SCC algorithm +// back to a two-peel (forward + reverse Kahn) over-approximation would mark the bridge as +// cyclic and eventually promote x_bridge1 → x_bridge2 in the inner loop, splitting the bridge +// into two singletons — which this test then catches. +TEST(SubgraphCollectorBridgeBetweenCyclesTest, bridge_subgraph_not_split) { + auto model = create_bridge_between_cycles_model(); + const std::map affinity_by_name = { + {"in_L", "MOCK.0"}, + {"c_LA", "MOCK.0"}, + {"c_LB", "MOCK.1"}, + {"a_L1", "MOCK.0"}, + {"b_L1", "MOCK.1"}, + {"b_L2", "MOCK.1"}, + {"a_L2", "MOCK.0"}, + {"x_bridge1", "MOCK.2"}, + {"x_bridge2", "MOCK.2"}, + {"c_RA", "MOCK.0"}, + {"c_RB", "MOCK.1"}, + {"a_R1", "MOCK.0"}, + {"b_R1", "MOCK.1"}, + {"b_R2", "MOCK.1"}, + {"a_R2", "MOCK.0"}, + {"res", "MOCK.0"}, + }; + SubgraphCollector::AffinitiesMap affinities; + for (const auto& node : model->get_ordered_ops()) { + const auto it = affinity_by_name.find(node->get_friendly_name()); + ASSERT_TRUE(it != affinity_by_name.end()) << "Missing affinity for node '" << node->get_friendly_name() << "'"; + affinities[node] = it->second; + } + + SubgraphCollector collector(model, affinities); + const auto result = collector.run(); + const auto& subgraphs = result.first; + // Locate which subgraph each bridge node ended up in by scanning each subgraph's submodel. + auto find_subgraph_containing = [&subgraphs](const std::string& node_name) -> std::optional { + for (size_t i = 0; i < subgraphs.size(); ++i) { + const auto submodel = create_submodel_from_collected_subgraph(subgraphs[i]); + for (const auto& op : submodel->get_ordered_ops()) { + if (op->get_friendly_name() == node_name) + return i; + } + } + return std::nullopt; + }; + const auto idx_x1 = find_subgraph_containing("x_bridge1"); + const auto idx_x2 = find_subgraph_containing("x_bridge2"); + ASSERT_TRUE(idx_x1.has_value()) << "x_bridge1 was not found in any resulting subgraph"; + ASSERT_TRUE(idx_x2.has_value()) << "x_bridge2 was not found in any resulting subgraph"; + EXPECT_EQ(*idx_x1, *idx_x2) << "Bridge subgraph was split: x_bridge1 ended up in subgraph " << *idx_x1 + << " but x_bridge2 ended up in subgraph " << *idx_x2 + << ". This indicates the SCC fallback wrongly classified the acyclic bridge as cyclic" + << " and promoted its internal edge."; +} + +// Regression test for the SCC fallback when every promotable same-subgraph edge has a +// Constant producer. See create_shared_const_scc_only_const_promotable_model() for the +// topology. The earlier implementation skipped any candidate edge whose source was a graph +// input (Constant/Parameter), so when an SCC consisted entirely of nodes whose only same-sg +// inputs came from a shared Constant, find_promotable_internal_edge() returned nullopt and +// the SCC fallback fired "no internal edge to promote". This is the exact failure mode +// reproduced on yolo26s-seg with HETERO:GPU,CPU. The contract under test: run() converges +// (no assert), and merge round-trip succeeds. +TEST(SubgraphCollectorSharedConstSccTest, scc_with_only_constant_sourced_edges_converges) { + auto model = create_shared_const_scc_only_const_promotable_model(); + auto model_ref = model->clone(); + const std::map affinity_by_name = { + {"in", "MOCK.0"}, + {"c_shared", "MOCK.0"}, + {"A", "MOCK.0"}, + {"B", "MOCK.1"}, + {"C", "MOCK.0"}, + {"D", "MOCK.1"}, + {"E", "MOCK.0"}, + {"F", "MOCK.1"}, + {"res", "MOCK.1"}, + }; + SubgraphCollector::AffinitiesMap affinities; + for (const auto& node : model->get_ordered_ops()) { + const auto it = affinity_by_name.find(node->get_friendly_name()); + ASSERT_TRUE(it != affinity_by_name.end()) << "Missing affinity for node '" << node->get_friendly_name() << "'"; + affinities[node] = it->second; + } + + SubgraphCollector collector(model, affinities); + // Must not assert "no internal edge to promote". + const auto& [subgraphs, mapping] = collector.run(); + ASSERT_FALSE(subgraphs.empty()); + + // Merge round-trip: gluing the submodels back together must reproduce the original model. + std::vector> submodels; + submodels.reserve(subgraphs.size()); + for (const auto& sg : subgraphs) + submodels.push_back(create_submodel_from_collected_subgraph(sg)); + OV_ASSERT_NO_THROW(ov::hetero::merge_submodels(submodels, mapping._submodels_input_to_prev_output)); + ASSERT_EQ(1u, submodels.size()); + const auto cmp_result = compare_functions(model_ref, submodels[0]); + EXPECT_TRUE(cmp_result.first) << cmp_result.second; +}