verilog-to-routing · AlexandreSinger · Aug 16, 2025 · Sep 28, 2025 · Dec 19, 2025 · Dec 19, 2025
diff --git a/vpr/src/analytical_place/analytical_solver.cpp b/vpr/src/analytical_place/analytical_solver.cpp
diff --git a/vpr/src/analytical_place/analytical_solver.h b/vpr/src/analytical_place/analytical_solver.h
@@ -10,7 +10,9 @@
 #include <memory>
 #include "ap_flow_enums.h"
 #include "ap_netlist.h"
+#include "chan_cost_handler.h"
 #include "device_grid.h"
+#include "globals.h"
 #include "place_delay_model.h"
 #include "vtr_strong_id.h"
 #include "vtr_vector.h"
@@ -545,7 +547,11 @@ class B2BSolver : public AnalyticalSolver {
                            ap_timing_tradeoff,
                            log_verbosity)
         , pre_cluster_timing_manager_(pre_cluster_timing_manager)
-        , place_delay_model_(place_delay_model) {}
+        , place_delay_model_(place_delay_model)
+        , chan_cost_handler_(g_vpr_ctx.device().rr_chanx_width,
+                             g_vpr_ctx.device().rr_chany_width,
+                             g_vpr_ctx.device().rr_graph,
+                             device_grid) {}
 
     /**
      * @brief Perform an iteration of the B2B solver, storing the result into
@@ -699,15 +705,41 @@ class B2BSolver : public AnalyticalSolver {
     void update_linear_system_with_anchors(unsigned iteration);
 
     /**
-     * @brief Store the x and y solutions in Eigen's vectors into the partial
-     *        placement object.
-     *
-     * Note: The x_soln and y_soln may be modified if it is found that the
-     *       solution is impossible (i.e. has negative positions).
+     * @brief Solves the linear system of equations using the connectivity
+     *        matrix (A), the constant vector (b), and a guess for the solution.
      */
-    void store_solution_into_placement(Eigen::VectorXd& x_soln,
-                                       Eigen::VectorXd& y_soln,
-                                       PartialPlacement& p_placement);
+    Eigen::VectorXd solve_linear_system(Eigen::SparseMatrix<double>& A,
+                                        Eigen::VectorXd& b,
+                                        Eigen::VectorXd& guess);
+
+    /**
+     * @brief Store the solutions from the linear system into the partial
+     *        placement object for the given dimension.
+     *
+     * Note: The dim_soln may be modified if it is found that the solution is
+     *       impossible (e.g. has negative positions).
+     *
+     *  @param dim_soln
+     *      The solution of the linear system for a given dimension.
+     *  @param block_dim_locs
+     *      The block locations in the partial placement for the dimension.
+     *  @param dim_max_pos
+     *      The maximum position allowed for the dimension. For example, for the
+     *      x-dimension, this would be the width of the device. This is used to
+     *      ensure that the positions do not go off device.
+     */
+    void store_solution_into_placement(Eigen::VectorXd& dim_soln,
+                                       vtr::vector<APBlockId, double>& block_dim_locs,
+                                       double dim_max_pos);
+
+    /**
+     * @brief Does the FPGA that the AP flow is currently targeting have more
+     *        than one die. Having multiple dies would imply that the solver
+     *        needs to add another dimension to solve for.
+     */
+    inline bool is_multi_die() const {
+        return device_grid_num_layers_ > 1;
+    }
 
     // The following are variables used to store the system of equations to be
     // solved in the x and y dimensions. The equations are of the form:
@@ -720,22 +752,30 @@ class B2BSolver : public AnalyticalSolver {
     Eigen::SparseMatrix<double> A_sparse_x;
     /// @brief The coefficient / connectivity matrix for the y dimension.
     Eigen::SparseMatrix<double> A_sparse_y;
+    /// @brief The coefficient / connectivity matrix for the z dimension (layer dimension).
+    Eigen::SparseMatrix<double> A_sparse_z;
     /// @brief The constant vector in the x dimension.
     Eigen::VectorXd b_x;
     /// @brief The constant vector in the y dimension.
     Eigen::VectorXd b_y;
+    /// @brief The constant vector in the z dimension (layer dimension).
+    Eigen::VectorXd b_z;
 
     // The following is the solution of the previous iteration of this solver.
     // They are updated at the end of solve() and are used as the starting point
     // for the next call to solve.
     vtr::vector<APBlockId, double> block_x_locs_solved;
     vtr::vector<APBlockId, double> block_y_locs_solved;
+    // NOTE: For speed, this vector is unused if a device has only one die.
+    vtr::vector<APBlockId, double> block_z_locs_solved;
 
     // The following are the legalized solution coming into the analytical solver
     // (other than the first iteration). These are stored to be used as anchor
     // blocks during the solver.
     vtr::vector<APBlockId, double> block_x_locs_legalized;
     vtr::vector<APBlockId, double> block_y_locs_legalized;
+    // NOTE: For speed, this vector is unused if a device has only one die.
+    vtr::vector<APBlockId, double> block_z_locs_legalized;
 
     /// @brief The total number of CG iterations that this solver has performed
     ///        so far. This can be a useful metric for the amount of work the
@@ -760,6 +800,9 @@ class B2BSolver : public AnalyticalSolver {
     /// @brief The place delay model used for calculating the delay between
     ///        two tiles on the FPGA. Used for computing the timing terms.
     std::shared_ptr<PlaceDelayModel> place_delay_model_;
+
+    /// @brief Manager class for getting the cost factors in the x, y, and z dimensions.
+    ChanCostHandler chan_cost_handler_;
 };
 
 #endif // EIGEN_INSTALLED
diff --git a/vpr/src/place/chan_cost_handler.cpp b/vpr/src/place/chan_cost_handler.cpp
@@ -0,0 +1,91 @@
+/**
+ * @file
+ * @author  Alex Singer
+ * @date    December 2025
+ * @brief   The definition of the channel cost handler class.
+ */
+
+#include "chan_cost_handler.h"
+#include <cmath>
+#include "rr_graph_view.h"
+
+ChanCostHandler::ChanCostHandler(const std::vector<int>& rr_chanx_width,
+                                 const std::vector<int>& rr_chany_width,
+                                 const RRGraphView& rr_graph,
+                                 const DeviceGrid& grid) {
+
+    // These arrays contain accumulative channel width between channel zero and
+    // the channel specified by the given index. The accumulated channel width
+    // is inclusive, meaning that it includes both channel zero and channel `idx`.
+    // To compute the total channel width between channels 'low' and 'high', use the
+    // following formula:
+    //      acc_chan?_width_[high] - acc_chan?_width_[low - 1]
+    // This returns the total number of tracks between channels 'low' and 'high',
+    // including tracks in these channels.
+    acc_chanx_width_ = vtr::PrefixSum1D<int>(grid.height(), [&](size_t y) noexcept {
+        int chan_x_width = rr_chanx_width[y];
+
+        // If the number of tracks in a channel is zero, two consecutive elements take the same
+        // value. This can lead to a division by zero in get_chanxy_cost_fac_(). To avoid this
+        // potential issue, we assume that the channel width is at least 1.
+        if (chan_x_width == 0) {
+            return 1;
+        }
+
+        return chan_x_width;
+    });
+
+    acc_chany_width_ = vtr::PrefixSum1D<int>(grid.width(), [&](size_t x) noexcept {
+        int chan_y_width = rr_chany_width[x];
+
+        // to avoid a division by zero
+        if (chan_y_width == 0) {
+            return 1;
+        }
+
+        return chan_y_width;
+    });
+
+    // If this is a multi-layer (3D) architecture, compute Z-channel cost term.
+    if (grid.get_num_layers() > 1) {
+        vtr::NdMatrix<float, 2> tile_num_inter_die_conn({grid.width(), grid.height()}, 0.);
+
+        /*
+         * Step 1: iterate over the rr-graph, recording how many edges go between layers at each (x,y) location
+         * in the device. We count all these edges, regardless of which layers they connect. Then we divide by
+         * the number of layers - 1 to get the average cross-layer edge count per (x,y) location -- this mirrors
+         * what we do for the horizontal and vertical channels where we assume the channel width doesn't change
+         * along the length of the channel. It lets us be more memory-efficient for 3D devices, and could be revisited
+         * if someday we have architectures with widely varying connectivity between different layers in a stack.
+         */
+
+        /* To calculate the accumulative number of inter-die connections we first need to get the number of
+         * inter-die connection per location. To be able to work for the cases that RR Graph is read instead
+         * of being made from the architecture file, we calculate this number by iterating over the RR graph. Once
+         * tile_num_inter_die_conn is populated, we can start populating acc_tile_num_inter_die_conn_.
+         */
+
+        for (const RRNodeId node : rr_graph.nodes()) {
+            if (rr_graph.node_type(node) == e_rr_type::CHANZ) {
+                int x = rr_graph.node_xlow(node);
+                int y = rr_graph.node_ylow(node);
+                VTR_ASSERT_SAFE(x == rr_graph.node_xhigh(node) && y == rr_graph.node_yhigh(node));
+                tile_num_inter_die_conn[x][y]++;
+            }
+        }
+
+        int num_layers = grid.get_num_layers();
+        for (size_t x = 0; x < grid.width(); x++) {
+            for (size_t y = 0; y < grid.height(); y++) {
+                tile_num_inter_die_conn[x][y] /= (num_layers - 1);
+            }
+        }
+
+        // Step 2: Calculate prefix sum of the inter-die connectivity up to and including the channel at (x, y).
+        acc_tile_num_inter_die_conn_ = vtr::PrefixSum2D<int>(grid.width(),
+                                                             grid.height(),
+                                                             [&](size_t x, size_t y) {
+                                                                 return static_cast<int>(std::round(tile_num_inter_die_conn[x][y]));
+                                                             });
+    }
+}
diff --git a/vpr/src/place/chan_cost_handler.h b/vpr/src/place/chan_cost_handler.h
@@ -0,0 +1,124 @@
+#pragma once
+/**
+ * @file
+ * @author  Alex Singer
+ * @date    December 2025
+ * @brief   Declaration of the channel cost handler class.
+ */
+
+#include "vpr_types.h"
+#include "vtr_prefix_sum.h"
+
+// Forward declarations.
+class DeviceGrid;
+class RRGraphView;
+
+/**
+ * @brief Manager class for computing the cost factors for channels in different
+ *        dimensions.
+ */
+class ChanCostHandler {
+  public:
+    ChanCostHandler() = delete;
+
+    /**
+     * @brief Constructor for the ChanCostHanlder class.
+     *
+     * This will pre-compute prefix sum data structures which will make getting
+     * the x, y, and z chan cost factors more efficient.
+     *
+     * @param rr_chanx_width
+     *      The horizontal channel width distribution across the device grid.
+     * @param rr_chany_width
+     *      The vertical channel width distribution across the device grid.
+     * @param rr_graph
+     *      The Routing Resource Graph of the device.
+     * @param grid
+     *      The device grid.
+     */
+    ChanCostHandler(const std::vector<int>& rr_chanx_width,
+                    const std::vector<int>& rr_chany_width,
+                    const RRGraphView& rr_graph,
+                    const DeviceGrid& grid);
+
+    /**
+     * @brief Computes the inverse of average channel width for horizontal
+     *        channels within a bounding box.
+     *
+     * @tparam BBT This can be either t_bb or t_2D_bb.
+     * @param bb The bounding box for which the inverse of average channel width
+     *           within the bounding box is computed.
+     * @return The inverse of average channel width for horizontal channels.
+     */
+    template<typename BBT>
+    inline double get_chanx_cost_fac(const BBT& bb) const {
+        int total_chanx_width = acc_chanx_width_.get_sum(bb.ymin, bb.ymax);
+        double inverse_average_chanx_width = (bb.ymax - bb.ymin + 1.0) / total_chanx_width;
+        return inverse_average_chanx_width;
+    }
+
+    /**
+     * @brief Computes the inverse of average channel width for vertical
+     *        channels within a bounding box.
+     *
+     * @tparam BBT This can be either t_bb or t_2D_bb.
+     * @param bb The bounding box for which the inverse of average channel width
+     *           within the bounding box is computed.
+     * @return The inverse of average channel width for vertical channels.
+     */
+    template<typename BBT>
+    inline double get_chany_cost_fac(const BBT& bb) const {
+        int total_chany_width = acc_chany_width_.get_sum(bb.xmin, bb.xmax);
+        double inverse_average_chany_width = (bb.xmax - bb.xmin + 1.0) / total_chany_width;
+        return inverse_average_chany_width;
+    }
+
+    /**
+     * @brief Calculate the chanz cost factor based on the inverse of the
+     *        average number of inter-die connections in the given bounding box.
+     *
+     * This cost factor increases the placement cost for blocks that require
+     * inter-layer connections in areas with, on average, fewer inter-die
+     * connections. If inter-die connections are evenly distributed across
+     * tiles, the cost factor will be the same for all bounding boxes, but it
+     * will still weight z-directed vs. x- and y-directed connections appropriately.
+     *
+     * @param bb Bounding box of the net which chanz cost factor is to be calculated
+     * @return ChanZ cost factor
+     */
+    inline double get_chanz_cost_fac(const t_bb& bb) const {
+        int num_inter_dir_conn = acc_tile_num_inter_die_conn_.get_sum(bb.xmin,
+                                                                      bb.ymin,
+                                                                      bb.xmax,
+                                                                      bb.ymax);
+
+        if (num_inter_dir_conn == 0)
+            return 1.0;
+
+        int bb_num_tiles = (bb.xmax - bb.xmin + 1) * (bb.ymax - bb.ymin + 1);
+        return static_cast<double>(bb_num_tiles) / num_inter_dir_conn;
+    }
+
+  private:
+    /**
+     * @brief Matrices below are used to precompute the inverse of the average
+     * number of tracks per channel between [subhigh] and [sublow].  Access
+     * them as chan?_place_cost_fac(subhigh, sublow).  They are used to
+     * speed up the computation of the cost function that takes the length
+     * of the net bounding box in each dimension, divided by the average
+     * number of tracks in that direction; for other cost functions they
+     * will never be used.
+     */
+    vtr::PrefixSum1D<int> acc_chanx_width_; // [0..grid_width-1]
+    vtr::PrefixSum1D<int> acc_chany_width_; // [0..grid_height-1]
+
+    /**
+     * @brief The matrix below is used to calculate a chanz_place_cost_fac based on the average channel width in 
+     * the cross-die-layer direction over a 2D (x,y) region. We don't assume the inter-die connectivity is the same at all (x,y) locations, so we
+     * can't compute the full chanz_place_cost_fac for all possible (xlow,ylow)(xhigh,yhigh) without a 4D array, which would
+     * be too big: O(n^2) in circuit size. Instead we compute a prefix sum that stores the number of inter-die connections per layer from
+     * (x=0,y=0) to (x,y). Given this, we can compute the average number of inter-die connections over a (xlow,ylow) to (xhigh,yhigh) 
+     * region in O(1) (by adding and subtracting 4 entries)
+     */
+    vtr::PrefixSum2D<int> acc_tile_num_inter_die_conn_; // [0..grid_width-1][0..grid_height-1]
+};
diff --git a/vpr/src/place/initial_placement.cpp b/vpr/src/place/initial_placement.cpp
@@ -653,21 +653,23 @@ static t_flat_pl_loc find_centroid_loc_from_flat_placement(const t_pl_macro& pl_
         // and save the closest of all regions.
         t_flat_pl_loc best_projected_pos = centroid;
         float best_distance = std::numeric_limits<float>::max();
-        VTR_ASSERT_MSG(centroid.layer == 0,
-                       "3D FPGAs not supported for this part of the code yet");
         for (const Region& region : head_pr.get_regions()) {
             const vtr::Rect<int>& rect = region.get_rect();
             // Note: We add 0.999 here since the partition region is in grid
             //       space, so it treats tile positions as having size 0x0 when
             //       they really are 1x1.
             float proj_x = std::clamp<float>(centroid.x, rect.xmin(), rect.xmax() + 0.999);
             float proj_y = std::clamp<float>(centroid.y, rect.ymin(), rect.ymax() + 0.999);
+            float proj_layer = std::clamp<float>(centroid.layer, region.get_layer_range().first,
+                                                 region.get_layer_range().second + 0.999);
             float dx = std::abs(proj_x - centroid.x);
             float dy = std::abs(proj_y - centroid.y);
-            float dist = dx + dy;
+            float dlayer = std::abs(proj_layer - centroid.layer);
+            float dist = dx + dy + dlayer;
             if (dist < best_distance) {
                 best_projected_pos.x = proj_x;
                 best_projected_pos.y = proj_y;
+                best_projected_pos.layer = proj_layer;
                 best_distance = dist;
             }
         }