Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
246 changes: 191 additions & 55 deletions vpr/src/analytical_place/analytical_solver.cpp

Large diffs are not rendered by default.

61 changes: 52 additions & 9 deletions vpr/src/analytical_place/analytical_solver.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
#include <memory>
#include "ap_flow_enums.h"
#include "ap_netlist.h"
#include "chan_cost_handler.h"
#include "device_grid.h"
#include "globals.h"
#include "place_delay_model.h"
#include "vtr_strong_id.h"
#include "vtr_vector.h"
Expand Down Expand Up @@ -545,7 +547,11 @@ class B2BSolver : public AnalyticalSolver {
ap_timing_tradeoff,
log_verbosity)
, pre_cluster_timing_manager_(pre_cluster_timing_manager)
, place_delay_model_(place_delay_model) {}
, place_delay_model_(place_delay_model)
, chan_cost_handler_(g_vpr_ctx.device().rr_chanx_width,
g_vpr_ctx.device().rr_chany_width,
g_vpr_ctx.device().rr_graph,
device_grid) {}

/**
* @brief Perform an iteration of the B2B solver, storing the result into
Expand Down Expand Up @@ -699,15 +705,41 @@ class B2BSolver : public AnalyticalSolver {
void update_linear_system_with_anchors(unsigned iteration);

/**
* @brief Store the x and y solutions in Eigen's vectors into the partial
* placement object.
*
* Note: The x_soln and y_soln may be modified if it is found that the
* solution is impossible (i.e. has negative positions).
* @brief Solves the linear system of equations using the connectivity
* matrix (A), the constant vector (b), and a guess for the solution.
*/
void store_solution_into_placement(Eigen::VectorXd& x_soln,
Eigen::VectorXd& y_soln,
PartialPlacement& p_placement);
Eigen::VectorXd solve_linear_system(Eigen::SparseMatrix<double>& A,
Eigen::VectorXd& b,
Eigen::VectorXd& guess);

/**
* @brief Store the solutions from the linear system into the partial
* placement object for the given dimension.
*
* Note: The dim_soln may be modified if it is found that the solution is
* impossible (e.g. has negative positions).
*
* @param dim_soln
* The solution of the linear system for a given dimension.
* @param block_dim_locs
* The block locations in the partial placement for the dimension.
* @param dim_max_pos
* The maximum position allowed for the dimension. For example, for the
* x-dimension, this would be the width of the device. This is used to
* ensure that the positions do not go off device.
*/
void store_solution_into_placement(Eigen::VectorXd& dim_soln,
vtr::vector<APBlockId, double>& block_dim_locs,
double dim_max_pos);

/**
* @brief Does the FPGA that the AP flow is currently targeting have more
* than one die. Having multiple dies would imply that the solver
* needs to add another dimension to solve for.
*/
inline bool is_multi_die() const {
return device_grid_num_layers_ > 1;
}

// The following are variables used to store the system of equations to be
// solved in the x and y dimensions. The equations are of the form:
Expand All @@ -720,22 +752,30 @@ class B2BSolver : public AnalyticalSolver {
Eigen::SparseMatrix<double> A_sparse_x;
/// @brief The coefficient / connectivity matrix for the y dimension.
Eigen::SparseMatrix<double> A_sparse_y;
/// @brief The coefficient / connectivity matrix for the z dimension (layer dimension).
Eigen::SparseMatrix<double> A_sparse_z;
/// @brief The constant vector in the x dimension.
Eigen::VectorXd b_x;
/// @brief The constant vector in the y dimension.
Eigen::VectorXd b_y;
/// @brief The constant vector in the z dimension (layer dimension).
Eigen::VectorXd b_z;

// The following is the solution of the previous iteration of this solver.
// They are updated at the end of solve() and are used as the starting point
// for the next call to solve.
vtr::vector<APBlockId, double> block_x_locs_solved;
vtr::vector<APBlockId, double> block_y_locs_solved;
// NOTE: For speed, this vector is unused if a device has only one die.
vtr::vector<APBlockId, double> block_z_locs_solved;

// The following are the legalized solution coming into the analytical solver
// (other than the first iteration). These are stored to be used as anchor
// blocks during the solver.
vtr::vector<APBlockId, double> block_x_locs_legalized;
vtr::vector<APBlockId, double> block_y_locs_legalized;
// NOTE: For speed, this vector is unused if a device has only one die.
vtr::vector<APBlockId, double> block_z_locs_legalized;

/// @brief The total number of CG iterations that this solver has performed
/// so far. This can be a useful metric for the amount of work the
Expand All @@ -760,6 +800,9 @@ class B2BSolver : public AnalyticalSolver {
/// @brief The place delay model used for calculating the delay between
/// two tiles on the FPGA. Used for computing the timing terms.
std::shared_ptr<PlaceDelayModel> place_delay_model_;

/// @brief Manager class for getting the cost factors in the x, y, and z dimensions.
ChanCostHandler chan_cost_handler_;
};

#endif // EIGEN_INSTALLED
91 changes: 91 additions & 0 deletions vpr/src/place/chan_cost_handler.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/**
* @file
* @author Alex Singer
* @date December 2025
* @brief The definition of the channel cost handler class.
*/

#include "chan_cost_handler.h"
#include <cmath>
#include "rr_graph_view.h"

ChanCostHandler::ChanCostHandler(const std::vector<int>& rr_chanx_width,
const std::vector<int>& rr_chany_width,
const RRGraphView& rr_graph,
const DeviceGrid& grid) {

// These arrays contain accumulative channel width between channel zero and
// the channel specified by the given index. The accumulated channel width
// is inclusive, meaning that it includes both channel zero and channel `idx`.
// To compute the total channel width between channels 'low' and 'high', use the
// following formula:
// acc_chan?_width_[high] - acc_chan?_width_[low - 1]
// This returns the total number of tracks between channels 'low' and 'high',
// including tracks in these channels.
acc_chanx_width_ = vtr::PrefixSum1D<int>(grid.height(), [&](size_t y) noexcept {
int chan_x_width = rr_chanx_width[y];

// If the number of tracks in a channel is zero, two consecutive elements take the same
// value. This can lead to a division by zero in get_chanxy_cost_fac_(). To avoid this
// potential issue, we assume that the channel width is at least 1.
if (chan_x_width == 0) {
return 1;
}

return chan_x_width;
});

acc_chany_width_ = vtr::PrefixSum1D<int>(grid.width(), [&](size_t x) noexcept {
int chan_y_width = rr_chany_width[x];

// to avoid a division by zero
if (chan_y_width == 0) {
return 1;
}

return chan_y_width;
});

// If this is a multi-layer (3D) architecture, compute Z-channel cost term.
if (grid.get_num_layers() > 1) {
vtr::NdMatrix<float, 2> tile_num_inter_die_conn({grid.width(), grid.height()}, 0.);

/*
* Step 1: iterate over the rr-graph, recording how many edges go between layers at each (x,y) location
* in the device. We count all these edges, regardless of which layers they connect. Then we divide by
* the number of layers - 1 to get the average cross-layer edge count per (x,y) location -- this mirrors
* what we do for the horizontal and vertical channels where we assume the channel width doesn't change
* along the length of the channel. It lets us be more memory-efficient for 3D devices, and could be revisited
* if someday we have architectures with widely varying connectivity between different layers in a stack.
*/

/* To calculate the accumulative number of inter-die connections we first need to get the number of
* inter-die connection per location. To be able to work for the cases that RR Graph is read instead
* of being made from the architecture file, we calculate this number by iterating over the RR graph. Once
* tile_num_inter_die_conn is populated, we can start populating acc_tile_num_inter_die_conn_.
*/

for (const RRNodeId node : rr_graph.nodes()) {
if (rr_graph.node_type(node) == e_rr_type::CHANZ) {
int x = rr_graph.node_xlow(node);
int y = rr_graph.node_ylow(node);
VTR_ASSERT_SAFE(x == rr_graph.node_xhigh(node) && y == rr_graph.node_yhigh(node));
tile_num_inter_die_conn[x][y]++;
}
}

int num_layers = grid.get_num_layers();
for (size_t x = 0; x < grid.width(); x++) {
for (size_t y = 0; y < grid.height(); y++) {
tile_num_inter_die_conn[x][y] /= (num_layers - 1);
}
}

// Step 2: Calculate prefix sum of the inter-die connectivity up to and including the channel at (x, y).
acc_tile_num_inter_die_conn_ = vtr::PrefixSum2D<int>(grid.width(),
grid.height(),
[&](size_t x, size_t y) {
return static_cast<int>(std::round(tile_num_inter_die_conn[x][y]));
});
}
}
124 changes: 124 additions & 0 deletions vpr/src/place/chan_cost_handler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#pragma once
/**
* @file
* @author Alex Singer
* @date December 2025
* @brief Declaration of the channel cost handler class.
*/

#include "vpr_types.h"
#include "vtr_prefix_sum.h"

// Forward declarations.
class DeviceGrid;
class RRGraphView;

/**
* @brief Manager class for computing the cost factors for channels in different
* dimensions.
*/
class ChanCostHandler {
public:
ChanCostHandler() = delete;

/**
* @brief Constructor for the ChanCostHanlder class.
*
* This will pre-compute prefix sum data structures which will make getting
* the x, y, and z chan cost factors more efficient.
*
* @param rr_chanx_width
* The horizontal channel width distribution across the device grid.
* @param rr_chany_width
* The vertical channel width distribution across the device grid.
* @param rr_graph
* The Routing Resource Graph of the device.
* @param grid
* The device grid.
*/
ChanCostHandler(const std::vector<int>& rr_chanx_width,
const std::vector<int>& rr_chany_width,
const RRGraphView& rr_graph,
const DeviceGrid& grid);

/**
* @brief Computes the inverse of average channel width for horizontal
* channels within a bounding box.
*
* @tparam BBT This can be either t_bb or t_2D_bb.
* @param bb The bounding box for which the inverse of average channel width
* within the bounding box is computed.
* @return The inverse of average channel width for horizontal channels.
*/
template<typename BBT>
inline double get_chanx_cost_fac(const BBT& bb) const {
int total_chanx_width = acc_chanx_width_.get_sum(bb.ymin, bb.ymax);
double inverse_average_chanx_width = (bb.ymax - bb.ymin + 1.0) / total_chanx_width;
return inverse_average_chanx_width;
}

/**
* @brief Computes the inverse of average channel width for vertical
* channels within a bounding box.
*
* @tparam BBT This can be either t_bb or t_2D_bb.
* @param bb The bounding box for which the inverse of average channel width
* within the bounding box is computed.
* @return The inverse of average channel width for vertical channels.
*/
template<typename BBT>
inline double get_chany_cost_fac(const BBT& bb) const {
int total_chany_width = acc_chany_width_.get_sum(bb.xmin, bb.xmax);
double inverse_average_chany_width = (bb.xmax - bb.xmin + 1.0) / total_chany_width;
return inverse_average_chany_width;
}

/**
* @brief Calculate the chanz cost factor based on the inverse of the
* average number of inter-die connections in the given bounding box.
*
* This cost factor increases the placement cost for blocks that require
* inter-layer connections in areas with, on average, fewer inter-die
* connections. If inter-die connections are evenly distributed across
* tiles, the cost factor will be the same for all bounding boxes, but it
* will still weight z-directed vs. x- and y-directed connections appropriately.
*
* @param bb Bounding box of the net which chanz cost factor is to be calculated
* @return ChanZ cost factor
*/
inline double get_chanz_cost_fac(const t_bb& bb) const {
int num_inter_dir_conn = acc_tile_num_inter_die_conn_.get_sum(bb.xmin,
bb.ymin,
bb.xmax,
bb.ymax);

if (num_inter_dir_conn == 0)
return 1.0;

int bb_num_tiles = (bb.xmax - bb.xmin + 1) * (bb.ymax - bb.ymin + 1);
return static_cast<double>(bb_num_tiles) / num_inter_dir_conn;
}

private:
/**
* @brief Matrices below are used to precompute the inverse of the average
* number of tracks per channel between [subhigh] and [sublow]. Access
* them as chan?_place_cost_fac(subhigh, sublow). They are used to
* speed up the computation of the cost function that takes the length
* of the net bounding box in each dimension, divided by the average
* number of tracks in that direction; for other cost functions they
* will never be used.
*/
vtr::PrefixSum1D<int> acc_chanx_width_; // [0..grid_width-1]
vtr::PrefixSum1D<int> acc_chany_width_; // [0..grid_height-1]

/**
* @brief The matrix below is used to calculate a chanz_place_cost_fac based on the average channel width in
* the cross-die-layer direction over a 2D (x,y) region. We don't assume the inter-die connectivity is the same at all (x,y) locations, so we
* can't compute the full chanz_place_cost_fac for all possible (xlow,ylow)(xhigh,yhigh) without a 4D array, which would
* be too big: O(n^2) in circuit size. Instead we compute a prefix sum that stores the number of inter-die connections per layer from
* (x=0,y=0) to (x,y). Given this, we can compute the average number of inter-die connections over a (xlow,ylow) to (xhigh,yhigh)
* region in O(1) (by adding and subtracting 4 entries)
*/
vtr::PrefixSum2D<int> acc_tile_num_inter_die_conn_; // [0..grid_width-1][0..grid_height-1]
};
8 changes: 5 additions & 3 deletions vpr/src/place/initial_placement.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -653,21 +653,23 @@ static t_flat_pl_loc find_centroid_loc_from_flat_placement(const t_pl_macro& pl_
// and save the closest of all regions.
t_flat_pl_loc best_projected_pos = centroid;
float best_distance = std::numeric_limits<float>::max();
VTR_ASSERT_MSG(centroid.layer == 0,
"3D FPGAs not supported for this part of the code yet");
for (const Region& region : head_pr.get_regions()) {
const vtr::Rect<int>& rect = region.get_rect();
// Note: We add 0.999 here since the partition region is in grid
// space, so it treats tile positions as having size 0x0 when
// they really are 1x1.
float proj_x = std::clamp<float>(centroid.x, rect.xmin(), rect.xmax() + 0.999);
float proj_y = std::clamp<float>(centroid.y, rect.ymin(), rect.ymax() + 0.999);
float proj_layer = std::clamp<float>(centroid.layer, region.get_layer_range().first,
region.get_layer_range().second + 0.999);
float dx = std::abs(proj_x - centroid.x);
float dy = std::abs(proj_y - centroid.y);
float dist = dx + dy;
float dlayer = std::abs(proj_layer - centroid.layer);
float dist = dx + dy + dlayer;
if (dist < best_distance) {
best_projected_pos.x = proj_x;
best_projected_pos.y = proj_y;
best_projected_pos.layer = proj_layer;
best_distance = dist;
}
}
Expand Down
Loading