vpr/src/place/timing_place_lookup.cpp - third_party/vtr-verilog-to-routing - Git at Google

 #include <cstdio>
 #include <cstring>
 #include <cmath>
 #include <time.h>
 #include <limits>

 #include "vtr_assert.h"
 #include "vtr_ndmatrix.h"
 #include "vtr_log.h"
 #include "vtr_util.h"
 #include "vtr_math.h"
 #include "vtr_memory.h"
 #include "vtr_time.h"
 #include "vtr_geometry.h"

 #include "arch_util.h"

 #include "vpr_types.h"
 #include "globals.h"
 #include "place_and_route.h"
 #include "route_common.h"
 #include "route_tree_timing.h"
 #include "route_timing.h"
 #include "route_export.h"
 #include "rr_graph.h"
 #include "timing_place_lookup.h"
 #include "read_xml_arch_file.h"
 #include "echo_files.h"
 #include "atom_netlist.h"
 #include "rr_graph2.h"
 #include "place_util.h"
 // all functions in profiling:: namespace, which are only activated if PROFILE is defined
 #include "route_profiling.h"
 #include "router_delay_profiling.h"
 #include "place_delay_model.h"

 /*To compute delay between blocks we calculate the delay between */
 /*different nodes in the FPGA.  From this procedure we generate
  * a lookup table which tells us the delay between different locations in*/
 /*the FPGA */

 /*the delta arrays are used to contain the best case routing delay */
 /*between different locations on the FPGA. */

 //#define VERBOSE

 constexpr float UNINITIALIZED_DELTA = -1;                                  //Indicates the delta delay value has not been calculated
 constexpr float EMPTY_DELTA = -2;                                          //Indicates delta delay from/to an EMPTY block
 constexpr float IMPOSSIBLE_DELTA = std::numeric_limits<float>::infinity(); //Indicates there is no valid delta delay

 struct t_profile_loc {
     t_profile_loc(int x, int y, std::vector<vtr::Point<int>> delta_values)
         : root(x, y)
         , deltas(delta_values) {}

     vtr::Point<int> root;
     std::vector<vtr::Point<int>> deltas;
 };

 struct t_profile_info {
     std::vector<t_profile_loc> locations;

     int max_delta_x;
     int max_delta_y;
 };

 /*** Function Prototypes *****/
 static t_chan_width setup_chan_width(const t_router_opts& router_opts,
                                      t_chan_width_dist chan_width_dist);

 static float route_connection_delay(
     const RouterDelayProfiler& route_profiler,
     int source_x_loc,
     int source_y_loc,
     int sink_x_loc,
     int sink_y_loc,
     const t_router_opts& router_opts,
     bool measure_directconnect);

 static void generic_compute_matrix(
     const RouterDelayProfiler& route_profiler,
     vtr::Matrix<std::vector<float>>& matrix,
     int source_x,
     int source_y,
     int start_x,
     int start_y,
     int end_x,
     int end_y,
     const t_router_opts& router_opts,
     bool measure_directconnect);

 static vtr::Matrix<float> compute_delta_delays(
     const RouterDelayProfiler& route_profiler,
     const t_placer_opts& palcer_opts,
     const t_router_opts& router_opts,
     bool measure_directconnect,
     size_t longest_length);

 float delay_reduce(std::vector<float>& delays, e_reducer reducer);

 static vtr::Matrix<float> compute_delta_delay_model(
     const RouterDelayProfiler& route_profiler,
     const t_placer_opts& placer_opts,
     const t_router_opts& router_opts,
     bool measure_directconnect,
     int longest_length);

 static bool find_direct_connect_sample_locations(const t_direct_inf* direct,
                                                  t_physical_tile_type_ptr from_type,
                                                  int from_pin,
                                                  int from_pin_class,
                                                  t_physical_tile_type_ptr to_type,
                                                  int to_pin,
                                                  int to_pin_class,
                                                  int* src_rr,
                                                  int* sink_rr);

 static bool verify_delta_delays(const vtr::Matrix<float>& delta_delays);

 static int get_longest_segment_length(std::vector<t_segment_inf>& segment_inf);

 static void fix_empty_coordinates(vtr::Matrix<float>& delta_delays);
 static void fix_uninitialized_coordinates(vtr::Matrix<float>& delta_delays);

 static float find_neightboring_average(vtr::Matrix<float>& matrix, int x, int y, int max_distance);

 /******* Globally Accessible Functions **********/

 std::unique_ptr<PlaceDelayModel> compute_place_delay_model(const t_placer_opts& placer_opts,
                                                            const t_router_opts& router_opts,
                                                            t_det_routing_arch* det_routing_arch,
                                                            std::vector<t_segment_inf>& segment_inf,
                                                            t_chan_width_dist chan_width_dist,
                                                            const t_direct_inf* directs,
                                                            const int num_directs) {
     vtr::ScopedStartFinishTimer timer("Computing placement delta delay look-up");

     init_placement_context();

     t_chan_width chan_width = setup_chan_width(router_opts, chan_width_dist);

     alloc_routing_structs(chan_width, router_opts, det_routing_arch, segment_inf,
                           directs, num_directs);

     const RouterLookahead* router_lookahead = get_cached_router_lookahead(
         router_opts.lookahead_type,
         router_opts.write_router_lookahead,
         router_opts.read_router_lookahead,
         segment_inf);
     RouterDelayProfiler route_profiler(router_lookahead);

     int longest_length = get_longest_segment_length(segment_inf);

     /*now setup and compute the actual arrays */
     std::unique_ptr<PlaceDelayModel> place_delay_model;
     if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA) {
         place_delay_model = std::make_unique<DeltaDelayModel>();
     } else if (placer_opts.delay_model_type == PlaceDelayModelType::DELTA_OVERRIDE) {
         place_delay_model = std::make_unique<OverrideDelayModel>();
     } else {
         VTR_ASSERT_MSG(false, "Invalid placer delay model");
     }

     if (placer_opts.read_placement_delay_lookup.empty()) {
         place_delay_model->compute(route_profiler, placer_opts, router_opts, longest_length);
     } else {
         place_delay_model->read(placer_opts.read_placement_delay_lookup);
     }

     if (!placer_opts.write_placement_delay_lookup.empty()) {
         place_delay_model->write(placer_opts.write_placement_delay_lookup);
     }

     /*free all data structures that are no longer needed */
     free_routing_structs();

     return place_delay_model;
 }

 void DeltaDelayModel::compute(
     const RouterDelayProfiler& route_profiler,
     const t_placer_opts& placer_opts,
     const t_router_opts& router_opts,
     int longest_length) {
     router_opts_ = router_opts;
     delays_ = compute_delta_delay_model(
         route_profiler,
         placer_opts, router_opts, /*measure_directconnect=*/true,
         longest_length);
 }

 void OverrideDelayModel::compute(
     const RouterDelayProfiler& route_profiler,
     const t_placer_opts& placer_opts,
     const t_router_opts& router_opts,
     int longest_length) {
     router_opts_ = router_opts;
     auto delays = compute_delta_delay_model(
         route_profiler,
         placer_opts, router_opts, /*measure_directconnect=*/false,
         longest_length);

     base_delay_model_ = std::make_unique<DeltaDelayModel>(delays, router_opts);

     compute_override_delay_model(route_profiler);
 }

 /******* File Accessible Functions **********/

 std::vector<int> get_best_classes(enum e_pin_type pintype, t_physical_tile_type_ptr type) {
     /*
      * This function tries to identify the best pin classes to hook up
      * for delay calculation.  The assumption is that we should pick
      * the pin class with the largest number of pins. This makes
      * sense, since it ensures we pick commonly used pins, and
      * removes order dependence on how the pins are specified
      * in the architecture (except in the case were the two largest pin classes
      * of a particular pintype have the same number of pins, in which case the
      * first pin class is used).
      */

     std::vector<int> best_classes;

     //Collect all classes of matching type which do not have all their pins ignored
     for (int i = 0; i < type->num_class; i++) {
         if (type->class_inf[i].type == pintype) {
             bool all_ignored = true;
             for (int ipin = 0; ipin < type->class_inf[i].num_pins; ++ipin) {
                 int pin = type->class_inf[i].pinlist[ipin];
                 if (!type->is_ignored_pin[pin]) {
                     all_ignored = false;
                     break;
                 }
             }
             if (!all_ignored) {
                 best_classes.push_back(i);
             }
         }
     }

     //Sort classe so largest pin class is first
     auto cmp_class = [&](int lhs, int rhs) {
         return type->class_inf[lhs].num_pins > type->class_inf[rhs].num_pins;
     };

     std::stable_sort(best_classes.begin(), best_classes.end(), cmp_class);

     return best_classes;
 }

 static int get_longest_segment_length(std::vector<t_segment_inf>& segment_inf) {
     int length;

     length = 0;
     for (size_t i = 0; i < segment_inf.size(); i++) {
         if (segment_inf[i].length > length)
             length = segment_inf[i].length;
     }
     return (length);
 }

 static t_chan_width setup_chan_width(const t_router_opts& router_opts,
                                      t_chan_width_dist chan_width_dist) {
     /*we give plenty of tracks, this increases routability for the */
     /*lookup table generation */

     int width_fac;

     if (router_opts.fixed_channel_width == NO_FIXED_CHANNEL_WIDTH) {
         auto& device_ctx = g_vpr_ctx.device();

         auto type = physical_tile_type(find_most_common_block_type(device_ctx.grid));

         width_fac = 4 * type->num_pins;
         /*this is 2x the value that binary search starts */
         /*this should be enough to allow most pins to   */
         /*connect to tracks in the architecture */
     } else {
         width_fac = router_opts.fixed_channel_width;
     }

     return init_chan(width_fac, chan_width_dist);
 }

 static float route_connection_delay(
     const RouterDelayProfiler& route_profiler,
     int source_x,
     int source_y,
     int sink_x,
     int sink_y,
     const t_router_opts& router_opts,
     bool measure_directconnect) {
     //Routes between the source and sink locations and calculates the delay

     float net_delay_value = IMPOSSIBLE_DELTA; /*set to known value for debug purposes */

     auto& device_ctx = g_vpr_ctx.device();

     bool successfully_routed = false;

     //Get the rr nodes to route between
     auto best_driver_ptcs = get_best_classes(DRIVER, device_ctx.grid[source_x][source_y].type);
     auto best_sink_ptcs = get_best_classes(RECEIVER, device_ctx.grid[sink_x][sink_y].type);

     for (int driver_ptc : best_driver_ptcs) {
         VTR_ASSERT(driver_ptc != OPEN);

         int source_rr_node = get_rr_node_index(device_ctx.rr_node_indices, source_x, source_y, SOURCE, driver_ptc);

         VTR_ASSERT(source_rr_node != OPEN);

         for (int sink_ptc : best_sink_ptcs) {
             VTR_ASSERT(sink_ptc != OPEN);

             int sink_rr_node = get_rr_node_index(device_ctx.rr_node_indices, sink_x, sink_y, SINK, sink_ptc);

             VTR_ASSERT(sink_rr_node != OPEN);

             if (!measure_directconnect && directconnect_exists(source_rr_node, sink_rr_node)) {
                 //Skip if we shouldn't measure direct connects and a direct connect exists
                 continue;
             }

             {
                 successfully_routed = route_profiler.calculate_delay(
                     source_rr_node, sink_rr_node,
                     router_opts,
                     &net_delay_value);
             }

             if (successfully_routed) break;
         }
         if (successfully_routed) break;
     }

     if (!successfully_routed) {
         VTR_LOG_WARN("Unable to route between blocks at (%d,%d) and (%d,%d) to characterize delay (setting to %g)\n",
                      source_x, source_y, sink_x, sink_y, net_delay_value);
     }

     return (net_delay_value);
 }

 static void generic_compute_matrix(
     const RouterDelayProfiler& route_profiler,
     vtr::Matrix<std::vector<float>>& matrix,
     int source_x,
     int source_y,
     int start_x,
     int start_y,
     int end_x,
     int end_y,
     const t_router_opts& router_opts,
     bool measure_directconnect) {
     int delta_x, delta_y;
     int sink_x, sink_y;

     auto& device_ctx = g_vpr_ctx.device();

     for (sink_x = start_x; sink_x <= end_x; sink_x++) {
         for (sink_y = start_y; sink_y <= end_y; sink_y++) {
             delta_x = abs(sink_x - source_x);
             delta_y = abs(sink_y - source_y);

             t_physical_tile_type_ptr src_type = device_ctx.grid[source_x][source_y].type;
             t_physical_tile_type_ptr sink_type = device_ctx.grid[sink_x][sink_y].type;

             bool src_or_target_empty = (src_type == device_ctx.EMPTY_TYPE
                                         || sink_type == device_ctx.EMPTY_TYPE);

             if (src_or_target_empty) {
                 if (matrix[delta_x][delta_y].empty()) {
                     //Only set empty target if we don't already have a valid delta delay
                     matrix[delta_x][delta_y].push_back(EMPTY_DELTA);
 #ifdef VERBOSE
                     VTR_LOG("Computed delay: %12s delta: %d,%d (src: %d,%d sink: %d,%d)\n",
                             "EMPTY",
                             delta_x, delta_y,
                             source_x, source_y,
                             sink_x, sink_y);
 #endif
                 }
             } else {
                 //Valid start/end

                 float delay = route_connection_delay(route_profiler, source_x, source_y, sink_x, sink_y, router_opts, measure_directconnect);

 #ifdef VERBOSE
                 VTR_LOG("Computed delay: %12g delta: %d,%d (src: %d,%d sink: %d,%d)\n",
                         delay,
                         delta_x, delta_y,
                         source_x, source_y,
                         sink_x, sink_y);
 #endif
                 if (matrix[delta_x][delta_y].size() == 1 && matrix[delta_x][delta_y][0] == EMPTY_DELTA) {
                     //Overwrite empty delta
                     matrix[delta_x][delta_y][0] = delay;
                 } else {
                     //Collect delta
                     matrix[delta_x][delta_y].push_back(delay);
                 }
             }
         }
     }
 }

 static vtr::Matrix<float> compute_delta_delays(
     const RouterDelayProfiler& route_profiler,
     const t_placer_opts& placer_opts,
     const t_router_opts& router_opts,
     bool measure_directconnect,
     size_t longest_length) {
     //To avoid edge effects we place the source at least 'longest_length' away
     //from the device edge
     //and route from there for all possible delta values < dimension

     auto& device_ctx = g_vpr_ctx.device();
     auto& grid = device_ctx.grid;

     vtr::Matrix<std::vector<float>> sampled_delta_delays({grid.width(), grid.height()});

     size_t mid_x = vtr::nint(grid.width() / 2);
     size_t mid_y = vtr::nint(grid.height() / 2);

     size_t low_x = std::min(longest_length, mid_x);
     size_t low_y = std::min(longest_length, mid_y);
     size_t high_x = std::max(grid.width() - longest_length, mid_x);
     size_t high_y = std::max(grid.height() - longest_length, mid_y);

     //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     //   +                 |                       |               +
     //   +        A        |           B           |       C       +
     //   +                 |                       |               +
     //   +-----------------\-----------------------.---------------+
     //   +                 |                       |               +
     //   +                 |                       |               +
     //   +                 |                       |               +
     //   +                 |                       |               +
     //   +        D        |           E           |       F       +
     //   +                 |                       |               +
     //   +                 |                       |               +
     //   +                 |                       |               +
     //   +                 |                       |               +
     //   +-----------------*-----------------------/---------------+
     //   +                 |                       |               +
     //   +        G        |           H           |       I       +
     //   +                 |                       |               +
     //   +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     //
     //   * = (low_x, low_y)
     //   . = (high_x, high_y)
     //   / = (high_x, low_y)
     //   \ = (low_x, high_y)
     //   + = device edge

     //Find the lowest y location on the left edge with a non-empty block
     size_t y = 0;
     size_t x = 0;
     t_physical_tile_type_ptr src_type = nullptr;
     for (x = 0; x < grid.width(); ++x) {
         for (y = 0; y < grid.height(); ++y) {
             auto type = grid[x][y].type;

             if (type != device_ctx.EMPTY_TYPE) {
                 src_type = type;
                 break;
             }
         }
         if (src_type) {
             break;
         }
     }
     VTR_ASSERT(src_type != nullptr);

 #ifdef VERBOSE
     VTR_LOG("Computing from lower left edge (%d,%d):\n", x, y);
 #endif
     generic_compute_matrix(route_profiler, sampled_delta_delays,
                            x, y,
                            x, y,
                            grid.width() - 1, grid.height() - 1,
                            router_opts,
                            measure_directconnect);

     //Find the lowest x location on the bottom edge with a non-empty block
     src_type = nullptr;
     for (y = 0; y < grid.height(); ++y) {
         for (x = 0; x < grid.width(); ++x) {
             auto type = grid[x][y].type;

             if (type != device_ctx.EMPTY_TYPE) {
                 src_type = type;
                 break;
             }
         }
         if (src_type) {
             break;
         }
     }
     VTR_ASSERT(src_type != nullptr);
 #ifdef VERBOSE
     VTR_LOG("Computing from left bottom edge (%d,%d):\n", x, y);
 #endif
     generic_compute_matrix(route_profiler, sampled_delta_delays,
                            x, y,
                            x, y,
                            grid.width() - 1, grid.height() - 1,
                            router_opts,
                            measure_directconnect);

     //Since the other delta delay values may have suffered from edge effects,
     //we recalculate deltas within regions B, C, E, F
 #ifdef VERBOSE
     VTR_LOG("Computing from low/low:\n");
 #endif
     generic_compute_matrix(route_profiler, sampled_delta_delays,
                            low_x, low_y,
                            low_x, low_y,
                            grid.width() - 1, grid.height() - 1,
                            router_opts,
                            measure_directconnect);

     //Since the other delta delay values may have suffered from edge effects,
     //we recalculate deltas within regions D, E, G, H
 #ifdef VERBOSE
     VTR_LOG("Computing from high/high:\n");
 #endif
     generic_compute_matrix(route_profiler, sampled_delta_delays,
                            high_x, high_y,
                            0, 0,
                            high_x, high_y,
                            router_opts,
                            measure_directconnect);

     //Since the other delta delay values may have suffered from edge effects,
     //we recalculate deltas within regions A, B, D, E
 #ifdef VERBOSE
     VTR_LOG("Computing from high/low:\n");
 #endif
     generic_compute_matrix(route_profiler, sampled_delta_delays,
                            high_x, low_y,
                            0, low_y,
                            high_x, grid.height() - 1,
                            router_opts,
                            measure_directconnect);

     //Since the other delta delay values may have suffered from edge effects,
     //we recalculate deltas within regions E, F, H, I
 #ifdef VERBOSE
     VTR_LOG("Computing from low/high:\n");
 #endif
     generic_compute_matrix(route_profiler, sampled_delta_delays,
                            low_x, high_y,
                            low_x, 0,
                            grid.width() - 1, high_y,
                            router_opts,
                            measure_directconnect);

     vtr::Matrix<float> delta_delays({grid.width(), grid.height()});
     for (size_t dx = 0; dx < sampled_delta_delays.dim_size(0); ++dx) {
         for (size_t dy = 0; dy < sampled_delta_delays.dim_size(1); ++dy) {
             delta_delays[dx][dy] = delay_reduce(sampled_delta_delays[dx][dy], placer_opts.delay_model_reducer);
         }
     }

     return delta_delays;
 }

 float delay_reduce(std::vector<float>& delays, e_reducer reducer) {
     if (delays.size() == 0) {
         return IMPOSSIBLE_DELTA;
     } else if (delays.size() == 1) {
         return delays[0];
     }

     VTR_ASSERT(delays.size() > 1);

     float delay;

     if (reducer == e_reducer::MIN) {
         auto itr = std::min_element(delays.begin(), delays.end());
         delay = *itr;
     } else if (reducer == e_reducer::MAX) {
         auto itr = std::max_element(delays.begin(), delays.end());
         delay = *itr;
     } else if (reducer == e_reducer::MEDIAN) {
         std::sort(delays.begin(), delays.end());
         delay = vtr::median(delays.begin(), delays.end());
     } else if (reducer == e_reducer::ARITHMEAN) {
         delay = vtr::arithmean(delays.begin(), delays.end());
     } else if (reducer == e_reducer::GEOMEAN) {
         delay = vtr::geomean(delays.begin(), delays.end());
     } else {
         VPR_FATAL_ERROR(VPR_ERROR_PLACE, "Unrecognized delta delay reducer");
     }

     return delay;
 }

 /* We return the average placement estimated delay for a routing spanning (x,y).
  * We start with an averaging distance of 1 (i.e. from (x-1,y-1) to (x+1,y+1))
  * and look for legal delay values to average; if some are found we return the
  * average and if none are found we increase the distance to average over.
  *
  * If no legal values are found to average over with a range of max_distance,
  * we return IMPOSSIBLE_DELTA.
  */
 static float find_neightboring_average(
     vtr::Matrix<float>& matrix,
     int x,
     int y,
     int max_distance) {
     float sum = 0;
     int counter = 0;
     int endx = matrix.end_index(0);
     int endy = matrix.end_index(1);

     int delx, dely;

     for (int distance = 1; distance <= max_distance; ++distance) {
         for (delx = x - distance; delx <= x + distance; delx++) {
             for (dely = y - distance; dely <= y + distance; dely++) {
                 // Check distance constraint
                 if (abs(delx - x) + abs(dely - y) > distance) {
                     continue;
                 }

                 //check out of bounds
                 if (delx < 0 || dely < 0 || delx >= endx || dely >= endy || (delx == x && dely == y)) {
                     continue;
                 }

                 if (matrix[delx][dely] == EMPTY_DELTA || matrix[delx][dely] == IMPOSSIBLE_DELTA) {
                     continue;
                 }
                 counter++;
                 sum += matrix[delx][dely];
             }
         }
         if (counter != 0) {
             return sum / (float)counter;
         }
     }

     return IMPOSSIBLE_DELTA;
 }

 static void fix_empty_coordinates(vtr::Matrix<float>& delta_delays) {
     // Set any empty delta's to the average of it's neighbours
     //
     // Empty coordinates may occur if the sampling location happens to not have
     // a connection at that location.  However a more through sampling likely
     // would return a result, so we fill in the empty holes with a small
     // neighbour average.
     constexpr int kMaxAverageDistance = 2;
     for (size_t delta_x = 0; delta_x < delta_delays.dim_size(0); ++delta_x) {
         for (size_t delta_y = 0; delta_y < delta_delays.dim_size(1); ++delta_y) {
             if (delta_delays[delta_x][delta_y] == EMPTY_DELTA) {
                 delta_delays[delta_x][delta_y] = find_neightboring_average(delta_delays, delta_x, delta_y, kMaxAverageDistance);
             }
         }
     }
 }

 static void fix_uninitialized_coordinates(vtr::Matrix<float>& delta_delays) {
     // Set any empty delta's to the average of it's neighbours
     for (size_t delta_x = 0; delta_x < delta_delays.dim_size(0); ++delta_x) {
         for (size_t delta_y = 0; delta_y < delta_delays.dim_size(1); ++delta_y) {
             if (delta_delays[delta_x][delta_y] == UNINITIALIZED_DELTA) {
                 delta_delays[delta_x][delta_y] = IMPOSSIBLE_DELTA;
             }
         }
     }
 }

 static void fill_impossible_coordinates(vtr::Matrix<float>& delta_delays) {
     // Set any impossible delta's to the average of it's neighbours
     //
     // Impossible coordinates may occur if an IPIN cannot be reached from the
     // sampling OPIN.  This might occur if the IPIN or OPIN used for sampling
     // is specialized, and therefore cannot be reached via the by the pins
     // sampled.  Leaving this value in the delay matrix will result in invalid
     // slacks if the delay matrix uses this value.
     //
     // A max average distance of 5 is used to provide increased effort in
     // filling these gaps.  It is more important to have a poor predication,
     // than a invalid value and causing a slack assertion.
     constexpr int kMaxAverageDistance = 5;
     for (size_t delta_x = 0; delta_x < delta_delays.dim_size(0); ++delta_x) {
         for (size_t delta_y = 0; delta_y < delta_delays.dim_size(1); ++delta_y) {
             if (delta_delays[delta_x][delta_y] == IMPOSSIBLE_DELTA) {
                 delta_delays[delta_x][delta_y] = find_neightboring_average(
                     delta_delays, delta_x, delta_y, kMaxAverageDistance);
             }
         }
     }
 }

 static vtr::Matrix<float> compute_delta_delay_model(
     const RouterDelayProfiler& route_profiler,
     const t_placer_opts& placer_opts,
     const t_router_opts& router_opts,
     bool measure_directconnect,
     int longest_length) {
     vtr::ScopedStartFinishTimer timer("Computing delta delays");
     vtr::Matrix<float> delta_delays = compute_delta_delays(route_profiler,
                                                            placer_opts, router_opts, measure_directconnect, longest_length);

     fix_uninitialized_coordinates(delta_delays);

     fix_empty_coordinates(delta_delays);

     fill_impossible_coordinates(delta_delays);

     verify_delta_delays(delta_delays);

     return delta_delays;
 }

 //Finds a src_rr and sink_rr appropriate for measuring the delay of the current direct specification
 static bool find_direct_connect_sample_locations(const t_direct_inf* direct,
                                                  t_physical_tile_type_ptr from_type,
                                                  int from_pin,
                                                  int from_pin_class,
                                                  t_physical_tile_type_ptr to_type,
                                                  int to_pin,
                                                  int to_pin_class,
                                                  int* src_rr,
                                                  int* sink_rr) {
     VTR_ASSERT(from_type != nullptr);
     VTR_ASSERT(to_type != nullptr);

     auto& device_ctx = g_vpr_ctx.device();
     auto& grid = device_ctx.grid;

     //Search the grid for an instance of from/to blocks which satisfy this direct connect offsets,
     //and which has the appropriate pins
     int from_x = 0, from_y = 0, from_z = 0;
     int to_x = 0, to_y = 0, to_z = 0;
     bool found = false;
     for (from_x = 0; from_x < (int)grid.width(); ++from_x) {
         to_x = from_x + direct->x_offset;
         if (to_x < 0 || to_x >= (int)grid.width()) continue;

         for (from_y = 0; from_y < (int)grid.height(); ++from_y) {
             if (grid[from_x][from_y].type != from_type) continue;

             //Check that the from pin exists at this from location
             //(with multi-width/height blocks pins may not exist at all locations)
             bool from_pin_found = false;
             if (direct->from_side != NUM_SIDES) {
                 int from_pin_rr = get_rr_node_index(device_ctx.rr_node_indices, from_x, from_y, OPIN, from_pin, direct->from_side);
                 from_pin_found = (from_pin_rr != OPEN);
             } else {
                 std::vector<int> from_pin_rrs = get_rr_node_indices(device_ctx.rr_node_indices, from_x, from_y, OPIN, from_pin);
                 from_pin_found = !from_pin_rrs.empty();
             }
             if (!from_pin_found) continue;

             to_y = from_y + direct->y_offset;

             if (to_y < 0 || to_y >= (int)grid.height()) continue;
             if (grid[to_x][to_y].type != to_type) continue;

             //Check that the from pin exists at this from location
             //(with multi-width/height blocks pins may not exist at all locations)
             bool to_pin_found = false;
             if (direct->to_side != NUM_SIDES) {
                 int to_pin_rr = get_rr_node_index(device_ctx.rr_node_indices, to_x, to_y, IPIN, to_pin, direct->to_side);
                 to_pin_found = (to_pin_rr != OPEN);
             } else {
                 std::vector<int> to_pin_rrs = get_rr_node_indices(device_ctx.rr_node_indices, to_x, to_y, IPIN, to_pin);
                 to_pin_found = !to_pin_rrs.empty();
             }
             if (!to_pin_found) continue;

             for (from_z = 0; from_z < from_type->capacity; ++from_z) {
                 to_z = from_z + direct->z_offset;

                 if (to_z < 0 || to_z >= to_type->capacity) continue;

                 found = true;
                 break;
             }
             if (found) break;
         }
         if (found) break;
     }

     if (!found) {
         return false;
     }

     //Now have a legal instance of this direct connect
     VTR_ASSERT(grid[from_x][from_y].type == from_type);
     VTR_ASSERT(from_z < from_type->capacity);

     VTR_ASSERT(grid[to_x][to_y].type == to_type);
     VTR_ASSERT(to_z < to_type->capacity);

     VTR_ASSERT(from_x + direct->x_offset == to_x);
     VTR_ASSERT(from_y + direct->y_offset == to_y);
     VTR_ASSERT(from_z + direct->z_offset == to_z);

     //
     //Find a source/sink RR node associated with the pins of the direct
     //

     auto source_rr_nodes = get_rr_node_indices(device_ctx.rr_node_indices, from_x, from_y, SOURCE, from_pin_class);
     VTR_ASSERT(source_rr_nodes.size() > 0);

     auto sink_rr_nodes = get_rr_node_indices(device_ctx.rr_node_indices, to_x, to_y, SINK, to_pin_class);
     VTR_ASSERT(sink_rr_nodes.size() > 0);

     *src_rr = source_rr_nodes[0];
     *sink_rr = sink_rr_nodes[0];

     return true;
 }

 static bool verify_delta_delays(const vtr::Matrix<float>& delta_delays) {
     auto& device_ctx = g_vpr_ctx.device();
     auto& grid = device_ctx.grid;

     for (size_t x = 0; x < grid.width(); ++x) {
         for (size_t y = 0; y < grid.height(); ++y) {
             float delta_delay = delta_delays[x][y];

             if (delta_delay < 0.) {
                 VPR_ERROR(VPR_ERROR_PLACE,
                           "Found invaild negative delay %g for delta (%d,%d)",
                           delta_delay, x, y);
             }
         }
     }

     return true;
 }

 void OverrideDelayModel::compute_override_delay_model(const RouterDelayProfiler& route_profiler) {
     t_router_opts router_opts2 = router_opts_;
     router_opts2.astar_fac = 0.;

     //Look at all the direct connections that exist, and add overrides to delay model
     auto& device_ctx = g_vpr_ctx.device();
     for (int idirect = 0; idirect < device_ctx.arch->num_directs; ++idirect) {
         const t_direct_inf* direct = &device_ctx.arch->Directs[idirect];

         InstPort from_port = parse_inst_port(direct->from_pin);
         InstPort to_port = parse_inst_port(direct->to_pin);

         t_physical_tile_type_ptr from_type = find_block_type_by_name(from_port.instance_name(), device_ctx.physical_tile_types);
         t_physical_tile_type_ptr to_type = find_block_type_by_name(to_port.instance_name(), device_ctx.physical_tile_types);

         int num_conns = from_port.port_high_index() - from_port.port_low_index() + 1;
         VTR_ASSERT_MSG(num_conns == to_port.port_high_index() - to_port.port_low_index() + 1, "Directs must have the same size to/from");

         //We now walk through all the connections associated with the current direct specification, measure
         //their delay and specify that value as an override in the delay model.
         //
         //Note that we need to check every connection in the direct to cover the case where the pins are not
         //equivalent.
         //
         //However, if the from/to ports are equivalent we could end up sampling the same RR SOURCE/SINK
         //paths multiple times (wasting CPU time) -- we avoid this by recording the sampled paths in
         //sampled_rr_pairs and skipping them if they occur multiple times.
         int missing_instances = 0;
         int missing_paths = 0;
         std::set<std::pair<int, int>> sampled_rr_pairs;
         for (int iconn = 0; iconn < num_conns; ++iconn) {
             //Find the associated pins
             int from_pin = find_pin(logical_block_type(from_type), from_port.port_name(), from_port.port_low_index() + iconn);
             int to_pin = find_pin(logical_block_type(to_type), to_port.port_name(), to_port.port_low_index() + iconn);

             VTR_ASSERT(from_pin != OPEN);
             VTR_ASSERT(to_pin != OPEN);

             int from_pin_class = find_pin_class(logical_block_type(from_type), from_port.port_name(), from_port.port_low_index() + iconn, DRIVER);
             VTR_ASSERT(from_pin_class != OPEN);

             int to_pin_class = find_pin_class(logical_block_type(to_type), to_port.port_name(), to_port.port_low_index() + iconn, RECEIVER);
             VTR_ASSERT(to_pin_class != OPEN);

             int src_rr = OPEN;
             int sink_rr = OPEN;
             bool found_sample_points = find_direct_connect_sample_locations(direct, from_type, from_pin, from_pin_class, to_type, to_pin, to_pin_class, &src_rr, &sink_rr);

             if (!found_sample_points) {
                 ++missing_instances;
                 continue;
             }

             //If some of the source/sink ports are logically equivalent we may have already
             //sampled the associated source/sink pair and don't need to do so again
             if (sampled_rr_pairs.count({src_rr, sink_rr})) continue;

             VTR_ASSERT(src_rr != OPEN);
             VTR_ASSERT(sink_rr != OPEN);

             float direct_connect_delay = std::numeric_limits<float>::quiet_NaN();
             bool found_routing_path = route_profiler.calculate_delay(src_rr, sink_rr, router_opts2, &direct_connect_delay);

             if (found_routing_path) {
                 set_delay_override(from_type->index, from_pin_class, to_type->index, to_pin_class, direct->x_offset, direct->y_offset, direct_connect_delay);
             } else {
                 ++missing_paths;
             }

             //Record that we've sampled this pair of source and sink nodes
             sampled_rr_pairs.insert({src_rr, sink_rr});
         }

         VTR_LOGV_WARN(missing_instances > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no instances of this direct found)\n", missing_instances, direct->name);
         VTR_LOGV_WARN(missing_paths > 0, "Found no delta delay for %d bits of inter-block direct connect '%s' (no routing path found)\n", missing_paths, direct->name);
     }
 }

 bool directconnect_exists(int src_rr_node, int sink_rr_node) {
     //Returns true if there is a directconnect between the two RR nodes
     //
     //This is checked by looking for a SOURCE -> OPIN -> IPIN -> SINK path
     //which starts at src_rr_node and ends at sink_rr_node
     auto& device_ctx = g_vpr_ctx.device();
     auto& rr_nodes = device_ctx.rr_nodes;

     VTR_ASSERT(rr_nodes[src_rr_node].type() == SOURCE && rr_nodes[sink_rr_node].type() == SINK);

     //TODO: This is a constant depth search, but still may be too slow
     for (t_edge_size i_src_edge = 0; i_src_edge < rr_nodes[src_rr_node].num_edges(); ++i_src_edge) {
         int opin_rr_node = rr_nodes[src_rr_node].edge_sink_node(i_src_edge);

         if (rr_nodes[opin_rr_node].type() != OPIN) continue;

         for (t_edge_size i_opin_edge = 0; i_opin_edge < rr_nodes[opin_rr_node].num_edges(); ++i_opin_edge) {
             int ipin_rr_node = rr_nodes[opin_rr_node].edge_sink_node(i_opin_edge);

             if (rr_nodes[ipin_rr_node].type() != IPIN) continue;

             for (t_edge_size i_ipin_edge = 0; i_ipin_edge < rr_nodes[ipin_rr_node].num_edges(); ++i_ipin_edge) {
                 if (sink_rr_node == rr_nodes[ipin_rr_node].edge_sink_node(i_ipin_edge)) {
                     return true;
                 }
             }
         }
     }
     return false;
 }