vpr/src/route/rr_graph_area.cpp - third_party/vtr-verilog-to-routing - Git at Google

 #include <cmath>

 #include "vtr_assert.h"
 #include "vtr_log.h"
 #include "vtr_math.h"
 #include "vtr_memory.h"

 #include "vpr_types.h"
 #include "vpr_error.h"

 #include "globals.h"
 #include "rr_graph.h"
 #include "rr_graph_util.h"
 #include "rr_graph_area.h"

 /* Select which transistor area equation to use. As found by Chiasson's and Betz's FPL 2013 paper
  * (Should FPGAs Abandon the Pass Gate?), the traditional transistor area model
  * significantly overpredicts area at smaller process nodes. Their improved area models
  * were obtained based on TSMC's 65nm layout rules, and scaled down to 22nm */
 enum e_trans_area_eq { AREA_ORIGINAL,
                        AREA_IMPROVED_NMOS_ONLY, /* only NMOS transistors taken into account */
                        AREA_IMPROVED_MIXED      /* both NMOS and PMOS. extra spacing required for N-wells */
 };
 static const e_trans_area_eq trans_area_eq = AREA_IMPROVED_NMOS_ONLY;

 /************************ Subroutines local to this module *******************/

 static void count_bidir_routing_transistors(int num_switch, int wire_to_ipin_switch, float R_minW_nmos, float R_minW_pmos, const float trans_sram_bit);

 static void count_unidir_routing_transistors(std::vector<t_segment_inf>& segment_inf,
                                              int wire_to_ipin_switch,
                                              float R_minW_nmos,
                                              float R_minW_pmos,
                                              const float trans_sram_bit);

 static float get_cblock_trans(int* num_inputs_to_cblock, int wire_to_ipin_switch, int max_inputs_to_cblock, float trans_sram_bit);

 static float* alloc_and_load_unsharable_switch_trans(int num_switch,
                                                      float trans_sram_bit,
                                                      float R_minW_nmos);

 static float* alloc_and_load_sharable_switch_trans(int num_switch,
                                                    float R_minW_nmos,
                                                    float R_minW_pmos);

 static float trans_per_mux(int num_inputs, float trans_sram_bit, float pass_trans_area);

 static float trans_per_R(float Rtrans, float R_minW_trans);

 /*************************** Subroutine definitions **************************/

 void count_routing_transistors(enum e_directionality directionality,
                                int num_switch,
                                int wire_to_ipin_switch,
                                std::vector<t_segment_inf>& segment_inf,
                                float R_minW_nmos,
                                float R_minW_pmos) {
     /* Counts how many transistors are needed to implement the FPGA routing      *
      * resources.  Call this only when an rr_graph exists.  It does not count    *
      * the transistors used in logic blocks, but it counts the transistors in    *
      * the input connection block multiplexers and in the output pin drivers and *
      * pass transistors.  NB:  this routine assumes pass transistors always      *
      * generate two edges (one forward, one backward) between two nodes.         *
      * Physically, this is what happens -- make sure your rr_graph does it.      *
      *                                                                           *
      * I assume a minimum width transistor takes 1 unit of area.  A double-width *
      * transistor takes the twice the diffusion width, but the same spacing, so  *
      * I assume it takes 1.5x the area of a minimum-width transitor.             */

     /* Area per SRAM cell (in minimum-width transistor areas) */
     const float trans_sram_bit = 4.;

     if (directionality == BI_DIRECTIONAL) {
         count_bidir_routing_transistors(num_switch, wire_to_ipin_switch, R_minW_nmos, R_minW_pmos, trans_sram_bit);
     } else {
         VTR_ASSERT(directionality == UNI_DIRECTIONAL);
         count_unidir_routing_transistors(segment_inf, wire_to_ipin_switch, R_minW_nmos, R_minW_pmos, trans_sram_bit);
     }
 }

 void count_bidir_routing_transistors(int num_switch, int wire_to_ipin_switch, float R_minW_nmos, float R_minW_pmos, const float trans_sram_bit) {
     /* Tri-state buffers are designed as a buffer followed by a pass transistor. *
      * I make Rbuffer = Rpass_transitor = 1/2 Rtri-state_buffer.                 *
      * I make the pull-up and pull-down sides of the buffer the same strength -- *
      * i.e. I make the p transistor R_minW_pmos / R_minW_nmos wider than the n   *
      * transistor.                                                               *
      *                                                                           *
      * I generate two area numbers in this routine:  ntrans_sharing and          *
      * ntrans_no_sharing.  ntrans_sharing exactly reflects what the timing       *
      * analyzer, etc. works with -- each switch is a completely self contained   *
      * pass transistor or tri-state buffer.  In the case of tri-state buffers    *
      * this is rather pessimisitic.  The inverter chain part of the buffer (as   *
      * opposed to the pass transistor + SRAM output part) can be shared by       *
      * several switches in the same location.  Obviously all the switches from   *
      * an OPIN can share one buffer.  Also, CHANX and CHANY switches at the same *
      * spot (i,j) on a single segment can share a buffer.  For a more realistic  *
      * area number I assume all buffered switches from a node that are at the    *
      * *same (i,j) location* can share one buffer.  Only the lowest resistance   *
      * (largest) buffer is implemented.  In practice, you might want to build    *
      * something that is 1.5x or 2x the largest buffer, so this may be a bit     *
      * optimistic (but I still think it's pretty reasonable).                    */
     auto& device_ctx = g_vpr_ctx.device();

     int* num_inputs_to_cblock; /* [0..device_ctx.rr_nodes.size()-1], but all entries not    */

     /* corresponding to IPINs will be 0.           */

     bool* cblock_counted;                                   /* [0..max(device_ctx.grid.width(),device_ctx.grid.height())] -- 0th element unused. */
     float* shared_buffer_trans;                             /* [0..max(device_ctx.grid.width(),device_ctx.grid.height())] */
     float *unsharable_switch_trans, *sharable_switch_trans; /* [0..num_switch-1] */

     t_rr_type from_rr_type, to_rr_type;
     int iedge, num_edges, maxlen;
     int iswitch, i, j, iseg, max_inputs_to_cblock;
     float input_cblock_trans, shared_opin_buffer_trans;

     /* Two variables below are the accumulator variables that add up all the    *
      * transistors in the routing.  Make doubles so that they don't stop        *
      * incrementing once adding a switch makes a change of less than 1 part in  *
      * 10^7 to the total.  If this still isn't good enough (adding 1 part in    *
      * 10^15 will still be thrown away), compute the transistor count in        *
      * "chunks", by adding up inodes 1 to 1000, 1001 to 2000 and then summing   *
      * the partial sums together.                                               */

     double ntrans_sharing, ntrans_no_sharing;

     /* Buffer from the routing to the ipin cblock inputs. Assume minimum size n *
      * transistors, and ptransistors sized to make the pull-up R = pull-down R */

     float trans_track_to_cblock_buf;

     ntrans_sharing = 0.;
     ntrans_no_sharing = 0.;
     max_inputs_to_cblock = 0;

     /* Assume the buffer below is 4x minimum drive strength (enough to        *
      * drive a fanout of up to 16 pretty nicely) -- should cover a reasonable *
      * wiring C plus the fanout.                                              */

     if (INCLUDE_TRACK_BUFFERS) {
         trans_track_to_cblock_buf = trans_per_buf(R_minW_nmos / 4., R_minW_nmos,
                                                   R_minW_pmos);
     } else {
         trans_track_to_cblock_buf = 0;
     }

     num_inputs_to_cblock = (int*)vtr::calloc(device_ctx.rr_nodes.size(), sizeof(int));

     maxlen = std::max(device_ctx.grid.width(), device_ctx.grid.height());
     cblock_counted = (bool*)vtr::calloc(maxlen, sizeof(bool));
     shared_buffer_trans = (float*)vtr::calloc(maxlen, sizeof(float));

     unsharable_switch_trans = alloc_and_load_unsharable_switch_trans(num_switch,
                                                                      trans_sram_bit, R_minW_nmos);

     sharable_switch_trans = alloc_and_load_sharable_switch_trans(num_switch,
                                                                  R_minW_nmos, R_minW_pmos);

     for (size_t from_node = 0; from_node < device_ctx.rr_nodes.size(); from_node++) {
         from_rr_type = device_ctx.rr_nodes[from_node].type();

         switch (from_rr_type) {
             case CHANX:
             case CHANY:
                 num_edges = device_ctx.rr_nodes[from_node].num_edges();

                 for (iedge = 0; iedge < num_edges; iedge++) {
                     size_t to_node = device_ctx.rr_nodes[from_node].edge_sink_node(iedge);
                     to_rr_type = device_ctx.rr_nodes[to_node].type();

                     /* Ignore any uninitialized rr_graph nodes */
                     if ((device_ctx.rr_nodes[to_node].type() == SOURCE)
                         && (device_ctx.rr_nodes[to_node].xlow() == 0) && (device_ctx.rr_nodes[to_node].ylow() == 0)
                         && (device_ctx.rr_nodes[to_node].xhigh() == 0) && (device_ctx.rr_nodes[to_node].yhigh() == 0)) {
                         continue;
                     }

                     switch (to_rr_type) {
                         case CHANX:
                         case CHANY:
                             iswitch = device_ctx.rr_nodes[from_node].edge_switch(iedge);

                             if (device_ctx.rr_switch_inf[iswitch].buffered()) {
                                 iseg = seg_index_of_sblock(from_node, to_node);
                                 shared_buffer_trans[iseg] = std::max(shared_buffer_trans[iseg],
                                                                      sharable_switch_trans[iswitch]);

                                 ntrans_no_sharing += unsharable_switch_trans[iswitch]
                                                      + sharable_switch_trans[iswitch];
                                 ntrans_sharing += unsharable_switch_trans[iswitch];
                             } else if (from_node < to_node) {
                                 /* Pass transistor shared by two edges -- only count once.  *
                                  * Also, no part of a pass transistor is sharable.          */

                                 ntrans_no_sharing += unsharable_switch_trans[iswitch];
                                 ntrans_sharing += unsharable_switch_trans[iswitch];
                             }
                             break;

                         case IPIN:
                             num_inputs_to_cblock[to_node]++;
                             max_inputs_to_cblock = std::max(max_inputs_to_cblock,
                                                             num_inputs_to_cblock[to_node]);

                             iseg = seg_index_of_cblock(from_rr_type, to_node);

                             if (cblock_counted[iseg] == false) {
                                 cblock_counted[iseg] = true;
                                 ntrans_sharing += trans_track_to_cblock_buf;
                                 ntrans_no_sharing += trans_track_to_cblock_buf;
                             }
                             break;

                         default:
                             VPR_ERROR(VPR_ERROR_ROUTE,
                                       "in count_routing_transistors:\n"
                                       "\tUnexpected connection from node %d (type %s) to node %d (type %s).\n",
                                       from_node, rr_node_typename[from_rr_type], to_node, rr_node_typename[to_rr_type]);
                             break;

                     } /* End switch on to_rr_type. */

                 } /* End for each edge. */

                 /* Now add in the shared buffer transistors, and reset some flags. */

                 if (from_rr_type == CHANX) {
                     for (i = device_ctx.rr_nodes[from_node].xlow() - 1;
                          i <= device_ctx.rr_nodes[from_node].xhigh(); i++) {
                         ntrans_sharing += shared_buffer_trans[i];
                         shared_buffer_trans[i] = 0.;
                     }

                     for (i = device_ctx.rr_nodes[from_node].xlow(); i <= device_ctx.rr_nodes[from_node].xhigh();
                          i++)
                         cblock_counted[i] = false;

                 } else { /* CHANY */
                     for (j = device_ctx.rr_nodes[from_node].ylow() - 1;
                          j <= device_ctx.rr_nodes[from_node].yhigh(); j++) {
                         ntrans_sharing += shared_buffer_trans[j];
                         shared_buffer_trans[j] = 0.;
                     }

                     for (j = device_ctx.rr_nodes[from_node].ylow(); j <= device_ctx.rr_nodes[from_node].yhigh();
                          j++)
                         cblock_counted[j] = false;
                 }
                 break;

             case OPIN:
                 num_edges = device_ctx.rr_nodes[from_node].num_edges();
                 shared_opin_buffer_trans = 0.;

                 for (iedge = 0; iedge < num_edges; iedge++) {
                     iswitch = device_ctx.rr_nodes[from_node].edge_switch(iedge);
                     ntrans_no_sharing += unsharable_switch_trans[iswitch]
                                          + sharable_switch_trans[iswitch];
                     ntrans_sharing += unsharable_switch_trans[iswitch];

                     shared_opin_buffer_trans = std::max(shared_opin_buffer_trans,
                                                         sharable_switch_trans[iswitch]);
                 }

                 ntrans_sharing += shared_opin_buffer_trans;
                 break;

             default:
                 break;

         } /* End switch on from_rr_type */
     }     /* End for all nodes */

     free(cblock_counted);
     free(shared_buffer_trans);
     free(unsharable_switch_trans);
     free(sharable_switch_trans);

     /* Now add in the input connection block transistors. */

     input_cblock_trans = get_cblock_trans(num_inputs_to_cblock, wire_to_ipin_switch,
                                           max_inputs_to_cblock, trans_sram_bit);

     free(num_inputs_to_cblock);

     ntrans_sharing += input_cblock_trans;
     ntrans_no_sharing += input_cblock_trans;

     VTR_LOG("\n");
     VTR_LOG("Routing area (in minimum width transistor areas)...\n");
     VTR_LOG("\tAssuming no buffer sharing (pessimistic). Total: %#g, per logic tile: %#g\n",
             ntrans_no_sharing, ntrans_no_sharing / (float)(device_ctx.grid.width() * device_ctx.grid.height()));
     VTR_LOG("\tAssuming buffer sharing (slightly optimistic). Total: %#g, per logic tile: %#g\n",
             ntrans_sharing, ntrans_sharing / (float)(device_ctx.grid.width() * device_ctx.grid.height()));
     VTR_LOG("\n");
 }

 void count_unidir_routing_transistors(std::vector<t_segment_inf>& /*segment_inf*/,
                                       int wire_to_ipin_switch,
                                       float R_minW_nmos,
                                       float R_minW_pmos,
                                       const float trans_sram_bit) {
     auto& device_ctx = g_vpr_ctx.device();

     bool* cblock_counted;      /* [0..max(device_ctx.grid.width(),device_ctx.grid.height())] -- 0th element unused. */
     int* num_inputs_to_cblock; /* [0..device_ctx.rr_nodes.size()-1], but all entries not    */

     /* corresponding to IPINs will be 0.           */

     t_rr_type from_rr_type, to_rr_type;
     int i, j, iseg, to_node, iedge, num_edges, maxlen;
     int max_inputs_to_cblock;
     float input_cblock_trans;

     /* August 2014:
      * In a unidirectional architecture all the fanin to a wire segment comes from
      * a single mux. We should count this mux only once as we look at the outgoing
      * switches of all rr nodes. Thus we keep track of which muxes we have already
      * counted via the variable below. */
     bool* chan_node_switch_done;
     chan_node_switch_done = (bool*)vtr::calloc(device_ctx.rr_nodes.size(), sizeof(bool));

     /* The variable below is an accumulator variable that will add up all the   *
      * transistors in the routing.  Make double so that it doesn't stop         *
      * incrementing once adding a switch makes a change of less than 1 part in  *
      * 10^7 to the total.  If this still isn't good enough (adding 1 part in    *
      * 10^15 will still be thrown away), compute the transistor count in        *
      * "chunks", by adding up inodes 1 to 1000, 1001 to 2000 and then summing   *
      * the partial sums together.                                               */

     double ntrans;

     /* Buffer from the routing to the ipin cblock inputs. Assume minimum size n *
      * transistors, and ptransistors sized to make the pull-up R = pull-down R */

     float trans_track_to_cblock_buf;

     max_inputs_to_cblock = 0;

     /* Assume the buffer below is 4x minimum drive strength (enough to        *
      * drive a fanout of up to 16 pretty nicely) -- should cover a reasonable *
      * wiring C plus the fanout.                                              */

     if (INCLUDE_TRACK_BUFFERS) {
         trans_track_to_cblock_buf = trans_per_buf(R_minW_nmos / 4., R_minW_nmos,
                                                   R_minW_pmos);
     } else {
         trans_track_to_cblock_buf = 0;
     }

     num_inputs_to_cblock = (int*)vtr::calloc(device_ctx.rr_nodes.size(), sizeof(int));
     maxlen = std::max(device_ctx.grid.width(), device_ctx.grid.height());
     cblock_counted = (bool*)vtr::calloc(maxlen, sizeof(bool));

     ntrans = 0;
     for (size_t from_node = 0; from_node < device_ctx.rr_nodes.size(); from_node++) {
         from_rr_type = device_ctx.rr_nodes[from_node].type();

         switch (from_rr_type) {
             case CHANX:
             case CHANY:
                 num_edges = device_ctx.rr_nodes[from_node].num_edges();

                 /* Increment number of inputs per cblock if IPIN */
                 for (iedge = 0; iedge < num_edges; iedge++) {
                     to_node = device_ctx.rr_nodes[from_node].edge_sink_node(iedge);
                     to_rr_type = device_ctx.rr_nodes[to_node].type();

                     /* Ignore any uninitialized rr_graph nodes */
                     if ((device_ctx.rr_nodes[to_node].type() == SOURCE)
                         && (device_ctx.rr_nodes[to_node].xlow() == 0) && (device_ctx.rr_nodes[to_node].ylow() == 0)
                         && (device_ctx.rr_nodes[to_node].xhigh() == 0) && (device_ctx.rr_nodes[to_node].yhigh() == 0)) {
                         continue;
                     }

                     switch (to_rr_type) {
                         case CHANX:
                         case CHANY:
                             if (!chan_node_switch_done[to_node]) {
                                 int switch_index = device_ctx.rr_nodes[from_node].edge_switch(iedge);
                                 auto switch_type = device_ctx.rr_switch_inf[switch_index].type();

                                 int fan_in = device_ctx.rr_nodes[to_node].fan_in();

                                 if (device_ctx.rr_switch_inf[switch_index].type() == SwitchType::MUX) {
                                     /* Each wire segment begins with a multipexer followed by a driver for unidirectional */
                                     /* Each multiplexer contains all the fan-in to that routing node */
                                     /* Add up area of multiplexer */
                                     ntrans += trans_per_mux(fan_in, trans_sram_bit,
                                                             device_ctx.rr_switch_inf[switch_index].mux_trans_size);

                                     /* Add up area of buffer */
                                     /* The buffer size should already have been auto-sized (if required) when
                                      * the rr switches were created from the arch switches */
                                     ntrans += device_ctx.rr_switch_inf[switch_index].buf_size;
                                 } else if (switch_type == SwitchType::SHORT) {
                                     ntrans += 0.; //Electrical shorts contribute no transisitor area
                                 } else if (switch_type == SwitchType::BUFFER) {
                                     if (fan_in != 1) {
                                         std::string msg = vtr::string_fmt(
                                             "Uni-directional RR node driven by non-configurable "
                                             "BUFFER has fan in %d (expected 1)\n",
                                             fan_in);
                                         msg += "  " + describe_rr_node(to_node);
                                         VPR_FATAL_ERROR(VPR_ERROR_OTHER, msg.c_str());
                                     }

                                     //This is a non-configurable buffer, so there are no mux transistors,
                                     //only the buffer area
                                     ntrans += device_ctx.rr_switch_inf[switch_index].buf_size;
                                 } else {
                                     VPR_FATAL_ERROR(VPR_ERROR_OTHER, "Unexpected switch type %d while calculating area of uni-directional routing", switch_type);
                                 }
                                 chan_node_switch_done[to_node] = true;
                             }

                             break;

                         case IPIN:
                             num_inputs_to_cblock[to_node]++;
                             max_inputs_to_cblock = std::max(max_inputs_to_cblock,
                                                             num_inputs_to_cblock[to_node]);
                             iseg = seg_index_of_cblock(from_rr_type, to_node);

                             if (cblock_counted[iseg] == false) {
                                 cblock_counted[iseg] = true;
                                 ntrans += trans_track_to_cblock_buf;
                             }
                             break;

                         default:
                             VPR_ERROR(VPR_ERROR_ROUTE,
                                       "in count_routing_transistors:\n"
                                       "\tUnexpected connection from node %d (type %d) to node %d (type %d).\n",
                                       from_node, from_rr_type, to_node, to_rr_type);
                             break;

                     } /* End switch on to_rr_type. */

                 } /* End for each edge. */

                 /* Reset some flags */
                 if (from_rr_type == CHANX) {
                     for (i = device_ctx.rr_nodes[from_node].xlow(); i <= device_ctx.rr_nodes[from_node].xhigh(); i++)
                         cblock_counted[i] = false;

                 } else { /* CHANY */
                     for (j = device_ctx.rr_nodes[from_node].ylow(); j <= device_ctx.rr_nodes[from_node].yhigh();
                          j++)
                         cblock_counted[j] = false;
                 }
                 break;
             case OPIN:
                 break;

             default:
                 break;

         } /* End switch on from_rr_type */
     }     /* End for all nodes */

     /* Now add in the input connection block transistors. */

     input_cblock_trans = get_cblock_trans(num_inputs_to_cblock, wire_to_ipin_switch,
                                           max_inputs_to_cblock, trans_sram_bit);

     free(cblock_counted);
     free(num_inputs_to_cblock);
     free(chan_node_switch_done);

     ntrans += input_cblock_trans;

     VTR_LOG("\n");
     VTR_LOG("Routing area (in minimum width transistor areas)...\n");
     VTR_LOG("\tTotal routing area: %#g, per logic tile: %#g\n", ntrans, ntrans / (float)(device_ctx.grid.width() * device_ctx.grid.height()));
 }

 static float get_cblock_trans(int* num_inputs_to_cblock, int wire_to_ipin_switch, int max_inputs_to_cblock, float trans_sram_bit) {
     /* Computes the transistors in the input connection block multiplexers and   *
      * the buffers from connection block outputs to the logic block input pins.  *
      * For speed, I precompute the number of transistors in the multiplexers of  *
      * interest.                                                                 */

     float* trans_per_cblock; /* [0..max_inputs_to_cblock] */
     float trans_count;
     int num_inputs;

     auto& device_ctx = g_vpr_ctx.device();

     trans_per_cblock = (float*)vtr::malloc((max_inputs_to_cblock + 1) * sizeof(float));

     trans_per_cblock[0] = 0.; /* i.e., not an IPIN or no inputs */

     /* With one or more inputs, add the mux and output buffer.  I add the output *
      * buffer even when the number of inputs = 1 (i.e. no mux) because I assume  *
      * I need the drivability just for metal capacitance.                        */

     for (int i = 1; i <= max_inputs_to_cblock; i++) {
         trans_per_cblock[i] = trans_per_mux(i, trans_sram_bit,
                                             device_ctx.rr_switch_inf[wire_to_ipin_switch].mux_trans_size);
         trans_per_cblock[i] += device_ctx.rr_switch_inf[wire_to_ipin_switch].buf_size;
     }

     trans_count = 0.;

     for (size_t i = 0; i < device_ctx.rr_nodes.size(); i++) {
         num_inputs = num_inputs_to_cblock[i];
         trans_count += trans_per_cblock[num_inputs];
     }

     free(trans_per_cblock);
     return (trans_count);
 }

 static float*
 alloc_and_load_unsharable_switch_trans(int num_switch, float trans_sram_bit, float R_minW_nmos) {
     /* Loads up an array that says how many transistors are needed to implement  *
      * the unsharable portion of each switch type.  The SRAM bit of a switch and *
      * the pass transistor (forming either the entire switch or the output part  *
      * of a tri-state buffer) are both unsharable.                               */

     float *unsharable_switch_trans, Rpass;
     int i;

     auto& device_ctx = g_vpr_ctx.device();

     unsharable_switch_trans = (float*)vtr::malloc(num_switch * sizeof(float));

     for (i = 0; i < num_switch; i++) {
         if (device_ctx.rr_switch_inf[i].type() == SwitchType::SHORT) {
             //Electrical shorts do not use any transistors
             unsharable_switch_trans[i] = 0.;
         } else {
             if (!device_ctx.rr_switch_inf[i].buffered()) {
                 Rpass = device_ctx.rr_switch_inf[i].R;
             } else { /* Buffer.  Set Rpass = Rbuf = 1/2 Rtotal. */
                 Rpass = device_ctx.rr_switch_inf[i].R / 2.;
             }

             unsharable_switch_trans[i] = trans_per_R(Rpass, R_minW_nmos);

             if (device_ctx.rr_switch_inf[i].configurable()) {
                 //Configurable switches use SRAM
                 unsharable_switch_trans[i] += trans_sram_bit;
             }
         }
     }

     return (unsharable_switch_trans);
 }

 static float*
 alloc_and_load_sharable_switch_trans(int num_switch,
                                      float R_minW_nmos,
                                      float R_minW_pmos) {
     /* Loads up an array that says how many transistor are needed to implement   *
      * the sharable portion of each switch type.  The SRAM bit of a switch and   *
      * the pass transistor (forming either the entire switch or the output part  *
      * of a tri-state buffer) are both unsharable.  Only the buffer part of a    *
      * buffer switch is sharable.                                                */

     float *sharable_switch_trans, Rbuf;
     int i;

     auto& device_ctx = g_vpr_ctx.device();

     sharable_switch_trans = (float*)vtr::malloc(num_switch * sizeof(float));

     for (i = 0; i < num_switch; i++) {
         if (!device_ctx.rr_switch_inf[i].buffered()) {
             sharable_switch_trans[i] = 0.;
         } else { /* Buffer.  Set Rbuf = Rpass = 1/2 Rtotal. */
             Rbuf = device_ctx.rr_switch_inf[i].R / 2.;
             sharable_switch_trans[i] = trans_per_buf(Rbuf, R_minW_nmos,
                                                      R_minW_pmos);
         }
     }

     return (sharable_switch_trans);
 }

 float trans_per_buf(float Rbuf, float R_minW_nmos, float R_minW_pmos) {
     /* Returns the number of minimum width transistor area equivalents needed to *
      * implement this buffer.  Assumes a stage ratio of 4, and equal strength    *
      * pull-up and pull-down paths.                                              */

     int num_stage, istage;
     float trans_count, stage_ratio, Rstage;

     if (Rbuf > 0.6 * R_minW_nmos || Rbuf <= 0.) { /* Use a single-stage buffer */
         trans_count = trans_per_R(Rbuf, R_minW_nmos)
                       + trans_per_R(Rbuf, R_minW_pmos);
     } else { /* Use a multi-stage buffer */

         /* Target stage ratio = 4.  1 minimum width buffer, then num_stage bigger *
          * ones.                                                                  */

         num_stage = vtr::nint(log10(R_minW_nmos / Rbuf) / log10(4.));
         num_stage = std::max(num_stage, 1);
         stage_ratio = pow((float)(R_minW_nmos / Rbuf), (float)(1. / (float)num_stage));

         Rstage = R_minW_nmos;
         trans_count = 0.;

         for (istage = 0; istage <= num_stage; istage++) {
             trans_count += trans_per_R(Rstage, R_minW_nmos)
                            + trans_per_R(Rstage, R_minW_pmos);
             Rstage /= stage_ratio;
         }
     }

     return (trans_count);
 }

 static float trans_per_mux(int num_inputs, float trans_sram_bit, float pass_trans_area) {
     /* Returns the number of transistors needed to build a pass transistor mux. *
      * DOES NOT include input buffers or any output buffer.                     *
      * Attempts to select smart multiplexer size depending on number of inputs  *
      * For multiplexers with inputs 4 or less, one level is used, more has two  *
      * levels.                                                                  */
     float ntrans, sram_trans, pass_trans;
     int num_second_stage_trans;

     if (num_inputs <= 1) {
         return (0);
     } else if (num_inputs == 2) {
         pass_trans = 2 * pass_trans_area;
         sram_trans = 1 * trans_sram_bit;
     } else if (num_inputs <= 4) {
         /* One-hot encoding */
         pass_trans = num_inputs * pass_trans_area;
         sram_trans = num_inputs * trans_sram_bit;
     } else {
         /* This is a large multiplexer so design it using a two-level multiplexer   *
          * + 0.00001 is to make sure exact square roots two don't get rounded down  *
          * to one lower level.                                                      */
         num_second_stage_trans = (int)floor((float)sqrt((float)num_inputs) + 0.00001);
         pass_trans = (num_inputs + num_second_stage_trans) * pass_trans_area;
         sram_trans = (ceil((float)num_inputs / num_second_stage_trans - 0.00001)
                       + num_second_stage_trans)
                      * trans_sram_bit;
         if (num_second_stage_trans == 2) {
             /* Can use one-bit instead of a two-bit one-hot encoding for the second stage */
             /* Eliminates one sram bit counted earlier */
             sram_trans -= 1 * trans_sram_bit;
         }
     }

     ntrans = pass_trans + sram_trans;

     return (ntrans);
 }

 static float trans_per_R(float Rtrans, float R_minW_trans) {
     /* Returns the number of minimum width transistor area equivalents needed    *
      * to make a transistor with Rtrans, given that the resistance of a minimum  *
      * width transistor of this type is R_minW_trans.                            */

     float trans_area;

     if (Rtrans <= 0.) {
         /* Assume resistances are nonsense -- use min. width */
         VTR_LOG_WARN("Sized nonsensical R=%g transistor to minimum width\n", Rtrans);
         return (1.);
     }

     if (Rtrans >= R_minW_trans) {
         return (1.);
     }

     /* Old area model (developed with 0.35um process rules) */
     /* Area = minimum width area (1) + 0.5 for each additional unit of width.  *
      * The 50% factor takes into account the "overlapping" that occurs in      *
      * horizontally-paralleled transistors, and the need for only one spacing, *
      * not two (i.e. two min W transistors need two spaces; a 2W transistor    *
      * needs only 1).                                                          */

     /* New area model (developed with 65nm process rules) 			   */
     /* These more advanced process rules change how much area we need to add   *
      * for each additional unit of width vs. the old area model.               */

     float drive_strength = R_minW_trans / Rtrans;
     if (trans_area_eq == AREA_ORIGINAL) {
         /* Old transistor area estimation equation */
         trans_area = 0.5 * drive_strength + 0.5;
     } else if (trans_area_eq == AREA_IMPROVED_NMOS_ONLY) {
         /* New transistor area estimation equation. Here only NMOS transistors
          * are taken into account */
         trans_area = 0.447 + 0.128 * drive_strength + 0.391 * sqrt(drive_strength);
     } else if (trans_area_eq == AREA_IMPROVED_MIXED) {
         /* New transistor area estimation equation. Here both NMOS and PMOS
          * transistors are taken into account (extra spacing needed for N-wells) */
         trans_area = 0.518 + 0.127 * drive_strength + 0.428 * sqrt(drive_strength);
     } else {
         VPR_FATAL_ERROR(VPR_ERROR_ROUTE, "Unrecognized transistor area model: %d\n", (int)trans_area_eq);
     }

     return (trans_area);
 }