blob: 7c2ab986641689d765280833378fa817b3c1612e [file] [log] [blame]
#include <cmath>
#include "vtr_assert.h"
#include "vtr_log.h"
#include "vtr_math.h"
#include "vtr_memory.h"
#include "vpr_types.h"
#include "vpr_error.h"
#include "globals.h"
#include "rr_graph.h"
#include "rr_graph_util.h"
#include "rr_graph_area.h"
/* Select which transistor area equation to use. As found by Chiasson's and Betz's FPL 2013 paper
* (Should FPGAs Abandon the Pass Gate?), the traditional transistor area model
* significantly overpredicts area at smaller process nodes. Their improved area models
* were obtained based on TSMC's 65nm layout rules, and scaled down to 22nm */
enum e_trans_area_eq { AREA_ORIGINAL,
AREA_IMPROVED_NMOS_ONLY, /* only NMOS transistors taken into account */
AREA_IMPROVED_MIXED /* both NMOS and PMOS. extra spacing required for N-wells */
};
static const e_trans_area_eq trans_area_eq = AREA_IMPROVED_NMOS_ONLY;
/************************ Subroutines local to this module *******************/
static void count_bidir_routing_transistors(int num_switch, int wire_to_ipin_switch, float R_minW_nmos, float R_minW_pmos, const float trans_sram_bit);
static void count_unidir_routing_transistors(std::vector<t_segment_inf>& segment_inf,
int wire_to_ipin_switch,
float R_minW_nmos,
float R_minW_pmos,
const float trans_sram_bit);
static float get_cblock_trans(int* num_inputs_to_cblock, int wire_to_ipin_switch, int max_inputs_to_cblock, float trans_sram_bit);
static float* alloc_and_load_unsharable_switch_trans(int num_switch,
float trans_sram_bit,
float R_minW_nmos);
static float* alloc_and_load_sharable_switch_trans(int num_switch,
float R_minW_nmos,
float R_minW_pmos);
static float trans_per_mux(int num_inputs, float trans_sram_bit, float pass_trans_area);
static float trans_per_R(float Rtrans, float R_minW_trans);
/*************************** Subroutine definitions **************************/
void count_routing_transistors(enum e_directionality directionality,
int num_switch,
int wire_to_ipin_switch,
std::vector<t_segment_inf>& segment_inf,
float R_minW_nmos,
float R_minW_pmos) {
/* Counts how many transistors are needed to implement the FPGA routing *
* resources. Call this only when an rr_graph exists. It does not count *
* the transistors used in logic blocks, but it counts the transistors in *
* the input connection block multiplexers and in the output pin drivers and *
* pass transistors. NB: this routine assumes pass transistors always *
* generate two edges (one forward, one backward) between two nodes. *
* Physically, this is what happens -- make sure your rr_graph does it. *
* *
* I assume a minimum width transistor takes 1 unit of area. A double-width *
* transistor takes the twice the diffusion width, but the same spacing, so *
* I assume it takes 1.5x the area of a minimum-width transitor. */
/* Area per SRAM cell (in minimum-width transistor areas) */
const float trans_sram_bit = 4.;
if (directionality == BI_DIRECTIONAL) {
count_bidir_routing_transistors(num_switch, wire_to_ipin_switch, R_minW_nmos, R_minW_pmos, trans_sram_bit);
} else {
VTR_ASSERT(directionality == UNI_DIRECTIONAL);
count_unidir_routing_transistors(segment_inf, wire_to_ipin_switch, R_minW_nmos, R_minW_pmos, trans_sram_bit);
}
}
void count_bidir_routing_transistors(int num_switch, int wire_to_ipin_switch, float R_minW_nmos, float R_minW_pmos, const float trans_sram_bit) {
/* Tri-state buffers are designed as a buffer followed by a pass transistor. *
* I make Rbuffer = Rpass_transitor = 1/2 Rtri-state_buffer. *
* I make the pull-up and pull-down sides of the buffer the same strength -- *
* i.e. I make the p transistor R_minW_pmos / R_minW_nmos wider than the n *
* transistor. *
* *
* I generate two area numbers in this routine: ntrans_sharing and *
* ntrans_no_sharing. ntrans_sharing exactly reflects what the timing *
* analyzer, etc. works with -- each switch is a completely self contained *
* pass transistor or tri-state buffer. In the case of tri-state buffers *
* this is rather pessimisitic. The inverter chain part of the buffer (as *
* opposed to the pass transistor + SRAM output part) can be shared by *
* several switches in the same location. Obviously all the switches from *
* an OPIN can share one buffer. Also, CHANX and CHANY switches at the same *
* spot (i,j) on a single segment can share a buffer. For a more realistic *
* area number I assume all buffered switches from a node that are at the *
* *same (i,j) location* can share one buffer. Only the lowest resistance *
* (largest) buffer is implemented. In practice, you might want to build *
* something that is 1.5x or 2x the largest buffer, so this may be a bit *
* optimistic (but I still think it's pretty reasonable). */
auto& device_ctx = g_vpr_ctx.device();
int* num_inputs_to_cblock; /* [0..device_ctx.rr_nodes.size()-1], but all entries not */
/* corresponding to IPINs will be 0. */
bool* cblock_counted; /* [0..max(device_ctx.grid.width(),device_ctx.grid.height())] -- 0th element unused. */
float* shared_buffer_trans; /* [0..max(device_ctx.grid.width(),device_ctx.grid.height())] */
float *unsharable_switch_trans, *sharable_switch_trans; /* [0..num_switch-1] */
t_rr_type from_rr_type, to_rr_type;
int iedge, num_edges, maxlen;
int iswitch, i, j, iseg, max_inputs_to_cblock;
float input_cblock_trans, shared_opin_buffer_trans;
/* Two variables below are the accumulator variables that add up all the *
* transistors in the routing. Make doubles so that they don't stop *
* incrementing once adding a switch makes a change of less than 1 part in *
* 10^7 to the total. If this still isn't good enough (adding 1 part in *
* 10^15 will still be thrown away), compute the transistor count in *
* "chunks", by adding up inodes 1 to 1000, 1001 to 2000 and then summing *
* the partial sums together. */
double ntrans_sharing, ntrans_no_sharing;
/* Buffer from the routing to the ipin cblock inputs. Assume minimum size n *
* transistors, and ptransistors sized to make the pull-up R = pull-down R */
float trans_track_to_cblock_buf;
ntrans_sharing = 0.;
ntrans_no_sharing = 0.;
max_inputs_to_cblock = 0;
/* Assume the buffer below is 4x minimum drive strength (enough to *
* drive a fanout of up to 16 pretty nicely) -- should cover a reasonable *
* wiring C plus the fanout. */
if (INCLUDE_TRACK_BUFFERS) {
trans_track_to_cblock_buf = trans_per_buf(R_minW_nmos / 4., R_minW_nmos,
R_minW_pmos);
} else {
trans_track_to_cblock_buf = 0;
}
num_inputs_to_cblock = (int*)vtr::calloc(device_ctx.rr_nodes.size(), sizeof(int));
maxlen = std::max(device_ctx.grid.width(), device_ctx.grid.height());
cblock_counted = (bool*)vtr::calloc(maxlen, sizeof(bool));
shared_buffer_trans = (float*)vtr::calloc(maxlen, sizeof(float));
unsharable_switch_trans = alloc_and_load_unsharable_switch_trans(num_switch,
trans_sram_bit, R_minW_nmos);
sharable_switch_trans = alloc_and_load_sharable_switch_trans(num_switch,
R_minW_nmos, R_minW_pmos);
for (size_t from_node = 0; from_node < device_ctx.rr_nodes.size(); from_node++) {
from_rr_type = device_ctx.rr_nodes[from_node].type();
switch (from_rr_type) {
case CHANX:
case CHANY:
num_edges = device_ctx.rr_nodes[from_node].num_edges();
for (iedge = 0; iedge < num_edges; iedge++) {
size_t to_node = device_ctx.rr_nodes[from_node].edge_sink_node(iedge);
to_rr_type = device_ctx.rr_nodes[to_node].type();
/* Ignore any uninitialized rr_graph nodes */
if ((device_ctx.rr_nodes[to_node].type() == SOURCE)
&& (device_ctx.rr_nodes[to_node].xlow() == 0) && (device_ctx.rr_nodes[to_node].ylow() == 0)
&& (device_ctx.rr_nodes[to_node].xhigh() == 0) && (device_ctx.rr_nodes[to_node].yhigh() == 0)) {
continue;
}
switch (to_rr_type) {
case CHANX:
case CHANY:
iswitch = device_ctx.rr_nodes[from_node].edge_switch(iedge);
if (device_ctx.rr_switch_inf[iswitch].buffered()) {
iseg = seg_index_of_sblock(from_node, to_node);
shared_buffer_trans[iseg] = std::max(shared_buffer_trans[iseg],
sharable_switch_trans[iswitch]);
ntrans_no_sharing += unsharable_switch_trans[iswitch]
+ sharable_switch_trans[iswitch];
ntrans_sharing += unsharable_switch_trans[iswitch];
} else if (from_node < to_node) {
/* Pass transistor shared by two edges -- only count once. *
* Also, no part of a pass transistor is sharable. */
ntrans_no_sharing += unsharable_switch_trans[iswitch];
ntrans_sharing += unsharable_switch_trans[iswitch];
}
break;
case IPIN:
num_inputs_to_cblock[to_node]++;
max_inputs_to_cblock = std::max(max_inputs_to_cblock,
num_inputs_to_cblock[to_node]);
iseg = seg_index_of_cblock(from_rr_type, to_node);
if (cblock_counted[iseg] == false) {
cblock_counted[iseg] = true;
ntrans_sharing += trans_track_to_cblock_buf;
ntrans_no_sharing += trans_track_to_cblock_buf;
}
break;
default:
VPR_ERROR(VPR_ERROR_ROUTE,
"in count_routing_transistors:\n"
"\tUnexpected connection from node %d (type %s) to node %d (type %s).\n",
from_node, rr_node_typename[from_rr_type], to_node, rr_node_typename[to_rr_type]);
break;
} /* End switch on to_rr_type. */
} /* End for each edge. */
/* Now add in the shared buffer transistors, and reset some flags. */
if (from_rr_type == CHANX) {
for (i = device_ctx.rr_nodes[from_node].xlow() - 1;
i <= device_ctx.rr_nodes[from_node].xhigh(); i++) {
ntrans_sharing += shared_buffer_trans[i];
shared_buffer_trans[i] = 0.;
}
for (i = device_ctx.rr_nodes[from_node].xlow(); i <= device_ctx.rr_nodes[from_node].xhigh();
i++)
cblock_counted[i] = false;
} else { /* CHANY */
for (j = device_ctx.rr_nodes[from_node].ylow() - 1;
j <= device_ctx.rr_nodes[from_node].yhigh(); j++) {
ntrans_sharing += shared_buffer_trans[j];
shared_buffer_trans[j] = 0.;
}
for (j = device_ctx.rr_nodes[from_node].ylow(); j <= device_ctx.rr_nodes[from_node].yhigh();
j++)
cblock_counted[j] = false;
}
break;
case OPIN:
num_edges = device_ctx.rr_nodes[from_node].num_edges();
shared_opin_buffer_trans = 0.;
for (iedge = 0; iedge < num_edges; iedge++) {
iswitch = device_ctx.rr_nodes[from_node].edge_switch(iedge);
ntrans_no_sharing += unsharable_switch_trans[iswitch]
+ sharable_switch_trans[iswitch];
ntrans_sharing += unsharable_switch_trans[iswitch];
shared_opin_buffer_trans = std::max(shared_opin_buffer_trans,
sharable_switch_trans[iswitch]);
}
ntrans_sharing += shared_opin_buffer_trans;
break;
default:
break;
} /* End switch on from_rr_type */
} /* End for all nodes */
free(cblock_counted);
free(shared_buffer_trans);
free(unsharable_switch_trans);
free(sharable_switch_trans);
/* Now add in the input connection block transistors. */
input_cblock_trans = get_cblock_trans(num_inputs_to_cblock, wire_to_ipin_switch,
max_inputs_to_cblock, trans_sram_bit);
free(num_inputs_to_cblock);
ntrans_sharing += input_cblock_trans;
ntrans_no_sharing += input_cblock_trans;
VTR_LOG("\n");
VTR_LOG("Routing area (in minimum width transistor areas)...\n");
VTR_LOG("\tAssuming no buffer sharing (pessimistic). Total: %#g, per logic tile: %#g\n",
ntrans_no_sharing, ntrans_no_sharing / (float)(device_ctx.grid.width() * device_ctx.grid.height()));
VTR_LOG("\tAssuming buffer sharing (slightly optimistic). Total: %#g, per logic tile: %#g\n",
ntrans_sharing, ntrans_sharing / (float)(device_ctx.grid.width() * device_ctx.grid.height()));
VTR_LOG("\n");
}
void count_unidir_routing_transistors(std::vector<t_segment_inf>& /*segment_inf*/,
int wire_to_ipin_switch,
float R_minW_nmos,
float R_minW_pmos,
const float trans_sram_bit) {
auto& device_ctx = g_vpr_ctx.device();
bool* cblock_counted; /* [0..max(device_ctx.grid.width(),device_ctx.grid.height())] -- 0th element unused. */
int* num_inputs_to_cblock; /* [0..device_ctx.rr_nodes.size()-1], but all entries not */
/* corresponding to IPINs will be 0. */
t_rr_type from_rr_type, to_rr_type;
int i, j, iseg, to_node, iedge, num_edges, maxlen;
int max_inputs_to_cblock;
float input_cblock_trans;
/* August 2014:
* In a unidirectional architecture all the fanin to a wire segment comes from
* a single mux. We should count this mux only once as we look at the outgoing
* switches of all rr nodes. Thus we keep track of which muxes we have already
* counted via the variable below. */
bool* chan_node_switch_done;
chan_node_switch_done = (bool*)vtr::calloc(device_ctx.rr_nodes.size(), sizeof(bool));
/* The variable below is an accumulator variable that will add up all the *
* transistors in the routing. Make double so that it doesn't stop *
* incrementing once adding a switch makes a change of less than 1 part in *
* 10^7 to the total. If this still isn't good enough (adding 1 part in *
* 10^15 will still be thrown away), compute the transistor count in *
* "chunks", by adding up inodes 1 to 1000, 1001 to 2000 and then summing *
* the partial sums together. */
double ntrans;
/* Buffer from the routing to the ipin cblock inputs. Assume minimum size n *
* transistors, and ptransistors sized to make the pull-up R = pull-down R */
float trans_track_to_cblock_buf;
max_inputs_to_cblock = 0;
/* Assume the buffer below is 4x minimum drive strength (enough to *
* drive a fanout of up to 16 pretty nicely) -- should cover a reasonable *
* wiring C plus the fanout. */
if (INCLUDE_TRACK_BUFFERS) {
trans_track_to_cblock_buf = trans_per_buf(R_minW_nmos / 4., R_minW_nmos,
R_minW_pmos);
} else {
trans_track_to_cblock_buf = 0;
}
num_inputs_to_cblock = (int*)vtr::calloc(device_ctx.rr_nodes.size(), sizeof(int));
maxlen = std::max(device_ctx.grid.width(), device_ctx.grid.height());
cblock_counted = (bool*)vtr::calloc(maxlen, sizeof(bool));
ntrans = 0;
for (size_t from_node = 0; from_node < device_ctx.rr_nodes.size(); from_node++) {
from_rr_type = device_ctx.rr_nodes[from_node].type();
switch (from_rr_type) {
case CHANX:
case CHANY:
num_edges = device_ctx.rr_nodes[from_node].num_edges();
/* Increment number of inputs per cblock if IPIN */
for (iedge = 0; iedge < num_edges; iedge++) {
to_node = device_ctx.rr_nodes[from_node].edge_sink_node(iedge);
to_rr_type = device_ctx.rr_nodes[to_node].type();
/* Ignore any uninitialized rr_graph nodes */
if ((device_ctx.rr_nodes[to_node].type() == SOURCE)
&& (device_ctx.rr_nodes[to_node].xlow() == 0) && (device_ctx.rr_nodes[to_node].ylow() == 0)
&& (device_ctx.rr_nodes[to_node].xhigh() == 0) && (device_ctx.rr_nodes[to_node].yhigh() == 0)) {
continue;
}
switch (to_rr_type) {
case CHANX:
case CHANY:
if (!chan_node_switch_done[to_node]) {
int switch_index = device_ctx.rr_nodes[from_node].edge_switch(iedge);
auto switch_type = device_ctx.rr_switch_inf[switch_index].type();
int fan_in = device_ctx.rr_nodes[to_node].fan_in();
if (device_ctx.rr_switch_inf[switch_index].type() == SwitchType::MUX) {
/* Each wire segment begins with a multipexer followed by a driver for unidirectional */
/* Each multiplexer contains all the fan-in to that routing node */
/* Add up area of multiplexer */
ntrans += trans_per_mux(fan_in, trans_sram_bit,
device_ctx.rr_switch_inf[switch_index].mux_trans_size);
/* Add up area of buffer */
/* The buffer size should already have been auto-sized (if required) when
* the rr switches were created from the arch switches */
ntrans += device_ctx.rr_switch_inf[switch_index].buf_size;
} else if (switch_type == SwitchType::SHORT) {
ntrans += 0.; //Electrical shorts contribute no transisitor area
} else if (switch_type == SwitchType::BUFFER) {
if (fan_in != 1) {
std::string msg = vtr::string_fmt(
"Uni-directional RR node driven by non-configurable "
"BUFFER has fan in %d (expected 1)\n",
fan_in);
msg += " " + describe_rr_node(to_node);
VPR_FATAL_ERROR(VPR_ERROR_OTHER, msg.c_str());
}
//This is a non-configurable buffer, so there are no mux transistors,
//only the buffer area
ntrans += device_ctx.rr_switch_inf[switch_index].buf_size;
} else {
VPR_FATAL_ERROR(VPR_ERROR_OTHER, "Unexpected switch type %d while calculating area of uni-directional routing", switch_type);
}
chan_node_switch_done[to_node] = true;
}
break;
case IPIN:
num_inputs_to_cblock[to_node]++;
max_inputs_to_cblock = std::max(max_inputs_to_cblock,
num_inputs_to_cblock[to_node]);
iseg = seg_index_of_cblock(from_rr_type, to_node);
if (cblock_counted[iseg] == false) {
cblock_counted[iseg] = true;
ntrans += trans_track_to_cblock_buf;
}
break;
default:
VPR_ERROR(VPR_ERROR_ROUTE,
"in count_routing_transistors:\n"
"\tUnexpected connection from node %d (type %d) to node %d (type %d).\n",
from_node, from_rr_type, to_node, to_rr_type);
break;
} /* End switch on to_rr_type. */
} /* End for each edge. */
/* Reset some flags */
if (from_rr_type == CHANX) {
for (i = device_ctx.rr_nodes[from_node].xlow(); i <= device_ctx.rr_nodes[from_node].xhigh(); i++)
cblock_counted[i] = false;
} else { /* CHANY */
for (j = device_ctx.rr_nodes[from_node].ylow(); j <= device_ctx.rr_nodes[from_node].yhigh();
j++)
cblock_counted[j] = false;
}
break;
case OPIN:
break;
default:
break;
} /* End switch on from_rr_type */
} /* End for all nodes */
/* Now add in the input connection block transistors. */
input_cblock_trans = get_cblock_trans(num_inputs_to_cblock, wire_to_ipin_switch,
max_inputs_to_cblock, trans_sram_bit);
free(cblock_counted);
free(num_inputs_to_cblock);
free(chan_node_switch_done);
ntrans += input_cblock_trans;
VTR_LOG("\n");
VTR_LOG("Routing area (in minimum width transistor areas)...\n");
VTR_LOG("\tTotal routing area: %#g, per logic tile: %#g\n", ntrans, ntrans / (float)(device_ctx.grid.width() * device_ctx.grid.height()));
}
static float get_cblock_trans(int* num_inputs_to_cblock, int wire_to_ipin_switch, int max_inputs_to_cblock, float trans_sram_bit) {
/* Computes the transistors in the input connection block multiplexers and *
* the buffers from connection block outputs to the logic block input pins. *
* For speed, I precompute the number of transistors in the multiplexers of *
* interest. */
float* trans_per_cblock; /* [0..max_inputs_to_cblock] */
float trans_count;
int num_inputs;
auto& device_ctx = g_vpr_ctx.device();
trans_per_cblock = (float*)vtr::malloc((max_inputs_to_cblock + 1) * sizeof(float));
trans_per_cblock[0] = 0.; /* i.e., not an IPIN or no inputs */
/* With one or more inputs, add the mux and output buffer. I add the output *
* buffer even when the number of inputs = 1 (i.e. no mux) because I assume *
* I need the drivability just for metal capacitance. */
for (int i = 1; i <= max_inputs_to_cblock; i++) {
trans_per_cblock[i] = trans_per_mux(i, trans_sram_bit,
device_ctx.rr_switch_inf[wire_to_ipin_switch].mux_trans_size);
trans_per_cblock[i] += device_ctx.rr_switch_inf[wire_to_ipin_switch].buf_size;
}
trans_count = 0.;
for (size_t i = 0; i < device_ctx.rr_nodes.size(); i++) {
num_inputs = num_inputs_to_cblock[i];
trans_count += trans_per_cblock[num_inputs];
}
free(trans_per_cblock);
return (trans_count);
}
static float*
alloc_and_load_unsharable_switch_trans(int num_switch, float trans_sram_bit, float R_minW_nmos) {
/* Loads up an array that says how many transistors are needed to implement *
* the unsharable portion of each switch type. The SRAM bit of a switch and *
* the pass transistor (forming either the entire switch or the output part *
* of a tri-state buffer) are both unsharable. */
float *unsharable_switch_trans, Rpass;
int i;
auto& device_ctx = g_vpr_ctx.device();
unsharable_switch_trans = (float*)vtr::malloc(num_switch * sizeof(float));
for (i = 0; i < num_switch; i++) {
if (device_ctx.rr_switch_inf[i].type() == SwitchType::SHORT) {
//Electrical shorts do not use any transistors
unsharable_switch_trans[i] = 0.;
} else {
if (!device_ctx.rr_switch_inf[i].buffered()) {
Rpass = device_ctx.rr_switch_inf[i].R;
} else { /* Buffer. Set Rpass = Rbuf = 1/2 Rtotal. */
Rpass = device_ctx.rr_switch_inf[i].R / 2.;
}
unsharable_switch_trans[i] = trans_per_R(Rpass, R_minW_nmos);
if (device_ctx.rr_switch_inf[i].configurable()) {
//Configurable switches use SRAM
unsharable_switch_trans[i] += trans_sram_bit;
}
}
}
return (unsharable_switch_trans);
}
static float*
alloc_and_load_sharable_switch_trans(int num_switch,
float R_minW_nmos,
float R_minW_pmos) {
/* Loads up an array that says how many transistor are needed to implement *
* the sharable portion of each switch type. The SRAM bit of a switch and *
* the pass transistor (forming either the entire switch or the output part *
* of a tri-state buffer) are both unsharable. Only the buffer part of a *
* buffer switch is sharable. */
float *sharable_switch_trans, Rbuf;
int i;
auto& device_ctx = g_vpr_ctx.device();
sharable_switch_trans = (float*)vtr::malloc(num_switch * sizeof(float));
for (i = 0; i < num_switch; i++) {
if (!device_ctx.rr_switch_inf[i].buffered()) {
sharable_switch_trans[i] = 0.;
} else { /* Buffer. Set Rbuf = Rpass = 1/2 Rtotal. */
Rbuf = device_ctx.rr_switch_inf[i].R / 2.;
sharable_switch_trans[i] = trans_per_buf(Rbuf, R_minW_nmos,
R_minW_pmos);
}
}
return (sharable_switch_trans);
}
float trans_per_buf(float Rbuf, float R_minW_nmos, float R_minW_pmos) {
/* Returns the number of minimum width transistor area equivalents needed to *
* implement this buffer. Assumes a stage ratio of 4, and equal strength *
* pull-up and pull-down paths. */
int num_stage, istage;
float trans_count, stage_ratio, Rstage;
if (Rbuf > 0.6 * R_minW_nmos || Rbuf <= 0.) { /* Use a single-stage buffer */
trans_count = trans_per_R(Rbuf, R_minW_nmos)
+ trans_per_R(Rbuf, R_minW_pmos);
} else { /* Use a multi-stage buffer */
/* Target stage ratio = 4. 1 minimum width buffer, then num_stage bigger *
* ones. */
num_stage = vtr::nint(log10(R_minW_nmos / Rbuf) / log10(4.));
num_stage = std::max(num_stage, 1);
stage_ratio = pow((float)(R_minW_nmos / Rbuf), (float)(1. / (float)num_stage));
Rstage = R_minW_nmos;
trans_count = 0.;
for (istage = 0; istage <= num_stage; istage++) {
trans_count += trans_per_R(Rstage, R_minW_nmos)
+ trans_per_R(Rstage, R_minW_pmos);
Rstage /= stage_ratio;
}
}
return (trans_count);
}
static float trans_per_mux(int num_inputs, float trans_sram_bit, float pass_trans_area) {
/* Returns the number of transistors needed to build a pass transistor mux. *
* DOES NOT include input buffers or any output buffer. *
* Attempts to select smart multiplexer size depending on number of inputs *
* For multiplexers with inputs 4 or less, one level is used, more has two *
* levels. */
float ntrans, sram_trans, pass_trans;
int num_second_stage_trans;
if (num_inputs <= 1) {
return (0);
} else if (num_inputs == 2) {
pass_trans = 2 * pass_trans_area;
sram_trans = 1 * trans_sram_bit;
} else if (num_inputs <= 4) {
/* One-hot encoding */
pass_trans = num_inputs * pass_trans_area;
sram_trans = num_inputs * trans_sram_bit;
} else {
/* This is a large multiplexer so design it using a two-level multiplexer *
* + 0.00001 is to make sure exact square roots two don't get rounded down *
* to one lower level. */
num_second_stage_trans = (int)floor((float)sqrt((float)num_inputs) + 0.00001);
pass_trans = (num_inputs + num_second_stage_trans) * pass_trans_area;
sram_trans = (ceil((float)num_inputs / num_second_stage_trans - 0.00001)
+ num_second_stage_trans)
* trans_sram_bit;
if (num_second_stage_trans == 2) {
/* Can use one-bit instead of a two-bit one-hot encoding for the second stage */
/* Eliminates one sram bit counted earlier */
sram_trans -= 1 * trans_sram_bit;
}
}
ntrans = pass_trans + sram_trans;
return (ntrans);
}
static float trans_per_R(float Rtrans, float R_minW_trans) {
/* Returns the number of minimum width transistor area equivalents needed *
* to make a transistor with Rtrans, given that the resistance of a minimum *
* width transistor of this type is R_minW_trans. */
float trans_area;
if (Rtrans <= 0.) {
/* Assume resistances are nonsense -- use min. width */
VTR_LOG_WARN("Sized nonsensical R=%g transistor to minimum width\n", Rtrans);
return (1.);
}
if (Rtrans >= R_minW_trans) {
return (1.);
}
/* Old area model (developed with 0.35um process rules) */
/* Area = minimum width area (1) + 0.5 for each additional unit of width. *
* The 50% factor takes into account the "overlapping" that occurs in *
* horizontally-paralleled transistors, and the need for only one spacing, *
* not two (i.e. two min W transistors need two spaces; a 2W transistor *
* needs only 1). */
/* New area model (developed with 65nm process rules) */
/* These more advanced process rules change how much area we need to add *
* for each additional unit of width vs. the old area model. */
float drive_strength = R_minW_trans / Rtrans;
if (trans_area_eq == AREA_ORIGINAL) {
/* Old transistor area estimation equation */
trans_area = 0.5 * drive_strength + 0.5;
} else if (trans_area_eq == AREA_IMPROVED_NMOS_ONLY) {
/* New transistor area estimation equation. Here only NMOS transistors
* are taken into account */
trans_area = 0.447 + 0.128 * drive_strength + 0.391 * sqrt(drive_strength);
} else if (trans_area_eq == AREA_IMPROVED_MIXED) {
/* New transistor area estimation equation. Here both NMOS and PMOS
* transistors are taken into account (extra spacing needed for N-wells) */
trans_area = 0.518 + 0.127 * drive_strength + 0.428 * sqrt(drive_strength);
} else {
VPR_FATAL_ERROR(VPR_ERROR_ROUTE, "Unrecognized transistor area model: %d\n", (int)trans_area_eq);
}
return (trans_area);
}