| <!-- |
| This is the architecture file for the Extra Carry Chain Architecture proposed in [1]. |
| Delays for routing and logic blocks come from COFFE runs for a 20 nm technology node. |
| Delays for DSP blocks and BRAMs come from Arria 10 (22 nm) delays. |
| |
| This architecture is Stratix-10 like architecture with a modified arithmetic mode |
| |
| This architecture has 10 ALMs per cluster, where each ALM is a 6-LUT fracturable into |
| two 5-LUTs. The ALM has 8 inputs and 4 optionally registered outputs.The two 5-LUTs should |
| share at least two inputs. Each two ALM outputs are logically equivalent, which means any |
| output signal that can reach ALM.out[0] can reach ALM.out[1] and the same thing for |
| ALM.out[2] and ALM.out[3]. The ALMs in this architecture have an arithmetic mode |
| where each 5-LUT is fractured into two 4-LUTs. This results in a total of four 4-LUTs per ALM. |
| This architecture has two carry chains per ALM (four adders), where the output of the first carry chain feeds |
| one of the inputs of the second carry chain. This structure allows this architecture to implement |
| adder trees and 3:1 addition reduction operations more efficiently. |
| |
| The LAB has 60 inputs and 40 outputs. Two outputs of each ALM are fed to the right and |
| left LAB using direct links and are also fed back to the LAB as feedback connections sharing |
| the 60 input ports with the signals coming from the routing channels. |
| |
| The architecture also has a 20Kb memory that has true and simple dual port modes. In simple |
| dual port mode the memory can be configured in the following modes: 512x40, 1024x20 and 2048x10, |
| while in true dual port mode it can be configured as: 1024x20 and 2028x10. |
| |
| In addition, the architecture has a 27x27 DSP block that can be fractured into two 18x19 DSPs. |
| |
| |
| [1] M. Eldafrawy, A. Boutros, S. Yazdanshenas, and V. Betz, "FPGA Logic Block Architectures for efficient |
| multiplication and addition to enhance machine learning performance," in Transactions on Reconfigurable |
| Technology and Systems (TRETS), 2019 |
| |
| --> |
| <architecture> |
| <!-- |
| ODIN II specific config begins |
| Describes the types of user-specified netlist blocks (in blif, this corresponds to |
| ".model [type_of_block]") that this architecture supports. |
| |
| Note: Basic LUTs, I/Os, and flip-flops are not included here as there are |
| already special structures in blif (.names, .input, .output, and .latch) |
| that describe them. |
| --> |
| <models> |
| <model name="multiply"> |
| <input_ports> |
| <port name="a" combinational_sink_ports="out"/> |
| <port name="b" combinational_sink_ports="out"/> |
| </input_ports> |
| <output_ports> |
| <port name="out"/> |
| </output_ports> |
| </model> |
| |
| <model name="single_port_ram"> |
| <input_ports> |
| <port name="we" clock="clk"/> <!-- control --> |
| <port name="addr" clock="clk"/> <!-- address lines --> |
| <port name="data" clock="clk"/> <!-- data lines can be broken down into smaller bit widths minimum size 1 --> |
| <port name="clk" is_clock="1"/> <!-- memories are often clocked --> |
| </input_ports> |
| <output_ports> |
| <port name="out" clock="clk"/> <!-- output can be broken down into smaller bit widths minimum size 1 --> |
| </output_ports> |
| </model> |
| |
| <model name="dual_port_ram"> |
| <input_ports> |
| <port name="we1" clock="clk"/> <!-- write enable --> |
| <port name="we2" clock="clk"/> <!-- write enable --> |
| <port name="addr1" clock="clk"/> <!-- address lines --> |
| <port name="addr2" clock="clk"/> <!-- address lines --> |
| <port name="data1" clock="clk"/> <!-- data lines can be broken down into smaller bit widths minimum size 1 --> |
| <port name="data2" clock="clk"/> <!-- data lines can be broken down into smaller bit widths minimum size 1 --> |
| <port name="clk" is_clock="1"/> <!-- memories are often clocked --> |
| </input_ports> |
| <output_ports> |
| <port name="out1" clock="clk"/> <!-- output can be broken down into smaller bit widths minimum size 1 --> |
| <port name="out2" clock="clk"/> <!-- output can be broken down into smaller bit widths minimum size 1 --> |
| </output_ports> |
| </model> |
| |
| <model name="adder"> |
| <input_ports> |
| <port name="a" combinational_sink_ports="sumout cout"/> |
| <port name="b" combinational_sink_ports="sumout cout"/> |
| <port name="cin" combinational_sink_ports="sumout cout"/> |
| </input_ports> |
| <output_ports> |
| <port name="cout"/> |
| <port name="sumout"/> |
| </output_ports> |
| </model> |
| </models> <!-- ODIN II specific config ends --> |
| |
| <layout> <!-- Physical descriptions begin --> |
| <auto_layout aspect_ratio="1.0"> |
| <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners--> |
| <perimeter type="io" priority="100"/> |
| <corners type="EMPTY" priority="101"/> |
| <!--Fill with 'clb'--> |
| <fill type="clb" priority="10"/> |
| <!--Column of 'mult_27' with 'EMPTY' blocks wherever a 'mult_27' does not fit. Vertical offset by 1 for perimeter.--> |
| <col type="mult_27" startx="6" starty="1" repeatx="8" priority="20"/> |
| <col type="EMPTY" startx="6" repeatx="8" starty="1" priority="19"/> |
| <!--Column of 'memory' with 'EMPTY' blocks wherever a 'memory' does not fit. Vertical offset by 1 for perimeter.--> |
| <col type="memory" startx="2" starty="1" repeatx="8" priority="20"/> |
| <col type="EMPTY" startx="2" repeatx="8" starty="1" priority="19"/> |
| </auto_layout> |
| </layout> |
| |
| <device> |
| <sizing R_minW_nmos="13090" R_minW_pmos="19086.83"/> |
| <area grid_logic_tile_area="25241.08"/> |
| <chan_width_distr> |
| <x distr="uniform" peak="1.000000"/> |
| <y distr="uniform" peak="1.000000"/> |
| </chan_width_distr> |
| <switch_block type="wilton" fs="3"/> |
| <connection_block input_switch_name="ipin_cblock"/> |
| </device> |
| |
| <switchlist> |
| <switch type="mux" name="0" R="0.0" Cin="0.0" Cout="0.0" Tdel="235.2e-12" mux_trans_size="2.173" buf_size="36.6"/> |
| <switch type="mux" name="ipin_cblock" R="0.0" Cout="0.0" Cin="0.0" Tdel="146e-12" mux_trans_size="1.508" buf_size="11.525"/> |
| </switchlist> |
| |
| <segmentlist> |
| <segment freq="1.000000" length="4" type="unidir" Rmetal="0.0" Cmetal="0.0"> |
| <mux name="0"/> |
| <sb type="pattern">1 1 1 1 1</sb> |
| <cb type="pattern">1 1 1 1</cb> |
| </segment> |
| </segmentlist> |
| |
| <directlist> |
| <direct name="adder_carry1" from_pin="clb.cout[0:0]" to_pin="clb.cin[0:0]" x_offset="0" y_offset="-1" z_offset="0"/> |
| <direct name="adder_carry2" from_pin="clb.cout[1:1]" to_pin="clb.cin[1:1]" x_offset="0" y_offset="-1" z_offset="0"/> |
| |
| <!-- Direct connect to left and right LAB --> |
| <direct name="direct_right_1" from_pin="clb.O[4:0]" to_pin="clb.I1[9:5]" x_offset="1" y_offset="0" z_offset="0"/> |
| <direct name="direct_right_2" from_pin="clb.O[24:20]" to_pin="clb.I2[9:5]" x_offset="1" y_offset="0" z_offset="0"/> |
| <direct name="direct_right_3" from_pin="clb.O[9:5]" to_pin="clb.I3[9:5]" x_offset="1" y_offset="0" z_offset="0"/> |
| <direct name="direct_right_4" from_pin="clb.O[29:25]" to_pin="clb.I4[9:5]" x_offset="1" y_offset="0" z_offset="0"/> |
| |
| <direct name="direct_left_1" from_pin="clb.O[14:10]" to_pin="clb.I1[14:10]" x_offset="-1" y_offset="0" z_offset="0"/> |
| <direct name="direct_left_2" from_pin="clb.O[34:30]" to_pin="clb.I2[14:10]" x_offset="-1" y_offset="0" z_offset="0"/> |
| <direct name="direct_left_3" from_pin="clb.O[19:15]" to_pin="clb.I3[14:10]" x_offset="-1" y_offset="0" z_offset="0"/> |
| <direct name="direct_left_4" from_pin="clb.O[39:35]" to_pin="clb.I4[14:10]" x_offset="-1" y_offset="0" z_offset="0"/> |
| </directlist> |
| |
| <complexblocklist> |
| |
| <!-- Define I/O pads begin --> |
| <!-- Capacity is a unique property of I/Os, it is the maximum number of I/Os that can be placed at the same (X,Y) location on the FPGA --> |
| <!-- Not sure of the area of an I/O (varies widely), and it's not relevant to the design of the FPGA core, so we're setting it to 0. --> |
| <pb_type name="io" capacity="8" area="0"> |
| <input name="outpad" num_pins="1"/> |
| <output name="inpad" num_pins="1"/> |
| <clock name="clock" num_pins="1"/> |
| |
| <!-- IOs can operate as either inputs or outputs. |
| Delays below come from Ian Kuon. They are small, so they should be interpreted as |
| the delays to and from registers in the I/O (and generally I/Os are registered |
| today and that is when you timing analyze them. |
| --> |
| <mode name="inpad"> |
| <pb_type name="inpad" blif_model=".input" num_pb="1"> |
| <output name="inpad" num_pins="1"/> |
| </pb_type> |
| <interconnect> |
| <direct name="inpad" input="inpad.inpad" output="io.inpad"> |
| <delay_constant max="4.243e-11" in_port="inpad.inpad" out_port="io.inpad"/> |
| </direct> |
| </interconnect> |
| </mode> |
| <mode name="outpad"> |
| <pb_type name="outpad" blif_model=".output" num_pb="1"> |
| <input name="outpad" num_pins="1"/> |
| </pb_type> |
| <interconnect> |
| <direct name="outpad" input="io.outpad" output="outpad.outpad"> |
| <delay_constant max="1.394e-11" in_port="io.outpad" out_port="outpad.outpad"/> |
| </direct> |
| </interconnect> |
| </mode> |
| |
| <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel --> |
| <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/> |
| |
| <!-- IOs go on the periphery of the FPGA, for consistency, |
| make it physically equivalent on all sides so that only one definition of I/Os is needed. |
| If I do not make a physically equivalent definition, then I need to define 4 different I/Os, one for each side of the FPGA |
| --> |
| <pinlocations pattern="custom"> |
| <loc side="left">io.outpad io.inpad io.clock</loc> |
| <loc side="top">io.outpad io.inpad io.clock</loc> |
| <loc side="right">io.outpad io.inpad io.clock</loc> |
| <loc side="bottom">io.outpad io.inpad io.clock</loc> |
| </pinlocations> |
| |
| <!-- Place I/Os on the sides of the FPGA --> |
| <power method="ignore"/> |
| </pb_type> |
| <!-- Define I/O pads ends --> |
| |
| <!-- Define general purpose logic block (CLB) begin --> |
| <pb_type name="clb"> |
| <input name="I1" num_pins="15" equivalent="full"/> |
| <input name="I2" num_pins="15" equivalent="full"/> |
| <input name="I3" num_pins="15" equivalent="full"/> |
| <input name="I4" num_pins="15" equivalent="full"/> |
| <input name="cin" num_pins="2"/> |
| <output name="O" num_pins="40" equivalent="none"/> |
| <output name="cout" num_pins="2"/> |
| <clock name="clk" num_pins="1"/> |
| <pb_type name="lab" num_pb="1"> |
| <input name="I1" num_pins="15"/> |
| <input name="I2" num_pins="15"/> |
| <input name="I3" num_pins="15"/> |
| <input name="I4" num_pins="15"/> |
| <input name="cin" num_pins="2"/> |
| <output name="O" num_pins="40"/> |
| <output name="cout" num_pins="2"/> |
| <clock name="clk" num_pins="1"/> |
| <!-- Describe fracturable logic element. |
| Each fracturable logic element has a 6-LUT that can alternatively operate as two 5-LUTs with shared inputs. |
| The outputs of the fracturable logic element can be optionally registered |
| --> |
| <pb_type name="fle" num_pb="10"> |
| <input name="in" num_pins="8"/> |
| <input name="cin" num_pins="2"/> |
| <output name="out" num_pins="4"/> |
| <output name="cout" num_pins="2"/> |
| <clock name="clk" num_pins="1"/> |
| <!-- |
| The ALM inputs are as follows: |
| A -> fle[0] |
| B -> fle[1] |
| C -> fle[2] |
| D -> fle[3] |
| E -> fle[4] |
| F -> fle[5] |
| G -> fle[6] |
| H -> fle[7] |
| --> |
| <mode name="n2_lut5"> |
| <pb_type name="ble5" num_pb="2"> |
| <input name="in" num_pins="5"/> |
| <input name="cin" num_pins="2"/> |
| <output name="out" num_pins="2"/> |
| <output name="cout" num_pins="2"/> |
| <clock name="clk" num_pins="1"/> |
| <mode name="blut5"> |
| <pb_type name="flut5" num_pb="1"> |
| <input name="in" num_pins="5"/> |
| <output name="out" num_pins="2"/> |
| <clock name="clk" num_pins="1"/> |
| <!-- Regular LUT mode --> |
| <pb_type name="lut5" blif_model=".names" num_pb="1" class="lut"> |
| <input name="in" num_pins="5" port_class="lut_in"/> |
| <output name="out" num_pins="1" port_class="lut_out"/> |
| <!-- LUT timing using delay matrix --> |
| <!-- These are the physical delay inputs on an Extra CC LUT but because VPR cannot do LUT rebalancing, |
| we instead take the average of these numbers to get more stable results |
| note that those are the same delays for inputs A - E as the ones used for the 6-LUT, however, we have |
| subtracted the delay of the last mux stage to get the delay of inputs A - E till the 5-LUT output |
| 208.91e-12 |
| 207.4e-12 |
| 143.94e-12 |
| 126.69e-12 |
| 77.06e-12 |
| --> |
| <delay_matrix type="max" in_port="lut5.in" out_port="lut5.out"> |
| 154.8e-12 |
| 154.8e-12 |
| 154.8e-12 |
| 154.8e-12 |
| 154.8e-12 |
| </delay_matrix> |
| </pb_type> |
| <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop"> |
| <input name="D" num_pins="1" port_class="D"/> |
| <output name="Q" num_pins="1" port_class="Q"/> |
| <clock name="clk" num_pins="1" port_class="clock"/> |
| <T_setup value="18.91e-12" port="ff.D" clock="clk"/> |
| <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/> |
| </pb_type> |
| <interconnect> |
| <direct name="lut5_in" input="flut5.in" output="lut5.in"/> |
| <direct name="reg_in" input="flut5.in[0]" output="ff[0].D"/> |
| <direct name="lut5_ff" input="lut5.out" output="ff[1].D"> |
| <delay_constant max="16.45e-12" in_port="lut5.out" out_port="ff[1].D"/> |
| <pack_pattern name="ble5" in_port="lut5.out" out_port="ff[1].D"/> |
| </direct> |
| <complete name="clock" input="flut5.clk" output="ff.clk"/> |
| <complete name="out_mux" input="ff.Q lut5.out" output="flut5.out"> |
| <delay_constant max="39.78e-12" in_port="lut5.out" out_port="flut5.out"/> |
| <delay_constant max="39.78e-12" in_port="ff.Q" out_port="flut5.out"/> |
| </complete> |
| </interconnect> |
| </pb_type> |
| <interconnect> |
| <direct name="direct1" input="ble5.in" output="flut5.in"/> |
| <direct name="direct2" input="ble5.clk" output="flut5.clk"/> |
| <direct name="direct3" input="flut5.out" output="ble5.out"/> |
| </interconnect> |
| </mode> |
| <mode name="arithmetic_1chain"> |
| <pb_type name="arithmetic" num_pb="1"> |
| <input name="in" num_pins="5"/> |
| <input name="cin" num_pins="2"/> |
| <output name="out" num_pins="2"/> |
| <output name="cout" num_pins="2"/> |
| <clock name="clk" num_pins="1"/> |
| <!-- Special dual-LUT mode that drives adder only --> |
| <pb_type name="lut4" blif_model=".names" num_pb="2" class="lut"> |
| <input name="in" num_pins="4" port_class="lut_in"/> |
| <output name="out" num_pins="1" port_class="lut_out"/> |
| <!-- LUT timing using delay matrix --> |
| <!-- These are the physical delay inputs on an Extra CC LUT but because VPR cannot do LUT rebalancing, |
| we instead take the average of these numbers to get more stable results |
| note that those are the same delays for inputs A - E as the ones used for the 6-LUT, however, we have |
| subtracted the delay of the last mux stage to get the delay of inputs A - E till the 5-LUT output |
| 165.14e-12 |
| 163.63e-12 |
| 100.17e-12 |
| 92.92e-12 |
| --> |
| <delay_matrix type="max" in_port="lut4.in" out_port="lut4.out"> |
| 131.72e-12 |
| 131.72e-12 |
| 131.72e-12 |
| 131.72e-12 |
| </delay_matrix> |
| </pb_type> |
| <pb_type name="adder" blif_model=".subckt adder" num_pb="2"> |
| <input name="a" num_pins="1"/> |
| <input name="b" num_pins="1"/> |
| <input name="cin" num_pins="1"/> |
| <output name="cout" num_pins="1"/> |
| <output name="sumout" num_pins="1"/> |
| <delay_constant max="71e-12" in_port="adder.a" out_port="adder.sumout"/> |
| <delay_constant max="71e-12" in_port="adder.b" out_port="adder.sumout"/> |
| <delay_constant max="35.06e-12" in_port="adder.cin" out_port="adder.sumout"/> |
| <delay_constant max="49.79e-12" in_port="adder.a" out_port="adder.cout"/> |
| <delay_constant max="49.79e-12" in_port="adder.b" out_port="adder.cout"/> |
| <delay_constant max="25.61e-12" in_port="adder.cin" out_port="adder.cout"/> |
| </pb_type> |
| <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop"> |
| <input name="D" num_pins="1" port_class="D"/> |
| <output name="Q" num_pins="1" port_class="Q"/> |
| <clock name="clk" num_pins="1" port_class="clock"/> |
| <T_setup value="18.91e-12" port="ff.D" clock="clk"/> |
| <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/> |
| </pb_type> |
| <interconnect> |
| <complete name="clock" input="arithmetic.clk" output="ff.clk"/> |
| <direct name="lut4_in1" input="arithmetic.in[3:0]" output="lut4[0].in"/> |
| <direct name="lut4_in2" input="arithmetic.in[3:0]" output="lut4[1].in"/> |
| <direct name="lut_to_add1" input="lut4[0:0].out" output="adder[0].a"> |
| <pack_pattern name="lut_chain" in_port="lut4[0:0].out" out_port="adder[0].a"/> |
| </direct> |
| <direct name="lut_to_add2" input="lut4[1:1].out" output="adder[0].b"> |
| <pack_pattern name="lut_chain" in_port="lut4[1:1].out" out_port="adder[0].b"/> |
| </direct> |
| <direct name="add_to_ff1" input="adder.sumout" output="ff.D"> |
| <delay_constant max="16.45e-12" in_port="adder.sumout" out_port="ff.D"/> |
| <!--pack_pattern name="chain" in_port="adder[1].sumout" out_port="ff.D"/--> |
| </direct> |
| <direct name="carry_in1" input="arithmetic.cin[0]" output="adder[0].cin"> |
| <pack_pattern name="chain" in_port="arithmetic.cin[0]" out_port="adder[0].cin"/> |
| <pack_pattern name="lut_chain" in_port="arithmetic.cin[0]" out_port="adder[0].cin"/> |
| </direct> |
| <direct name="carry_out1" input="adder[0].cout" output="arithmetic.cout[0]"> |
| <pack_pattern name="chain" in_port="adder[0].cout" out_port="arithmetic.cout[0]"/> |
| <pack_pattern name="lut_chain" in_port="adder[0].cout" out_port="arithmetic.cout[0]"/> |
| </direct> |
| <direct name="carry_in2" input="arithmetic.cin[1]" output="adder[1].cin"> |
| <pack_pattern name="chain" in_port="arithmetic.cin[1]" out_port="adder[1].cin"/> |
| <pack_pattern name="lut_chain" in_port="arithmetic.cin[1]" out_port="adder[1].cin"/> |
| </direct> |
| <direct name="carry_out2" input="adder[1].cout" output="arithmetic.cout[1]"> |
| <pack_pattern name="chain" in_port="adder[1].cout" out_port="arithmetic.cout[1]"/> |
| <pack_pattern name="lut_chain" in_port="adder[1].cout" out_port="arithmetic.cout[1]"/> |
| </direct> |
| <!-- the output of this connection should be adder[1].a only, however, a complete cross bar |
| is used since the packer is not aware that the adder inputs are logically equivalent --> |
| <!--complete name="input_to_add" input="arithmetic.in[4]" output="adder[1].a adder[1].b"/--> |
| <!-- the output of this connection should be adder[1].b only, however, a complete cross bar |
| is used since the packer is not aware that the adder inputs are logically equivalent --> |
| <complete name="add2_input" input="arithmetic.in[4] adder[0].sumout arithmetic.in[0]" output="adder[1].a adder[1].b"> |
| <pack_pattern name="chain" in_port="adder[0].sumout" out_port="adder[1].b"/> |
| <pack_pattern name="lut_chain" in_port="adder[0].sumout" out_port="adder[1].b"/> |
| </complete> |
| <complete name="sumout" input="ff.Q adder.sumout" output="arithmetic.out"> |
| <delay_constant max="39.78e-12" in_port="adder.sumout" out_port="arithmetic.out"/> |
| <delay_constant max="39.78e-12" in_port="ff.Q" out_port="arithmetic.out"/> |
| </complete> |
| </interconnect> |
| </pb_type> |
| <interconnect> |
| <direct name="direct1" input="ble5.in" output="arithmetic.in"/> |
| <direct name="carry_in" input="ble5.cin" output="arithmetic.cin"> |
| <pack_pattern name="chain" in_port="ble5.cin" out_port="arithmetic.cin"/> |
| <pack_pattern name="lut_chain" in_port="ble5.cin" out_port="arithmetic.cin"/> |
| </direct> |
| <direct name="carry_out" input="arithmetic.cout" output="ble5.cout"> |
| <pack_pattern name="chain" in_port="arithmetic.cout" out_port="ble5.cout"/> |
| <pack_pattern name="lut_chain" in_port="arithmetic.cout" out_port="ble5.cout"/> |
| </direct> |
| <direct name="direct2" input="ble5.clk" output="arithmetic.clk"/> |
| <direct name="direct3" input="arithmetic.out" output="ble5.out"/> |
| </interconnect> |
| </mode> |
| <mode name="arithmetic_2chains"> |
| <pb_type name="arithmetic" num_pb="1"> |
| <input name="in" num_pins="5"/> |
| <input name="cin" num_pins="2"/> |
| <output name="out" num_pins="2"/> |
| <output name="cout" num_pins="2"/> |
| <clock name="clk" num_pins="1"/> |
| <!-- Special dual-LUT mode that drives adder only --> |
| <pb_type name="lut4" blif_model=".names" num_pb="2" class="lut"> |
| <input name="in" num_pins="4" port_class="lut_in"/> |
| <output name="out" num_pins="1" port_class="lut_out"/> |
| <!-- LUT timing using delay matrix --> |
| <!-- These are the physical delay inputs on a Stratix 10 LUT but because VPR cannot do LUT rebalancing, |
| we instead take the average of these numbers to get more stable results |
| note that those are the same delays for inputs A - E as the ones used for the 6-LUT, however, we have |
| subtracted the delay of the last mux stage to get the delay of inputs A - E till the 5-LUT output |
| 165.14e-12 |
| 163.63e-12 |
| 100.17e-12 |
| 92.92e-12 |
| --> |
| <delay_matrix type="max" in_port="lut4.in" out_port="lut4.out"> |
| 130.47e-12 |
| 130.47e-12 |
| 130.47e-12 |
| 130.47e-12 |
| </delay_matrix> |
| </pb_type> |
| <pb_type name="adder" blif_model=".subckt adder" num_pb="2"> |
| <input name="a" num_pins="1"/> |
| <input name="b" num_pins="1"/> |
| <input name="cin" num_pins="1"/> |
| <output name="cout" num_pins="1"/> |
| <output name="sumout" num_pins="1"/> |
| <delay_constant max="71.95e-12" in_port="adder.a" out_port="adder.sumout"/> |
| <delay_constant max="71.95e-12" in_port="adder.b" out_port="adder.sumout"/> |
| <delay_constant max="37.55e-12" in_port="adder.cin" out_port="adder.sumout"/> |
| <delay_constant max="49.31e-12" in_port="adder.a" out_port="adder.cout"/> |
| <delay_constant max="49.31e-12" in_port="adder.b" out_port="adder.cout"/> |
| <delay_constant max="25.61e-12" in_port="adder.cin" out_port="adder.cout"/> |
| </pb_type> |
| <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop"> |
| <input name="D" num_pins="1" port_class="D"/> |
| <output name="Q" num_pins="1" port_class="Q"/> |
| <clock name="clk" num_pins="1" port_class="clock"/> |
| <T_setup value="18.91e-12" port="ff.D" clock="clk"/> |
| <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/> |
| </pb_type> |
| <interconnect> |
| <complete name="clock" input="arithmetic.clk" output="ff.clk"/> |
| <direct name="lut4_in1" input="arithmetic.in[3:0]" output="lut4[0].in"/> |
| <direct name="lut4_in2" input="arithmetic.in[3:0]" output="lut4[1].in"/> |
| <direct name="lut_to_add1" input="lut4[0:0].out" output="adder[0].a"> |
| <pack_pattern name="simple_lut_chain" in_port="lut4[0:0].out" out_port="adder[0].a"/> |
| </direct> |
| <direct name="lut_to_add2" input="lut4[1:1].out" output="adder[0].b"> |
| <pack_pattern name="simple_lut_chain" in_port="lut4[1:1].out" out_port="adder[0].b"/> |
| </direct> |
| <direct name="add_to_ff1" input="adder[0].sumout" output="ff[0].D"> |
| <delay_constant max="16.45e-12" in_port="adder[0].sumout" out_port="ff[0].D"/> |
| <!--pack_pattern name="simple_chain" in_port="adder[0].sumout" out_port="ff[0].D"/--> |
| </direct> |
| <direct name="add_to_ff2" input="adder[1].sumout" output="ff[1].D"> |
| <delay_constant max="16.45e-12" in_port="adder[1].sumout" out_port="ff[1].D"/> |
| <!--pack_pattern name="simple_chain" in_port="adder[1].sumout" out_port="ff[1].D"/--> |
| </direct> |
| <direct name="carry_in1" input="arithmetic.cin[0]" output="adder[0].cin"> |
| <pack_pattern name="simple_chain" in_port="arithmetic.cin[0]" out_port="adder[0].cin"/> |
| <pack_pattern name="simple_lut_chain" in_port="arithmetic.cin[0]" out_port="adder[0].cin"/> |
| </direct> |
| <direct name="carry_out1" input="adder[0].cout" output="arithmetic.cout[0]"> |
| <pack_pattern name="simple_chain" in_port="adder[0].cout" out_port="arithmetic.cout[0]"/> |
| <pack_pattern name="simple_lut_chain" in_port="adder[0].cout" out_port="arithmetic.cout[0]"/> |
| </direct> |
| <direct name="carry_in2" input="arithmetic.cin[1]" output="adder[1].cin"> |
| <pack_pattern name="simple_chain" in_port="arithmetic.cin[1]" out_port="adder[1].cin"/> |
| </direct> |
| <direct name="carry_out2" input="adder[1].cout" output="arithmetic.cout[1]"> |
| <pack_pattern name="simple_chain" in_port="adder[1].cout" out_port="arithmetic.cout[1]"/> |
| </direct> |
| <!-- the output of this connection should be adder[1].a only, however, a complete cross bar |
| is used since the packer is not aware that the adder inputs are logically equivalent --> |
| <!--complete name="input_to_add" input="arithmetic.in[4]" output="adder[1].a adder[1].b"/--> |
| <!-- the output of this connection should be adder[1].b only, however, a complete cross bar |
| is used since the packer is not aware that the adder inputs are logically equivalent --> |
| <complete name="add2_input" input="lut4[0].out arithmetic.in[4]" output="adder[1].a adder[1].b"/> |
| <complete name="sumout" input="ff.Q adder.sumout" output="arithmetic.out"> |
| <delay_constant max="39.78e-12" in_port="adder.sumout" out_port="arithmetic.out"/> |
| <delay_constant max="39.78e-12" in_port="ff.Q" out_port="arithmetic.out"/> |
| </complete> |
| </interconnect> |
| </pb_type> |
| <interconnect> |
| <direct name="direct1" input="ble5.in" output="arithmetic.in"/> |
| <direct name="carry_in" input="ble5.cin" output="arithmetic.cin"> |
| <pack_pattern name="simple_chain" in_port="ble5.cin" out_port="arithmetic.cin"/> |
| <pack_pattern name="simple_lut_chain" in_port="ble5.cin[0]" out_port="arithmetic.cin[0]"/> |
| </direct> |
| <direct name="carry_out" input="arithmetic.cout" output="ble5.cout"> |
| <pack_pattern name="simple_chain" in_port="arithmetic.cout" out_port="ble5.cout"/> |
| <pack_pattern name="simple_lut_chain" in_port="arithmetic.cout[0]" out_port="ble5.cout[0]"/> |
| </direct> |
| <direct name="direct2" input="ble5.clk" output="arithmetic.clk"/> |
| <direct name="direct3" input="arithmetic.out" output="ble5.out"/> |
| </interconnect> |
| </mode> |
| </pb_type> |
| <interconnect> |
| <!-- Shared inputs between the two 5-LUTs --> |
| <complete name="lut5_reg1" input="fle.in[0]" output="ble5[0].in[0] ble5[1].in[1]"/> |
| <complete name="lut5_reg2" input="fle.in[1]" output="ble5[0].in[1] ble5[1].in[0]"/> |
| |
| <!-- Rest of the 5-LUT inputs --> |
| <direct name="lut5_inputs_1" input="fle.in[4:2]" output="ble5[0].in[4:2]"/> |
| <direct name="lut5_inputs_22" input="fle.in[7:5]" output="ble5[1].in[4:2]"/> |
| |
| <direct name="lut5_outputs_1" input="ble5[0].out" output="fle.out[1:0]"/> |
| <direct name="lut5_outputs_2" input="ble5[1].out" output="fle.out[3:2]"/> |
| |
| <direct name="carry_in" input="fle.cin" output="ble5[0].cin"> |
| <pack_pattern name="chain" in_port="fle.cin" out_port="ble5[0].cin"/> |
| <pack_pattern name="lut_chain" in_port="fle.cin" out_port="ble5[0].cin"/> |
| <pack_pattern name="simple_chain" in_port="fle.cin" out_port="ble5[0].cin"/> |
| <pack_pattern name="simple_lut_chain" in_port="fle.cin[0]" out_port="ble5[0].cin[0]"/> |
| </direct> |
| <direct name="carry_out" input="ble5[1].cout" output="fle.cout"> |
| <pack_pattern name="chain" in_port="ble5[1].cout" out_port="fle.cout"/> |
| <pack_pattern name="lut_chain" in_port="ble5[1].cout" out_port="fle.cout"/> |
| <pack_pattern name="simple_chain" in_port="ble5[1].cout" out_port="fle.cout"/> |
| <pack_pattern name="simple_lut_chain" in_port="ble5[1].cout[0]" out_port="fle.cout[0]"/> |
| </direct> |
| <direct name="carry_link" input="ble5[0].cout" output="ble5[1].cin"> |
| <pack_pattern name="chain" in_port="ble5[0].cout" out_port="ble5[1].cout"/> |
| <pack_pattern name="lut_chain" in_port="ble5[0].cout" out_port="ble5[1].cout"/> |
| <pack_pattern name="simple_chain" in_port="ble5[0].cout" out_port="ble5[1].cout"/> |
| <pack_pattern name="simple_lut_chain" in_port="ble5[0].cout[0]" out_port="ble5[1].cout[0]"/> |
| </direct> |
| <complete name="clock" input="fle.clk" output="ble5[1:0].clk"/> |
| </interconnect> |
| </mode> <!-- n2_lut5 --> |
| <mode name="n1_lut6"> |
| <pb_type name="ble6" num_pb="1"> |
| <input name="in" num_pins="6"/> |
| <output name="out" num_pins="4"/> |
| <clock name="clk" num_pins="1"/> |
| <pb_type name="lut6" blif_model=".names" num_pb="1" class="lut"> |
| <input name="in" num_pins="6" port_class="lut_in"/> |
| <output name="out" num_pins="1" port_class="lut_out"/> |
| <!-- LUT timing using delay matrix --> |
| <!-- These are the physical delay inputs on an Extra CC LUT but because VPR cannot do LUT rebalancing, |
| we instead take the average of these numbers to get more stable results |
| 254.02e-12 |
| 252.51e-12 |
| 189.05e-12 |
| 181.8e-12 |
| 122.17e-12 |
| 84.19e-12 |
| --> |
| <delay_matrix type="max" in_port="lut6.in" out_port="lut6.out"> |
| 180.6e-12 |
| 180.6e-12 |
| 180.6e-12 |
| 180.6e-12 |
| 180.6e-12 |
| 180.6e-12 |
| </delay_matrix> |
| </pb_type> |
| <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop"> |
| <input name="D" num_pins="1" port_class="D"/> |
| <output name="Q" num_pins="1" port_class="Q"/> |
| <clock name="clk" num_pins="1" port_class="clock"/> |
| <T_setup value="18.91e-12" port="ff.D" clock="clk"/> |
| <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/> |
| </pb_type> |
| <interconnect> |
| <direct name="lut6_inputs" input="ble6.in" output="lut6.in"/> |
| <direct name="lut6_ff" input="lut6.out" output="ff[1].D"> |
| <delay_constant max="16.45e-12" in_port="lut6.out" out_port="ff[1].D"/> |
| <pack_pattern name="ble6" in_port="lut6.out" out_port="ff[1].D"/> |
| </direct> |
| <complete name="clock" input="ble6.clk" output="ff.clk"/> |
| <direct name="input_to_ff" input="ble6.in[0]" output="ff[0].D"/> |
| <complete name="mux1" input="ff[0].Q lut6.out" output="ble6.out[1:0]"> |
| <delay_constant max="39.78e-12" in_port="lut6.out" out_port="ble6.out[1:0]"/> |
| <delay_constant max="39.78e-12" in_port="ff[0].Q" out_port="ble6.out[1:0]"/> |
| </complete> |
| <complete name="mux2" input="ff[1].Q lut6.out" output="ble6.out[3:2]"> |
| <delay_constant max="39.78e-12" in_port="lut6.out" out_port="ble6.out[3:2]"/> |
| <delay_constant max="39.78e-12" in_port="ff[1].Q" out_port="ble6.out[3:2]"/> |
| </complete> |
| </interconnect> |
| </pb_type> |
| <interconnect> |
| <!-- ble6 takes inputs A, B, C, D, E, & F; where F is fle[7] --> |
| <direct name="lut6_inputs1" input="fle.in[4:0]" output="ble6.in[4:0]"/> |
| <direct name="lut6_inputs2" input="fle.in[7]" output="ble6.in[5]"/> |
| <direct name="direct2" input="ble6.out" output="fle.out"/> |
| <direct name="direct4" input="fle.clk" output="ble6.clk"/> |
| </interconnect> |
| </mode> <!-- n1_lut6 --> |
| </pb_type> |
| <interconnect> |
| <!-- We use a 50% depop crossbar built using small full xbars to get sets of logically equivalent pins at inputs of CLB |
| The delays below come from Stratix IV. the delay through a connection block |
| input mux + the crossbar in Stratix IV is 167 ps. We already have a 72 ps |
| delay on the connection block input mux (modeled by Ian Kuon), so the remaining |
| delay within the crossbar is 95 ps. |
| The delays of cluster feedbacks in Stratix IV is 100 ps, when driven by a LUT. |
| Since all our outputs LUT outputs go to a BLE output, and have a delay of |
| 25 ps to do so, we subtract 25 ps from the 100 ps delay of a feedback |
| to get the part that should be marked on the crossbar. --> |
| |
| <!-- 50% sparsely populated local routing --> |
| <complete name="lutA" input="lab.I4 lab.I3" output="fle[9:0].in[0:0]"> |
| <delay_constant max="72.73e-12" in_port="lab.I4" out_port="fle.in[0:0]"/> |
| <delay_constant max="72.73e-12" in_port="lab.I3" out_port="fle.in[0:0]"/> |
| </complete> |
| <complete name="lutB" input="lab.I3 lab.I2" output="fle[9:0].in[1:1]"> |
| <delay_constant max="72.73e-12" in_port="lab.I3" out_port="fle.in[1:1]"/> |
| <delay_constant max="72.73e-12" in_port="lab.I2" out_port="fle.in[1:1]"/> |
| </complete> |
| <complete name="lutC" input="lab.I2 lab.I1" output="fle[9:0].in[2:2]"> |
| <delay_constant max="72.73e-12" in_port="lab.I2" out_port="fle.in[2:2]"/> |
| <delay_constant max="72.73e-12" in_port="lab.I1" out_port="fle.in[2:2]"/> |
| </complete> |
| <complete name="lutD" input="lab.I4 lab.I2" output="fle[9:0].in[3:3]"> |
| <delay_constant max="72.73e-12" in_port="lab.I4" out_port="fle.in[3:3]"/> |
| <delay_constant max="72.73e-12" in_port="lab.I2" out_port="fle.in[3:3]"/> |
| </complete> |
| <complete name="lutE" input="lab.I3 lab.I1" output="fle[9:0].in[4:4]"> |
| <delay_constant max="72.73e-12" in_port="lab.I3" out_port="fle.in[4:4]"/> |
| <delay_constant max="72.73e-12" in_port="lab.I1" out_port="fle.in[4:4]"/> |
| </complete> |
| <complete name="lutF" input="lab.I4 lab.I1" output="fle[9:0].in[5:5]"> |
| <delay_constant max="72.73e-12" in_port="lab.I4" out_port="fle.in[5:5]"/> |
| <delay_constant max="72.73e-12" in_port="lab.I1" out_port="fle.in[5:5]"/> |
| </complete> |
| <complete name="lutG" input="lab.I4 lab.I3" output="fle[9:0].in[6:6]"> |
| <delay_constant max="72.73e-12" in_port="lab.I4" out_port="fle.in[6:6]"/> |
| <delay_constant max="72.73e-12" in_port="lab.I3" out_port="fle.in[6:6]"/> |
| </complete> |
| <complete name="lutH" input="lab.I3 lab.I2" output="fle[9:0].in[7:7]"> |
| <delay_constant max="72.73e-12" in_port="lab.I3" out_port="fle.in[7:7]"/> |
| <delay_constant max="72.73e-12" in_port="lab.I2" out_port="fle.in[7:7]"/> |
| </complete> |
| |
| <complete name="clks" input="lab.clk" output="fle[9:0].clk"/> |
| |
| <!-- This way of specifying direct connection to clb outputs is important because this architecture uses automatic spreading of opins. |
| By grouping to output pins in this fashion, if a logic block is completely filled by 6-LUTs, |
| then the outputs those 6-LUTs take get evenly distributed across all four sides of the CLB instead of clumped on two sides (which is what happens with a more |
| naive specification). |
| --> |
| <direct name="labouts1" input="fle[9:0].out[0]" output="lab.O[9:0]"/> |
| <direct name="labouts2" input="fle[9:0].out[1]" output="lab.O[19:10]"/> |
| <direct name="labouts3" input="fle[9:0].out[2]" output="lab.O[29:20]"/> |
| <direct name="labouts4" input="fle[9:0].out[3]" output="lab.O[39:30]"/> |
| |
| <!-- Carry chain links --> |
| <direct name="carry_in" input="lab.cin" output="fle[0:0].cin"> |
| <!-- Put all inter-block carry chain delay on this one edge --> |
| <delay_constant max="18.69e-12" in_port="lab.cin[0]" out_port="fle[0:0].cin[0]"/> |
| <delay_constant max="18.85e-12" in_port="lab.cin[1]" out_port="fle[0:0].cin[1]"/> |
| <pack_pattern name="chain" in_port="lab.cin" out_port="fle[0:0].cin"/> |
| <pack_pattern name="lut_chain" in_port="lab.cin" out_port="fle[0:0].cin"/> |
| <pack_pattern name="simple_chain" in_port="lab.cin" out_port="fle[0:0].cin"/> |
| <pack_pattern name="simple_lut_chain" in_port="lab.cin[0]" out_port="fle[0:0].cin[0]"/> |
| </direct> |
| <direct name="carry_out" input="fle[9:9].cout" output="lab.cout"> |
| <pack_pattern name="chain" in_port="fle[9:9].cout" out_port="lab.cout"/> |
| <pack_pattern name="lut_chain" in_port="fle[9:9].cout" out_port="lab.cout"/> |
| <pack_pattern name="simple_chain" in_port="fle[9:9].cout" out_port="lab.cout"/> |
| <pack_pattern name="simple_lut_chain" in_port="fle[9:9].cout[0]" out_port="lab.cout[0]"/> |
| </direct> |
| <direct name="carry_link" input="fle[8:0].cout" output="fle[9:1].cin"> |
| <pack_pattern name="chain" in_port="fle[8:0].cout" out_port="fle[9:1].cin"/> |
| <pack_pattern name="lut_chain" in_port="fle[8:0].cout" out_port="fle[9:1].cin"/> |
| <pack_pattern name="simple_chain" in_port="fle[8:0].cout" out_port="fle[9:1].cin"/> |
| <pack_pattern name="simple_lut_chain" in_port="fle[8:0].cout[0]" out_port="fle[9:1].cin[0]"/> |
| </direct> |
| </interconnect> |
| </pb_type> |
| <interconnect> |
| <direct name="carry_in1" input="clb.cin[0]" output="lab.cin[0]"> |
| <pack_pattern name="chain" in_port="clb.cin[0]" out_port="lab.cin[0]"/> |
| <pack_pattern name="lut_chain" in_port="clb.cin[0]" out_port="lab.cin[0]"/> |
| <pack_pattern name="simple_chain" in_port="clb.cin[0]" out_port="lab.cin[0]"/> |
| <pack_pattern name="simple_lut_chain" in_port="clb.cin[0]" out_port="lab.cin[0]"/> |
| </direct> |
| <direct name="carry_out1" input="lab.cout[0]" output="clb.cout[0]"> |
| <pack_pattern name="chain" in_port="lab.cout[0]" out_port="clb.cout[0]"/> |
| <pack_pattern name="lut_chain" in_port="lab.cout[0]" out_port="clb.cout[0]"/> |
| <pack_pattern name="simple_chain" in_port="lab.cout[0]" out_port="clb.cout[0]"/> |
| <pack_pattern name="simple_lut_chain" in_port="lab.cout[0]" out_port="clb.cout[0]"/> |
| </direct> |
| <direct name="carry_in2" input="clb.cin[1]" output="lab.cin[1]"> |
| <pack_pattern name="chain" in_port="clb.cin[1]" out_port="lab.cin[1]"/> |
| <pack_pattern name="lut_chain" in_port="clb.cin[1]" out_port="lab.cin[1]"/> |
| <pack_pattern name="simple_chain" in_port="clb.cin[1]" out_port="lab.cin[1]"/> |
| </direct> |
| <direct name="carry_out2" input="lab.cout[1]" output="clb.cout[1]"> |
| <pack_pattern name="chain" in_port="lab.cout[1]" out_port="clb.cout[1]"/> |
| <pack_pattern name="lut_chain" in_port="lab.cout[1]" out_port="clb.cout[1]"/> |
| <pack_pattern name="simple_chain" in_port="lab.cout[1]" out_port="clb.cout[1]"/> |
| </direct> |
| <direct name="clock" input="clb.clk" output="lab.clk"/> |
| |
| <complete name="Input_feedback_I1" input="lab.O[4:0]" output="lab.I1"/> |
| <complete name="Input_feedback_I2" input="lab.O[24:20]" output="lab.I2"/> |
| <complete name="Input_feedback_I3" input="lab.O[9:5]" output="lab.I3"/> |
| <complete name="Input_feedback_I4" input="lab.O[29:25]" output="lab.I4"/> |
| |
| <direct name="Input_I1" input="clb.I1" output="lab.I1"/> |
| <direct name="Input_I2" input="clb.I2" output="lab.I2"/> |
| <direct name="Input_I3" input="clb.I3" output="lab.I3"/> |
| <direct name="Input_I4" input="clb.I4" output="lab.I4"/> |
| |
| <direct name="output" input="lab.O" output="clb.O"/> |
| </interconnect> |
| <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"> |
| <fc_override port_name="cin" fc_type="frac" fc_val="0"/> |
| <fc_override port_name="cout" fc_type="frac" fc_val="0"/> |
| </fc> |
| <pinlocations pattern="spread"/> |
| </pb_type> |
| <!-- Define general purpose logic block (CLB) ends --> |
| |
| <!-- Define fracturable multiplier begin --> |
| <pb_type name="mult_27" height="2"> |
| <input name="datain" num_pins="74"/> |
| <output name="dataout" num_pins="74"/> |
| |
| <mode name="two_mult_18x19"> |
| <pb_type name="two_mult_18x19" num_pb="2"> |
| <input name="a" num_pins="18"/> |
| <input name="b" num_pins="19"/> |
| <output name="out" num_pins="37"/> |
| <pb_type name="mult_18x19" blif_model=".subckt multiply" num_pb="1"> |
| <input name="a" num_pins="18"/> |
| <input name="b" num_pins="19"/> |
| <output name="out" num_pins="37"/> |
| <!-- Using the numbers from Arria 10 which is a 22nm technology, an 18x19 multiplier |
| can operate at 548 MHz which maps to a delay of 1.825e-9 --> |
| <delay_constant max="1.825e-9" in_port="mult_18x19.a" out_port="mult_18x19.out"/> |
| <delay_constant max="1.825e-9" in_port="mult_18x19.b" out_port="mult_18x19.out"/> |
| </pb_type> |
| <interconnect> |
| <direct name="a2a" input="two_mult_18x19.a" output="mult_18x19.a"> |
| </direct> |
| <direct name="b2b" input="two_mult_18x19.b" output="mult_18x19.b"> |
| </direct> |
| <direct name="out2out" input="mult_18x19.out" output="two_mult_18x19.out"> |
| </direct> |
| </interconnect> |
| <power method="pin-toggle"> |
| <port name="a" energy_per_toggle="1.09e-12"/> |
| <port name="b" energy_per_toggle="1.09e-12"/> |
| <static_power power_per_instance="0.0"/> |
| </power> |
| </pb_type> |
| <interconnect> |
| <!-- Stratix IV input delay of 207ps is conservative for this architecture because this architecture does not have an input crossbar in the multiplier. |
| Subtract 72.5 ps delay, which is already in the connection block input mux, leading |
| --> |
| <direct name="datain2a1" input="mult_27.datain[17:0]" output="two_mult_18x19[0].a"> |
| <delay_constant max="134e-12" in_port="mult_27.datain[17:0]" out_port="two_mult_18x19[0].a"/> |
| </direct> |
| <direct name="datain2b1" input="mult_27.datain[36:18]" output="two_mult_18x19[0].b"> |
| <delay_constant max="134e-12" in_port="mult_27.datain[36:18]" out_port="two_mult_18x19[0].b"/> |
| </direct> |
| <direct name="datain2a2" input="mult_27.datain[54:37]" output="two_mult_18x19[1].a"> |
| <delay_constant max="134e-12" in_port="mult_27.datain[54:37]" out_port="two_mult_18x19[1].a"/> |
| </direct> |
| <direct name="datain2b2" input="mult_27.datain[73:55]" output="two_mult_18x19[1].b"> |
| <delay_constant max="134e-12" in_port="mult_27.datain[73:55]" out_port="two_mult_18x19[1].b"/> |
| </direct> |
| <direct name="out2dataout" input="two_mult_18x19[1:0].out" output="mult_27.dataout"> |
| <delay_constant max="1.09e-9" in_port="two_mult_18x19[1:0].out" out_port="mult_27.dataout"/> |
| </direct> |
| </interconnect> |
| </mode> |
| |
| <mode name="mult_27x27"> |
| <pb_type name="one_mult_27x27" num_pb="1"> |
| <input name="a" num_pins="27"/> |
| <input name="b" num_pins="27"/> |
| <output name="out" num_pins="54"/> |
| |
| <pb_type name="mult_27x27" blif_model=".subckt multiply" num_pb="1"> |
| <input name="a" num_pins="27"/> |
| <input name="b" num_pins="27"/> |
| <output name="out" num_pins="54"/> |
| <!-- Using the numbers from Arria 10 which is a 22nm technology, an 27x27 multiplier |
| can operate at 541 MHz which maps to a delay of 1.848e-9 --> |
| <delay_constant max="1.848e-9" in_port="mult_27x27.a" out_port="mult_27x27.out"/> |
| <delay_constant max="1.848e-9" in_port="mult_27x27.b" out_port="mult_27x27.out"/> |
| </pb_type> |
| <interconnect> |
| <direct name="a2a" input="one_mult_27x27.a" output="mult_27x27.a"> |
| </direct> |
| <direct name="b2b" input="one_mult_27x27.b" output="mult_27x27.b"> |
| </direct> |
| <direct name="out2out" input="mult_27x27.out" output="one_mult_27x27.out"> |
| </direct> |
| </interconnect> |
| <power method="pin-toggle"> |
| <port name="a" energy_per_toggle="2.13e-12"/> |
| <port name="b" energy_per_toggle="2.13e-12"/> |
| <static_power power_per_instance="0.0"/> |
| </power> |
| </pb_type> |
| <interconnect> |
| <!-- Stratix IV input delay of 207ps is conservative for this architecture because this architecture does not have an input crossbar in the multiplier. |
| Subtract 72.5 ps delay, which is already in the connection block input mux, leading |
| to a 134 ps delay. |
| --> |
| <direct name="datain2a" input="mult_27.datain[26:0]" output="one_mult_27x27.a"> |
| <delay_constant max="134e-12" in_port="mult_27.datain[26:0]" out_port="one_mult_27x27.a"/> |
| </direct> |
| <direct name="datain2b" input="mult_27.datain[53:27]" output="one_mult_27x27.b"> |
| <delay_constant max="134e-12" in_port="mult_27.datain[53:27]" out_port="one_mult_27x27.b"/> |
| </direct> |
| <direct name="out2dataout" input="one_mult_27x27.out" output="mult_27.dataout[53:0]"> |
| <delay_constant max="1.93e-9" in_port="one_mult_27x27.out" out_port="mult_27.dataout[53:0]"/> |
| </direct> |
| </interconnect> |
| |
| </mode> |
| |
| <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/> |
| <pinlocations pattern="spread"/> |
| |
| <!-- Place this multiplier block every 8 columns from (and including) the sixth column --> |
| <power method="sum-of-children"/> |
| </pb_type> |
| <!-- Define fracturable multiplier end --> |
| |
| <!-- Define fracturable memory begin --> |
| <pb_type name="memory" height="4"> |
| <input name="addr1" num_pins="11"/> |
| <input name="addr2" num_pins="11"/> |
| <input name="data" num_pins="40"/> |
| <input name="we1" num_pins="1"/> |
| <input name="we2" num_pins="1"/> |
| <output name="out" num_pins="40"/> |
| <clock name="clk" num_pins="1"/> |
| |
| <!-- Specify single port mode first --> |
| <mode name="mem_512x40_sp"> |
| <pb_type name="mem_512x40_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1"> |
| <input name="addr" num_pins="9" port_class="address"/> |
| <input name="data" num_pins="40" port_class="data_in"/> |
| <input name="we" num_pins="1" port_class="write_en"/> |
| <output name="out" num_pins="40" port_class="data_out"/> |
| <clock name="clk" num_pins="1" port_class="clock"/> |
| <T_setup value="509e-12" port="mem_512x40_sp.addr" clock="clk"/> |
| <T_setup value="509e-12" port="mem_512x40_sp.data" clock="clk"/> |
| <T_setup value="509e-12" port="mem_512x40_sp.we" clock="clk"/> |
| <T_clock_to_Q max="1.234e-9" port="mem_512x40_sp.out" clock="clk"/> |
| <power method="pin-toggle"> |
| <port name="clk" energy_per_toggle="9.0e-12"/> |
| <static_power power_per_instance="0.0"/> |
| </power> |
| </pb_type> |
| <interconnect> |
| <direct name="address1" input="memory.addr1[8:0]" output="mem_512x40_sp.addr"> |
| <delay_constant max="132e-12" in_port="memory.addr1[8:0]" out_port="mem_512x40_sp.addr"/> |
| </direct> |
| <direct name="data1" input="memory.data" output="mem_512x40_sp.data"> |
| <delay_constant max="132e-12" in_port="memory.data" out_port="mem_512x40_sp.data"/> |
| </direct> |
| <direct name="writeen1" input="memory.we1" output="mem_512x40_sp.we"> |
| <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_512x40_sp.we"/> |
| </direct> |
| <direct name="dataout1" input="mem_512x40_sp.out" output="memory.out"> |
| <delay_constant max="40e-12" in_port="mem_512x40_sp.out" out_port="memory.out"/> |
| </direct> |
| <direct name="clk" input="memory.clk" output="mem_512x40_sp.clk"> |
| </direct> |
| </interconnect> |
| </mode> |
| |
| <mode name="mem_1024x20_sp"> |
| <pb_type name="mem_1024x20_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1"> |
| <input name="addr" num_pins="10" port_class="address"/> |
| <input name="data" num_pins="20" port_class="data_in"/> |
| <input name="we" num_pins="1" port_class="write_en"/> |
| <output name="out" num_pins="20" port_class="data_out"/> |
| <clock name="clk" num_pins="1" port_class="clock"/> |
| <T_setup value="509e-12" port="mem_1024x20_sp.addr" clock="clk"/> |
| <T_setup value="509e-12" port="mem_1024x20_sp.data" clock="clk"/> |
| <T_setup value="509e-12" port="mem_1024x20_sp.we" clock="clk"/> |
| <T_clock_to_Q max="1.234e-9" port="mem_1024x20_sp.out" clock="clk"/> |
| <power method="pin-toggle"> |
| <port name="clk" energy_per_toggle="9.0e-12"/> |
| <static_power power_per_instance="0.0"/> |
| </power> |
| </pb_type> |
| <interconnect> |
| <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_sp.addr"> |
| <delay_constant max="132e-12" in_port="memory.addr1[9:0]" out_port="mem_1024x20_sp.addr"/> |
| </direct> |
| <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_sp.data"> |
| <delay_constant max="132e-12" in_port="memory.data[19:0]" out_port="mem_1024x20_sp.data"/> |
| </direct> |
| <direct name="writeen1" input="memory.we1" output="mem_1024x20_sp.we"> |
| <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_1024x20_sp.we"/> |
| </direct> |
| <direct name="dataout1" input="mem_1024x20_sp.out" output="memory.out[19:0]"> |
| <delay_constant max="40e-12" in_port="mem_1024x20_sp.out" out_port="memory.out[19:0]"/> |
| </direct> |
| <direct name="clk" input="memory.clk" output="mem_1024x20_sp.clk"> |
| </direct> |
| </interconnect> |
| </mode> |
| |
| <mode name="mem_2048x10_sp"> |
| <pb_type name="mem_2048x10_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1"> |
| <input name="addr" num_pins="11" port_class="address"/> |
| <input name="data" num_pins="10" port_class="data_in"/> |
| <input name="we" num_pins="1" port_class="write_en"/> |
| <output name="out" num_pins="10" port_class="data_out"/> |
| <clock name="clk" num_pins="1" port_class="clock"/> |
| <T_setup value="509e-12" port="mem_2048x10_sp.addr" clock="clk"/> |
| <T_setup value="509e-12" port="mem_2048x10_sp.data" clock="clk"/> |
| <T_setup value="509e-12" port="mem_2048x10_sp.we" clock="clk"/> |
| <T_clock_to_Q max="1.234e-9" port="mem_2048x10_sp.out" clock="clk"/> |
| <power method="pin-toggle"> |
| <port name="clk" energy_per_toggle="9.0e-12"/> |
| <static_power power_per_instance="0.0"/> |
| </power> |
| </pb_type> |
| <interconnect> |
| <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_sp.addr"> |
| <delay_constant max="132e-12" in_port="memory.addr1[10:0]" out_port="mem_2048x10_sp.addr"/> |
| </direct> |
| <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_sp.data"> |
| <delay_constant max="132e-12" in_port="memory.data[9:0]" out_port="mem_2048x10_sp.data"/> |
| </direct> |
| <direct name="writeen1" input="memory.we1" output="mem_2048x10_sp.we"> |
| <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_2048x10_sp.we"/> |
| </direct> |
| <direct name="dataout1" input="mem_2048x10_sp.out" output="memory.out[9:0]"> |
| <delay_constant max="40e-12" in_port="mem_2048x10_sp.out" out_port="memory.out[9:0]"/> |
| </direct> |
| <direct name="clk" input="memory.clk" output="mem_2048x10_sp.clk"> |
| </direct> |
| </interconnect> |
| </mode> |
| |
| <!-- Specify true dual port mode next --> |
| <mode name="mem_1024x20_dp"> |
| <pb_type name="mem_1024x20_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1"> |
| <input name="addr1" num_pins="10" port_class="address1"/> |
| <input name="addr2" num_pins="10" port_class="address2"/> |
| <input name="data1" num_pins="20" port_class="data_in1"/> |
| <input name="data2" num_pins="20" port_class="data_in2"/> |
| <input name="we1" num_pins="1" port_class="write_en1"/> |
| <input name="we2" num_pins="1" port_class="write_en2"/> |
| <output name="out1" num_pins="20" port_class="data_out1"/> |
| <output name="out2" num_pins="20" port_class="data_out2"/> |
| <clock name="clk" num_pins="1" port_class="clock"/> |
| <T_setup value="509e-12" port="mem_1024x20_dp.addr1" clock="clk"/> |
| <T_setup value="509e-12" port="mem_1024x20_dp.data1" clock="clk"/> |
| <T_setup value="509e-12" port="mem_1024x20_dp.we1" clock="clk"/> |
| <T_setup value="509e-12" port="mem_1024x20_dp.addr2" clock="clk"/> |
| <T_setup value="509e-12" port="mem_1024x20_dp.data2" clock="clk"/> |
| <T_setup value="509e-12" port="mem_1024x20_dp.we2" clock="clk"/> |
| <T_clock_to_Q max="1.234e-9" port="mem_1024x20_dp.out1" clock="clk"/> |
| <T_clock_to_Q max="1.234e-9" port="mem_1024x20_dp.out2" clock="clk"/> |
| <power method="pin-toggle"> |
| <port name="clk" energy_per_toggle="17.9e-12"/> |
| <static_power power_per_instance="0.0"/> |
| </power> |
| </pb_type> |
| <interconnect> |
| <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_dp.addr1"> |
| <delay_constant max="132e-12" in_port="memory.addr1[9:0]" out_port="mem_1024x20_dp.addr1"/> |
| </direct> |
| <direct name="address2" input="memory.addr2[9:0]" output="mem_1024x20_dp.addr2"> |
| <delay_constant max="132e-12" in_port="memory.addr2[9:0]" out_port="mem_1024x20_dp.addr2"/> |
| </direct> |
| <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_dp.data1"> |
| <delay_constant max="132e-12" in_port="memory.data[19:0]" out_port="mem_1024x20_dp.data1"/> |
| </direct> |
| <direct name="data2" input="memory.data[39:20]" output="mem_1024x20_dp.data2"> |
| <delay_constant max="132e-12" in_port="memory.data[39:20]" out_port="mem_1024x20_dp.data2"/> |
| </direct> |
| <direct name="writeen1" input="memory.we1" output="mem_1024x20_dp.we1"> |
| <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_1024x20_dp.we1"/> |
| </direct> |
| <direct name="writeen2" input="memory.we2" output="mem_1024x20_dp.we2"> |
| <delay_constant max="132e-12" in_port="memory.we2" out_port="mem_1024x20_dp.we2"/> |
| </direct> |
| <direct name="dataout1" input="mem_1024x20_dp.out1" output="memory.out[19:0]"> |
| <delay_constant max="40e-12" in_port="mem_1024x20_dp.out1" out_port="memory.out[19:0]"/> |
| </direct> |
| <direct name="dataout2" input="mem_1024x20_dp.out2" output="memory.out[39:20]"> |
| <delay_constant max="40e-12" in_port="mem_1024x20_dp.out2" out_port="memory.out[39:20]"/> |
| </direct> |
| <direct name="clk" input="memory.clk" output="mem_1024x20_dp.clk"> |
| </direct> |
| </interconnect> |
| </mode> |
| |
| <mode name="mem_2048x10_dp"> |
| <pb_type name="mem_2048x10_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1"> |
| <input name="addr1" num_pins="11" port_class="address1"/> |
| <input name="addr2" num_pins="11" port_class="address2"/> |
| <input name="data1" num_pins="10" port_class="data_in1"/> |
| <input name="data2" num_pins="10" port_class="data_in2"/> |
| <input name="we1" num_pins="1" port_class="write_en1"/> |
| <input name="we2" num_pins="1" port_class="write_en2"/> |
| <output name="out1" num_pins="10" port_class="data_out1"/> |
| <output name="out2" num_pins="10" port_class="data_out2"/> |
| <clock name="clk" num_pins="1" port_class="clock"/> |
| <T_setup value="509e-12" port="mem_2048x10_dp.addr1" clock="clk"/> |
| <T_setup value="509e-12" port="mem_2048x10_dp.data1" clock="clk"/> |
| <T_setup value="509e-12" port="mem_2048x10_dp.we1" clock="clk"/> |
| <T_setup value="509e-12" port="mem_2048x10_dp.addr2" clock="clk"/> |
| <T_setup value="509e-12" port="mem_2048x10_dp.data2" clock="clk"/> |
| <T_setup value="509e-12" port="mem_2048x10_dp.we2" clock="clk"/> |
| <T_clock_to_Q max="1.234e-9" port="mem_2048x10_dp.out1" clock="clk"/> |
| <T_clock_to_Q max="1.234e-9" port="mem_2048x10_dp.out2" clock="clk"/> |
| <power method="pin-toggle"> |
| <port name="clk" energy_per_toggle="17.9e-12"/> |
| <static_power power_per_instance="0.0"/> |
| </power> |
| </pb_type> |
| <interconnect> |
| <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_dp.addr1"> |
| <delay_constant max="132e-12" in_port="memory.addr1[10:0]" out_port="mem_2048x10_dp.addr1"/> |
| </direct> |
| <direct name="address2" input="memory.addr2[10:0]" output="mem_2048x10_dp.addr2"> |
| <delay_constant max="132e-12" in_port="memory.addr2[10:0]" out_port="mem_2048x10_dp.addr2"/> |
| </direct> |
| <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_dp.data1"> |
| <delay_constant max="132e-12" in_port="memory.data[9:0]" out_port="mem_2048x10_dp.data1"/> |
| </direct> |
| <direct name="data2" input="memory.data[19:10]" output="mem_2048x10_dp.data2"> |
| <delay_constant max="132e-12" in_port="memory.data[19:10]" out_port="mem_2048x10_dp.data2"/> |
| </direct> |
| <direct name="writeen1" input="memory.we1" output="mem_2048x10_dp.we1"> |
| <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_2048x10_dp.we1"/> |
| </direct> |
| <direct name="writeen2" input="memory.we2" output="mem_2048x10_dp.we2"> |
| <delay_constant max="132e-12" in_port="memory.we2" out_port="mem_2048x10_dp.we2"/> |
| </direct> |
| <direct name="dataout1" input="mem_2048x10_dp.out1" output="memory.out[9:0]"> |
| <delay_constant max="40e-12" in_port="mem_2048x10_dp.out1" out_port="memory.out[9:0]"/> |
| </direct> |
| <direct name="dataout2" input="mem_2048x10_dp.out2" output="memory.out[19:10]"> |
| <delay_constant max="40e-12" in_port="mem_2048x10_dp.out2" out_port="memory.out[19:10]"/> |
| </direct> |
| <direct name="clk" input="memory.clk" output="mem_2048x10_dp.clk"> |
| </direct> |
| </interconnect> |
| </mode> |
| |
| <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel --> |
| <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/> |
| <pinlocations pattern="spread"/> |
| |
| <!-- Place this memory block every 8 columns from (and including) the second column --> |
| <power method="sum-of-children"/> |
| </pb_type> |
| <!-- Define fracturable memory end --> |
| |
| |
| </complexblocklist> |
| |
| <power> |
| <local_interconnect C_wire="2.5e-10"/> |
| <mux_transistor_size mux_transistor_size="3"/> |
| <FF_size FF_size="4"/> |
| <LUT_transistor_size LUT_transistor_size="4"/> |
| </power> |
| |
| <clocks> |
| <clock buffer_size="auto" C_wire="2.5e-10"/> |
| </clocks> |
| </architecture> |