arch: Added an architecture file for the Extra Carry Chain architecture
file
diff --git a/vtr_flow/arch/timing/extra_carry_chain_arch.xml b/vtr_flow/arch/timing/extra_carry_chain_arch.xml
new file mode 100644
index 0000000..1e02194
--- /dev/null
+++ b/vtr_flow/arch/timing/extra_carry_chain_arch.xml
@@ -0,0 +1,1097 @@
+<!--
+    This is the architecture file for the Extra Carry Chain Architecture proposed in [1].
+    Delays for routing and logic blocks come from COFFE runs for a 20 nm technology node.
+    Delays for DSP blocks and BRAMs come from Arria 10 (22 nm) delays.
+
+    This architecture is Stratix-10 like architecture with a modified arithmetic mode
+
+    This architecture has 10 ALMs per cluster, where each ALM is a 6-LUT fracturable into
+    two 5-LUTs. The ALM has 8 inputs and 4 optionally registered outputs.The two 5-LUTs should
+    share at least two inputs. Each two ALM outputs are logically equivalent, which means any
+    output signal that can reach ALM.out[0] can reach ALM.out[1] and the same thing for
+    ALM.out[2] and ALM.out[3]. The ALMs in this architecture have an arithmetic mode
+    where each 5-LUT is fractured into two 4-LUTs. This results in a total of four 4-LUTs per ALM.
+    This architecture has two carry chains per ALM (four adders), where the output of the first carry chain feeds
+    one of the inputs of the second carry chain. This structure allows this architecture to implement
+    adder trees and 3:1 addition reduction operations more efficiently.
+
+    The LAB has 60 inputs and 40 outputs. Two outputs of each ALM are fed to the right and
+    left LAB using direct links and are also fed back to the LAB as feedback connections sharing
+    the 60 input ports with the signals coming from the routing channels.
+
+    The architecture also has a 20Kb memory that has true and simple dual port modes. In simple
+    dual port mode the memory can be configured in the following modes: 512x40, 1024x20 and 2048x10,
+    while in true dual port mode it can be configured as: 1024x20 and 2028x10.
+
+    In addition, the architecture has a 27x27 DSP block that can be fractured into two 18x19 DSPs.
+
+
+    [1] M. Eldafrawy, A. Boutros, S. Yazdanshenas, and V. Betz, "FPGA Logic Block Architectures for efficient
+        multiplication and addition to enhance machine learning performance," in Transactions on Reconfigurable
+        Technology and Systems (TRETS), 2019
+
+-->
+<architecture>
+    <!--
+         ODIN II specific config begins
+         Describes the types of user-specified netlist blocks (in blif, this corresponds to
+         ".model [type_of_block]") that this architecture supports.
+
+         Note: Basic LUTs, I/Os, and flip-flops are not included here as there are
+         already special structures in blif (.names, .input, .output, and .latch)
+         that describe them.
+    -->
+    <models>
+      <model name="multiply">
+        <input_ports>
+        <port name="a" combinational_sink_ports="out"/>
+        <port name="b" combinational_sink_ports="out"/>
+        </input_ports>
+        <output_ports>
+        <port name="out"/>
+        </output_ports>
+      </model>
+
+      <model name="single_port_ram">
+        <input_ports>
+        <port name="we" clock="clk"/>     <!-- control -->
+        <port name="addr" clock="clk"/>  <!-- address lines -->
+        <port name="data" clock="clk"/>  <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="clk" is_clock="1"/>  <!-- memories are often clocked -->
+        </input_ports>
+        <output_ports>
+        <port name="out" clock="clk"/>   <!-- output can be broken down into smaller bit widths minimum size 1 -->
+        </output_ports>
+      </model>
+
+      <model name="dual_port_ram">
+        <input_ports>
+        <port name="we1" clock="clk"/>     <!-- write enable -->
+        <port name="we2" clock="clk"/>     <!-- write enable -->
+        <port name="addr1" clock="clk"/>  <!-- address lines -->
+        <port name="addr2" clock="clk"/>  <!-- address lines -->
+        <port name="data1" clock="clk"/>  <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="data2" clock="clk"/>  <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="clk" is_clock="1"/>  <!-- memories are often clocked -->
+        </input_ports>
+        <output_ports>
+        <port name="out1" clock="clk"/>   <!-- output can be broken down into smaller bit widths minimum size 1 -->
+        <port name="out2" clock="clk"/>   <!-- output can be broken down into smaller bit widths minimum size 1 -->
+        </output_ports>
+      </model>
+
+      <model name="adder">
+        <input_ports>
+          <port name="a" combinational_sink_ports="sumout cout"/>
+          <port name="b" combinational_sink_ports="sumout cout"/>
+          <port name="cin" combinational_sink_ports="sumout cout"/>
+        </input_ports>
+        <output_ports>
+          <port name="cout"/>
+          <port name="sumout"/>
+        </output_ports>
+      </model>
+    </models> <!-- ODIN II specific config ends -->
+
+    <layout> <!-- Physical descriptions begin -->
+      <auto_layout aspect_ratio="1.0">
+          <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
+          <perimeter type="io" priority="100"/>
+          <corners type="EMPTY" priority="101"/>
+          <!--Fill with 'clb'-->
+          <fill type="clb" priority="10"/>
+          <!--Column of 'mult_27' with 'EMPTY' blocks wherever a 'mult_27' does not fit. Vertical offset by 1 for perimeter.-->
+          <col type="mult_27" startx="6" starty="1" repeatx="8" priority="20"/>
+          <col type="EMPTY" startx="6" repeatx="8" starty="1" priority="19"/>
+          <!--Column of 'memory' with 'EMPTY' blocks wherever a 'memory' does not fit. Vertical offset by 1 for perimeter.-->
+          <col type="memory" startx="2" starty="1" repeatx="8" priority="20"/>
+          <col type="EMPTY" startx="2" repeatx="8" starty="1" priority="19"/>
+      </auto_layout>
+    </layout>
+
+    <device>
+      <sizing R_minW_nmos="13090" R_minW_pmos="19086.83"/>
+      <area grid_logic_tile_area="25241.08"/>
+      <chan_width_distr>
+        <x distr="uniform" peak="1.000000"/>
+        <y distr="uniform" peak="1.000000"/>
+      </chan_width_distr>
+	  <switch_block type="wilton" fs="3"/>
+	  <connection_block input_switch_name="ipin_cblock"/>
+    </device>
+
+    <switchlist>
+      <switch type="mux" name="0" R="0.0" Cin="0.0" Cout="0.0" Tdel="235.2e-12" mux_trans_size="2.173" buf_size="36.6"/>
+      <switch type="mux" name="ipin_cblock" R="0.0" Cout="0.0" Cin="0.0" Tdel="146e-12" mux_trans_size="1.508" buf_size="11.525"/>
+    </switchlist>
+
+    <segmentlist>
+      <segment freq="1.000000" length="4" type="unidir" Rmetal="0.0" Cmetal="0.0">
+        <mux name="0"/>
+        <sb type="pattern">1 1 1 1 1</sb>
+        <cb type="pattern">1 1 1 1</cb>
+      </segment>
+    </segmentlist>
+
+    <directlist>
+      <direct name="adder_carry1" from_pin="clb.cout[0:0]" to_pin="clb.cin[0:0]" x_offset="0" y_offset="-1" z_offset="0"/>
+      <direct name="adder_carry2" from_pin="clb.cout[1:1]" to_pin="clb.cin[1:1]" x_offset="0" y_offset="-1" z_offset="0"/>
+
+      <!-- Direct connect to left and right LAB -->
+      <direct name="direct_right_1" from_pin="clb.O[4:0]" to_pin="clb.I1[9:5]" x_offset="1" y_offset="0" z_offset="0"/>
+      <direct name="direct_right_2" from_pin="clb.O[24:20]" to_pin="clb.I2[9:5]" x_offset="1" y_offset="0" z_offset="0"/>
+      <direct name="direct_right_3" from_pin="clb.O[9:5]" to_pin="clb.I3[9:5]" x_offset="1" y_offset="0" z_offset="0"/>
+      <direct name="direct_right_4" from_pin="clb.O[29:25]" to_pin="clb.I4[9:5]" x_offset="1" y_offset="0" z_offset="0"/>
+
+      <direct name="direct_left_1" from_pin="clb.O[14:10]" to_pin="clb.I1[14:10]" x_offset="-1" y_offset="0" z_offset="0"/>
+      <direct name="direct_left_2" from_pin="clb.O[34:30]" to_pin="clb.I2[14:10]" x_offset="-1" y_offset="0" z_offset="0"/>
+      <direct name="direct_left_3" from_pin="clb.O[19:15]" to_pin="clb.I3[14:10]" x_offset="-1" y_offset="0" z_offset="0"/>
+      <direct name="direct_left_4" from_pin="clb.O[39:35]" to_pin="clb.I4[14:10]" x_offset="-1" y_offset="0" z_offset="0"/>
+    </directlist>
+
+    <complexblocklist>
+
+      <!-- Define I/O pads begin -->
+      <!-- Capacity is a unique property of I/Os, it is the maximum number of I/Os that can be placed at the same (X,Y) location on the FPGA -->
+	    <!-- Not sure of the area of an I/O (varies widely), and it's not relevant to the design of the FPGA core, so we're setting it to 0. -->
+      <pb_type name="io" capacity="8" area="0">
+        <input name="outpad" num_pins="1"/>
+        <output name="inpad" num_pins="1"/>
+        <clock name="clock" num_pins="1"/>
+
+	<!-- IOs can operate as either inputs or outputs.
+	     Delays below come from Ian Kuon. They are small, so they should be interpreted as
+	     the delays to and from registers in the I/O (and generally I/Os are registered
+	     today and that is when you timing analyze them.
+	     -->
+        <mode name="inpad">
+          <pb_type name="inpad" blif_model=".input" num_pb="1">
+            <output name="inpad" num_pins="1"/>
+          </pb_type>
+          <interconnect>
+            <direct name="inpad" input="inpad.inpad" output="io.inpad">
+            <delay_constant max="4.243e-11" in_port="inpad.inpad" out_port="io.inpad"/>
+            </direct>
+          </interconnect>
+        </mode>
+        <mode name="outpad">
+          <pb_type name="outpad" blif_model=".output" num_pb="1">
+            <input name="outpad" num_pins="1"/>
+          </pb_type>
+          <interconnect>
+            <direct name="outpad" input="io.outpad" output="outpad.outpad">
+            <delay_constant max="1.394e-11" in_port="io.outpad" out_port="outpad.outpad"/>
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel -->
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+
+        <!-- IOs go on the periphery of the FPGA, for consistency,
+          make it physically equivalent on all sides so that only one definition of I/Os is needed.
+          If I do not make a physically equivalent definition, then I need to define 4 different I/Os, one for each side of the FPGA
+        -->
+        <pinlocations pattern="custom">
+          <loc side="left">io.outpad io.inpad io.clock</loc>
+          <loc side="top">io.outpad io.inpad io.clock</loc>
+          <loc side="right">io.outpad io.inpad io.clock</loc>
+          <loc side="bottom">io.outpad io.inpad io.clock</loc>
+        </pinlocations>
+
+        <!-- Place I/Os on the sides of the FPGA -->
+        <power method="ignore"/>
+      </pb_type>
+      <!-- Define I/O pads ends -->
+
+      <!-- Define general purpose logic block (CLB) begin -->
+      <pb_type name="clb">
+        <input name="I1" num_pins="15" equivalent="full"/>
+		<input name="I2" num_pins="15" equivalent="full"/>
+		<input name="I3" num_pins="15" equivalent="full"/>
+		<input name="I4" num_pins="15" equivalent="full"/>
+        <input name="cin" num_pins="2"/>
+        <output name="O" num_pins="40" equivalent="none"/>
+        <output name="cout" num_pins="2"/>
+        <clock name="clk" num_pins="1"/>
+        <pb_type name="lab" num_pb="1">
+            <input name="I1" num_pins="15"/>
+            <input name="I2" num_pins="15"/>
+            <input name="I3" num_pins="15"/>
+            <input name="I4" num_pins="15"/>
+            <input name="cin" num_pins="2"/>
+            <output name="O" num_pins="40"/>
+            <output name="cout" num_pins="2"/>
+            <clock name="clk" num_pins="1"/>
+            <!-- Describe fracturable logic element.
+                 Each fracturable logic element has a 6-LUT that can alternatively operate as two 5-LUTs with shared inputs.
+                 The outputs of the fracturable logic element can be optionally registered
+            -->
+            <pb_type name="fle" num_pb="10">
+              <input name="in" num_pins="8"/>
+              <input name="cin" num_pins="2"/>
+              <output name="out" num_pins="4"/>
+              <output name="cout" num_pins="2"/>
+              <clock name="clk" num_pins="1"/>
+              <!--
+                    The ALM inputs are as follows:
+                            A -> fle[0]
+                            B -> fle[1]
+                            C -> fle[2]
+                            D -> fle[3]
+                            E -> fle[4]
+                            F -> fle[5]
+                            G -> fle[6]
+                            H -> fle[7]
+              -->
+              <mode name="n2_lut5">
+                <pb_type name="ble5" num_pb="2">
+                  <input name="in" num_pins="5"/>
+                  <input name="cin" num_pins="2"/>
+                  <output name="out" num_pins="2"/>
+                  <output name="cout" num_pins="2"/>
+                  <clock name="clk" num_pins="1"/>
+                  <mode name="blut5">
+                    <pb_type name="flut5" num_pb="1">
+                      <input name="in" num_pins="5"/>
+                      <output name="out" num_pins="2"/>
+                      <clock name="clk" num_pins="1"/>
+                      <!-- Regular LUT mode -->
+                      <pb_type name="lut5" blif_model=".names" num_pb="1" class="lut">
+                        <input name="in" num_pins="5" port_class="lut_in"/>
+                        <output name="out" num_pins="1" port_class="lut_out"/>
+                        <!-- LUT timing using delay matrix -->
+                          <!-- These are the physical delay inputs on an Extra CC LUT but because VPR cannot do LUT rebalancing,
+                             we instead take the average of these numbers to get more stable results
+                             note that those are the same delays for inputs A - E as the ones used for the 6-LUT, however, we have
+                             subtracted the delay of the last mux stage to get the delay of inputs A - E till the 5-LUT output
+                             208.91e-12
+                             207.4e-12
+                             143.94e-12
+                             126.69e-12
+                             77.06e-12
+                          -->
+                        <delay_matrix type="max" in_port="lut5.in" out_port="lut5.out">
+                            154.8e-12
+                            154.8e-12
+                            154.8e-12
+                            154.8e-12
+                            154.8e-12
+                        </delay_matrix>
+                      </pb_type>
+                      <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop">
+                        <input name="D" num_pins="1" port_class="D"/>
+                        <output name="Q" num_pins="1" port_class="Q"/>
+                        <clock name="clk" num_pins="1" port_class="clock"/>
+                        <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                        <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+                      </pb_type>
+                      <interconnect>
+                        <direct name="lut5_in" input="flut5.in" output="lut5.in"/>
+                        <direct name="reg_in" input="flut5.in[0]" output="ff[0].D"/>
+                        <direct name="lut5_ff" input="lut5.out" output="ff[1].D">
+                          <delay_constant max="16.45e-12" in_port="lut5.out" out_port="ff[1].D"/>
+                          <pack_pattern name="ble5" in_port="lut5.out" out_port="ff[1].D"/>
+                        </direct>
+                        <complete name="clock" input="flut5.clk" output="ff.clk"/>
+                        <complete name="out_mux" input="ff.Q lut5.out" output="flut5.out">
+                          <delay_constant max="39.78e-12" in_port="lut5.out" out_port="flut5.out"/>
+                          <delay_constant max="39.78e-12" in_port="ff.Q" out_port="flut5.out"/>
+                        </complete>
+                      </interconnect>
+                    </pb_type>
+                    <interconnect>
+                      <direct name="direct1" input="ble5.in" output="flut5.in"/>
+                      <direct name="direct2" input="ble5.clk" output="flut5.clk"/>
+                      <direct name="direct3" input="flut5.out" output="ble5.out"/>
+                    </interconnect>
+                  </mode>
+                  <mode name="arithmetic_1chain">
+                    <pb_type name="arithmetic" num_pb="1">
+                      <input name="in" num_pins="5"/>
+                      <input name="cin" num_pins="2"/>
+                      <output name="out" num_pins="2"/>
+                      <output name="cout" num_pins="2"/>
+                      <clock name="clk" num_pins="1"/>
+                      <!-- Special dual-LUT mode that drives adder only -->
+                      <pb_type name="lut4" blif_model=".names" num_pb="2" class="lut">
+                        <input name="in" num_pins="4" port_class="lut_in"/>
+                        <output name="out" num_pins="1" port_class="lut_out"/>
+                        <!-- LUT timing using delay matrix -->
+                        <!-- These are the physical delay inputs on an Extra CC LUT but because VPR cannot do LUT rebalancing,
+                           we instead take the average of these numbers to get more stable results
+                           note that those are the same delays for inputs A - E as the ones used for the 6-LUT, however, we have
+                           subtracted the delay of the last mux stage to get the delay of inputs A - E till the 5-LUT output
+                            165.14e-12
+                            163.63e-12
+                            100.17e-12
+                            92.92e-12
+                          -->
+                        <delay_matrix type="max" in_port="lut4.in" out_port="lut4.out">
+                            131.72e-12
+                            131.72e-12
+                            131.72e-12
+                            131.72e-12
+                        </delay_matrix>
+                      </pb_type>
+                      <pb_type name="adder" blif_model=".subckt adder" num_pb="2">
+                        <input name="a" num_pins="1"/>
+                        <input name="b" num_pins="1"/>
+                        <input name="cin" num_pins="1"/>
+                        <output name="cout" num_pins="1"/>
+                        <output name="sumout" num_pins="1"/>
+                        <delay_constant max="71e-12" in_port="adder.a" out_port="adder.sumout"/>
+                        <delay_constant max="71e-12" in_port="adder.b" out_port="adder.sumout"/>
+                        <delay_constant max="35.06e-12" in_port="adder.cin" out_port="adder.sumout"/>
+                        <delay_constant max="49.79e-12" in_port="adder.a" out_port="adder.cout"/>
+                        <delay_constant max="49.79e-12" in_port="adder.b" out_port="adder.cout"/>
+                        <delay_constant max="25.61e-12" in_port="adder.cin" out_port="adder.cout"/>
+                      </pb_type>
+                      <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop">
+                        <input name="D" num_pins="1" port_class="D"/>
+                        <output name="Q" num_pins="1" port_class="Q"/>
+                        <clock name="clk" num_pins="1" port_class="clock"/>
+                        <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                        <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+                      </pb_type>
+                      <interconnect>
+                        <complete name="clock" input="arithmetic.clk" output="ff.clk"/>
+                        <direct name="lut4_in1" input="arithmetic.in[3:0]" output="lut4[0].in"/>
+                        <direct name="lut4_in2" input="arithmetic.in[3:0]" output="lut4[1].in"/>
+                        <direct name="lut_to_add1" input="lut4[0:0].out" output="adder[0].a">
+                            <pack_pattern name="lut_chain" in_port="lut4[0:0].out" out_port="adder[0].a"/>
+                        </direct>
+                        <direct name="lut_to_add2" input="lut4[1:1].out" output="adder[0].b">
+                            <pack_pattern name="lut_chain" in_port="lut4[1:1].out" out_port="adder[0].b"/>
+                        </direct>
+                        <direct name="add_to_ff1" input="adder.sumout" output="ff.D">
+                            <delay_constant max="16.45e-12" in_port="adder.sumout" out_port="ff.D"/>
+                            <!--pack_pattern name="chain" in_port="adder[1].sumout" out_port="ff.D"/-->
+                        </direct>
+                        <direct name="carry_in1" input="arithmetic.cin[0]" output="adder[0].cin">
+                          <pack_pattern name="chain" in_port="arithmetic.cin[0]" out_port="adder[0].cin"/>
+                          <pack_pattern name="lut_chain" in_port="arithmetic.cin[0]" out_port="adder[0].cin"/>
+                        </direct>
+                        <direct name="carry_out1" input="adder[0].cout" output="arithmetic.cout[0]">
+                          <pack_pattern name="chain" in_port="adder[0].cout" out_port="arithmetic.cout[0]"/>
+                          <pack_pattern name="lut_chain" in_port="adder[0].cout" out_port="arithmetic.cout[0]"/>
+                        </direct>
+                        <direct name="carry_in2" input="arithmetic.cin[1]" output="adder[1].cin">
+                          <pack_pattern name="chain" in_port="arithmetic.cin[1]" out_port="adder[1].cin"/>
+                          <pack_pattern name="lut_chain" in_port="arithmetic.cin[1]" out_port="adder[1].cin"/>
+                        </direct>
+                        <direct name="carry_out2" input="adder[1].cout" output="arithmetic.cout[1]">
+                          <pack_pattern name="chain" in_port="adder[1].cout" out_port="arithmetic.cout[1]"/>
+                          <pack_pattern name="lut_chain" in_port="adder[1].cout" out_port="arithmetic.cout[1]"/>
+                        </direct>
+                        <!-- the output of this connection should be adder[1].a only, however, a complete cross bar
+                             is used since the packer is not aware that the adder inputs are logically equivalent -->
+                        <!--complete name="input_to_add" input="arithmetic.in[4]" output="adder[1].a adder[1].b"/-->
+                        <!-- the output of this connection should be adder[1].b only, however, a complete cross bar
+                             is used since the packer is not aware that the adder inputs are logically equivalent -->
+                        <complete name="add2_input" input="arithmetic.in[4] adder[0].sumout arithmetic.in[0]" output="adder[1].a adder[1].b">
+                            <pack_pattern name="chain" in_port="adder[0].sumout" out_port="adder[1].b"/>
+                            <pack_pattern name="lut_chain" in_port="adder[0].sumout" out_port="adder[1].b"/>
+                        </complete>
+                        <complete name="sumout" input="ff.Q adder.sumout" output="arithmetic.out">
+                            <delay_constant max="39.78e-12" in_port="adder.sumout" out_port="arithmetic.out"/>
+                            <delay_constant max="39.78e-12" in_port="ff.Q" out_port="arithmetic.out"/>
+                        </complete>
+                      </interconnect>
+                    </pb_type>
+                    <interconnect>
+                      <direct name="direct1" input="ble5.in" output="arithmetic.in"/>
+                      <direct name="carry_in" input="ble5.cin" output="arithmetic.cin">
+                        <pack_pattern name="chain" in_port="ble5.cin" out_port="arithmetic.cin"/>
+                        <pack_pattern name="lut_chain" in_port="ble5.cin" out_port="arithmetic.cin"/>
+                      </direct>
+                      <direct name="carry_out" input="arithmetic.cout" output="ble5.cout">
+                        <pack_pattern name="chain" in_port="arithmetic.cout" out_port="ble5.cout"/>
+                        <pack_pattern name="lut_chain" in_port="arithmetic.cout" out_port="ble5.cout"/>
+                      </direct>
+                      <direct name="direct2" input="ble5.clk" output="arithmetic.clk"/>
+                      <direct name="direct3" input="arithmetic.out" output="ble5.out"/>
+                    </interconnect>
+                  </mode>
+                  <mode name="arithmetic_2chains">
+                    <pb_type name="arithmetic" num_pb="1">
+                      <input name="in" num_pins="5"/>
+                      <input name="cin" num_pins="2"/>
+                      <output name="out" num_pins="2"/>
+                      <output name="cout" num_pins="2"/>
+                      <clock name="clk" num_pins="1"/>
+                      <!-- Special dual-LUT mode that drives adder only -->
+                      <pb_type name="lut4" blif_model=".names" num_pb="2" class="lut">
+                        <input name="in" num_pins="4" port_class="lut_in"/>
+                        <output name="out" num_pins="1" port_class="lut_out"/>
+                        <!-- LUT timing using delay matrix -->
+                        <!-- These are the physical delay inputs on a Stratix 10 LUT but because VPR cannot do LUT rebalancing,
+                           we instead take the average of these numbers to get more stable results
+                           note that those are the same delays for inputs A - E as the ones used for the 6-LUT, however, we have
+                           subtracted the delay of the last mux stage to get the delay of inputs A - E till the 5-LUT output
+                           165.14e-12
+                           163.63e-12
+                           100.17e-12
+                           92.92e-12
+                          -->
+                        <delay_matrix type="max" in_port="lut4.in" out_port="lut4.out">
+                            130.47e-12
+                            130.47e-12
+                            130.47e-12
+                            130.47e-12
+                        </delay_matrix>
+                      </pb_type>
+                      <pb_type name="adder" blif_model=".subckt adder" num_pb="2">
+                        <input name="a" num_pins="1"/>
+                        <input name="b" num_pins="1"/>
+                        <input name="cin" num_pins="1"/>
+                        <output name="cout" num_pins="1"/>
+                        <output name="sumout" num_pins="1"/>
+                        <delay_constant max="71.95e-12" in_port="adder.a" out_port="adder.sumout"/>
+                        <delay_constant max="71.95e-12" in_port="adder.b" out_port="adder.sumout"/>
+                        <delay_constant max="37.55e-12" in_port="adder.cin" out_port="adder.sumout"/>
+                        <delay_constant max="49.31e-12" in_port="adder.a" out_port="adder.cout"/>
+                        <delay_constant max="49.31e-12" in_port="adder.b" out_port="adder.cout"/>
+                        <delay_constant max="25.61e-12" in_port="adder.cin" out_port="adder.cout"/>
+                      </pb_type>
+                      <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop">
+                        <input name="D" num_pins="1" port_class="D"/>
+                        <output name="Q" num_pins="1" port_class="Q"/>
+                        <clock name="clk" num_pins="1" port_class="clock"/>
+                        <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                        <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+                      </pb_type>
+                      <interconnect>
+                        <complete name="clock" input="arithmetic.clk" output="ff.clk"/>
+                        <direct name="lut4_in1" input="arithmetic.in[3:0]" output="lut4[0].in"/>
+                        <direct name="lut4_in2" input="arithmetic.in[3:0]" output="lut4[1].in"/>
+                        <direct name="lut_to_add1" input="lut4[0:0].out" output="adder[0].a">
+                            <pack_pattern name="simple_lut_chain" in_port="lut4[0:0].out" out_port="adder[0].a"/>
+                        </direct>
+                        <direct name="lut_to_add2" input="lut4[1:1].out" output="adder[0].b">
+                            <pack_pattern name="simple_lut_chain" in_port="lut4[1:1].out" out_port="adder[0].b"/>
+                        </direct>
+                        <direct name="add_to_ff1" input="adder[0].sumout" output="ff[0].D">
+                            <delay_constant max="16.45e-12" in_port="adder[0].sumout" out_port="ff[0].D"/>
+                            <!--pack_pattern name="simple_chain" in_port="adder[0].sumout" out_port="ff[0].D"/-->
+                        </direct>
+                        <direct name="add_to_ff2" input="adder[1].sumout" output="ff[1].D">
+                            <delay_constant max="16.45e-12" in_port="adder[1].sumout" out_port="ff[1].D"/>
+                            <!--pack_pattern name="simple_chain" in_port="adder[1].sumout" out_port="ff[1].D"/-->
+                        </direct>
+                        <direct name="carry_in1" input="arithmetic.cin[0]" output="adder[0].cin">
+                          <pack_pattern name="simple_chain" in_port="arithmetic.cin[0]" out_port="adder[0].cin"/>
+                          <pack_pattern name="simple_lut_chain" in_port="arithmetic.cin[0]" out_port="adder[0].cin"/>
+                        </direct>
+                        <direct name="carry_out1" input="adder[0].cout" output="arithmetic.cout[0]">
+                          <pack_pattern name="simple_chain" in_port="adder[0].cout" out_port="arithmetic.cout[0]"/>
+                          <pack_pattern name="simple_lut_chain" in_port="adder[0].cout" out_port="arithmetic.cout[0]"/>
+                        </direct>
+                        <direct name="carry_in2" input="arithmetic.cin[1]" output="adder[1].cin">
+                          <pack_pattern name="simple_chain" in_port="arithmetic.cin[1]" out_port="adder[1].cin"/>
+                        </direct>
+                        <direct name="carry_out2" input="adder[1].cout" output="arithmetic.cout[1]">
+                          <pack_pattern name="simple_chain" in_port="adder[1].cout" out_port="arithmetic.cout[1]"/>
+                        </direct>
+                        <!-- the output of this connection should be adder[1].a only, however, a complete cross bar
+                             is used since the packer is not aware that the adder inputs are logically equivalent -->
+                        <!--complete name="input_to_add" input="arithmetic.in[4]" output="adder[1].a adder[1].b"/-->
+                        <!-- the output of this connection should be adder[1].b only, however, a complete cross bar
+                             is used since the packer is not aware that the adder inputs are logically equivalent -->
+                        <complete name="add2_input" input="lut4[0].out arithmetic.in[4]" output="adder[1].a adder[1].b"/>
+                        <complete name="sumout" input="ff.Q adder.sumout" output="arithmetic.out">
+                            <delay_constant max="39.78e-12" in_port="adder.sumout" out_port="arithmetic.out"/>
+                            <delay_constant max="39.78e-12" in_port="ff.Q" out_port="arithmetic.out"/>
+                        </complete>
+                      </interconnect>
+                    </pb_type>
+                    <interconnect>
+                      <direct name="direct1" input="ble5.in" output="arithmetic.in"/>
+                      <direct name="carry_in" input="ble5.cin" output="arithmetic.cin">
+                        <pack_pattern name="simple_chain" in_port="ble5.cin" out_port="arithmetic.cin"/>
+                        <pack_pattern name="simple_lut_chain" in_port="ble5.cin[0]" out_port="arithmetic.cin[0]"/>
+                      </direct>
+                      <direct name="carry_out" input="arithmetic.cout" output="ble5.cout">
+                        <pack_pattern name="simple_chain" in_port="arithmetic.cout" out_port="ble5.cout"/>
+                        <pack_pattern name="simple_lut_chain" in_port="arithmetic.cout[0]" out_port="ble5.cout[0]"/>
+                      </direct>
+                      <direct name="direct2" input="ble5.clk" output="arithmetic.clk"/>
+                      <direct name="direct3" input="arithmetic.out" output="ble5.out"/>
+                    </interconnect>
+                  </mode>
+                </pb_type>
+                <interconnect>
+                  <!-- Shared inputs between the two 5-LUTs -->
+                  <complete name="lut5_reg1" input="fle.in[0]" output="ble5[0].in[0] ble5[1].in[1]"/>
+                  <complete name="lut5_reg2" input="fle.in[1]" output="ble5[0].in[1] ble5[1].in[0]"/>
+
+                  <!-- Rest of the 5-LUT inputs -->
+                  <direct name="lut5_inputs_1" input="fle.in[4:2]" output="ble5[0].in[4:2]"/>
+                  <direct name="lut5_inputs_22" input="fle.in[7:5]" output="ble5[1].in[4:2]"/>
+
+                  <direct name="lut5_outputs_1" input="ble5[0].out" output="fle.out[1:0]"/>
+                  <direct name="lut5_outputs_2" input="ble5[1].out" output="fle.out[3:2]"/>
+
+                  <direct name="carry_in" input="fle.cin" output="ble5[0].cin">
+                    <pack_pattern name="chain" in_port="fle.cin" out_port="ble5[0].cin"/>
+                    <pack_pattern name="lut_chain" in_port="fle.cin" out_port="ble5[0].cin"/>
+                    <pack_pattern name="simple_chain" in_port="fle.cin" out_port="ble5[0].cin"/>
+                    <pack_pattern name="simple_lut_chain" in_port="fle.cin[0]" out_port="ble5[0].cin[0]"/>
+                  </direct>
+                  <direct name="carry_out" input="ble5[1].cout" output="fle.cout">
+                    <pack_pattern name="chain" in_port="ble5[1].cout" out_port="fle.cout"/>
+                    <pack_pattern name="lut_chain" in_port="ble5[1].cout" out_port="fle.cout"/>
+                    <pack_pattern name="simple_chain" in_port="ble5[1].cout" out_port="fle.cout"/>
+                    <pack_pattern name="simple_lut_chain" in_port="ble5[1].cout[0]" out_port="fle.cout[0]"/>
+                  </direct>
+                  <direct name="carry_link" input="ble5[0].cout" output="ble5[1].cin">
+                    <pack_pattern name="chain" in_port="ble5[0].cout" out_port="ble5[1].cout"/>
+                    <pack_pattern name="lut_chain" in_port="ble5[0].cout" out_port="ble5[1].cout"/>
+                    <pack_pattern name="simple_chain" in_port="ble5[0].cout" out_port="ble5[1].cout"/>
+                    <pack_pattern name="simple_lut_chain" in_port="ble5[0].cout[0]" out_port="ble5[1].cout[0]"/>
+                  </direct>
+                  <complete name="clock" input="fle.clk" output="ble5[1:0].clk"/>
+                </interconnect>
+              </mode> <!-- n2_lut5 -->
+              <mode name="n1_lut6">
+                <pb_type name="ble6" num_pb="1">
+                  <input name="in" num_pins="6"/>
+                  <output name="out" num_pins="4"/>
+                  <clock name="clk" num_pins="1"/>
+                  <pb_type name="lut6" blif_model=".names" num_pb="1" class="lut">
+                    <input name="in" num_pins="6" port_class="lut_in"/>
+                    <output name="out" num_pins="1" port_class="lut_out"/>
+                    <!-- LUT timing using delay matrix -->
+                    <!-- These are the physical delay inputs on an Extra CC LUT but because VPR cannot do LUT rebalancing,
+                           we instead take the average of these numbers to get more stable results
+                           254.02e-12
+                           252.51e-12
+                           189.05e-12
+                           181.8e-12
+                           122.17e-12
+                           84.19e-12
+                      -->
+                    <delay_matrix type="max" in_port="lut6.in" out_port="lut6.out">
+                        180.6e-12
+                        180.6e-12
+                        180.6e-12
+                        180.6e-12
+                        180.6e-12
+                        180.6e-12
+                    </delay_matrix>
+                  </pb_type>
+                  <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop">
+                    <input name="D" num_pins="1" port_class="D"/>
+                    <output name="Q" num_pins="1" port_class="Q"/>
+                    <clock name="clk" num_pins="1" port_class="clock"/>
+                    <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                    <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+                  </pb_type>
+                  <interconnect>
+                    <direct name="lut6_inputs" input="ble6.in" output="lut6.in"/>
+                    <direct name="lut6_ff" input="lut6.out" output="ff[1].D">
+                      <delay_constant max="16.45e-12" in_port="lut6.out" out_port="ff[1].D"/>
+                      <pack_pattern name="ble6" in_port="lut6.out" out_port="ff[1].D"/>
+                    </direct>
+                    <complete name="clock" input="ble6.clk" output="ff.clk"/>
+                    <direct name="input_to_ff" input="ble6.in[0]" output="ff[0].D"/>
+                    <complete name="mux1" input="ff[0].Q lut6.out" output="ble6.out[1:0]">
+                      <delay_constant max="39.78e-12" in_port="lut6.out" out_port="ble6.out[1:0]"/>
+                      <delay_constant max="39.78e-12" in_port="ff[0].Q" out_port="ble6.out[1:0]"/>
+                    </complete>
+                    <complete name="mux2" input="ff[1].Q lut6.out" output="ble6.out[3:2]">
+                      <delay_constant max="39.78e-12" in_port="lut6.out" out_port="ble6.out[3:2]"/>
+                      <delay_constant max="39.78e-12" in_port="ff[1].Q" out_port="ble6.out[3:2]"/>
+                    </complete>
+                  </interconnect>
+                </pb_type>
+                <interconnect>
+                  <!-- ble6 takes inputs A, B, C, D, E, & F; where F is fle[7] -->
+                  <direct name="lut6_inputs1" input="fle.in[4:0]" output="ble6.in[4:0]"/>
+                  <direct name="lut6_inputs2" input="fle.in[7]" output="ble6.in[5]"/>
+                  <direct name="direct2" input="ble6.out" output="fle.out"/>
+                  <direct name="direct4" input="fle.clk" output="ble6.clk"/>
+                </interconnect>
+              </mode> <!-- n1_lut6 -->
+            </pb_type>
+            <interconnect>
+              <!-- We use a 50% depop crossbar built using small full xbars to get sets of logically equivalent pins at inputs of CLB
+               The delays below come from Stratix IV. the delay through a connection block
+               input mux + the crossbar in Stratix IV is 167 ps. We already have a 72 ps
+               delay on the connection block input mux (modeled by Ian Kuon), so the remaining
+               delay within the crossbar is 95 ps.
+               The delays of cluster feedbacks in Stratix IV is 100 ps, when driven by a LUT.
+               Since all our outputs LUT outputs go to a BLE output, and have a delay of
+               25 ps to do so, we subtract 25 ps from the 100 ps delay of a feedback
+               to get the part that should be marked on the crossbar.	 -->
+
+              <!-- 50% sparsely populated local routing -->
+              <complete name="lutA" input="lab.I4 lab.I3" output="fle[9:0].in[0:0]">
+                <delay_constant max="72.73e-12" in_port="lab.I4" out_port="fle.in[0:0]"/>
+                <delay_constant max="72.73e-12" in_port="lab.I3" out_port="fle.in[0:0]"/>
+              </complete>
+              <complete name="lutB" input="lab.I3 lab.I2" output="fle[9:0].in[1:1]">
+                <delay_constant max="72.73e-12" in_port="lab.I3" out_port="fle.in[1:1]"/>
+                <delay_constant max="72.73e-12" in_port="lab.I2" out_port="fle.in[1:1]"/>
+              </complete>
+              <complete name="lutC" input="lab.I2 lab.I1" output="fle[9:0].in[2:2]">
+                <delay_constant max="72.73e-12" in_port="lab.I2" out_port="fle.in[2:2]"/>
+                <delay_constant max="72.73e-12" in_port="lab.I1" out_port="fle.in[2:2]"/>
+              </complete>
+              <complete name="lutD" input="lab.I4 lab.I2" output="fle[9:0].in[3:3]">
+                <delay_constant max="72.73e-12" in_port="lab.I4" out_port="fle.in[3:3]"/>
+                <delay_constant max="72.73e-12" in_port="lab.I2" out_port="fle.in[3:3]"/>
+              </complete>
+              <complete name="lutE" input="lab.I3 lab.I1" output="fle[9:0].in[4:4]">
+                <delay_constant max="72.73e-12" in_port="lab.I3" out_port="fle.in[4:4]"/>
+                <delay_constant max="72.73e-12" in_port="lab.I1" out_port="fle.in[4:4]"/>
+              </complete>
+              <complete name="lutF" input="lab.I4 lab.I1" output="fle[9:0].in[5:5]">
+                <delay_constant max="72.73e-12" in_port="lab.I4" out_port="fle.in[5:5]"/>
+                <delay_constant max="72.73e-12" in_port="lab.I1" out_port="fle.in[5:5]"/>
+              </complete>
+              <complete name="lutG" input="lab.I4 lab.I3" output="fle[9:0].in[6:6]">
+                <delay_constant max="72.73e-12" in_port="lab.I4" out_port="fle.in[6:6]"/>
+                <delay_constant max="72.73e-12" in_port="lab.I3" out_port="fle.in[6:6]"/>
+              </complete>
+              <complete name="lutH" input="lab.I3 lab.I2" output="fle[9:0].in[7:7]">
+                <delay_constant max="72.73e-12" in_port="lab.I3" out_port="fle.in[7:7]"/>
+                <delay_constant max="72.73e-12" in_port="lab.I2" out_port="fle.in[7:7]"/>
+              </complete>
+
+              <complete name="clks" input="lab.clk" output="fle[9:0].clk"/>
+
+              <!-- This way of specifying direct connection to clb outputs is important because this architecture uses automatic spreading of opins.
+                     By grouping to output pins in this fashion, if a logic block is completely filled by 6-LUTs,
+                     then the outputs those 6-LUTs take get evenly distributed across all four sides of the CLB instead of clumped on two sides (which is what happens with a more
+                     naive specification).
+              -->
+              <direct name="labouts1" input="fle[9:0].out[0]" output="lab.O[9:0]"/>
+              <direct name="labouts2" input="fle[9:0].out[1]" output="lab.O[19:10]"/>
+              <direct name="labouts3" input="fle[9:0].out[2]" output="lab.O[29:20]"/>
+              <direct name="labouts4" input="fle[9:0].out[3]" output="lab.O[39:30]"/>
+
+              <!-- Carry chain links -->
+              <direct name="carry_in" input="lab.cin" output="fle[0:0].cin">
+                <!-- Put all inter-block carry chain delay on this one edge -->
+                <delay_constant max="18.69e-12" in_port="lab.cin[0]" out_port="fle[0:0].cin[0]"/>
+                <delay_constant max="18.85e-12" in_port="lab.cin[1]" out_port="fle[0:0].cin[1]"/>
+                <pack_pattern name="chain" in_port="lab.cin" out_port="fle[0:0].cin"/>
+                <pack_pattern name="lut_chain" in_port="lab.cin" out_port="fle[0:0].cin"/>
+                <pack_pattern name="simple_chain" in_port="lab.cin" out_port="fle[0:0].cin"/>
+                <pack_pattern name="simple_lut_chain" in_port="lab.cin[0]" out_port="fle[0:0].cin[0]"/>
+              </direct>
+              <direct name="carry_out" input="fle[9:9].cout" output="lab.cout">
+                <pack_pattern name="chain" in_port="fle[9:9].cout" out_port="lab.cout"/>
+                <pack_pattern name="lut_chain" in_port="fle[9:9].cout" out_port="lab.cout"/>
+                <pack_pattern name="simple_chain" in_port="fle[9:9].cout" out_port="lab.cout"/>
+                <pack_pattern name="simple_lut_chain" in_port="fle[9:9].cout[0]" out_port="lab.cout[0]"/>
+              </direct>
+              <direct name="carry_link" input="fle[8:0].cout" output="fle[9:1].cin">
+                <pack_pattern name="chain" in_port="fle[8:0].cout" out_port="fle[9:1].cin"/>
+                <pack_pattern name="lut_chain" in_port="fle[8:0].cout" out_port="fle[9:1].cin"/>
+                <pack_pattern name="simple_chain" in_port="fle[8:0].cout" out_port="fle[9:1].cin"/>
+                <pack_pattern name="simple_lut_chain" in_port="fle[8:0].cout[0]" out_port="fle[9:1].cin[0]"/>
+              </direct>
+            </interconnect>
+        </pb_type>
+        <interconnect>
+            <direct name="carry_in1" input="clb.cin[0]" output="lab.cin[0]">
+                <pack_pattern name="chain" in_port="clb.cin[0]" out_port="lab.cin[0]"/>
+                <pack_pattern name="lut_chain" in_port="clb.cin[0]" out_port="lab.cin[0]"/>
+                <pack_pattern name="simple_chain" in_port="clb.cin[0]" out_port="lab.cin[0]"/>
+                <pack_pattern name="simple_lut_chain" in_port="clb.cin[0]" out_port="lab.cin[0]"/>
+            </direct>
+            <direct name="carry_out1" input="lab.cout[0]" output="clb.cout[0]">
+                <pack_pattern name="chain" in_port="lab.cout[0]" out_port="clb.cout[0]"/>
+                <pack_pattern name="lut_chain" in_port="lab.cout[0]" out_port="clb.cout[0]"/>
+                <pack_pattern name="simple_chain" in_port="lab.cout[0]" out_port="clb.cout[0]"/>
+                <pack_pattern name="simple_lut_chain" in_port="lab.cout[0]" out_port="clb.cout[0]"/>
+            </direct>
+            <direct name="carry_in2" input="clb.cin[1]" output="lab.cin[1]">
+                <pack_pattern name="chain" in_port="clb.cin[1]" out_port="lab.cin[1]"/>
+                <pack_pattern name="lut_chain" in_port="clb.cin[1]" out_port="lab.cin[1]"/>
+                <pack_pattern name="simple_chain" in_port="clb.cin[1]" out_port="lab.cin[1]"/>
+            </direct>
+            <direct name="carry_out2" input="lab.cout[1]" output="clb.cout[1]">
+                <pack_pattern name="chain" in_port="lab.cout[1]" out_port="clb.cout[1]"/>
+                <pack_pattern name="lut_chain" in_port="lab.cout[1]" out_port="clb.cout[1]"/>
+                <pack_pattern name="simple_chain" in_port="lab.cout[1]" out_port="clb.cout[1]"/>
+            </direct>
+            <direct name="clock" input="clb.clk" output="lab.clk"/>
+
+            <complete name="Input_feedback_I1" input="lab.O[4:0]" output="lab.I1"/>
+            <complete name="Input_feedback_I2" input="lab.O[24:20]" output="lab.I2"/>
+            <complete name="Input_feedback_I3" input="lab.O[9:5]" output="lab.I3"/>
+            <complete name="Input_feedback_I4" input="lab.O[29:25]" output="lab.I4"/>
+
+            <direct name="Input_I1" input="clb.I1" output="lab.I1"/>
+            <direct name="Input_I2" input="clb.I2" output="lab.I2"/>
+            <direct name="Input_I3" input="clb.I3" output="lab.I3"/>
+            <direct name="Input_I4" input="clb.I4" output="lab.I4"/>
+
+            <direct name="output" input="lab.O" output="clb.O"/>
+        </interconnect>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+            <fc_override port_name="cin" fc_type="frac" fc_val="0"/>
+            <fc_override port_name="cout" fc_type="frac" fc_val="0"/>
+        </fc>
+        <pinlocations pattern="spread"/>
+      </pb_type>
+       <!-- Define general purpose logic block (CLB) ends -->
+
+       <!-- Define fracturable multiplier begin -->
+       <pb_type name="mult_27" height="2">
+         <input name="datain" num_pins="74"/>
+         <output name="dataout" num_pins="74"/>
+
+         <mode name="two_mult_18x19">
+           <pb_type name="two_mult_18x19" num_pb="2">
+             <input name="a" num_pins="18"/>
+             <input name="b" num_pins="19"/>
+             <output name="out" num_pins="37"/>
+             <pb_type name="mult_18x19" blif_model=".subckt multiply" num_pb="1">
+               <input name="a" num_pins="18"/>
+               <input name="b" num_pins="19"/>
+               <output name="out" num_pins="37"/>
+               <!-- Using the numbers from Arria 10 which is a 22nm technology, an 18x19 multiplier
+                    can operate at 548 MHz which maps to a delay of 1.825e-9 -->
+                 <delay_constant max="1.825e-9" in_port="mult_18x19.a" out_port="mult_18x19.out"/>
+                 <delay_constant max="1.825e-9" in_port="mult_18x19.b" out_port="mult_18x19.out"/>
+             </pb_type>
+             <interconnect>
+               <direct name="a2a" input="two_mult_18x19.a" output="mult_18x19.a">
+               </direct>
+               <direct name="b2b" input="two_mult_18x19.b" output="mult_18x19.b">
+               </direct>
+               <direct name="out2out" input="mult_18x19.out" output="two_mult_18x19.out">
+               </direct>
+             </interconnect>
+               <power method="pin-toggle">
+                 <port name="a" energy_per_toggle="1.09e-12"/>
+                 <port name="b" energy_per_toggle="1.09e-12"/>
+                 <static_power power_per_instance="0.0"/>
+               </power>
+           </pb_type>
+           <interconnect>
+             <!-- Stratix IV input delay of 207ps is conservative for this architecture because this architecture does not have an input crossbar in the multiplier.
+              Subtract 72.5 ps delay, which is already in the connection block input mux, leading
+                 -->
+             <direct name="datain2a1" input="mult_27.datain[17:0]" output="two_mult_18x19[0].a">
+               <delay_constant max="134e-12" in_port="mult_27.datain[17:0]" out_port="two_mult_18x19[0].a"/>
+             </direct>
+             <direct name="datain2b1" input="mult_27.datain[36:18]" output="two_mult_18x19[0].b">
+               <delay_constant max="134e-12" in_port="mult_27.datain[36:18]" out_port="two_mult_18x19[0].b"/>
+             </direct>
+             <direct name="datain2a2" input="mult_27.datain[54:37]" output="two_mult_18x19[1].a">
+               <delay_constant max="134e-12" in_port="mult_27.datain[54:37]" out_port="two_mult_18x19[1].a"/>
+             </direct>
+             <direct name="datain2b2" input="mult_27.datain[73:55]" output="two_mult_18x19[1].b">
+               <delay_constant max="134e-12" in_port="mult_27.datain[73:55]" out_port="two_mult_18x19[1].b"/>
+             </direct>
+             <direct name="out2dataout" input="two_mult_18x19[1:0].out" output="mult_27.dataout">
+               <delay_constant max="1.09e-9" in_port="two_mult_18x19[1:0].out" out_port="mult_27.dataout"/>
+             </direct>
+           </interconnect>
+         </mode>
+
+         <mode name="mult_27x27">
+           <pb_type name="one_mult_27x27" num_pb="1">
+             <input name="a" num_pins="27"/>
+             <input name="b" num_pins="27"/>
+             <output name="out" num_pins="54"/>
+
+             <pb_type name="mult_27x27" blif_model=".subckt multiply" num_pb="1">
+               <input name="a" num_pins="27"/>
+               <input name="b" num_pins="27"/>
+               <output name="out" num_pins="54"/>
+               <!-- Using the numbers from Arria 10 which is a 22nm technology, an 27x27 multiplier
+                    can operate at 541 MHz which maps to a delay of 1.848e-9 -->
+               <delay_constant max="1.848e-9" in_port="mult_27x27.a" out_port="mult_27x27.out"/>
+               <delay_constant max="1.848e-9" in_port="mult_27x27.b" out_port="mult_27x27.out"/>
+             </pb_type>
+             <interconnect>
+               <direct name="a2a" input="one_mult_27x27.a" output="mult_27x27.a">
+               </direct>
+               <direct name="b2b" input="one_mult_27x27.b" output="mult_27x27.b">
+               </direct>
+               <direct name="out2out" input="mult_27x27.out" output="one_mult_27x27.out">
+               </direct>
+             </interconnect>
+             <power method="pin-toggle">
+               <port name="a" energy_per_toggle="2.13e-12"/>
+               <port name="b" energy_per_toggle="2.13e-12"/>
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <!-- Stratix IV input delay of 207ps is conservative for this architecture because this architecture does not have an input crossbar in the multiplier.
+              Subtract 72.5 ps delay, which is already in the connection block input mux, leading
+              to a 134 ps delay.
+                 -->
+             <direct name="datain2a" input="mult_27.datain[26:0]" output="one_mult_27x27.a">
+               <delay_constant max="134e-12" in_port="mult_27.datain[26:0]" out_port="one_mult_27x27.a"/>
+             </direct>
+             <direct name="datain2b" input="mult_27.datain[53:27]" output="one_mult_27x27.b">
+               <delay_constant max="134e-12" in_port="mult_27.datain[53:27]" out_port="one_mult_27x27.b"/>
+             </direct>
+             <direct name="out2dataout" input="one_mult_27x27.out" output="mult_27.dataout[53:0]">
+               <delay_constant max="1.93e-9" in_port="one_mult_27x27.out" out_port="mult_27.dataout[53:0]"/>
+             </direct>
+           </interconnect>
+
+         </mode>
+
+         <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+         <pinlocations pattern="spread"/>
+
+         <!-- Place this multiplier block every 8 columns from (and including) the sixth column -->
+         <power method="sum-of-children"/>
+       </pb_type>
+       <!-- Define fracturable multiplier end -->
+
+       <!-- Define fracturable memory begin -->
+       <pb_type name="memory" height="4">
+         <input name="addr1" num_pins="11"/>
+         <input name="addr2" num_pins="11"/>
+         <input name="data" num_pins="40"/>
+         <input name="we1" num_pins="1"/>
+         <input name="we2" num_pins="1"/>
+         <output name="out" num_pins="40"/>
+         <clock name="clk" num_pins="1"/>
+
+         <!-- Specify single port mode first -->
+         <mode name="mem_512x40_sp">
+           <pb_type name="mem_512x40_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+             <input name="addr" num_pins="9" port_class="address"/>
+             <input name="data" num_pins="40" port_class="data_in"/>
+             <input name="we" num_pins="1" port_class="write_en"/>
+             <output name="out" num_pins="40" port_class="data_out"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_512x40_sp.addr" clock="clk"/>
+             <T_setup value="509e-12" port="mem_512x40_sp.data" clock="clk"/>
+             <T_setup value="509e-12" port="mem_512x40_sp.we" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_512x40_sp.out" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="9.0e-12"/>
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[8:0]" output="mem_512x40_sp.addr">
+               <delay_constant max="132e-12" in_port="memory.addr1[8:0]" out_port="mem_512x40_sp.addr"/>
+             </direct>
+             <direct name="data1" input="memory.data" output="mem_512x40_sp.data">
+               <delay_constant max="132e-12" in_port="memory.data" out_port="mem_512x40_sp.data"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_512x40_sp.we">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_512x40_sp.we"/>
+             </direct>
+             <direct name="dataout1" input="mem_512x40_sp.out" output="memory.out">
+               <delay_constant max="40e-12" in_port="mem_512x40_sp.out" out_port="memory.out"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_512x40_sp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <mode name="mem_1024x20_sp">
+           <pb_type name="mem_1024x20_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+             <input name="addr" num_pins="10" port_class="address"/>
+             <input name="data" num_pins="20" port_class="data_in"/>
+             <input name="we" num_pins="1" port_class="write_en"/>
+             <output name="out" num_pins="20" port_class="data_out"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_1024x20_sp.addr" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_sp.data" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_sp.we" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_1024x20_sp.out" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="9.0e-12"/>
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_sp.addr">
+               <delay_constant max="132e-12" in_port="memory.addr1[9:0]" out_port="mem_1024x20_sp.addr"/>
+             </direct>
+             <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_sp.data">
+               <delay_constant max="132e-12" in_port="memory.data[19:0]" out_port="mem_1024x20_sp.data"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_1024x20_sp.we">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_1024x20_sp.we"/>
+             </direct>
+             <direct name="dataout1" input="mem_1024x20_sp.out" output="memory.out[19:0]">
+               <delay_constant max="40e-12" in_port="mem_1024x20_sp.out" out_port="memory.out[19:0]"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_1024x20_sp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <mode name="mem_2048x10_sp">
+           <pb_type name="mem_2048x10_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+             <input name="addr" num_pins="11" port_class="address"/>
+             <input name="data" num_pins="10" port_class="data_in"/>
+             <input name="we" num_pins="1" port_class="write_en"/>
+             <output name="out" num_pins="10" port_class="data_out"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_2048x10_sp.addr" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_sp.data" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_sp.we" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_2048x10_sp.out" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="9.0e-12"/>
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_sp.addr">
+               <delay_constant max="132e-12" in_port="memory.addr1[10:0]" out_port="mem_2048x10_sp.addr"/>
+             </direct>
+             <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_sp.data">
+               <delay_constant max="132e-12" in_port="memory.data[9:0]" out_port="mem_2048x10_sp.data"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_2048x10_sp.we">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_2048x10_sp.we"/>
+             </direct>
+             <direct name="dataout1" input="mem_2048x10_sp.out" output="memory.out[9:0]">
+               <delay_constant max="40e-12" in_port="mem_2048x10_sp.out" out_port="memory.out[9:0]"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_2048x10_sp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <!-- Specify true dual port mode next -->
+         <mode name="mem_1024x20_dp">
+           <pb_type name="mem_1024x20_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+             <input name="addr1" num_pins="10" port_class="address1"/>
+             <input name="addr2" num_pins="10" port_class="address2"/>
+             <input name="data1" num_pins="20" port_class="data_in1"/>
+             <input name="data2" num_pins="20" port_class="data_in2"/>
+             <input name="we1" num_pins="1" port_class="write_en1"/>
+             <input name="we2" num_pins="1" port_class="write_en2"/>
+             <output name="out1" num_pins="20" port_class="data_out1"/>
+             <output name="out2" num_pins="20" port_class="data_out2"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.addr1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.data1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.we1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.addr2" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.data2" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.we2" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_1024x20_dp.out1" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_1024x20_dp.out2" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="17.9e-12"/>
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_dp.addr1">
+               <delay_constant max="132e-12" in_port="memory.addr1[9:0]" out_port="mem_1024x20_dp.addr1"/>
+             </direct>
+             <direct name="address2" input="memory.addr2[9:0]" output="mem_1024x20_dp.addr2">
+               <delay_constant max="132e-12" in_port="memory.addr2[9:0]" out_port="mem_1024x20_dp.addr2"/>
+             </direct>
+             <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_dp.data1">
+               <delay_constant max="132e-12" in_port="memory.data[19:0]" out_port="mem_1024x20_dp.data1"/>
+             </direct>
+             <direct name="data2" input="memory.data[39:20]" output="mem_1024x20_dp.data2">
+               <delay_constant max="132e-12" in_port="memory.data[39:20]" out_port="mem_1024x20_dp.data2"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_1024x20_dp.we1">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_1024x20_dp.we1"/>
+             </direct>
+             <direct name="writeen2" input="memory.we2" output="mem_1024x20_dp.we2">
+               <delay_constant max="132e-12" in_port="memory.we2" out_port="mem_1024x20_dp.we2"/>
+             </direct>
+             <direct name="dataout1" input="mem_1024x20_dp.out1" output="memory.out[19:0]">
+               <delay_constant max="40e-12" in_port="mem_1024x20_dp.out1" out_port="memory.out[19:0]"/>
+             </direct>
+             <direct name="dataout2" input="mem_1024x20_dp.out2" output="memory.out[39:20]">
+               <delay_constant max="40e-12" in_port="mem_1024x20_dp.out2" out_port="memory.out[39:20]"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_1024x20_dp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <mode name="mem_2048x10_dp">
+           <pb_type name="mem_2048x10_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+             <input name="addr1" num_pins="11" port_class="address1"/>
+             <input name="addr2" num_pins="11" port_class="address2"/>
+             <input name="data1" num_pins="10" port_class="data_in1"/>
+             <input name="data2" num_pins="10" port_class="data_in2"/>
+             <input name="we1" num_pins="1" port_class="write_en1"/>
+             <input name="we2" num_pins="1" port_class="write_en2"/>
+             <output name="out1" num_pins="10" port_class="data_out1"/>
+             <output name="out2" num_pins="10" port_class="data_out2"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.addr1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.data1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.we1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.addr2" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.data2" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.we2" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_2048x10_dp.out1" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_2048x10_dp.out2" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="17.9e-12"/>
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_dp.addr1">
+               <delay_constant max="132e-12" in_port="memory.addr1[10:0]" out_port="mem_2048x10_dp.addr1"/>
+             </direct>
+             <direct name="address2" input="memory.addr2[10:0]" output="mem_2048x10_dp.addr2">
+               <delay_constant max="132e-12" in_port="memory.addr2[10:0]" out_port="mem_2048x10_dp.addr2"/>
+             </direct>
+             <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_dp.data1">
+               <delay_constant max="132e-12" in_port="memory.data[9:0]" out_port="mem_2048x10_dp.data1"/>
+             </direct>
+             <direct name="data2" input="memory.data[19:10]" output="mem_2048x10_dp.data2">
+               <delay_constant max="132e-12" in_port="memory.data[19:10]" out_port="mem_2048x10_dp.data2"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_2048x10_dp.we1">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_2048x10_dp.we1"/>
+             </direct>
+             <direct name="writeen2" input="memory.we2" output="mem_2048x10_dp.we2">
+               <delay_constant max="132e-12" in_port="memory.we2" out_port="mem_2048x10_dp.we2"/>
+             </direct>
+             <direct name="dataout1" input="mem_2048x10_dp.out1" output="memory.out[9:0]">
+               <delay_constant max="40e-12" in_port="mem_2048x10_dp.out1" out_port="memory.out[9:0]"/>
+             </direct>
+             <direct name="dataout2" input="mem_2048x10_dp.out2" output="memory.out[19:10]">
+               <delay_constant max="40e-12" in_port="mem_2048x10_dp.out2" out_port="memory.out[19:10]"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_2048x10_dp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel -->
+         <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+         <pinlocations pattern="spread"/>
+
+         <!-- Place this memory block every 8 columns from (and including) the second column -->
+         <power method="sum-of-children"/>
+       </pb_type>
+       <!-- Define fracturable memory end -->
+
+
+    </complexblocklist>
+
+    <power>
+      <local_interconnect C_wire="2.5e-10"/>
+      <mux_transistor_size mux_transistor_size="3"/>
+      <FF_size FF_size="4"/>
+      <LUT_transistor_size LUT_transistor_size="4"/>
+    </power>
+
+    <clocks>
+      <clock buffer_size="auto" C_wire="2.5e-10"/>
+    </clocks>
+</architecture>