arch: Added an architecture file for the 4-bit Adder Double Chain
architecture
diff --git a/vtr_flow/arch/timing/4bit_adder_double_chain_arch.xml b/vtr_flow/arch/timing/4bit_adder_double_chain_arch.xml
new file mode 100644
index 0000000..896aa63
--- /dev/null
+++ b/vtr_flow/arch/timing/4bit_adder_double_chain_arch.xml
@@ -0,0 +1,968 @@
+<!--
+    This is the architecture file for the 4-bit Adder Double Chain Architecture proposed in [1].
+    Delays for routing and logic blocks come from COFFE runs for a 20 nm technology node.
+    Delays for DSP blocks and BRAMs come from Arria 10 (22 nm) delays.
+
+    This architecture is Stratix-10 like architecture with a modified arithmetic mode
+
+    This architecture has 10 ALMs per cluster, where each ALM is a 6-LUT fracturable into 
+    two 5-LUTs. The ALM has 8 inputs and 4 optionally registered outputs.The two 5-LUTs should
+    share at least two inputs. Each two ALM outputs are logically equivalent, which means any
+    output signal that can reach ALM.out[0] can reach ALM.out[1] and the same thing for
+    ALM.out[2] and ALM.out[3]. This architecture has an arithmetic mode where each 5-LUT
+    is fractured into two 4-LUTs and each 4-LUT into two 3-LUTs. This results in a total of
+    eight 3-LUTs per ALM and four bits of addition. This architecture has two separate carry chains
+    with separate start (Cin[0] and Cin[1]) and end (Cout[0] and Cout[1]) points. The adders in each
+    5 ALMs are connected together with a carry chain; chain 1 is connecting ALM[0] : ALM[4] while 
+    chain 2 is connecting ALM[5] : ALM[9].
+
+    The LAB has 60 inputs and 40 outputs. Two outputs of each ALM are fed to the right and
+    left LAB using direct links and are also fed back to the LAB as feedback connections sharing
+    the 60 input ports with the signals coming from the routing channels.
+
+    The architecture also has a 20Kb memory that has true and simple dual port modes. In simple
+    dual port mode the memory can be configured in the following modes: 512x40, 1024x20 and 2048x10,
+    while in true dual port mode it can be configured as: 1024x20 and 2028x10.
+
+    In addition, the architecture has a 27x27 DSP block that can be fractured into two 18x19 DSPs.
+
+
+    [1] M. Eldafrawy, A. Boutros, S. Yazdanshenas, and V. Betz, "FPGA Logic Block Architectures for efficient
+        multiplication and addition to enhance machine learning performance," in Transactions on Reconfigurable
+        Technology and Systems (TRETS), 2019
+
+-->
+<architecture>
+    <!-- 
+         ODIN II specific config begins 
+         Describes the types of user-specified netlist blocks (in blif, this corresponds to 
+         ".model [type_of_block]") that this architecture supports.
+
+         Note: Basic LUTs, I/Os, and flip-flops are not included here as there are 
+         already special structures in blif (.names, .input, .output, and .latch) 
+         that describe them.
+    -->
+    <models>
+      <model name="multiply">
+        <input_ports>
+        <port name="a" combinational_sink_ports="out"/>
+        <port name="b" combinational_sink_ports="out"/>
+        </input_ports>
+        <output_ports>
+        <port name="out"/>
+        </output_ports>
+      </model>
+      
+      <model name="single_port_ram">
+        <input_ports>
+        <port name="we" clock="clk"/>     <!-- control -->
+        <port name="addr" clock="clk"/>  <!-- address lines -->
+        <port name="data" clock="clk"/>  <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="clk" is_clock="1"/>  <!-- memories are often clocked -->
+        </input_ports>
+        <output_ports>
+        <port name="out" clock="clk"/>   <!-- output can be broken down into smaller bit widths minimum size 1 -->
+        </output_ports>
+      </model>
+
+      <model name="dual_port_ram">
+        <input_ports>
+        <port name="we1" clock="clk"/>     <!-- write enable -->
+        <port name="we2" clock="clk"/>     <!-- write enable -->
+        <port name="addr1" clock="clk"/>  <!-- address lines -->
+        <port name="addr2" clock="clk"/>  <!-- address lines -->
+        <port name="data1" clock="clk"/>  <!-- data lines can be broken down into smaller bit widths minimum size 10 -->
+        <port name="data2" clock="clk"/>  <!-- data lines can be broken down into smaller bit widths minimum size 10 -->
+        <port name="clk" is_clock="1"/>  <!-- memories are often clocked -->
+        </input_ports>
+        <output_ports>
+        <port name="out1" clock="clk"/>   <!-- output can be broken down into smaller bit widths minimum size 10 -->
+        <port name="out2" clock="clk"/>   <!-- output can be broken down into smaller bit widths minimum size 10 -->
+        </output_ports>
+      </model>
+
+      <model name="adder">
+        <input_ports>
+          <port name="a" combinational_sink_ports="sumout cout"/>
+          <port name="b" combinational_sink_ports="sumout cout"/>
+          <port name="cin" combinational_sink_ports="sumout cout"/>
+        </input_ports>
+        <output_ports>
+          <port name="cout"/>
+          <port name="sumout"/>
+        </output_ports>
+      </model>
+    </models> <!-- ODIN II specific config ends -->
+
+    <layout> <!-- Physical descriptions begin -->
+      <auto_layout aspect_ratio="1.0">
+          <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
+          <perimeter type="io" priority="100"/>
+          <corners type="EMPTY" priority="101"/>
+          <!--Fill with 'clb'-->
+          <fill type="clb" priority="10"/>
+          <!--Column of 'mult_27' with 'EMPTY' blocks wherever a 'mult_36' does not fit. Vertical offset by 1 for perimeter.-->
+          <col type="mult_27" startx="6" starty="1" repeatx="8" priority="20"/>
+          <col type="EMPTY" startx="6" repeatx="8" starty="1" priority="19"/>
+          <!--Column of 'memory' with 'EMPTY' blocks wherever a 'memory' does not fit. Vertical offset by 1 for perimeter.-->
+          <col type="memory" startx="2" starty="1" repeatx="8" priority="20"/>
+          <col type="EMPTY" startx="2" repeatx="8" starty="1" priority="19"/>
+      </auto_layout>
+    </layout>
+
+    <device>
+      <!-- Those values are generated using COFFE at 22nm technology -->
+      <sizing R_minW_nmos="13090" R_minW_pmos="19086.83"/>
+      <area grid_logic_tile_area="25201.9"/>
+      <chan_width_distr>
+        <x distr="uniform" peak="1.000000"/>
+        <y distr="uniform" peak="1.000000"/>
+      </chan_width_distr>
+	  <switch_block type="wilton" fs="3"/>
+	  <connection_block input_switch_name="ipin_cblock"/>
+    </device>
+
+    <switchlist>
+      <!-- Those values are generated using COFFE at 22nm technology -->
+      <switch type="mux" name="0" R="0.0" Cin="0.0" Cout="0.0" Tdel="237e-12" mux_trans_size="2.173" buf_size="34.22"/>
+      <switch type="mux" name="ipin_cblock" R="0.0" Cout="0.0" Cin="0.0" Tdel="146.4e-12" mux_trans_size="1.508" buf_size="12.286"/>
+    </switchlist>
+
+    <segmentlist>
+      <segment freq="1.000000" length="4" type="unidir" Rmetal="0.0" Cmetal="0.0">
+        <mux name="0"/>
+        <sb type="pattern">1 1 1 1 1</sb>
+        <cb type="pattern">1 1 1 1</cb>
+      </segment>
+    </segmentlist>
+
+    <directlist>
+      <!-- direct links connecting the two carry chains of on LAB to the LAB below it -->
+      <direct name="adder_carry1" from_pin="clb.cout[0:0]" to_pin="clb.cin[0:0]" x_offset="0" y_offset="-1" z_offset="0"/>
+      <direct name="adder_carry2" from_pin="clb.cout[1:1]" to_pin="clb.cin[1:1]" x_offset="0" y_offset="-1" z_offset="0"/>
+      
+      <!-- Direct connect to right LAB -->
+      <direct name="direct_right_1" from_pin="clb.O[4:0]" to_pin="clb.I1[9:5]" x_offset="1" y_offset="0" z_offset="0"/>
+      <direct name="direct_right_2" from_pin="clb.O[24:20]" to_pin="clb.I2[9:5]" x_offset="1" y_offset="0" z_offset="0"/>
+      <direct name="direct_right_3" from_pin="clb.O[9:5]" to_pin="clb.I3[9:5]" x_offset="1" y_offset="0" z_offset="0"/>
+      <direct name="direct_right_4" from_pin="clb.O[29:25]" to_pin="clb.I4[9:5]" x_offset="1" y_offset="0" z_offset="0"/>
+      
+      <!-- Direct connect to left LAB -->
+      <direct name="direct_left_1" from_pin="clb.O[14:10]" to_pin="clb.I1[14:10]" x_offset="-1" y_offset="0" z_offset="0"/>
+      <direct name="direct_left_2" from_pin="clb.O[34:30]" to_pin="clb.I2[14:10]" x_offset="-1" y_offset="0" z_offset="0"/>
+      <direct name="direct_left_3" from_pin="clb.O[19:15]" to_pin="clb.I3[14:10]" x_offset="-1" y_offset="0" z_offset="0"/>
+      <direct name="direct_left_4" from_pin="clb.O[39:35]" to_pin="clb.I4[14:10]" x_offset="-1" y_offset="0" z_offset="0"/>
+    </directlist>
+
+    <complexblocklist>
+
+      <!-- Define I/O pads begin -->
+      <!-- Capacity is a unique property of I/Os, it is the maximum number of I/Os that can be placed at the same (X,Y) location on the FPGA -->
+      <!-- Not sure of the area of an I/O (varies widely), and it's not relevant to the design of the FPGA core, so we're setting it to 0. -->
+      <pb_type name="io" capacity="8" area="0">
+        <input name="outpad" num_pins="1"/>
+        <output name="inpad" num_pins="1"/>
+        <clock name="clock" num_pins="1"/>
+
+	    <!-- IOs can operate as either inputs or outputs.
+	         Delays below come from Ian Kuon. They are small, so they should be interpreted as
+	         the delays to and from registers in the I/O (and generally I/Os are registered 
+	         today and that is when you timing analyze them.
+	    -->
+        <mode name="inpad">
+          <pb_type name="inpad" blif_model=".input" num_pb="1">
+            <output name="inpad" num_pins="1"/>
+          </pb_type>
+          <interconnect>
+            <direct name="inpad" input="inpad.inpad" output="io.inpad">
+            <delay_constant max="4.243e-11" in_port="inpad.inpad" out_port="io.inpad"/>
+            </direct>
+          </interconnect>
+        </mode>
+        <mode name="outpad">
+          <pb_type name="outpad" blif_model=".output" num_pb="1">
+            <input name="outpad" num_pins="1"/>
+          </pb_type>
+          <interconnect>
+            <direct name="outpad" input="io.outpad" output="outpad.outpad">
+            <delay_constant max="1.394e-11" in_port="io.outpad" out_port="outpad.outpad"/>
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel -->
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+
+        <!-- IOs go on the periphery of the FPGA, for consistency, 
+          make it physically equivalent on all sides so that only one definition of I/Os is needed.
+          If I do not make a physically equivalent definition, then I need to define 4 different I/Os, one for each side of the FPGA
+        -->
+        <pinlocations pattern="custom">
+          <loc side="left">io.outpad io.inpad io.clock</loc>
+          <loc side="top">io.outpad io.inpad io.clock</loc>
+          <loc side="right">io.outpad io.inpad io.clock</loc>
+          <loc side="bottom">io.outpad io.inpad io.clock</loc>
+        </pinlocations>
+
+        <!-- Place I/Os on the sides of the FPGA -->
+        <power method="ignore"/>
+      </pb_type>
+      <!-- Define I/O pads ends -->
+
+      <!-- Define general purpose logic block (CLB) begin -->
+      <pb_type name="clb">
+        <input name="I1" num_pins="15" equivalent="full"/>
+		<input name="I2" num_pins="15" equivalent="full"/>
+		<input name="I3" num_pins="15" equivalent="full"/>
+		<input name="I4" num_pins="15" equivalent="full"/>
+        <input name="cin" num_pins="2"/>
+        <output name="O" num_pins="40" equivalent="none"/>
+        <output name="cout" num_pins="2"/>
+        <clock name="clk" num_pins="1"/>
+        <pb_type name="lab" num_pb="1">
+          <input name="I1" num_pins="15"/>
+          <input name="I2" num_pins="15"/>
+          <input name="I3" num_pins="15"/>
+          <input name="I4" num_pins="15"/>
+          <input name="cin" num_pins="2"/>
+          <output name="O" num_pins="40"/>
+          <output name="cout" num_pins="2"/>
+          <clock name="clk" num_pins="1"/>
+          <!-- Describe fracturable logic element.  
+               Each fracturable logic element has a 6-LUT that can alternatively operate as two 5-LUTs with two shared inputs. 
+               The outputs of the fracturable logic element can be optionally registered
+          -->
+          <pb_type name="fle" num_pb="10">
+            <input name="in" num_pins="8"/>
+            <input name="cin" num_pins="1"/>
+            <output name="out" num_pins="4"/>
+            <output name="cout" num_pins="1"/>
+            <clock name="clk" num_pins="1"/>
+            <!-- 
+                  The ALM inputs are as follows:
+                          A -> fle[0]
+                          B -> fle[1]
+                          C -> fle[2]
+                          D -> fle[3]
+                          E -> fle[4]
+                          F -> fle[5]
+                          G -> fle[6]
+                          H -> fle[7]
+            -->
+            <mode name="n2_lut5">
+              <pb_type name="ble5" num_pb="2">
+                <input name="in" num_pins="5"/>
+                <input name="cin" num_pins="1"/>
+                <output name="out" num_pins="2"/>
+                <output name="cout" num_pins="1"/>
+                <clock name="clk" num_pins="1"/> 
+                <mode name="blut5">
+                  <pb_type name="flut5" num_pb="1">
+                    <input name="in" num_pins="5"/>
+                    <output name="out" num_pins="2"/>
+                    <clock name="clk" num_pins="1"/> 
+                    <!-- Regular LUT mode -->
+                    <pb_type name="lut5" blif_model=".names" num_pb="1" class="lut">
+                      <input name="in" num_pins="5" port_class="lut_in"/>
+                      <output name="out" num_pins="1" port_class="lut_out"/>
+                      <!-- LUT timing using delay matrix -->
+                        <!-- These are the physical delay inputs on a 4bit Adder architecture but because VPR cannot do LUT rebalancing,
+                           we instead take the average of these numbers to get more stable results.
+                           Note that those are the same delays for inputs A - E as the ones used for the 6-LUT, however, we have 
+                           subtracted the delay of the last mux stage to get the delay of inputs A - E till the 5-LUT output
+                           219.86e-12
+                           216.03e-12
+                           197.86e-12
+                           110.77e-12
+                           71.48e-12
+                        -->
+                      <delay_matrix type="max" in_port="lut5.in" out_port="lut5.out">
+                          163.2e-12
+                          163.2e-12
+                          163.2e-12
+                          163.2e-12
+                          163.2e-12
+                      </delay_matrix>
+                    </pb_type>       
+                    <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop">
+                      <input name="D" num_pins="1" port_class="D"/>
+                      <output name="Q" num_pins="1" port_class="Q"/>
+                      <clock name="clk" num_pins="1" port_class="clock"/>
+                      <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                      <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+                    </pb_type>
+                    <interconnect>
+                      <direct name="lut5_in" input="flut5.in" output="lut5.in"/>
+                      <direct name="reg_in" input="flut5.in[0]" output="ff[0].D"/>
+                      <direct name="lut5_ff" input="lut5.out" output="ff[1].D">
+                        <pack_pattern name="ble5" in_port="lut5.out" out_port="ff[1].D"/>
+                        <delay_constant max="18.1e-12" in_port="lut5.out" out_port="ff[1].D"/>
+                      </direct>
+                      <complete name="clock" input="flut5.clk" output="ff.clk"/>
+                      <complete name="out_mux" input="ff.Q lut5.out" output="flut5.out">
+                        <delay_constant max="43.87e-12" in_port="lut5.out" out_port="flut5.out"/>
+                        <delay_constant max="43.87e-12" in_port="ff.Q" out_port="flut5.out"/>
+                      </complete>
+                    </interconnect>
+                  </pb_type>
+                  <interconnect>
+                    <direct name="direct1" input="ble5.in" output="flut5.in"/>
+                    <direct name="direct2" input="ble5.clk" output="flut5.clk"/>
+                    <direct name="direct3" input="flut5.out" output="ble5.out"/>
+                  </interconnect>
+                </mode>
+                <!-- Special dual-LUT mode that drives adder only -->
+                <mode name="arithmetic">
+                  <pb_type name="arithmetic" num_pb="1">
+                    <input name="in" num_pins="5"/>
+                    <input name="cin" num_pins="1"/>
+                    <output name="out" num_pins="2"/>
+                    <output name="cout" num_pins="1"/>
+                    <clock name="clk" num_pins="1"/> 
+                      <pb_type name="lut4" num_pb="2">
+                        <input name="in" num_pins="4"/>
+                        <input name="cin" num_pins="1"/>
+                        <output name="out" num_pins="1"/>
+                        <output name="cout" num_pins="1"/>
+                        <clock name="clk" num_pins="1"/>
+                        <pb_type name="lut3" blif_model=".names" num_pb="2" class="lut">
+                          <input name="in" num_pins="3" port_class="lut_in"/>
+                          <output name="out" num_pins="1" port_class="lut_out"/>
+                          <!-- LUT timing using delay matrix -->
+                          <!-- These are the physical delay inputs on a 4bit Adder architecture but because VPR cannot do LUT rebalancing,
+                               we instead take the average of these numbers to get more stable results. Those are the delays from input
+                               A - C to the 3-LUT output
+                               138.45e-12
+                               134.62e-12
+                               116.45e-12
+                          -->
+                          <!-- taking the average of the three long delays: 232e-12 -->
+                          <delay_matrix type="max" in_port="lut3.in" out_port="lut3.out">
+                              129.84e-12
+                              129.84e-12
+                              129.84e-12
+                          </delay_matrix>
+                        </pb_type>
+                        <pb_type name="adder" blif_model=".subckt adder" num_pb="1">
+                          <input name="a" num_pins="1"/>
+                          <input name="b" num_pins="1"/>
+                          <input name="cin" num_pins="1"/>
+                          <output name="cout" num_pins="1"/>
+                          <output name="sumout" num_pins="1"/>
+                          <delay_constant max="65.36e-12" in_port="adder.a" out_port="adder.sumout"/>
+                          <delay_constant max="65.36e-12" in_port="adder.b" out_port="adder.sumout"/>
+                          <delay_constant max="36.93e-12" in_port="adder.cin" out_port="adder.sumout"/>
+                          <delay_constant max="44.01e-12" in_port="adder.a" out_port="adder.cout"/>
+                          <delay_constant max="44.01e-12" in_port="adder.b" out_port="adder.cout"/>
+                          <delay_constant max="23.18e-12" in_port="adder.cin" out_port="adder.cout"/>
+                        </pb_type>
+                        <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop">
+                          <input name="D" num_pins="1" port_class="D"/>
+                          <output name="Q" num_pins="1" port_class="Q"/>
+                          <clock name="clk" num_pins="1" port_class="clock"/>
+                          <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                          <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+                        </pb_type>
+                        <interconnect>  <!-- arithmetic interconnect -->
+                          <direct name="clock" input="lut4.clk" output="ff.clk"/>
+                          <direct name="lut_in1" input="lut4.in[1:0]" output="lut3[0:0].in[1:0]"/>
+                          <mux name="input_mux1" input="lut4.in[2:2] lut4.in[3:3]" output="lut3[0:0].in[2:2]">
+                            <delay_constant max="5.262e-12" in_port="lut4.in[2:2]" out_port="lut3[0:0].in[2:2]"/>
+                            <delay_constant max="5.262e-12" in_port="lut4.in[3:3]" out_port="lut3[0:0].in[2:2]"/>
+                          </mux>
+                          <direct name="lut_in2" input="lut4.in[1:0]" output="lut3[1:1].in[1:0]"/>
+                          <mux name="input_mux2" input="lut4.in[2:2] lut4.in[3:3]" output="lut3[1:1].in[2:2]">
+                            <delay_constant max="5.262e-12" in_port="lut4.in[2:2]" out_port="lut3[1:1].in[2:2]"/>
+                            <delay_constant max="5.262e-12" in_port="lut4.in[3:3]" out_port="lut3[1:1].in[2:2]"/>
+                          </mux>  
+                          <direct name="lut_to_add1" input="lut3[0:0].out" output="adder.a"/>
+                          <direct name="lut_to_add2" input="lut3[1:1].out" output="adder.b"/>
+                          <direct name="add_to_ff" input="adder.sumout" output="ff.D">
+                            <delay_constant max="18.1e-12" in_port="adder.sumout" out_port="ff.D"/>
+                            <!--pack_pattern name="chain" in_port="adder.sumout" out_port="ff.D"/-->
+                          </direct>
+                          <direct name="carry_in" input="lut4.cin" output="adder.cin">
+                            <pack_pattern name="chain" in_port="lut4.cin" out_port="adder.cin"/>
+                          </direct>
+                          <direct name="carry_out" input="adder.cout" output="lut4.cout">
+                            <pack_pattern name="chain" in_port="adder.cout" out_port="lut4.cout"/>
+                          </direct>
+                          <mux name="sumout" input="ff.Q adder.sumout" output="lut4.out">
+                            <delay_constant max="43.87e-12" in_port="adder.sumout" out_port="lut4.out"/>
+                            <delay_constant max="43.87e-12" in_port="ff.Q" out_port="lut4.out"/>
+                          </mux>
+                        </interconnect>
+                      </pb_type>
+                      <interconnect>
+                        <direct name="direct1" input="arithmetic.in[3:0]" output="lut4[0:0].in[3:0]"/>
+                        <direct name="direct2" input="arithmetic.in[2:0]" output="lut4[1:1].in[2:0]"/>
+                        <direct name="direct22" input="arithmetic.in[4:4]" output="lut4[1:1].in[3:3]"/>
+                        <direct name="carry_in" input="arithmetic.cin" output="lut4[0:0].cin">
+                          <pack_pattern name="chain" in_port="arithmetic.cin" out_port="lut4[0:0].cin"/>
+                        </direct>
+                        <direct name="carry_link" input="lut4[0:0].cout" output="lut4[1:1].cin">
+                          <pack_pattern name="chain" in_port="lut4[0:0].cout" out_port="lut4[1:1].cin"/>
+                        </direct>
+                        <direct name="carry_out" input="lut4[1:1].cout" output="arithmetic.cout">
+                          <pack_pattern name="chain" in_port="lut4[1:1].cout" out_port="arithmetic.cout"/>
+                        </direct>
+                        <complete name="complete1" input="arithmetic.clk" output="lut4[1:0].clk"/>
+                        <direct name="direct4" input="lut4[0:0].out" output="arithmetic.out[0:0]"/>
+                        <direct name="direct5" input="lut4[1:1].out" output="arithmetic.out[1:1]"/>
+                      </interconnect>
+                  </pb_type>
+                  <interconnect> <!-- ble5 interconnect @ mode arithmetic -->
+                    <direct name="direct1" input="ble5.in" output="arithmetic.in"/>
+                    <direct name="carry_in" input="ble5.cin" output="arithmetic.cin">
+                      <pack_pattern name="chain" in_port="ble5.cin" out_port="arithmetic.cin"/>
+                    </direct>
+                    <direct name="carry_out" input="arithmetic.cout" output="ble5.cout">
+                      <pack_pattern name="chain" in_port="arithmetic.cout" out_port="ble5.cout"/>
+                    </direct>
+                    <direct name="direct2" input="ble5.clk" output="arithmetic.clk"/>
+                    <direct name="direct3" input="arithmetic.out" output="ble5.out"/>
+                  </interconnect>
+                </mode>
+              </pb_type>
+              <interconnect>
+                <!-- Shared inputs between the two 5-LUTs -->
+                <complete name="lut5_reg1" input="fle.in[0]" output="ble5[0].in[0] ble5[1].in[1]"/>
+                <complete name="lut5_reg2" input="fle.in[1]" output="ble5[0].in[1] ble5[1].in[0]"/>
+
+                <!-- Rest of the 5-LUT inputs -->
+                <direct name="lut5_inputs_1" input="fle.in[4:2]" output="ble5[0].in[4:2]"/>
+                <direct name="lut5_inputs_22" input="fle.in[7:5]" output="ble5[1].in[4:2]"/>
+
+                <direct name="lut5_outputs_1" input="ble5[0].out" output="fle.out[1:0]"/>
+                <direct name="lut5_outputs_2" input="ble5[1].out" output="fle.out[3:2]"/>
+
+                <direct name="carry_in" input="fle.cin" output="ble5[0].cin">
+                  <pack_pattern name="chain" in_port="fle.cin" out_port="ble5[0].cin"/>
+                </direct>
+                <direct name="carry_out" input="ble5[1].cout" output="fle.cout">
+                  <pack_pattern name="chain" in_port="ble5[1].cout" out_port="fle.cout"/>
+                </direct>
+                <direct name="carry_link" input="ble5[0].cout" output="ble5[1].cin">
+                  <pack_pattern name="chain" in_port="ble5[0].cout" out_port="ble5[1].cout"/>
+                </direct>
+                <complete name="clock" input="fle.clk" output="ble5[1:0].clk"/>
+              </interconnect>
+            </mode> <!-- n2_lut5 -->
+            <mode name="n1_lut6">
+              <pb_type name="ble6" num_pb="1">
+                <input name="in" num_pins="6"/>
+                <output name="out" num_pins="4"/>
+                <clock name="clk" num_pins="1"/> 
+                <pb_type name="lut6" blif_model=".names" num_pb="1" class="lut">
+                  <input name="in" num_pins="6" port_class="lut_in"/>
+                  <output name="out" num_pins="1" port_class="lut_out"/>
+                  <!-- LUT timing using delay matrix -->
+                  <!-- These are the physical delay inputs on a 4bit Adder architecture LUT but because VPR cannot do LUT rebalancing,
+                         we instead take the average of these numbers to get more stable results
+                         264.27e-12
+                         260.44e-12
+                         242.27e-12
+                         155.18e-12
+                         115.89e-12
+                         78.67e-12
+                    -->
+                  <delay_matrix type="max" in_port="lut6.in" out_port="lut6.out">
+                      186.12e-12
+                      186.12e-12
+                      186.12e-12
+                      186.12e-12
+                      186.12e-12
+                      186.12e-12
+                  </delay_matrix>
+                </pb_type>
+                <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop">
+                  <input name="D" num_pins="1" port_class="D"/>
+                  <output name="Q" num_pins="1" port_class="Q"/>
+                  <clock name="clk" num_pins="1" port_class="clock"/>
+                  <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                  <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+                </pb_type>
+                <interconnect>
+                  <direct name="lut6_inputs" input="ble6.in" output="lut6.in"/>
+                  <direct name="lut6_ff" input="lut6.out" output="ff[1].D">
+                    <delay_constant max="18.1e-12" in_port="lut6.out" out_port="ff[1].D"/>
+                    <pack_pattern name="ble6" in_port="lut6.out" out_port="ff[1].D"/>
+                  </direct>
+                  <complete name="clock" input="ble6.clk" output="ff.clk"/>
+                  <direct name="input_to_ff" input="ble6.in[0]" output="ff[0].D"/>
+                  <complete name="mux1" input="ff[0].Q lut6.out" output="ble6.out[1:0]">
+                    <delay_constant max="43.87e-12" in_port="lut6.out" out_port="ble6.out[1:0]"/>
+                    <delay_constant max="43.87e-12" in_port="ff[0].Q" out_port="ble6.out[1:0]"/>
+                  </complete>
+                  <complete name="mux2" input="ff[1].Q lut6.out" output="ble6.out[3:2]">
+                    <delay_constant max="43.87e-12" in_port="lut6.out" out_port="ble6.out[3:2]"/>
+                    <delay_constant max="43.87e-12" in_port="ff[1].Q" out_port="ble6.out[3:2]"/>
+                  </complete>
+                </interconnect>
+              </pb_type>
+              <interconnect>
+                <!-- ble6 takes inputs A, B, C, D, E, & F; where F is fle[7] -->
+                <direct name="lut6_inputs1" input="fle.in[4:0]" output="ble6.in[4:0]"/>
+                <direct name="lut6_inputs2" input="fle.in[7]" output="ble6.in[5]"/>
+                <direct name="direct2" input="ble6.out" output="fle.out"/>
+                <direct name="direct4" input="fle.clk" output="ble6.clk"/>
+              </interconnect>
+            </mode> <!-- n1_lut6 -->
+          </pb_type>
+          <interconnect>
+            <!-- We use a 50% depop crossbar built using small full xbars to get sets of logically equivalent pins at inputs of CLB --> 
+            <complete name="lutA" input="lab.I4 lab.I3" output="fle[9:0].in[0:0]">
+              <delay_constant max="75.11e-12" in_port="lab.I4" out_port="fle.in[0:0]"/>
+              <delay_constant max="75.11e-12" in_port="lab.I3" out_port="fle.in[0:0]"/>
+            </complete>
+            <complete name="lutB" input="lab.I3 lab.I2" output="fle[9:0].in[1:1]">
+              <delay_constant max="75.11e-12" in_port="lab.I3" out_port="fle.in[1:1]"/>
+              <delay_constant max="75.11e-12" in_port="lab.I2" out_port="fle.in[1:1]"/>
+            </complete>
+            <complete name="lutC" input="lab.I2 lab.I1" output="fle[9:0].in[2:2]">
+              <delay_constant max="75.11e-12" in_port="lab.I2" out_port="fle.in[2:2]"/>
+              <delay_constant max="75.11e-12" in_port="lab.I1" out_port="fle.in[2:2]"/>
+            </complete>
+            <complete name="lutD" input="lab.I4 lab.I2" output="fle[9:0].in[3:3]">
+              <delay_constant max="75.11e-12" in_port="lab.I4" out_port="fle.in[3:3]"/>
+              <delay_constant max="75.11e-12" in_port="lab.I2" out_port="fle.in[3:3]"/>
+            </complete>
+            <complete name="lutE" input="lab.I3 lab.I1" output="fle[9:0].in[4:4]">
+              <delay_constant max="75.11e-12" in_port="lab.I3" out_port="fle.in[4:4]"/>
+              <delay_constant max="75.11e-12" in_port="lab.I1" out_port="fle.in[4:4]"/>
+            </complete>
+            <complete name="lutF" input="lab.I4 lab.I1" output="fle[9:0].in[5:5]">
+              <delay_constant max="75.11e-12" in_port="lab.I4" out_port="fle.in[5:5]"/>
+              <delay_constant max="75.11e-12" in_port="lab.I1" out_port="fle.in[5:5]"/>
+            </complete>
+            <complete name="lutG" input="lab.I4 lab.I3" output="fle[9:0].in[6:6]">
+              <delay_constant max="75.11e-12" in_port="lab.I4" out_port="fle.in[6:6]"/>
+              <delay_constant max="75.11e-12" in_port="lab.I3" out_port="fle.in[6:6]"/>
+            </complete>
+            <complete name="lutH" input="lab.I3 lab.I2" output="fle[9:0].in[7:7]">
+              <delay_constant max="75.11e-12" in_port="lab.I3" out_port="fle.in[7:7]"/>
+              <delay_constant max="75.11e-12" in_port="lab.I2" out_port="fle.in[7:7]"/>
+            </complete>
+
+            <complete name="clks" input="lab.clk" output="fle[9:0].clk"/>
+
+            <!-- This way of specifying direct connection to clb outputs is important because this architecture uses automatic spreading of opins.  
+                   By grouping to output pins in this fashion, if a logic block is completely filled by 6-LUTs, 
+                   then the outputs those 6-LUTs take get evenly distributed across all four sides of the CLB instead of clumped on two sides (which is what happens with a more
+                   naive specification).
+            -->
+            <direct name="labouts11" input="fle[9:0].out[0]" output="lab.O[9:0]"/>
+            <direct name="labouts12" input="fle[9:0].out[1]" output="lab.O[19:10]"/>
+            <direct name="labouts13" input="fle[9:0].out[2]" output="lab.O[29:20]"/>
+            <direct name="labouts14" input="fle[9:0].out[3]" output="lab.O[39:30]"/>
+
+            <!-- Carry chain links -->
+            <direct name="carry_in1" input="lab.cin[0:0]" output="fle[0:0].cin">
+              <!-- Put all inter-block carry chain delay on this one edge -->
+              <delay_constant max="17.8e-12" in_port="lab.cin[0:0]" out_port="fle[0:0].cin"/>
+              <pack_pattern name="chain" in_port="lab.cin[0:0]" out_port="fle[0:0].cin"/>
+            </direct>
+            <direct name="carry_in2" input="lab.cin[1:1]" output="fle[5:5].cin">
+              <!-- Put all inter-block carry chain delay on this one edge -->
+              <delay_constant max="18.47e-12" in_port="lab.cin[1:1]" out_port="fle[5:5].cin"/>
+              <pack_pattern name="chain" in_port="lab.cin[1:1]" out_port="fle[5:5].cin"/>
+            </direct>
+            <direct name="carry_out1" input="fle[4:4].cout" output="lab.cout[0:0]">
+              <pack_pattern name="chain" in_port="fle[4:4].cout" out_port="lab.cout[0:0]"/>
+            </direct>
+            <direct name="carry_out2" input="fle[9:9].cout" output="lab.cout[1:1]">
+              <pack_pattern name="chain" in_port="fle[9:9].cout" out_port="lab.cout[1:1]"/>
+            </direct>
+            <direct name="carry_link1" input="fle[3:0].cout" output="fle[4:1].cin">
+              <pack_pattern name="chain" in_port="fle[3:0].cout" out_port="fle[4:1].cin"/>
+            </direct>
+            <direct name="carry_link2" input="fle[8:5].cout" output="fle[9:6].cin">
+              <pack_pattern name="chain" in_port="fle[8:5].cout" out_port="fle[9:6].cin"/>
+            </direct>
+          </interconnect>
+        </pb_type>
+        <interconnect>
+
+          <direct name="carry_in1" input="clb.cin[0:0]" output="lab.cin[0:0]"/>
+          <direct name="carry_out1" input="lab.cout[0:0]" output="clb.cout[0:0]"/>
+          <direct name="carry_in2" input="clb.cin[1:1]" output="lab.cin[1:1]"/>
+          <direct name="carry_out2" input="lab.cout[1:1]" output="clb.cout[1:1]"/>
+
+          <direct name="clock" input="clb.clk" output="lab.clk"/>
+
+          <!-- local feedback signals from two outputs per ALM sharing LAB inputs with 
+               routing channel inputs -->
+          <complete name="Input_feedback_I1" input="clb.I1 lab.O[4:0]" output="lab.I1"/>
+          <complete name="Input_feedback_I2" input="clb.I2 lab.O[24:20]" output="lab.I2"/>
+          <complete name="Input_feedback_I3" input="clb.I3 lab.O[9:5]" output="lab.I3"/>
+          <complete name="Input_feedback_I4" input="clb.I4 lab.O[29:25]" output="lab.I4"/>
+
+          <direct name="output" input="lab.O" output="clb.O"/>
+        </interconnect>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+            <fc_override port_name="cin" fc_type="frac" fc_val="0"/>
+            <fc_override port_name="cout" fc_type="frac" fc_val="0"/>
+        </fc>
+        <pinlocations pattern="spread"/>
+      </pb_type>
+       <!-- Define general purpose logic block (CLB) ends -->
+
+       <!-- Define fracturable multiplier begin -->
+       <!-- This multiplier can operate as a 27x27 multiplier that can fracture to two 18x19 multipliers
+            the delays of thsi multiplier are from Arria 10 which is a 22nm chip -->
+       <pb_type name="mult_27" height="2">
+         <input name="datain" num_pins="74"/>
+         <output name="dataout" num_pins="74"/>
+
+         <mode name="two_mult_18x19">
+           <pb_type name="two_mult_18x19" num_pb="2">
+             <input name="a" num_pins="18"/>
+             <input name="b" num_pins="19"/>
+             <output name="out" num_pins="37"/>
+             <pb_type name="mult_18x19" blif_model=".subckt multiply" num_pb="1">
+               <input name="a" num_pins="18"/>
+               <input name="b" num_pins="19"/>
+               <output name="out" num_pins="37"/>
+               <!-- Using the numbers from Arria 10 which is a 22nm technology, an 18x19 multiplier 
+                    can operate at 548 MHz which maps to a delay of 1.825e-9 -->
+                 <delay_constant max="1.825e-9" in_port="mult_18x19.a" out_port="mult_18x19.out"/>
+                 <delay_constant max="1.825e-9" in_port="mult_18x19.b" out_port="mult_18x19.out"/>
+             </pb_type>
+             <interconnect>
+               <direct name="a2a" input="two_mult_18x19.a" output="mult_18x19.a">
+               </direct>
+               <direct name="b2b" input="two_mult_18x19.b" output="mult_18x19.b">
+               </direct>
+               <direct name="out2out" input="mult_18x19.out" output="two_mult_18x19.out">
+               </direct>
+             </interconnect>
+               <power method="pin-toggle">
+                 <port name="a" energy_per_toggle="1.09e-12"/>
+                 <port name="b" energy_per_toggle="1.09e-12"/>
+                 <static_power power_per_instance="0.0"/>
+               </power>
+           </pb_type>
+           <interconnect>
+             <!-- Stratix IV input delay of 207ps is conservative for this architecture because this architecture does not have an input crossbar in the multiplier. 
+              Subtract 72.5 ps delay, which is already in the connection block input mux, leading
+                 -->
+             <direct name="datain2a1" input="mult_27.datain[17:0]" output="two_mult_18x19[0].a">
+               <delay_constant max="134e-12" in_port="mult_27.datain[17:0]" out_port="two_mult_18x19[0].a"/>
+             </direct>
+             <direct name="datain2b1" input="mult_27.datain[36:18]" output="two_mult_18x19[0].b">
+               <delay_constant max="134e-12" in_port="mult_27.datain[36:18]" out_port="two_mult_18x19[0].b"/>
+             </direct>
+             <direct name="datain2a2" input="mult_27.datain[54:37]" output="two_mult_18x19[1].a">
+               <delay_constant max="134e-12" in_port="mult_27.datain[54:37]" out_port="two_mult_18x19[1].a"/>
+             </direct>
+             <direct name="datain2b2" input="mult_27.datain[73:55]" output="two_mult_18x19[1].b">
+               <delay_constant max="134e-12" in_port="mult_27.datain[73:55]" out_port="two_mult_18x19[1].b"/>
+             </direct>
+             <direct name="out2dataout" input="two_mult_18x19[1:0].out" output="mult_27.dataout">
+               <delay_constant max="1.09e-9" in_port="two_mult_18x19[1:0].out" out_port="mult_27.dataout"/>
+             </direct>
+           </interconnect>
+         </mode>
+
+         <mode name="mult_27x27">
+           <pb_type name="one_mult_27x27" num_pb="1">
+             <input name="a" num_pins="27"/>
+             <input name="b" num_pins="27"/>
+             <output name="out" num_pins="54"/>
+
+             <pb_type name="mult_27x27" blif_model=".subckt multiply" num_pb="1">
+               <input name="a" num_pins="27"/>
+               <input name="b" num_pins="27"/>
+               <output name="out" num_pins="54"/>
+               <!-- Using the numbers from Arria 10 which is a 22nm technology, an 27x27 multiplier 
+                    can operate at 541 MHz which maps to a delay of 1.848e-9 -->
+               <delay_constant max="1.848e-9" in_port="mult_27x27.a" out_port="mult_27x27.out"/>
+               <delay_constant max="1.848e-9" in_port="mult_27x27.b" out_port="mult_27x27.out"/>
+             </pb_type>
+             <interconnect>
+               <direct name="a2a" input="one_mult_27x27.a" output="mult_27x27.a">
+               </direct>
+               <direct name="b2b" input="one_mult_27x27.b" output="mult_27x27.b">
+               </direct>
+               <direct name="out2out" input="mult_27x27.out" output="one_mult_27x27.out">
+               </direct>
+             </interconnect>
+             <power method="pin-toggle">
+               <port name="a" energy_per_toggle="2.13e-12"/>
+               <port name="b" energy_per_toggle="2.13e-12"/>
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <!-- Stratix IV input delay of 207ps is conservative for this architecture because this architecture does not have an input crossbar in the multiplier. 
+              Subtract 72.5 ps delay, which is already in the connection block input mux, leading
+              to a 134 ps delay.
+                 -->
+             <direct name="datain2a" input="mult_27.datain[26:0]" output="one_mult_27x27.a">
+               <delay_constant max="134e-12" in_port="mult_27.datain[26:0]" out_port="one_mult_27x27.a"/>
+             </direct>
+             <direct name="datain2b" input="mult_27.datain[53:27]" output="one_mult_27x27.b">
+               <delay_constant max="134e-12" in_port="mult_27.datain[53:27]" out_port="one_mult_27x27.b"/>
+             </direct>
+             <direct name="out2dataout" input="one_mult_27x27.out" output="mult_27.dataout[53:0]">
+               <delay_constant max="1.93e-9" in_port="one_mult_27x27.out" out_port="mult_27.dataout[53:0]"/>
+             </direct>
+           </interconnect>
+
+         </mode>
+
+         <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+         <pinlocations pattern="spread"/>
+
+         <!-- Place this multiplier block every 8 columns from (and including) the sixth column -->
+         <power method="sum-of-children"/>
+       </pb_type>
+       <!-- Define fracturable multiplier end -->
+
+       <!-- Define fracturable memory begin -->
+       <!-- The architecture also has a 20Kb memory that has true and simple dual port modes. In simple
+            dual port mode the memory can be configured in the following modes: 512x40, 1024x20 and 2048x10
+            While in true dual port mode it can be configured as: 1024x20 and 2028x10. -->
+       <pb_type name="memory" height="4">
+         <input name="addr1" num_pins="11"/>
+         <input name="addr2" num_pins="11"/>
+         <input name="data" num_pins="40"/>
+         <input name="we1" num_pins="1"/>
+         <input name="we2" num_pins="1"/>
+         <output name="out" num_pins="40"/>
+         <clock name="clk" num_pins="1"/>
+
+         <!-- Specify single port mode first -->
+         <mode name="mem_512x40_sp">
+           <pb_type name="mem_512x40_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+             <input name="addr" num_pins="9" port_class="address"/>
+             <input name="data" num_pins="40" port_class="data_in"/>
+             <input name="we" num_pins="1" port_class="write_en"/>
+             <output name="out" num_pins="40" port_class="data_out"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_512x40_sp.addr" clock="clk"/>
+             <T_setup value="509e-12" port="mem_512x40_sp.data" clock="clk"/>
+             <T_setup value="509e-12" port="mem_512x40_sp.we" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_512x40_sp.out" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="9.0e-12"/>
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[8:0]" output="mem_512x40_sp.addr">
+               <delay_constant max="132e-12" in_port="memory.addr1[8:0]" out_port="mem_512x40_sp.addr"/>
+             </direct>
+             <direct name="data1" input="memory.data" output="mem_512x40_sp.data">
+               <delay_constant max="132e-12" in_port="memory.data" out_port="mem_512x40_sp.data"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_512x40_sp.we">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_512x40_sp.we"/>
+             </direct>
+             <direct name="dataout1" input="mem_512x40_sp.out" output="memory.out">
+               <delay_constant max="40e-12" in_port="mem_512x40_sp.out" out_port="memory.out"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_512x40_sp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <mode name="mem_1024x20_sp">
+           <pb_type name="mem_1024x20_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+             <input name="addr" num_pins="10" port_class="address"/>
+             <input name="data" num_pins="20" port_class="data_in"/>
+             <input name="we" num_pins="1" port_class="write_en"/>
+             <output name="out" num_pins="20" port_class="data_out"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_1024x20_sp.addr" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_sp.data" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_sp.we" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_1024x20_sp.out" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="9.0e-12"/>
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_sp.addr">
+               <delay_constant max="132e-12" in_port="memory.addr1[9:0]" out_port="mem_1024x20_sp.addr"/>
+             </direct>
+             <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_sp.data">
+               <delay_constant max="132e-12" in_port="memory.data[19:0]" out_port="mem_1024x20_sp.data"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_1024x20_sp.we">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_1024x20_sp.we"/>
+             </direct>
+             <direct name="dataout1" input="mem_1024x20_sp.out" output="memory.out[19:0]">
+               <delay_constant max="40e-12" in_port="mem_1024x20_sp.out" out_port="memory.out[19:0]"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_1024x20_sp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <mode name="mem_2048x10_sp">
+           <pb_type name="mem_2048x10_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+             <input name="addr" num_pins="11" port_class="address"/>
+             <input name="data" num_pins="10" port_class="data_in"/>
+             <input name="we" num_pins="1" port_class="write_en"/>
+             <output name="out" num_pins="10" port_class="data_out"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_2048x10_sp.addr" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_sp.data" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_sp.we" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_2048x10_sp.out" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="9.0e-12"/>
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_sp.addr">
+               <delay_constant max="132e-12" in_port="memory.addr1[10:0]" out_port="mem_2048x10_sp.addr"/>
+             </direct>
+             <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_sp.data">
+               <delay_constant max="132e-12" in_port="memory.data[9:0]" out_port="mem_2048x10_sp.data"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_2048x10_sp.we">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_2048x10_sp.we"/>
+             </direct>
+             <direct name="dataout1" input="mem_2048x10_sp.out" output="memory.out[9:0]">
+               <delay_constant max="40e-12" in_port="mem_2048x10_sp.out" out_port="memory.out[9:0]"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_2048x10_sp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <!-- Specify true dual port mode next -->
+         <mode name="mem_1024x20_dp">
+           <pb_type name="mem_1024x20_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+             <input name="addr1" num_pins="10" port_class="address1"/>
+             <input name="addr2" num_pins="10" port_class="address2"/>
+             <input name="data1" num_pins="20" port_class="data_in1"/>
+             <input name="data2" num_pins="20" port_class="data_in2"/>
+             <input name="we1" num_pins="1" port_class="write_en1"/>
+             <input name="we2" num_pins="1" port_class="write_en2"/>
+             <output name="out1" num_pins="20" port_class="data_out1"/>
+             <output name="out2" num_pins="20" port_class="data_out2"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.addr1" clock="clk"/> <T_setup value="509e-12" port="mem_1024x20_dp.data1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.we1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.addr2" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.data2" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.we2" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_1024x20_dp.out1" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_1024x20_dp.out2" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="17.9e-12"/>
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_dp.addr1">
+               <delay_constant max="132e-12" in_port="memory.addr1[9:0]" out_port="mem_1024x20_dp.addr1"/>
+             </direct>
+             <direct name="address2" input="memory.addr2[9:0]" output="mem_1024x20_dp.addr2">
+               <delay_constant max="132e-12" in_port="memory.addr2[9:0]" out_port="mem_1024x20_dp.addr2"/>
+             </direct>
+             <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_dp.data1">
+               <delay_constant max="132e-12" in_port="memory.data[19:0]" out_port="mem_1024x20_dp.data1"/>
+             </direct>
+             <direct name="data2" input="memory.data[39:20]" output="mem_1024x20_dp.data2">
+               <delay_constant max="132e-12" in_port="memory.data[39:20]" out_port="mem_1024x20_dp.data2"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_1024x20_dp.we1">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_1024x20_dp.we1"/>
+             </direct>
+             <direct name="writeen2" input="memory.we2" output="mem_1024x20_dp.we2">
+               <delay_constant max="132e-12" in_port="memory.we2" out_port="mem_1024x20_dp.we2"/>
+             </direct>
+             <direct name="dataout1" input="mem_1024x20_dp.out1" output="memory.out[19:0]">
+               <delay_constant max="40e-12" in_port="mem_1024x20_dp.out1" out_port="memory.out[19:0]"/>
+             </direct>
+             <direct name="dataout2" input="mem_1024x20_dp.out2" output="memory.out[39:20]">
+               <delay_constant max="40e-12" in_port="mem_1024x20_dp.out2" out_port="memory.out[39:20]"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_1024x20_dp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <mode name="mem_2048x10_dp">
+           <pb_type name="mem_2048x10_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+             <input name="addr1" num_pins="11" port_class="address1"/>
+             <input name="addr2" num_pins="11" port_class="address2"/>
+             <input name="data1" num_pins="10" port_class="data_in1"/>
+             <input name="data2" num_pins="10" port_class="data_in2"/>
+             <input name="we1" num_pins="1" port_class="write_en1"/>
+             <input name="we2" num_pins="1" port_class="write_en2"/>
+             <output name="out1" num_pins="10" port_class="data_out1"/>
+             <output name="out2" num_pins="10" port_class="data_out2"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.addr1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.data1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.we1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.addr2" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.data2" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.we2" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_2048x10_dp.out1" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_2048x10_dp.out2" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="17.9e-12"/>
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_dp.addr1">
+               <delay_constant max="132e-12" in_port="memory.addr1[10:0]" out_port="mem_2048x10_dp.addr1"/>
+             </direct>
+             <direct name="address2" input="memory.addr2[10:0]" output="mem_2048x10_dp.addr2">
+               <delay_constant max="132e-12" in_port="memory.addr2[10:0]" out_port="mem_2048x10_dp.addr2"/>
+             </direct>
+             <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_dp.data1">
+               <delay_constant max="132e-12" in_port="memory.data[9:0]" out_port="mem_2048x10_dp.data1"/>
+             </direct>
+             <direct name="data2" input="memory.data[19:10]" output="mem_2048x10_dp.data2">
+               <delay_constant max="132e-12" in_port="memory.data[19:10]" out_port="mem_2048x10_dp.data2"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_2048x10_dp.we1">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_2048x10_dp.we1"/>
+             </direct>
+             <direct name="writeen2" input="memory.we2" output="mem_2048x10_dp.we2">
+               <delay_constant max="132e-12" in_port="memory.we2" out_port="mem_2048x10_dp.we2"/>
+             </direct>
+             <direct name="dataout1" input="mem_2048x10_dp.out1" output="memory.out[9:0]">
+               <delay_constant max="40e-12" in_port="mem_2048x10_dp.out1" out_port="memory.out[9:0]"/>
+             </direct>
+             <direct name="dataout2" input="mem_2048x10_dp.out2" output="memory.out[19:10]">
+               <delay_constant max="40e-12" in_port="mem_2048x10_dp.out2" out_port="memory.out[19:10]"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_2048x10_dp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel -->
+         <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+         <pinlocations pattern="spread"/>
+
+         <!-- Place this memory block every 8 columns from (and including) the second column -->
+         <power method="sum-of-children"/>
+       </pb_type>
+       <!-- Define fracturable memory end -->
+
+
+    </complexblocklist>
+
+    <power>
+      <local_interconnect C_wire="2.5e-10"/>
+      <mux_transistor_size mux_transistor_size="3"/>
+      <FF_size FF_size="4"/>
+      <LUT_transistor_size LUT_transistor_size="4"/>
+    </power>
+
+    <clocks>
+      <clock buffer_size="auto" C_wire="2.5e-10"/>
+    </clocks>
+</architecture>