arch: Added an architecture file for the 9-bit Shadow Multiplier
architecture
diff --git a/vtr_flow/arch/timing/shadow_multiplier_9bit.xml b/vtr_flow/arch/timing/shadow_multiplier_9bit.xml
new file mode 100644
index 0000000..4191b28
--- /dev/null
+++ b/vtr_flow/arch/timing/shadow_multiplier_9bit.xml
@@ -0,0 +1,1020 @@
+<!-- 
+    This is the architecture file for 9-bit Shadow Multipier Architecture discussed in [1].
+    Delays for routing and logic blocks come from COFFE runs for a 20 nm technology node.
+    Delays for DSP blocks and BRAMs come from Arria 10 (22 nm) delays.
+
+    This architecture has 10 ALMs per cluster, where each ALM is a 6-LUT fracturable into
+    two 5-LUTs. The ALM has 8 inputs and 4 optionally registered outputs.The two 5-LUTs should
+    share at least two inputs. Each two ALM outputs are logically equivalent, which means any
+    output signal that can reach ALM.out[0] can reach ALM.out[1] and the same thing for
+    ALM.out[2] and ALM.out[3]. The ALMs in this architecture have an arithmetic mode
+    where each 5-LUT is fractured into two 4-LUTs, resulting in a total of four 4-LUTs and two
+    bits of addition per ALM. This architecture has a single carry chain that spans the 10 ALMs
+    in the LAB. In addition, the architecture has a 9-bit multiplier per LAB that shares its inputs
+    and outputs with four and a half ALMs within the LAB. Therefore, either the multiplier or the ALMs
+    could be used at a time.
+
+    The LAB has 60 inputs and 40 outputs. Two outputs of each ALM are fed to the right and
+    left LAB using direct links and are also fed back to the LAB as feedback connections sharing
+    the 60 input ports with the signals coming from the routing channels.
+
+    The architecture also has a 20Kb memory that has true and simple dual port modes. In simple
+    dual port mode the memory can be configured in the following modes: 512x40, 1024x20 and 2048x10,
+    while in true dual port mode it can be configured as: 1024x20 and 2028x10.
+
+    In addition, the architecture has a 27x27 DSP block that can be fractured into two 18x19 DSPs.
+
+
+    [1] M. Eldafrawy, A. Boutros, S. Yazdanshenas, and V. Betz, "FPGA Logic Block Architectures for efficient
+        multiplication and addition to enhance machine learning performance," in Transactions on Reconfigurable
+        Technology and Systems (TRETS), 2019
+
+-->
+<architecture>
+    <!-- 
+         ODIN II specific config begins 
+         Describes the types of user-specified netlist blocks (in blif, this corresponds to 
+         ".model [type_of_block]") that this architecture supports.
+
+         Note: Basic LUTs, I/Os, and flip-flops are not included here as there are 
+         already special structures in blif (.names, .input, .output, and .latch) 
+         that describe them.
+    -->
+    <models>
+      <model name="multiply">
+        <input_ports>
+        <port name="a" combinational_sink_ports="out"/>
+        <port name="b" combinational_sink_ports="out"/>
+        </input_ports>
+        <output_ports>
+        <port name="out"/>
+        </output_ports>
+      </model>
+      
+      <model name="single_port_ram">
+        <input_ports>
+        <port name="we" clock="clk"/>     <!-- control -->
+        <port name="addr" clock="clk"/>  <!-- address lines -->
+        <port name="data" clock="clk"/>  <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="clk" is_clock="1"/>  <!-- memories are often clocked -->
+        </input_ports>
+        <output_ports>
+        <port name="out" clock="clk"/>   <!-- output can be broken down into smaller bit widths minimum size 1 -->
+        </output_ports>
+      </model>
+
+      <model name="dual_port_ram">
+        <input_ports>
+        <port name="we1" clock="clk"/>     <!-- write enable -->
+        <port name="we2" clock="clk"/>     <!-- write enable -->
+        <port name="addr1" clock="clk"/>  <!-- address lines -->
+        <port name="addr2" clock="clk"/>  <!-- address lines -->
+        <port name="data1" clock="clk"/>  <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="data2" clock="clk"/>  <!-- data lines can be broken down into smaller bit widths minimum size 1 -->
+        <port name="clk" is_clock="1"/>  <!-- memories are often clocked -->
+        </input_ports>
+        <output_ports>
+        <port name="out1" clock="clk"/>   <!-- output can be broken down into smaller bit widths minimum size 1 -->
+        <port name="out2" clock="clk"/>   <!-- output can be broken down into smaller bit widths minimum size 1 -->
+        </output_ports>
+      </model>
+
+      <model name="adder">
+        <input_ports>
+          <port name="a" combinational_sink_ports="sumout cout"/>
+          <port name="b" combinational_sink_ports="sumout cout"/>
+          <port name="cin" combinational_sink_ports="sumout cout"/>
+        </input_ports>
+        <output_ports>
+          <port name="cout"/>
+          <port name="sumout"/>
+        </output_ports>
+      </model>
+    </models> <!-- ODIN II specific config ends -->
+
+    <layout> <!-- Physical descriptions begin -->
+      <auto_layout aspect_ratio="1.0">
+          <!--Perimeter of 'io' blocks with 'EMPTY' blocks at corners-->
+          <perimeter type="io" priority="100"/>
+          <corners type="EMPTY" priority="101"/>
+          <!--Fill with 'clb'-->
+          <fill type="clb" priority="10"/>
+          <!--Column of 'mult_27' with 'EMPTY' blocks wherever a 'mult_27' does not fit. Vertical offset by 1 for perimeter.-->
+          <col type="mult_27" startx="6" starty="1" repeatx="8" priority="20"/>
+          <col type="EMPTY" startx="6" repeatx="8" starty="1" priority="19"/>
+          <!--Column of 'memory' with 'EMPTY' blocks wherever a 'memory' does not fit. Vertical offset by 1 for perimeter.-->
+          <col type="memory" startx="2" starty="1" repeatx="8" priority="20"/>
+          <col type="EMPTY" startx="2" repeatx="8" starty="1" priority="19"/>
+      </auto_layout>
+    </layout>
+
+    <device>
+      <sizing R_minW_nmos="13090" R_minW_pmos="19086.83"/>
+      <area grid_logic_tile_area="23840.5292411"/>
+      <chan_width_distr>
+        <x distr="uniform" peak="1.000000"/>
+        <y distr="uniform" peak="1.000000"/>
+      </chan_width_distr>
+	  <switch_block type="wilton" fs="3"/>
+	  <connection_block input_switch_name="ipin_cblock"/>
+    </device>
+
+    <switchlist>
+      <switch type="mux" name="0" R="0.0" Cin="0.0" Cout="0.0" Tdel="247.7e-12" mux_trans_size="2.173" buf_size="38.1053"/>
+      <switch type="mux" name="ipin_cblock" R="0.0" Cout="0.0" Cin="0.0" Tdel="144.7e-12" mux_trans_size="1.6265" buf_size="13.488"/>
+    </switchlist>
+
+    <segmentlist>
+      <segment freq="1.000000" length="4" type="unidir" Rmetal="0.0" Cmetal="0.0">
+        <mux name="0"/>
+        <sb type="pattern">1 1 1 1 1</sb>
+        <cb type="pattern">1 1 1 1</cb>
+      </segment>
+    </segmentlist>
+
+    <directlist>
+      <direct name="adder_carry" from_pin="clb.cout" to_pin="clb.cin" x_offset="0" y_offset="-1" z_offset="0"/>
+      
+      <!-- Direct connect to left and right LAB -->
+      <direct name="direct_right_1" from_pin="clb.O[4:0]" to_pin="clb.I1[9:5]" x_offset="1" y_offset="0" z_offset="0"/>
+      <direct name="direct_right_2" from_pin="clb.O[24:20]" to_pin="clb.I2[9:5]" x_offset="1" y_offset="0" z_offset="0"/>
+      <direct name="direct_right_3" from_pin="clb.O[9:5]" to_pin="clb.I3[9:5]" x_offset="1" y_offset="0" z_offset="0"/>
+      <direct name="direct_right_4" from_pin="clb.O[29:25]" to_pin="clb.I4[9:5]" x_offset="1" y_offset="0" z_offset="0"/>
+      
+      <direct name="direct_left_1" from_pin="clb.O[14:10]" to_pin="clb.I1[14:10]" x_offset="-1" y_offset="0" z_offset="0"/>
+      <direct name="direct_left_2" from_pin="clb.O[34:30]" to_pin="clb.I2[14:10]" x_offset="-1" y_offset="0" z_offset="0"/>
+      <direct name="direct_left_3" from_pin="clb.O[19:15]" to_pin="clb.I3[14:10]" x_offset="-1" y_offset="0" z_offset="0"/>
+      <direct name="direct_left_4" from_pin="clb.O[39:35]" to_pin="clb.I4[14:10]" x_offset="-1" y_offset="0" z_offset="0"/>
+    </directlist>
+
+    <complexblocklist>
+
+      <!-- Define I/O pads begin -->
+      <!-- Capacity is a unique property of I/Os, it is the maximum number of I/Os that can be placed at the same (X,Y) location on the FPGA -->
+	    <!-- Not sure of the area of an I/O (varies widely), and it's not relevant to the design of the FPGA core, so we're setting it to 0. -->
+      <pb_type name="io" capacity="8" area="0">
+        <input name="outpad" num_pins="1"/>
+        <output name="inpad" num_pins="1"/>
+        <clock name="clock" num_pins="1"/>
+
+	<!-- IOs can operate as either inputs or outputs.
+	     Delays below come from Ian Kuon. They are small, so they should be interpreted as
+	     the delays to and from registers in the I/O (and generally I/Os are registered 
+	     today and that is when you timing analyze them.
+	     -->
+        <mode name="inpad">
+          <pb_type name="inpad" blif_model=".input" num_pb="1">
+            <output name="inpad" num_pins="1"/>
+          </pb_type>
+          <interconnect>
+            <direct name="inpad" input="inpad.inpad" output="io.inpad">
+            <delay_constant max="4.243e-11" in_port="inpad.inpad" out_port="io.inpad"/>
+            </direct>
+          </interconnect>
+        </mode>
+        <mode name="outpad">
+          <pb_type name="outpad" blif_model=".output" num_pb="1">
+            <input name="outpad" num_pins="1"/>
+          </pb_type>
+          <interconnect>
+            <direct name="outpad" input="io.outpad" output="outpad.outpad">
+            <delay_constant max="1.394e-11" in_port="io.outpad" out_port="outpad.outpad"/>
+            </direct>
+          </interconnect>
+        </mode>
+
+        <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel -->
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+
+        <!-- IOs go on the periphery of the FPGA, for consistency, 
+          make it physically equivalent on all sides so that only one definition of I/Os is needed.
+          If I do not make a physically equivalent definition, then I need to define 4 different I/Os, one for each side of the FPGA
+        -->
+        <pinlocations pattern="custom">
+          <loc side="left">io.outpad io.inpad io.clock</loc>
+          <loc side="top">io.outpad io.inpad io.clock</loc>
+          <loc side="right">io.outpad io.inpad io.clock</loc>
+          <loc side="bottom">io.outpad io.inpad io.clock</loc>
+        </pinlocations>
+
+        <!-- Place I/Os on the sides of the FPGA -->
+        <power method="ignore"/>
+      </pb_type>
+      <!-- Define I/O pads ends -->
+
+      <!-- Define general purpose logic block (CLB) begin -->
+      <pb_type name="clb">
+        <input name="I1" num_pins="15" equivalent="full"/>
+		<input name="I2" num_pins="15" equivalent="full"/>
+		<input name="I3" num_pins="15" equivalent="full"/>
+		<input name="I4" num_pins="15" equivalent="full"/>
+        <input name="cin" num_pins="1"/>
+        <output name="O" num_pins="40" equivalent="none"/>
+        <output name="cout" num_pins="1"/>
+        <clock name="clk" num_pins="1"/>
+        <pb_type name="lab" num_pb="1">
+            <input name="I1" num_pins="15"/>
+            <input name="I2" num_pins="15"/>
+            <input name="I3" num_pins="15"/>
+            <input name="I4" num_pins="15"/>
+            <input name="cin" num_pins="1"/>
+            <output name="O" num_pins="40"/>
+            <output name="cout" num_pins="1"/>
+            <clock name="clk" num_pins="1"/>
+            <!-- Describe fracturable logic element.  
+                 Each fracturable logic element has a 6-LUT that can alternatively operate as two 5-LUTs with shared inputs. 
+                 The outputs of the fracturable logic element can be optionally registered
+            -->
+            <pb_type name="fle" num_pb="10">
+              <input name="in" num_pins="8"/>
+              <input name="cin" num_pins="1"/>
+              <output name="out" num_pins="4"/>
+              <output name="cout" num_pins="1"/>
+              <clock name="clk" num_pins="1"/>
+              <!-- 
+                    The ALM inputs are as follows:
+                            A -> fle[0]
+                            B -> fle[1]
+                            C -> fle[2]
+                            D -> fle[3]
+                            E -> fle[4]
+                            F -> fle[5]
+                            G -> fle[6]
+                            H -> fle[7]
+              -->
+              <mode name="n2_lut5">
+                <pb_type name="ble5" num_pb="2">
+                  <input name="in" num_pins="5"/>
+                  <input name="cin" num_pins="1"/>
+                  <output name="out" num_pins="2"/>
+                  <output name="cout" num_pins="1"/>
+                  <clock name="clk" num_pins="1"/>
+                  <mode name="blut5">
+                    <pb_type name="flut5" num_pb="1">
+                      <input name="in" num_pins="5"/>
+                      <output name="out" num_pins="2"/>
+                      <clock name="clk" num_pins="1"/>
+                      <!-- Regular LUT mode -->
+                      <pb_type name="lut5" blif_model=".names" num_pb="1" class="lut">
+                        <input name="in" num_pins="5" port_class="lut_in"/>
+                        <output name="out" num_pins="1" port_class="lut_out"/>
+                        <!-- LUT timing using delay matrix -->
+                          <!-- These are the physical delay inputs on a Stratix 10 LUT but because VPR cannot do LUT rebalancing,
+                             we instead take the average of these numbers to get more stable results
+                             note that those are the same delays for inputs A - E as the ones used for the 6-LUT, however, we have 
+                             subtracted the delay of the last mux stage to get the delay of inputs A - E till the 5-LUT output
+                          -->
+                        <delay_matrix type="max" in_port="lut5.in" out_port="lut5.out">
+                            154.5e-12
+                            154.5e-12
+                            154.5e-12
+                            154.5e-12
+                            154.5e-12
+                        </delay_matrix>
+                      </pb_type>
+                      <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop">
+                        <input name="D" num_pins="1" port_class="D"/>
+                        <output name="Q" num_pins="1" port_class="Q"/>
+                        <clock name="clk" num_pins="1" port_class="clock"/>
+                        <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                        <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+                      </pb_type>
+                      <interconnect>
+                        <direct name="lut5_in" input="flut5.in" output="lut5.in"/>
+                        <direct name="reg_in" input="flut5.in[0]" output="ff[0].D"/>
+                        <direct name="lut5_ff" input="lut5.out" output="ff[1].D">
+                          <delay_constant max="18.62e-12" in_port="lut5.out" out_port="ff[1].D"/>
+                          <pack_pattern name="ble5" in_port="lut5.out" out_port="ff[1].D"/>
+                        </direct>
+                        <complete name="clock" input="flut5.clk" output="ff.clk"/>
+                        <complete name="out_mux" input="ff.Q lut5.out" output="flut5.out">
+                          <delay_constant max="39.62e-12" in_port="lut5.out" out_port="flut5.out"/>
+                          <delay_constant max="39.62e-12" in_port="ff.Q" out_port="flut5.out"/>
+                        </complete>
+                      </interconnect>
+                    </pb_type>
+                    <interconnect>
+                      <direct name="direct1" input="ble5.in" output="flut5.in"/>
+                      <direct name="direct2" input="ble5.clk" output="flut5.clk"/>
+                      <direct name="direct3" input="flut5.out" output="ble5.out"/>                 
+                    </interconnect>
+                  </mode>
+                  <mode name="arithmetic">
+                    <pb_type name="arithmetic" num_pb="1">
+                      <input name="in" num_pins="4"/>
+                      <input name="cin" num_pins="1"/>
+                      <output name="out" num_pins="2"/>
+                      <output name="cout" num_pins="1"/>
+                      <clock name="clk" num_pins="1"/>
+                      <!-- Special dual-LUT mode that drives adder only -->
+                      <pb_type name="lut4" blif_model=".names" num_pb="2" class="lut">
+                        <input name="in" num_pins="4" port_class="lut_in"/>
+                        <output name="out" num_pins="1" port_class="lut_out"/>
+                        <!-- LUT timing using delay matrix -->
+                        <!-- These are the physical delay inputs on a Stratix 10 LUT but because VPR cannot do LUT rebalancing,
+                           we instead take the average of these numbers to get more stable results
+                           note that those are the same delays for inputs A - E as the ones used for the 6-LUT, however, we have 
+                           subtracted the delay of the last mux stage to get the delay of inputs A - E till the 5-LUT output
+                          -->
+                        <delay_matrix type="max" in_port="lut4.in" out_port="lut4.out">
+                            133.8e-12
+                            133.8e-12
+                            133.8e-12
+                            133.8e-12
+                        </delay_matrix>
+                      </pb_type>    
+                      <pb_type name="adder" blif_model=".subckt adder" num_pb="1">
+                        <input name="a" num_pins="1"/>
+                        <input name="b" num_pins="1"/>
+                        <input name="cin" num_pins="1"/>                      
+                        <output name="cout" num_pins="1"/>
+                        <output name="sumout" num_pins="1"/>
+                        <delay_constant max="64.74e-12" in_port="adder.a" out_port="adder.sumout"/>
+                        <delay_constant max="64.74e-12" in_port="adder.b" out_port="adder.sumout"/>
+                        <delay_constant max="35.78e-12" in_port="adder.cin" out_port="adder.sumout"/>
+                        <delay_constant max="45.5e-12" in_port="adder.a" out_port="adder.cout"/>
+                        <delay_constant max="45.5e-12" in_port="adder.b" out_port="adder.cout"/>
+                        <delay_constant max="24.37e-12" in_port="adder.cin" out_port="adder.cout"/>
+                      </pb_type>
+                      <pb_type name="ff" blif_model=".latch" num_pb="1" class="flipflop">
+                        <input name="D" num_pins="1" port_class="D"/>
+                        <output name="Q" num_pins="1" port_class="Q"/>
+                        <clock name="clk" num_pins="1" port_class="clock"/>
+                        <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                        <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+                      </pb_type>
+                      <interconnect>
+                        <direct name="clock" input="arithmetic.clk" output="ff.clk"/>
+                        <direct name="lut4_in1" input="arithmetic.in" output="lut4[0].in"/>
+                        <direct name="lut4_in2" input="arithmetic.in" output="lut4[1].in"/>
+                        <direct name="lut_to_add1" input="lut4[0:0].out" output="adder.a">
+                            <pack_pattern name="lut_chain" in_port="lut4[0:0].out" out_port="adder.a"/>
+                        </direct>
+                        <direct name="lut_to_add2" input="lut4[1:1].out" output="adder.b">
+                            <pack_pattern name="lut_chain" in_port="lut4[1:1].out" out_port="adder.b"/>
+                        </direct>
+                        <direct name="add_to_ff" input="adder.sumout" output="ff.D">
+                          <delay_constant max="17.62e-12" in_port="adder.sumout" out_port="ff.D"/>
+                          <!--pack_pattern name="chain" in_port="adder.sumout" out_port="ff.D"/-->
+                        </direct>
+                        <direct name="carry_in" input="arithmetic.cin" output="adder.cin">
+                          <pack_pattern name="chain" in_port="arithmetic.cin" out_port="adder.cin"/>
+                          <pack_pattern name="lut_chain" in_port="arithmetic.cin" out_port="adder.cin"/>
+                        </direct>
+                        <direct name="carry_out" input="adder.cout" output="arithmetic.cout">
+                          <pack_pattern name="chain" in_port="adder.cout" out_port="arithmetic.cout"/>
+                          <pack_pattern name="lut_chain" in_port="adder.cout" out_port="arithmetic.cout"/>
+                        </direct>
+                        <complete name="sumout" input="ff.Q adder.sumout" output="arithmetic.out">
+                            <delay_constant max="39.62e-12" in_port="adder.sumout" out_port="arithmetic.out"/>
+                            <delay_constant max="39.62e-12" in_port="ff.Q" out_port="arithmetic.out"/>
+                        </complete>
+                      </interconnect>
+                    </pb_type>
+                    <interconnect>
+                      <direct name="direct1" input="ble5.in[3:0]" output="arithmetic.in"/>
+                      <direct name="carry_in" input="ble5.cin" output="arithmetic.cin">
+                        <pack_pattern name="chain" in_port="ble5.cin" out_port="arithmetic.cin"/>
+                        <pack_pattern name="lut_chain" in_port="ble5.cin" out_port="arithmetic.cin"/>
+                      </direct>
+                      <direct name="carry_out" input="arithmetic.cout" output="ble5.cout">
+                        <pack_pattern name="chain" in_port="arithmetic.cout" out_port="ble5.cout"/>
+                        <pack_pattern name="lut_chain" in_port="arithmetic.cout" out_port="ble5.cout"/>
+                      </direct>
+                      <direct name="direct2" input="ble5.clk" output="arithmetic.clk"/>
+                      <direct name="direct3" input="arithmetic.out" output="ble5.out"/>
+                    </interconnect>
+                  </mode>
+                </pb_type>
+                <interconnect>
+                  <!-- Shared inputs between the two 5-LUTs -->
+                  <complete name="lut5_reg1" input="fle.in[0]" output="ble5[0].in[0] ble5[1].in[1]"/>
+                  <complete name="lut5_reg2" input="fle.in[1]" output="ble5[0].in[1] ble5[1].in[0]"/>
+
+                  <!-- Rest of the 5-LUT inputs -->
+                  <direct name="lut5_inputs_1" input="fle.in[4:2]" output="ble5[0].in[4:2]"/>
+                  <direct name="lut5_inputs_22" input="fle.in[7:5]" output="ble5[1].in[4:2]"/>
+
+                  <direct name="lut5_outputs_1" input="ble5[0].out" output="fle.out[1:0]"/>
+                  <direct name="lut5_outputs_2" input="ble5[1].out" output="fle.out[3:2]"/>
+
+                  <direct name="carry_in" input="fle.cin" output="ble5[0].cin">
+                    <pack_pattern name="chain" in_port="fle.cin" out_port="ble5[0].cin"/>
+                    <pack_pattern name="lut_chain" in_port="fle.cin" out_port="ble5[0].cin"/>
+                  </direct>
+                  <direct name="carry_out" input="ble5[1].cout" output="fle.cout">
+                    <pack_pattern name="chain" in_port="ble5[1].cout" out_port="fle.cout"/>
+                    <pack_pattern name="lut_chain" in_port="ble5[1].cout" out_port="fle.cout"/>
+                  </direct>
+                  <direct name="carry_link" input="ble5[0].cout" output="ble5[1].cin">
+                    <pack_pattern name="chain" in_port="ble5[0].cout" out_port="ble5[1].cout"/>
+                    <pack_pattern name="lut_chain" in_port="ble5[0].cout" out_port="ble5[1].cout"/>
+                  </direct>
+                  <complete name="clock" input="fle.clk" output="ble5[1:0].clk"/>
+                </interconnect>
+              </mode> <!-- n2_lut5 -->
+              <mode name="n1_lut6">
+                <pb_type name="ble6" num_pb="1">
+                  <input name="in" num_pins="6"/>
+                  <output name="out" num_pins="4"/>
+                  <clock name="clk" num_pins="1"/> 
+                  <pb_type name="lut6" blif_model=".names" num_pb="1" class="lut">
+                    <input name="in" num_pins="6" port_class="lut_in"/>
+                    <output name="out" num_pins="1" port_class="lut_out"/>
+                    <!-- LUT timing using delay matrix -->
+                    <!-- These are the physical delay inputs on a Stratix 10 LUT but because VPR cannot do LUT rebalancing,
+                           we instead take the average of these numbers to get more stable results
+                      -->
+                    <delay_matrix type="max" in_port="lut6.in" out_port="lut6.out">
+                        180.9e-12
+                        180.9e-12
+                        180.9e-12
+                        180.9e-12
+                        180.9e-12
+                        180.9e-12
+                    </delay_matrix>
+                  </pb_type>
+                  <pb_type name="ff" blif_model=".latch" num_pb="2" class="flipflop">
+                    <input name="D" num_pins="1" port_class="D"/>
+                    <output name="Q" num_pins="1" port_class="Q"/>
+                    <clock name="clk" num_pins="1" port_class="clock"/>
+                    <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                    <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+                  </pb_type>
+                  <interconnect>
+                    <direct name="lut6_inputs" input="ble6.in" output="lut6.in"/>
+                    <direct name="lut6_ff" input="lut6.out" output="ff[1].D">
+                      <delay_constant max="17.62e-12" in_port="lut6.out" out_port="ff[1].D"/>
+                      <pack_pattern name="ble6" in_port="lut6.out" out_port="ff[1].D"/>
+                    </direct>
+                    <complete name="clock" input="ble6.clk" output="ff.clk"/>
+                    <direct name="input_to_ff" input="ble6.in[0]" output="ff[0].D"/>
+                    <mux name="mux1" input="ff[0].Q lut6.out" output="ble6.out[0]">
+                      <delay_constant max="39.62e-12" in_port="lut6.out" out_port="ble6.out[0]"/>
+                      <delay_constant max="39.62e-12" in_port="ff[0].Q" out_port="ble6.out[0]"/>
+                    </mux>
+                    <!-- This mux is the same as mux1 but connected to output 2 -->
+                    <mux name="mux2" input="ff[0].Q lut6.out" output="ble6.out[1]">
+                      <delay_constant max="39.62e-12" in_port="lut6.out" out_port="ble6.out[1]"/>
+                      <delay_constant max="39.62e-12" in_port="ff[0].Q" out_port="ble6.out[1]"/>
+                    </mux>
+                    <mux name="mux3" input="ff[1].Q lut6.out" output="ble6.out[2]">
+                      <delay_constant max="39.62e-12" in_port="lut6.out" out_port="ble6.out[2]"/>
+                      <delay_constant max="39.62e-12" in_port="ff[1].Q" out_port="ble6.out[2]"/>
+                    </mux>
+                    <!-- This mux is the same as mux2 but connected to output 3 -->
+                    <mux name="mux4" input="ff[1].Q lut6.out" output="ble6.out[3]">
+                      <delay_constant max="39.62e-12" in_port="lut6.out" out_port="ble6.out[3]"/>
+                      <delay_constant max="39.62e-12" in_port="ff[1].Q" out_port="ble6.out[3]"/>
+                    </mux>
+                  </interconnect>
+                </pb_type>
+                <interconnect>
+                  <!-- ble6 takes inputs A, B, C, D, E, & F; where F is fle[7] -->
+                  <direct name="lut6_inputs1" input="fle.in[4:0]" output="ble6.in[4:0]"/>
+                  <direct name="lut6_inputs2" input="fle.in[7]" output="ble6.in[5]"/>
+                  <direct name="direct2" input="ble6.out" output="fle.out"/>
+                  <direct name="direct4" input="fle.clk" output="ble6.clk"/>
+                </interconnect>
+              </mode> <!-- n1_lut6 -->
+            </pb_type>
+           <pb_type name="mult_9" num_pb="1">
+             <input name="datain" num_pins="18"/>
+             <output name="dataout" num_pins="18"/>
+             <clock name="clk" num_pins="1"/>
+                 <pb_type name="mult_9x9" blif_model=".subckt multiply" num_pb="1">
+                   <input name="a" num_pins="9"/>
+                   <input name="b" num_pins="9"/>
+                   <output name="out" num_pins="18"/>
+                     <delay_constant max="1.0e-9" in_port="mult_9x9.a" out_port="mult_9x9.out"/>
+                     <delay_constant max="1.0e-9" in_port="mult_9x9.b" out_port="mult_9x9.out"/>
+                 </pb_type>
+                  <pb_type name="ff" blif_model=".latch" num_pb="18" class="flipflop">
+                    <input name="D" num_pins="1" port_class="D"/>
+                    <output name="Q" num_pins="1" port_class="Q"/>
+                    <clock name="clk" num_pins="1" port_class="clock"/>
+                    <T_setup value="18.91e-12" port="ff.D" clock="clk"/>
+                    <T_clock_to_Q max="60.32e-12" port="ff.Q" clock="clk"/>
+                  </pb_type>
+                 <interconnect>
+                     <direct name="a2a" input="mult_9.datain[8:0]" output="mult_9x9.a"/>
+                     <direct name="b2b" input="mult_9.datain[17:9]" output="mult_9x9.b"/>
+                     <direct name="outtoff" input="mult_9x9.out" output="ff.D"/>
+                     <direct name="outff" input="ff.Q" output="mult_9.dataout"/>
+                     <direct name="out" input="mult_9x9.out" output="mult_9.dataout"/>
+                     <complete name="clock" input="mult_9.clk" output="ff.clk"/>
+                 </interconnect>
+           </pb_type>
+            <interconnect>
+              <!-- We use a 50% depop crossbar built using small full xbars to get sets of logically equivalent pins at inputs of CLB 
+               The delays below come from Stratix IV. the delay through a connection block
+               input mux + the crossbar in Stratix IV is 167 ps. We already have a 72 ps 
+               delay on the connection block input mux (modeled by Ian Kuon), so the remaining
+               delay within the crossbar is 95 ps. 
+               The delays of cluster feedbacks in Stratix IV is 100 ps, when driven by a LUT.
+               Since all our outputs LUT outputs go to a BLE output, and have a delay of 
+               25 ps to do so, we subtract 25 ps from the 100 ps delay of a feedback
+               to get the part that should be marked on the crossbar.	 -->
+
+              <!-- 50% sparsely populated local routing -->
+              <complete name="lutA" input="lab.I4 lab.I3" output="fle[9:0].in[0:0]">
+                <delay_constant max="71.43e-12" in_port="lab.I4" out_port="fle.in[0:0]"/>
+                <delay_constant max="71.43e-12" in_port="lab.I3" out_port="fle.in[0:0]"/>
+              </complete>
+              <complete name="lutB" input="lab.I3 lab.I2" output="fle[9:0].in[1:1]">
+                <delay_constant max="71.43e-12" in_port="lab.I3" out_port="fle.in[1:1]"/>
+                <delay_constant max="71.43e-12" in_port="lab.I2" out_port="fle.in[1:1]"/>
+              </complete>
+              <complete name="lutC" input="lab.I2 lab.I1" output="fle[9:0].in[2:2]">
+                <delay_constant max="71.43e-12" in_port="lab.I2" out_port="fle.in[2:2]"/>
+                <delay_constant max="71.43e-12" in_port="lab.I1" out_port="fle.in[2:2]"/>
+              </complete>
+              <complete name="lutD" input="lab.I4 lab.I2" output="fle[9:0].in[3:3]">
+                <delay_constant max="71.43e-12" in_port="lab.I4" out_port="fle.in[3:3]"/>
+                <delay_constant max="71.43e-12" in_port="lab.I2" out_port="fle.in[3:3]"/>
+              </complete>
+              <complete name="lutE" input="lab.I3 lab.I1" output="fle[9:0].in[4:4]">
+                <delay_constant max="71.43e-12" in_port="lab.I3" out_port="fle.in[4:4]"/>
+                <delay_constant max="71.43e-12" in_port="lab.I1" out_port="fle.in[4:4]"/>
+              </complete>
+              <complete name="lutF" input="lab.I4 lab.I1" output="fle[9:0].in[5:5]">
+                <delay_constant max="71.43e-12" in_port="lab.I4" out_port="fle.in[5:5]"/>
+                <delay_constant max="71.43e-12" in_port="lab.I1" out_port="fle.in[5:5]"/>
+              </complete>
+              <complete name="lutG" input="lab.I4 lab.I3" output="fle[9:0].in[6:6]">
+                <delay_constant max="71.43e-12" in_port="lab.I4" out_port="fle.in[6:6]"/>
+                <delay_constant max="71.43e-12" in_port="lab.I3" out_port="fle.in[6:6]"/>
+              </complete>
+              <complete name="lutH" input="lab.I3 lab.I2" output="fle[9:0].in[7:7]">
+                <delay_constant max="71.43e-12" in_port="lab.I3" out_port="fle.in[7:7]"/>
+                <delay_constant max="71.43e-12" in_port="lab.I2" out_port="fle.in[7:7]"/>
+              </complete>
+
+              <complete name="clks" input="lab.clk" output="fle[9:0].clk"/>
+              <direct name="clkmult" input="lab.clk" output="mult_9.clk"/>
+
+              <!-- This way of specifying direct connection to clb outputs is important because this architecture uses automatic spreading of opins.  
+                     By grouping to output pins in this fashion, if a logic block is completely filled by 6-LUTs, 
+                     then the outputs those 6-LUTs take get evenly distributed across all four sides of the CLB instead of clumped on two sides (which is what happens with a more
+                     naive specification).
+              -->
+              <direct name="labouts1" input="fle[9:5].out[0]" output="lab.O[9:5]"/>
+              <direct name="labouts2" input="fle[9:5].out[1]" output="lab.O[19:15]"/>
+              <direct name="labouts3" input="fle[9:4].out[2]" output="lab.O[29:24]"/>
+              <direct name="labouts4" input="fle[9:4].out[3]" output="lab.O[39:34]"/>
+
+
+              <direct name="labouts11" input="fle[4:0].out[0]" output="lab.O[4:0]">
+                  <delay_constant max="10e-12" in_port="fle[4:0].out[0]" out_port="lab.O[4:0]"/>
+              </direct>
+              <direct name="labouts21" input="fle[4:0].out[1]" output="lab.O[14:10]">
+                  <delay_constant max="10e-12" in_port="fle[4:0].out[1]" out_port="lab.O[14:10]"/>
+              </direct>
+              <direct name="labouts31" input="fle[3:0].out[2]" output="lab.O[23:20]">
+                  <delay_constant max="10e-12" in_port="fle[3:0].out[2]" out_port="lab.O[23:20]"/>
+              </direct>
+              <direct name="labouts41" input="fle[3:0].out[3]" output="lab.O[33:30]">
+                  <delay_constant max="10e-12" in_port="fle[3:0].out[3]" out_port="lab.O[33:30]"/>
+              </direct>
+
+              <!-- Carry chain links -->
+              <direct name="carry_in" input="lab.cin" output="fle[0:0].cin">
+                <!-- Put all inter-block carry chain delay on this one edge -->
+                <delay_constant max="18.39e-12" in_port="lab.cin" out_port="fle[0:0].cin"/>
+                <pack_pattern name="chain" in_port="lab.cin" out_port="fle[0:0].cin"/>
+                <pack_pattern name="lut_chain" in_port="lab.cin" out_port="fle[0:0].cin"/>
+              </direct>
+              <direct name="carry_out" input="fle[9:9].cout" output="lab.cout">
+                <pack_pattern name="chain" in_port="fle[9:9].cout" out_port="lab.cout"/>
+                <pack_pattern name="lut_chain" in_port="fle[9:9].cout" out_port="lab.cout"/>
+              </direct>
+              <direct name="carry_link" input="fle[8:0].cout" output="fle[9:1].cin">
+                <pack_pattern name="chain" in_port="fle[8:0].cout" out_port="fle[9:1].cin"/>
+                <pack_pattern name="lut_chain" in_port="fle[8:0].cout" out_port="fle[9:1].cin"/>
+              </direct>
+
+              <complete name="mult_in" input="lab.I1 lab.I2 lab.I3 lab.I4" output="mult_9.datain"/>
+              <direct name="mult_out0" input="mult_9.dataout[0]" output="lab.O[0]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[0]" out_port="lab.O[0]"/>
+              </direct>
+              <direct name="mult_out1" input="mult_9.dataout[1]" output="lab.O[10]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[1]" out_port="lab.O[10]"/>
+              </direct>
+              <direct name="mult_out2" input="mult_9.dataout[2]" output="lab.O[20]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[2]" out_port="lab.O[20]"/>
+              </direct>
+              <direct name="mult_out3" input="mult_9.dataout[3]" output="lab.O[30]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[3]" out_port="lab.O[30]"/>
+              </direct>
+              <direct name="mult_out4" input="mult_9.dataout[4]" output="lab.O[1]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[4]" out_port="lab.O[1]"/>
+              </direct>
+              <direct name="mult_out5" input="mult_9.dataout[5]" output="lab.O[11]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[5]" out_port="lab.O[11]"/>
+              </direct>
+              <direct name="mult_out6" input="mult_9.dataout[6]" output="lab.O[21]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[6]" out_port="lab.O[21]"/>
+              </direct>
+              <direct name="mult_out7" input="mult_9.dataout[7]" output="lab.O[31]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[7]" out_port="lab.O[31]"/>
+              </direct>
+              <direct name="mult_out8" input="mult_9.dataout[8]" output="lab.O[2]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[8]" out_port="lab.O[2]"/>
+              </direct>
+              <direct name="mult_out9" input="mult_9.dataout[9]" output="lab.O[12]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[9]" out_port="lab.O[12]"/>
+              </direct>
+              <direct name="mult_out10" input="mult_9.dataout[10]" output="lab.O[22]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[10]" out_port="lab.O[22]"/>
+              </direct>
+              <direct name="mult_out11" input="mult_9.dataout[11]" output="lab.O[32]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[11]" out_port="lab.O[32]"/>
+              </direct>
+              <direct name="mult_out12" input="mult_9.dataout[12]" output="lab.O[3]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[12]" out_port="lab.O[3]"/>
+              </direct>
+              <direct name="mult_out13" input="mult_9.dataout[13]" output="lab.O[13]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[13]" out_port="lab.O[13]"/>
+              </direct>
+              <direct name="mult_out14" input="mult_9.dataout[14]" output="lab.O[23]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[14]" out_port="lab.O[23]"/>
+              </direct>
+              <direct name="mult_out15" input="mult_9.dataout[15]" output="lab.O[33]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[15]" out_port="lab.O[33]"/>
+              </direct>
+              <direct name="mult_out16" input="mult_9.dataout[16]" output="lab.O[4]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[16]" out_port="lab.O[4]"/>
+              </direct>
+              <direct name="mult_out17" input="mult_9.dataout[17]" output="lab.O[14]">
+                <delay_constant max="40e-12" in_port="mult_9.dataout[17]" out_port="lab.O[14]"/>
+              </direct>
+            </interconnect>
+        </pb_type>
+        <interconnect>
+            <direct name="carry_in" input="clb.cin" output="lab.cin"/>
+            <direct name="carry_out" input="lab.cout" output="clb.cout"/>
+            <direct name="clock" input="clb.clk" output="lab.clk"/>
+
+            <complete name="Input_feedback_I1" input="lab.O[4:0]" output="lab.I1"/>
+            <complete name="Input_feedback_I2" input="lab.O[24:20]" output="lab.I2"/>
+            <complete name="Input_feedback_I3" input="lab.O[9:5]" output="lab.I3"/>
+            <complete name="Input_feedback_I4" input="lab.O[29:25]" output="lab.I4"/>
+
+            <direct name="Input_I1" input="clb.I1" output="lab.I1"/>
+            <direct name="Input_I2" input="clb.I2" output="lab.I2"/>
+            <direct name="Input_I3" input="clb.I3" output="lab.I3"/>
+            <direct name="Input_I4" input="clb.I4" output="lab.I4"/>
+
+            <direct name="output" input="lab.O" output="clb.O"/>
+        </interconnect>
+        <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10">
+            <fc_override port_name="cin" fc_type="frac" fc_val="0"/>
+            <fc_override port_name="cout" fc_type="frac" fc_val="0"/>
+        </fc>
+        <pinlocations pattern="spread"/>
+      </pb_type>
+       <!-- Define general purpose logic block (CLB) ends -->
+
+       <!-- Define fracturable multiplier begin -->
+       <pb_type name="mult_27" height="2">
+         <input name="datain" num_pins="74"/>
+         <output name="dataout" num_pins="74"/>
+
+         <mode name="two_mult_18x19">
+           <pb_type name="two_mult_18x19" num_pb="2">
+             <input name="a" num_pins="18"/>
+             <input name="b" num_pins="19"/>
+             <output name="out" num_pins="37"/>
+             <pb_type name="mult_18x19" blif_model=".subckt multiply" num_pb="1">
+               <input name="a" num_pins="18"/>
+               <input name="b" num_pins="19"/>
+               <output name="out" num_pins="37"/>
+                 <delay_constant max="1.825e-9" in_port="mult_18x19.a" out_port="mult_18x19.out"/>
+                 <delay_constant max="1.825e-9" in_port="mult_18x19.b" out_port="mult_18x19.out"/>
+             </pb_type>
+             <interconnect>
+               <direct name="a2a" input="two_mult_18x19.a" output="mult_18x19.a">
+               </direct>
+               <direct name="b2b" input="two_mult_18x19.b" output="mult_18x19.b">
+               </direct>
+               <direct name="out2out" input="mult_18x19.out" output="two_mult_18x19.out">
+               </direct>
+             </interconnect>
+               <power method="pin-toggle">
+                 <port name="a" energy_per_toggle="1.09e-12"/>
+                 <port name="b" energy_per_toggle="1.09e-12"/>
+                 <static_power power_per_instance="0.0"/>					
+               </power>
+           </pb_type>
+           <interconnect>
+             <direct name="datain2a1" input="mult_27.datain[17:0]" output="two_mult_18x19[0].a">
+               <delay_constant max="134e-12" in_port="mult_27.datain[17:0]" out_port="two_mult_18x19[0].a"/>
+             </direct>
+             <direct name="datain2b1" input="mult_27.datain[36:18]" output="two_mult_18x19[0].b">
+               <delay_constant max="134e-12" in_port="mult_27.datain[36:18]" out_port="two_mult_18x19[0].b"/>
+             </direct>
+             <direct name="datain2a2" input="mult_27.datain[54:37]" output="two_mult_18x19[1].a">
+               <delay_constant max="134e-12" in_port="mult_27.datain[54:37]" out_port="two_mult_18x19[1].a"/>
+             </direct>
+             <direct name="datain2b2" input="mult_27.datain[73:55]" output="two_mult_18x19[1].b">
+               <delay_constant max="134e-12" in_port="mult_27.datain[73:55]" out_port="two_mult_18x19[1].b"/>
+             </direct>
+             <direct name="out2dataout" input="two_mult_18x19[1:0].out" output="mult_27.dataout">
+               <delay_constant max="1.09e-9" in_port="two_mult_18x19[1:0].out" out_port="mult_27.dataout"/>
+             </direct>
+           </interconnect>
+         </mode>
+
+         <mode name="mult_27x27">
+           <pb_type name="one_mult_27x27" num_pb="1">
+             <input name="a" num_pins="27"/>
+             <input name="b" num_pins="27"/>
+             <output name="out" num_pins="54"/>
+
+             <pb_type name="mult_27x27" blif_model=".subckt multiply" num_pb="1">
+               <input name="a" num_pins="27"/>
+               <input name="b" num_pins="27"/>
+               <output name="out" num_pins="54"/>
+               <delay_constant max="1.848e-9" in_port="mult_27x27.a" out_port="mult_27x27.out"/>
+               <delay_constant max="1.848e-9" in_port="mult_27x27.b" out_port="mult_27x27.out"/>
+             </pb_type>
+             <interconnect>
+               <direct name="a2a" input="one_mult_27x27.a" output="mult_27x27.a">
+               </direct>
+               <direct name="b2b" input="one_mult_27x27.b" output="mult_27x27.b">
+               </direct>
+               <direct name="out2out" input="mult_27x27.out" output="one_mult_27x27.out">
+               </direct>
+             </interconnect>
+             <power method="pin-toggle">
+               <port name="a" energy_per_toggle="2.13e-12"/>
+               <port name="b" energy_per_toggle="2.13e-12"/>
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="datain2a" input="mult_27.datain[26:0]" output="one_mult_27x27.a">
+               <delay_constant max="134e-12" in_port="mult_27.datain[26:0]" out_port="one_mult_27x27.a"/>
+             </direct>
+             <direct name="datain2b" input="mult_27.datain[53:27]" output="one_mult_27x27.b">
+               <delay_constant max="134e-12" in_port="mult_27.datain[53:27]" out_port="one_mult_27x27.b"/>
+             </direct>
+             <direct name="out2dataout" input="one_mult_27x27.out" output="mult_27.dataout[53:0]">
+               <delay_constant max="1.93e-9" in_port="one_mult_27x27.out" out_port="mult_27.dataout[53:0]"/>
+             </direct>
+           </interconnect>
+
+         </mode>
+
+         <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+         <pinlocations pattern="spread"/>
+
+         <power method="sum-of-children"/>
+       </pb_type>
+       <!-- Define fracturable multiplier end -->
+
+       <!-- Define fracturable memory begin -->
+       <pb_type name="memory" height="4">
+         <input name="addr1" num_pins="11"/>
+         <input name="addr2" num_pins="11"/>
+         <input name="data" num_pins="40"/>
+         <input name="we1" num_pins="1"/>
+         <input name="we2" num_pins="1"/>
+         <output name="out" num_pins="40"/>
+         <clock name="clk" num_pins="1"/>
+
+         <!-- Specify single port mode first -->
+         <mode name="mem_512x40_sp">
+           <pb_type name="mem_512x40_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+             <input name="addr" num_pins="9" port_class="address"/>
+             <input name="data" num_pins="40" port_class="data_in"/>
+             <input name="we" num_pins="1" port_class="write_en"/>
+             <output name="out" num_pins="40" port_class="data_out"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_512x40_sp.addr" clock="clk"/>
+             <T_setup value="509e-12" port="mem_512x40_sp.data" clock="clk"/>
+             <T_setup value="509e-12" port="mem_512x40_sp.we" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_512x40_sp.out" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="9.0e-12"/>					
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[8:0]" output="mem_512x40_sp.addr">
+               <delay_constant max="132e-12" in_port="memory.addr1[8:0]" out_port="mem_512x40_sp.addr"/>
+             </direct>
+             <direct name="data1" input="memory.data" output="mem_512x40_sp.data">
+               <delay_constant max="132e-12" in_port="memory.data" out_port="mem_512x40_sp.data"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_512x40_sp.we">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_512x40_sp.we"/>
+             </direct>
+             <direct name="dataout1" input="mem_512x40_sp.out" output="memory.out">
+               <delay_constant max="40e-12" in_port="mem_512x40_sp.out" out_port="memory.out"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_512x40_sp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <mode name="mem_1024x20_sp">
+           <pb_type name="mem_1024x20_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+             <input name="addr" num_pins="10" port_class="address"/>
+             <input name="data" num_pins="20" port_class="data_in"/>
+             <input name="we" num_pins="1" port_class="write_en"/>
+             <output name="out" num_pins="20" port_class="data_out"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_1024x20_sp.addr" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_sp.data" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_sp.we" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_1024x20_sp.out" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="9.0e-12"/>					
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_sp.addr">
+               <delay_constant max="132e-12" in_port="memory.addr1[9:0]" out_port="mem_1024x20_sp.addr"/>
+             </direct>
+             <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_sp.data">
+               <delay_constant max="132e-12" in_port="memory.data[19:0]" out_port="mem_1024x20_sp.data"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_1024x20_sp.we">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_1024x20_sp.we"/>
+             </direct>
+             <direct name="dataout1" input="mem_1024x20_sp.out" output="memory.out[19:0]">
+               <delay_constant max="40e-12" in_port="mem_1024x20_sp.out" out_port="memory.out[19:0]"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_1024x20_sp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <mode name="mem_2048x10_sp">
+           <pb_type name="mem_2048x10_sp" blif_model=".subckt single_port_ram" class="memory" num_pb="1">
+             <input name="addr" num_pins="11" port_class="address"/>
+             <input name="data" num_pins="10" port_class="data_in"/>
+             <input name="we" num_pins="1" port_class="write_en"/>
+             <output name="out" num_pins="10" port_class="data_out"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_2048x10_sp.addr" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_sp.data" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_sp.we" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_2048x10_sp.out" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="9.0e-12"/>					
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_sp.addr">
+               <delay_constant max="132e-12" in_port="memory.addr1[10:0]" out_port="mem_2048x10_sp.addr"/>
+             </direct>
+             <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_sp.data">
+               <delay_constant max="132e-12" in_port="memory.data[9:0]" out_port="mem_2048x10_sp.data"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_2048x10_sp.we">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_2048x10_sp.we"/>
+             </direct>
+             <direct name="dataout1" input="mem_2048x10_sp.out" output="memory.out[9:0]">
+               <delay_constant max="40e-12" in_port="mem_2048x10_sp.out" out_port="memory.out[9:0]"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_2048x10_sp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <!-- Specify true dual port mode next -->
+         <mode name="mem_1024x20_dp">
+           <pb_type name="mem_1024x20_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+             <input name="addr1" num_pins="10" port_class="address1"/>
+             <input name="addr2" num_pins="10" port_class="address2"/>
+             <input name="data1" num_pins="20" port_class="data_in1"/>
+             <input name="data2" num_pins="20" port_class="data_in2"/>
+             <input name="we1" num_pins="1" port_class="write_en1"/>
+             <input name="we2" num_pins="1" port_class="write_en2"/>
+             <output name="out1" num_pins="20" port_class="data_out1"/>
+             <output name="out2" num_pins="20" port_class="data_out2"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.addr1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.data1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.we1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.addr2" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.data2" clock="clk"/>
+             <T_setup value="509e-12" port="mem_1024x20_dp.we2" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_1024x20_dp.out1" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_1024x20_dp.out2" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="17.9e-12"/>					
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[9:0]" output="mem_1024x20_dp.addr1">
+               <delay_constant max="132e-12" in_port="memory.addr1[9:0]" out_port="mem_1024x20_dp.addr1"/>
+             </direct>
+             <direct name="address2" input="memory.addr2[9:0]" output="mem_1024x20_dp.addr2">
+               <delay_constant max="132e-12" in_port="memory.addr2[9:0]" out_port="mem_1024x20_dp.addr2"/>
+             </direct>
+             <direct name="data1" input="memory.data[19:0]" output="mem_1024x20_dp.data1">
+               <delay_constant max="132e-12" in_port="memory.data[19:0]" out_port="mem_1024x20_dp.data1"/>
+             </direct>
+             <direct name="data2" input="memory.data[39:20]" output="mem_1024x20_dp.data2">
+               <delay_constant max="132e-12" in_port="memory.data[39:20]" out_port="mem_1024x20_dp.data2"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_1024x20_dp.we1">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_1024x20_dp.we1"/>
+             </direct>
+             <direct name="writeen2" input="memory.we2" output="mem_1024x20_dp.we2">
+               <delay_constant max="132e-12" in_port="memory.we2" out_port="mem_1024x20_dp.we2"/>
+             </direct>
+             <direct name="dataout1" input="mem_1024x20_dp.out1" output="memory.out[19:0]">
+               <delay_constant max="40e-12" in_port="mem_1024x20_dp.out1" out_port="memory.out[19:0]"/>
+             </direct>
+             <direct name="dataout2" input="mem_1024x20_dp.out2" output="memory.out[39:20]">
+               <delay_constant max="40e-12" in_port="mem_1024x20_dp.out2" out_port="memory.out[39:20]"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_1024x20_dp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <mode name="mem_2048x10_dp">
+           <pb_type name="mem_2048x10_dp" blif_model=".subckt dual_port_ram" class="memory" num_pb="1">
+             <input name="addr1" num_pins="11" port_class="address1"/>
+             <input name="addr2" num_pins="11" port_class="address2"/>
+             <input name="data1" num_pins="10" port_class="data_in1"/>
+             <input name="data2" num_pins="10" port_class="data_in2"/>
+             <input name="we1" num_pins="1" port_class="write_en1"/>
+             <input name="we2" num_pins="1" port_class="write_en2"/>
+             <output name="out1" num_pins="10" port_class="data_out1"/>
+             <output name="out2" num_pins="10" port_class="data_out2"/>
+             <clock name="clk" num_pins="1" port_class="clock"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.addr1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.data1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.we1" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.addr2" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.data2" clock="clk"/>
+             <T_setup value="509e-12" port="mem_2048x10_dp.we2" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_2048x10_dp.out1" clock="clk"/>
+             <T_clock_to_Q max="1.234e-9" port="mem_2048x10_dp.out2" clock="clk"/>
+             <power method="pin-toggle">
+               <port name="clk" energy_per_toggle="17.9e-12"/>					
+               <static_power power_per_instance="0.0"/>
+             </power>
+           </pb_type>
+           <interconnect>
+             <direct name="address1" input="memory.addr1[10:0]" output="mem_2048x10_dp.addr1">
+               <delay_constant max="132e-12" in_port="memory.addr1[10:0]" out_port="mem_2048x10_dp.addr1"/>
+             </direct>
+             <direct name="address2" input="memory.addr2[10:0]" output="mem_2048x10_dp.addr2">
+               <delay_constant max="132e-12" in_port="memory.addr2[10:0]" out_port="mem_2048x10_dp.addr2"/>
+             </direct>
+             <direct name="data1" input="memory.data[9:0]" output="mem_2048x10_dp.data1">
+               <delay_constant max="132e-12" in_port="memory.data[9:0]" out_port="mem_2048x10_dp.data1"/>
+             </direct>
+             <direct name="data2" input="memory.data[19:10]" output="mem_2048x10_dp.data2">
+               <delay_constant max="132e-12" in_port="memory.data[19:10]" out_port="mem_2048x10_dp.data2"/>
+             </direct>
+             <direct name="writeen1" input="memory.we1" output="mem_2048x10_dp.we1">
+               <delay_constant max="132e-12" in_port="memory.we1" out_port="mem_2048x10_dp.we1"/>
+             </direct>
+             <direct name="writeen2" input="memory.we2" output="mem_2048x10_dp.we2">
+               <delay_constant max="132e-12" in_port="memory.we2" out_port="mem_2048x10_dp.we2"/>
+             </direct>
+             <direct name="dataout1" input="mem_2048x10_dp.out1" output="memory.out[9:0]">
+               <delay_constant max="40e-12" in_port="mem_2048x10_dp.out1" out_port="memory.out[9:0]"/>
+             </direct>
+             <direct name="dataout2" input="mem_2048x10_dp.out2" output="memory.out[19:10]">
+               <delay_constant max="40e-12" in_port="mem_2048x10_dp.out2" out_port="memory.out[19:10]"/>
+             </direct>
+             <direct name="clk" input="memory.clk" output="mem_2048x10_dp.clk">
+             </direct>
+           </interconnect>
+         </mode>
+
+         <!-- Every input pin is driven by 15% of the tracks in a channel, every output pin is driven by 10% of the tracks in a channel -->
+         <fc in_type="frac" in_val="0.15" out_type="frac" out_val="0.10"/>
+         <pinlocations pattern="spread"/>
+
+         <!-- Place this memory block every 8 columns from (and including) the second column -->
+         <power method="sum-of-children"/>
+       </pb_type>
+       <!-- Define fracturable memory end -->
+
+
+    </complexblocklist>
+
+    <power>
+      <local_interconnect C_wire="2.5e-10"/>
+      <mux_transistor_size mux_transistor_size="3"/>
+      <FF_size FF_size="4"/>
+      <LUT_transistor_size LUT_transistor_size="4"/> 
+    </power>
+
+    <clocks>
+      <clock buffer_size="auto" C_wire="2.5e-10"/>
+    </clocks>
+</architecture>