odin: Added a function that splits a multiplier into AND gates
implementing partial products and a balanced addition tree for reducing the
partial products and producing the multiplication output.
diff --git a/ODIN_II/SRC/multipliers.cpp b/ODIN_II/SRC/multipliers.cpp
index 242afc0..4b05282 100644
--- a/ODIN_II/SRC/multipliers.cpp
+++ b/ODIN_II/SRC/multipliers.cpp
@@ -53,6 +53,16 @@
 void split_multiplier_a(nnode_t *node, int a0, int a1, int b);
 void split_multiplier_b(nnode_t *node, int a, int b1, int b0);
 void pad_multiplier(nnode_t *node, netlist_t *netlist);
+void split_soft_multiplier(nnode_t *node, netlist_t *netlist);
+
+// data structure representing a row of bits an adder tree
+struct AdderTreeRow {
+    // the shift of this row from the least significant bit of the multiplier output
+    int shift = 0;
+    // array representing the bits in the row, each bit is a node
+    // pointer and the index of this bit in this node output array.
+    std::vector<std::pair<nnode_t*, int>> bits;
+};
 
 /*---------------------------------------------------------------------------
  * (function: instantiate_simple_soft_multiplier )
@@ -652,7 +662,7 @@
  * (function: init_multiplier_adder)
  *
  * This function is used to initialize an adder that is within
- *	a split multiplier.
+ * a split multiplier or a multiplier addition tree.
  *-----------------------------------------------------------------------*/
 void init_multiplier_adder(nnode_t *node, nnode_t *parent, int a, int b) {
 
@@ -1090,8 +1100,6 @@
  *	fit into a basic hard multiplier block that exists on the FPGA.
  *	If the proper option is set, then it will be expanded as well
  *	to just use a fixed size hard multiplier.
- * Note: This function will ignore multipliers that are smaller than
- * the minimum hard multiplier size
  *-----------------------------------------------------------------------*/
 void iterate_multipliers(netlist_t *netlist)
 {
@@ -1171,11 +1179,226 @@
 			{
 				record_mult_distribution(node);
 			}
-		}
+		} else if (hard_adders) {
+            if (configuration.fixed_hard_multiplier != 0) {
+                split_soft_multiplier(node, netlist);
+            }
+        }
 	}
 	return;
 }
 
+/*---------------------------------------------------------------------------
+ * (function: split_soft_multiplier)
+ *
+ * This function splits the input multiplier (node) into partial products (AND gates) and
+ * adders, as shown below. The partial products starts with "I", and all the partial products
+ * generated are added together by implementing a balanced adder tree to produce the final product
+ * Sample 4x4 multiplier to help understand logic:
+ *
+ * 	    				A3 	A2	A1	A0
+ *	    				B3 	B2 	B1 	B0
+ *	   	-------------------------------
+ *	    				I03	I02 I01 I00
+ *	   	+         	I13	I12	I11	I10
+ *	    		I23	I22	I21	I20             Level 0
+ *  	+	I23	I22	I21 I20
+ *      -------------------------------
+ *  		    C4	C3	C2	C1  C0
+ * 	+	D4  D3	D2	D1  D0	I20             Level 1
+ *  	-------------------------------
+ *  	E5	E4  E3  E2	E1	E0	C0  I00     Level 2
+ *
+ *-------------------------------------------------------------------------*/
+void split_soft_multiplier(nnode_t *node, netlist_t *netlist) {
+
+	oassert(node->num_output_pins > 0);
+	oassert(node->num_input_pins > 0);
+	oassert(node->num_input_port_sizes == 2);
+	oassert(node->num_output_port_sizes == 1);
+
+	int multiplier_width = node->input_port_sizes[0];
+	int multiplicand_width = node->input_port_sizes[1];
+
+    // ODIN II doesn't work with multiplicand sizes of 1 since it assumes that the
+    // output of the multiplier is still the sum of the operands sizes. However, it
+    // should only be equal to the long operand since its an AND operation in this case.
+    // If this is fixed, this assert statement should be removed and the code will work properly
+    oassert(multiplicand_width > 1);
+
+    // number of adders in a balanced tree of the partial product rows
+    const int add_levels = std::ceil(std::log((double)multiplicand_width)/std::log(2.));
+
+    // data structure holding the rows of output pins to be added in each addition stage
+    // as well as the shift of each row from the position of the first output
+    std::vector<std::vector<AdderTreeRow>> addition_stages(add_levels+1);
+    // 2-D array of adders, indexed by the level of the adder in the tree and the adder id within the level
+    std::vector<std::vector<nnode_t *>> adders(add_levels);
+    // array holding the adder width at each level in the adder tree
+    std::vector<std::vector<int>> adder_widths(add_levels);
+
+    // 2-D array of partial products. [0..multiplicand_width][0..multiplier_width]
+    std::vector<std::vector<nnode_t *>> partial_products(multiplicand_width);
+
+    addition_stages[0].resize(multiplicand_width);
+	// initialize all the AND gates needed for the partial products
+	for (int i = 0; i < multiplicand_width; i++) {
+        std::vector<std::pair<nnode_t*, int>> pp_bits(multiplier_width);
+        // resize the ith row of the partial products
+		partial_products[i].resize(multiplier_width);
+		for (int j = 0; j < multiplier_width; j++) {
+			// create each one of the partial products
+			partial_products[i][j] = make_1port_logic_gate(LOGICAL_AND, 2, node, node->traverse_visited);
+            pp_bits[j] = {partial_products[i][j], 0};
+		}
+        // add the partial product rows the addition stages data structure
+        addition_stages[0][i] = {i, pp_bits};
+	}
+
+	// generate the connections to the AND gates that generates the partial products of the multiplication
+    for (int i = 0; i < multiplicand_width; i++) {
+        for (int j = 0; j < multiplier_width; j++) {
+			// hookup the multiplier bits to the AND gates
+			if (i == 0) {
+				// when connecting the input to an AND gate for the first time, remap the input
+				remap_pin_to_new_node(node->input_pins[j], partial_products[i][j], 1);
+			} else {
+				// this input was remapped before, copy from the AND gate input instead
+				add_input_pin_to_node(partial_products[i][j], copy_input_npin(partial_products[0][j]->input_pins[1]), 1);
+			}
+			// hookup the input multiplicand bits the AND gates
+			if (j == 0) {
+				// when connecting the input to an AND gate for the first time, remap the input
+				remap_pin_to_new_node(node->input_pins[i+node->input_port_sizes[0]], partial_products[i][j], 0);
+			} else {
+				// this input was remapped before, copy from the AND gate input instead
+				add_input_pin_to_node(partial_products[i][j], copy_input_npin(partial_products[i][0]->input_pins[0]), 0);
+			}
+		}
+	}
+
+    // iterate over all the levels of addition
+    for (size_t level = 0; level < adders.size(); level++) {
+        // the number of rows in the next stage is the ceiling of number of rows in this stage divided by 2
+        addition_stages[level+1].resize(std::ceil(addition_stages[level].size()/2.));
+        // the number of adders in this stage is the integer division of the number of rows in this stage
+        adder_widths[level].resize(addition_stages[level].size()/2);
+        adders[level].resize(addition_stages[level].size()/2);
+
+        // iterate over every two rows
+        for (size_t row = 0; row < addition_stages[level].size() - 1; row +=2) {
+            auto& first_row = addition_stages[level][row];
+            auto& second_row = addition_stages[level][row+1];
+            auto shift_difference = second_row.shift - first_row.shift;
+            auto add_id = row/2;
+
+            // get the widths of the adder, by finding the larger operand size
+            adder_widths[level][add_id] = std::max<int>(first_row.bits.size() - shift_difference, second_row.bits.size());
+            // first level of addition has a carry out that needs to be generated, so increase adder size by 1
+            if (level == 0) adder_widths[level][add_id]++;
+            // add one bit for carry out if that last bit of the addition is fed by both levels
+            // (was found to be the only case were a carry out will be needed in this multiplier adder tree)
+            if (first_row.bits.size() - shift_difference == second_row.bits.size()) adder_widths[level][add_id]++;
+
+            // initialize this adder
+            adders[level][add_id] = allocate_nnode();
+            init_multiplier_adder(adders[level][add_id], node, adder_widths[level][add_id], adder_widths[level][add_id]);
+            adders[level][add_id]->name = node_name(adders[level][add_id], node->name);
+
+            // initialize the output of this adder in the next stage
+            addition_stages[level+1][add_id].shift = first_row.shift;
+            addition_stages[level+1][add_id].bits.resize(shift_difference + adder_widths[level][add_id]);
+            // copy the bits that weren't fed to adders in the previous stage
+            for (int i = 0; i < shift_difference; i++) {
+                addition_stages[level+1][add_id].bits[i] = first_row.bits[i];
+            }
+            // copy adder output bits to their row in next stage
+            for (int i = 0; i < adder_widths[level][add_id]; i++) {
+                addition_stages[level+1][add_id].bits[i + shift_difference] = {adders[level][add_id], i};
+            }
+
+            // connect the bits in the rows to the adder inputs.
+            for (int bit = 0; bit < adder_widths[level][add_id]; bit++) {
+               // input port a of the adder
+               if (bit < first_row.bits.size() - shift_difference) {
+                  auto bit_a = first_row.bits[bit + shift_difference];
+                  connect_nodes(bit_a.first, bit_a.second, adders[level][add_id], bit);
+               } else {
+                  // connect additional inputs to gnd
+                  add_input_pin_to_node(adders[level][add_id], get_zero_pin(netlist), bit);
+               }
+               // input port b of the adder
+               if (bit < second_row.bits.size()) {
+                  connect_nodes(second_row.bits[bit].first, second_row.bits[bit].second, adders[level][add_id], bit + adder_widths[level][add_id]);
+               } else {
+                  // connect additional inputs to gnd
+                  add_input_pin_to_node(adders[level][add_id], get_zero_pin(netlist), bit + adder_widths[level][add_id]);
+               }
+            }
+        }
+
+        // if this level have odd number of rows copy the last row to the next level to be added later
+        if (addition_stages[level].size() % 2 == 1) {
+            addition_stages[level+1].back() = addition_stages[level].back();
+        }
+    }
+
+    // the size of the last stage of the adder tree should match the output size of the multiplier
+    oassert(addition_stages[add_levels][0].bits.size() == node->num_output_pins);
+
+    // Remap the outputs of the multiplier
+    for (size_t i = 0; i < addition_stages[add_levels][0].bits.size(); i++) {
+        auto output_bit = addition_stages[add_levels][0].bits[i];
+        remap_pin_to_new_node(node->output_pins[i], output_bit.first, output_bit.second);
+    }
+
+    // check that all connections and input/output remapping is done right
+    // meaning all the inputs and outputs of the multiplier that is splitted are nullptrs
+    // and all inputs and outputs of the AND gates and adders are not nullptrs
+
+    // check that all the inputs/outputs of the multiplier are remapped
+    for (int i = 0; i < node->num_input_pins; i++) {
+        oassert(!node->input_pins[i]);
+    }
+    for (int i = 0; i < node->num_output_pins; i++) {
+        oassert(!node->output_pins[i]);
+    }
+
+    // check that all the partial product gates have nets connected to their inputs/outputs
+    for (size_t ilevel = 0; ilevel < partial_products.size(); ilevel++) {
+        for (size_t depth = 0; depth < partial_products[ilevel].size(); depth++) {
+            for (int i = 0; i < partial_products[ilevel][depth]->num_input_pins; i++) {
+                oassert(partial_products[ilevel][depth]->input_pins[i]);
+            }
+            for (int i = 0; i < partial_products[ilevel][depth]->num_output_pins; i++) {
+                oassert(partial_products[ilevel][depth]->output_pins[i]);
+            }
+        }
+    }
+
+    // check that all adders have nets connected to their inputs/outputs
+    for (size_t ilevel = 0; ilevel < adders.size(); ilevel++) {
+        for (size_t iadd = 0; iadd < adders[ilevel].size(); iadd++) {
+            for (int i = 0; i < adders[ilevel][iadd]->num_input_pins; i++) {
+                oassert(adders[ilevel][iadd]->input_pins[i]);
+            }
+            for (int i = 0; i < adders[ilevel][iadd]->num_output_pins; i++) {
+                oassert(adders[ilevel][iadd]->output_pins[i]);
+            }
+        }
+    }
+
+	// Probably more to do here in freeing the old node!
+	vtr::free(node->name);
+	vtr::free(node->input_port_sizes);
+	vtr::free(node->output_port_sizes);
+
+	// Free arrays NOT the pins since relocated!
+	vtr::free(node->input_pins);
+	vtr::free(node->output_pins);
+	vtr::free(node);
+}
+
 /*-------------------------------------------------------------------------
  * (function: clean_multipliers)
  *
diff --git a/ODIN_II/SRC/partial_map.cpp b/ODIN_II/SRC/partial_map.cpp
index d6fdd41..9b54719 100644
--- a/ODIN_II/SRC/partial_map.cpp
+++ b/ODIN_II/SRC/partial_map.cpp
@@ -258,7 +258,7 @@
             int mult_size = std::max<int>(node->input_port_sizes[0], node->input_port_sizes[1]);
 			if (hard_multipliers && mult_size >= min_mult) {
                 instantiate_hard_multiplier(node, traverse_number, netlist);
-            else {
+            } else if (!hard_adders) {
 				instantiate_simple_soft_multiplier(node, traverse_number, netlist);
 			}
 			break;