vpr/src/pack/pack.cpp - third_party/vtr-verilog-to-routing - Git at Google

 #include <cstdio>
 #include <cstring>
 #include <unordered_set>
 #include <unordered_map>
 #include <fstream>
 #include <stdlib.h>
 #include <sstream>

 #include "vtr_assert.h"
 #include "vtr_log.h"
 #include "vtr_math.h"

 #include "vpr_error.h"
 #include "vpr_types.h"

 #include "read_xml_arch_file.h"
 #include "globals.h"
 #include "atom_netlist.h"
 #include "prepack.h"
 #include "pack_types.h"
 #include "pack.h"
 #include "read_blif.h"
 #include "cluster.h"
 #include "SetupGrid.h"

 /* #define DUMP_PB_GRAPH 1 */
 /* #define DUMP_BLIF_INPUT 1 */

 static std::unordered_set<AtomNetId> alloc_and_load_is_clock(bool global_clocks);
 static bool try_size_device_grid(const t_arch& arch, const std::map<t_logical_block_type_ptr, size_t>& num_type_instances, float target_device_utilization, std::string device_layout_name);

 static t_ext_pin_util_targets parse_target_external_pin_util(std::vector<std::string> specs);
 static std::string target_external_pin_util_to_string(const t_ext_pin_util_targets& ext_pin_utils);

 static t_pack_high_fanout_thresholds parse_high_fanout_thresholds(std::vector<std::string> specs);
 static std::string high_fanout_thresholds_to_string(const t_pack_high_fanout_thresholds& hf_thresholds);

 bool try_pack(t_packer_opts* packer_opts,
               const t_analysis_opts* analysis_opts,
               const t_arch* arch,
               const t_model* user_models,
               const t_model* library_models,
               float interc_delay,
               std::vector<t_lb_type_rr_node>* lb_type_rr_graphs) {
     std::unordered_set<AtomNetId> is_clock;
     std::multimap<AtomBlockId, t_pack_molecule*> atom_molecules;                     //The molecules associated with each atom block
     std::unordered_map<AtomBlockId, t_pb_graph_node*> expected_lowest_cost_pb_gnode; //The molecules associated with each atom block
     const t_model* cur_model;
     int num_models;
     t_pack_patterns* list_of_packing_patterns;
     int num_packing_patterns;
     t_pack_molecule *list_of_pack_molecules, *cur_pack_molecule;
     VTR_LOG("Begin packing '%s'.\n", packer_opts->blif_file_name.c_str());

     /* determine number of models in the architecture */
     num_models = 0;
     cur_model = user_models;
     while (cur_model) {
         num_models++;
         cur_model = cur_model->next;
     }
     cur_model = library_models;
     while (cur_model) {
         num_models++;
         cur_model = cur_model->next;
     }

     is_clock = alloc_and_load_is_clock(packer_opts->global_clocks);

     auto& atom_ctx = g_vpr_ctx.atom();

     size_t num_p_inputs = 0;
     size_t num_p_outputs = 0;
     for (auto blk_id : atom_ctx.nlist.blocks()) {
         auto type = atom_ctx.nlist.block_type(blk_id);
         if (type == AtomBlockType::INPAD) {
             ++num_p_inputs;
         } else if (type == AtomBlockType::OUTPAD) {
             ++num_p_outputs;
         }
     }

     VTR_LOG("\n");
     VTR_LOG("After removing unused inputs...\n");
     VTR_LOG("\ttotal blocks: %zu, total nets: %zu, total inputs: %zu, total outputs: %zu\n",
             atom_ctx.nlist.blocks().size(), atom_ctx.nlist.nets().size(), num_p_inputs, num_p_outputs);

     VTR_LOG("Begin prepacking.\n");
     list_of_packing_patterns = alloc_and_load_pack_patterns(&num_packing_patterns);
     list_of_pack_molecules = alloc_and_load_pack_molecules(list_of_packing_patterns,
                                                            atom_molecules,
                                                            expected_lowest_cost_pb_gnode,
                                                            num_packing_patterns);
     VTR_LOG("Finish prepacking.\n");

     if (packer_opts->auto_compute_inter_cluster_net_delay) {
         packer_opts->inter_cluster_net_delay = interc_delay;
         VTR_LOG("Using inter-cluster delay: %g\n", packer_opts->inter_cluster_net_delay);
     }

     t_ext_pin_util_targets target_external_pin_util = parse_target_external_pin_util(packer_opts->target_external_pin_util);
     t_pack_high_fanout_thresholds high_fanout_thresholds = parse_high_fanout_thresholds(packer_opts->high_fanout_threshold);

     VTR_LOG("Packing with pin utilization targets: %s\n", target_external_pin_util_to_string(target_external_pin_util).c_str());
     VTR_LOG("Packing with high fanout thresholds: %s\n", high_fanout_thresholds_to_string(high_fanout_thresholds).c_str());

     bool allow_unrelated_clustering = false;
     if (packer_opts->allow_unrelated_clustering == e_unrelated_clustering::ON) {
         allow_unrelated_clustering = true;
     } else if (packer_opts->allow_unrelated_clustering == e_unrelated_clustering::OFF) {
         allow_unrelated_clustering = false;
     }

     bool balance_block_type_util = false;
     if (packer_opts->balance_block_type_utilization == e_balance_block_type_util::ON) {
         balance_block_type_util = true;
     } else if (packer_opts->balance_block_type_utilization == e_balance_block_type_util::OFF) {
         balance_block_type_util = false;
     }

     int pack_iteration = 1;

     while (true) {
         //Cluster the netlist
         auto num_type_instances = do_clustering(
             *packer_opts,
             *analysis_opts,
             arch, list_of_pack_molecules, num_models,
             is_clock,
             atom_molecules,
             expected_lowest_cost_pb_gnode,
             allow_unrelated_clustering,
             balance_block_type_util,
             lb_type_rr_graphs,
             target_external_pin_util,
             high_fanout_thresholds);

         //Try to size/find a device
         bool fits_on_device = try_size_device_grid(*arch, num_type_instances, packer_opts->target_device_utilization, packer_opts->device_layout);

         if (fits_on_device) {
             break; //Done
         } else if (pack_iteration == 1) {
             //1st pack attempt was unsucessful (i.e. not dense enough) and we have control of unrelated clustering
             //
             //Turn it on to increase packing density
             if (packer_opts->allow_unrelated_clustering == e_unrelated_clustering::AUTO) {
                 VTR_ASSERT(allow_unrelated_clustering == false);
                 allow_unrelated_clustering = true;
             }
             if (packer_opts->balance_block_type_utilization == e_balance_block_type_util::AUTO) {
                 VTR_ASSERT(balance_block_type_util == false);
                 balance_block_type_util = true;
             }
             VTR_LOG("Packing failed to fit on device. Re-packing with: unrelated_logic_clustering=%s balance_block_type_util=%s\n",
                     (allow_unrelated_clustering ? "true" : "false"),
                     (balance_block_type_util ? "true" : "false"));
         } else {
             //Unable to pack densely enough: Give Up

             //No suitable device found
             std::string resource_reqs;
             std::string resource_avail;
             auto& grid = g_vpr_ctx.device().grid;
             for (auto iter = num_type_instances.begin(); iter != num_type_instances.end(); ++iter) {
                 if (iter != num_type_instances.begin()) {
                     resource_reqs += ", ";
                     resource_avail += ", ";
                 }

                 resource_reqs += std::string(iter->first->name) + ": " + std::to_string(iter->second);
                 resource_avail += std::string(iter->first->name) + ": " + std::to_string(grid.num_instances(physical_tile_type(iter->first)));
             }

             VPR_FATAL_ERROR(VPR_ERROR_OTHER, "Failed to find device which satisifies resource requirements required: %s (available %s)", resource_reqs.c_str(), resource_avail.c_str());
         }

         //Reset clustering for re-packing
         g_vpr_ctx.mutable_clustering().clb_nlist = ClusteredNetlist();
         for (auto blk : g_vpr_ctx.atom().nlist.blocks()) {
             g_vpr_ctx.mutable_atom().lookup.set_atom_clb(blk, ClusterBlockId::INVALID());
             g_vpr_ctx.mutable_atom().lookup.set_atom_pb(blk, nullptr);
         }
         for (auto net : g_vpr_ctx.atom().nlist.nets()) {
             g_vpr_ctx.mutable_atom().lookup.set_atom_clb_net(net, ClusterNetId::INVALID());
         }

         ++pack_iteration;
     }

     /*free list_of_pack_molecules*/
     free_list_of_pack_patterns(list_of_packing_patterns, num_packing_patterns);

     cur_pack_molecule = list_of_pack_molecules;
     while (cur_pack_molecule != nullptr) {
         cur_pack_molecule = list_of_pack_molecules->next;
         delete list_of_pack_molecules;
         list_of_pack_molecules = cur_pack_molecule;
     }

     VTR_LOG("\n");
     VTR_LOG("Netlist conversion complete.\n");
     VTR_LOG("\n");

     return true;
 }

 float get_arch_switch_info(short switch_index, int switch_fanin, float& Tdel_switch, float& R_switch, float& Cout_switch) {
     /* Fetches delay, resistance and output capacitance of the architecture switch at switch_index.
      * Returns the total delay through the switch. Used to calculate inter-cluster net delay. */

     /* The intrinsic delay may depend on fanin to the switch. If the delay map of a
      * switch from the architecture file has multiple (#inputs, delay) entries, we
      * interpolate/extrapolate to get the delay at 'switch_fanin'. */
     auto& device_ctx = g_vpr_ctx.device();

     Tdel_switch = device_ctx.arch_switch_inf[switch_index].Tdel(switch_fanin);
     R_switch = device_ctx.arch_switch_inf[switch_index].R;
     Cout_switch = device_ctx.arch_switch_inf[switch_index].Cout;

     /* The delay through a loaded switch is its intrinsic (unloaded)
      * delay plus the product of its resistance and output capacitance. */
     return Tdel_switch + R_switch * Cout_switch;
 }

 std::unordered_set<AtomNetId> alloc_and_load_is_clock(bool global_clocks) {
     /* Looks through all the atom blocks to find and mark all the clocks, by setting
      * the corresponding entry by adding the clock to is_clock.
      * global_clocks is used
      * only for an error check.                                                */

     int num_clocks = 0;
     std::unordered_set<AtomNetId> is_clock;

     /* Want to identify all the clock nets.  */
     auto& atom_ctx = g_vpr_ctx.atom();

     for (auto blk_id : atom_ctx.nlist.blocks()) {
         for (auto pin_id : atom_ctx.nlist.block_clock_pins(blk_id)) {
             auto net_id = atom_ctx.nlist.pin_net(pin_id);
             if (!is_clock.count(net_id)) {
                 is_clock.insert(net_id);
                 num_clocks++;
             }
         }
     }

     /* If we have multiple clocks and we're supposed to declare them global, *
      * print a warning message, since it looks like this circuit may have    *
      * locally generated clocks.                                             */

     if (num_clocks > 1 && global_clocks) {
         VTR_LOG_WARN(
             "All %d clocks will be treated as global.\n", num_clocks);
     }

     return (is_clock);
 }

 static bool try_size_device_grid(const t_arch& arch, const std::map<t_logical_block_type_ptr, size_t>& num_type_instances, float target_device_utilization, std::string device_layout_name) {
     auto& device_ctx = g_vpr_ctx.mutable_device();

     //Build the device
     auto grid = create_device_grid(device_layout_name, arch.grid_layouts, num_type_instances, target_device_utilization);

     /*
      *Report on the device
      */
     VTR_LOG("FPGA sized to %zu x %zu (%s)\n", grid.width(), grid.height(), grid.name().c_str());

     bool fits_on_device = true;

     float device_utilization = calculate_device_utilization(grid, num_type_instances);
     VTR_LOG("Device Utilization: %.2f (target %.2f)\n", device_utilization, target_device_utilization);
     std::map<t_logical_block_type_ptr, float> type_util;
     for (const auto& type : device_ctx.logical_block_types) {
         auto physical_type = physical_tile_type(&type);
         auto itr = num_type_instances.find(&type);
         if (itr == num_type_instances.end()) continue;

         float num_instances = itr->second;
         float util = 0.;
         if (device_ctx.grid.num_instances(physical_type) != 0) {
             util = num_instances / device_ctx.grid.num_instances(physical_type);
         }
         type_util[&type] = util;

         if (util > 1.) {
             fits_on_device = false;
         }
         VTR_LOG("\tBlock Utilization: %.2f Type: %s\n", util, type.name);
     }
     VTR_LOG("\n");

     return fits_on_device;
 }

 static t_ext_pin_util_targets parse_target_external_pin_util(std::vector<std::string> specs) {
     t_ext_pin_util_targets targets(1., 1.);

     if (specs.size() == 1 && specs[0] == "auto") {
         //No user-specified pin utilizations, infer them automatically.
         //
         //We set a pin utilization target based on the block type, with
         //the logic block having a lower utilization target and other blocks
         //(e.g. hard blocks) having no limit.

         auto& device_ctx = g_vpr_ctx.device();
         auto& grid = device_ctx.grid;
         t_logical_block_type_ptr logic_block_type = infer_logic_block_type(grid);

         //Allowing 100% pin utilization of the logic block type can harm
         //routability, since it may allow a few (typically outlier) clusters to
         //use a very large number of pins -- causing routability issues. These
         //clusters can cause failed routings where only a handful of routing
         //resource nodes remain overused (and do not resolve) These can be
         //avoided by putting a (soft) limit on the number of input pins which
         //can be used, effectively clipping off the most egregeous outliers.
         //
         //Experiments show that limiting input utilization produces better quality
         //than limiting output utilization (limiting input utilization implicitly
         //also limits output utilization).
         //
         //For relatively high pin utilizations (e.g. > 70%) this has little-to-no
         //impact on the number of clusters required. As a result we set a default
         //input pin utilization target which is high, but less than 100%.
         if (logic_block_type != nullptr) {
             constexpr float LOGIC_BLOCK_TYPE_AUTO_INPUT_UTIL = 0.8;
             constexpr float LOGIC_BLOCK_TYPE_AUTO_OUTPUT_UTIL = 1.0;

             t_ext_pin_util logic_block_ext_pin_util(LOGIC_BLOCK_TYPE_AUTO_INPUT_UTIL, LOGIC_BLOCK_TYPE_AUTO_OUTPUT_UTIL);

             targets.set_block_pin_util(logic_block_type->name, logic_block_ext_pin_util);
         } else {
             VTR_LOG_WARN("Unable to identify logic block type to apply default pin utilization targets to; this may result in denser packing than desired\n");
         }

     } else {
         //Process user specified overrides

         bool default_set = false;
         std::set<std::string> seen_block_types;

         for (auto spec : specs) {
             t_ext_pin_util target_ext_pin_util(1., 1.);

             auto block_values = vtr::split(spec, ":");
             std::string block_type;
             std::string values;
             if (block_values.size() == 2) {
                 block_type = block_values[0];
                 values = block_values[1];
             } else if (block_values.size() == 1) {
                 values = block_values[0];
             } else {
                 std::stringstream msg;
                 msg << "In valid block pin utilization specification '" << spec << "' (expected at most one ':' between block name and values";
                 VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
             }

             auto elements = vtr::split(values, ",");
             if (elements.size() == 1) {
                 target_ext_pin_util.input_pin_util = vtr::atof(elements[0]);
             } else if (elements.size() == 2) {
                 target_ext_pin_util.input_pin_util = vtr::atof(elements[0]);
                 target_ext_pin_util.output_pin_util = vtr::atof(elements[1]);
             } else {
                 std::stringstream msg;
                 msg << "Invalid conversion from '" << spec << "' to external pin util (expected either a single float value, or two float values separted by a comma)";
                 VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
             }

             if (target_ext_pin_util.input_pin_util < 0. || target_ext_pin_util.input_pin_util > 1.) {
                 std::stringstream msg;
                 msg << "Out of range target input pin utilization '" << target_ext_pin_util.input_pin_util << "' (expected within range [0.0, 1.0])";
                 VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
             }
             if (target_ext_pin_util.output_pin_util < 0. || target_ext_pin_util.output_pin_util > 1.) {
                 std::stringstream msg;
                 msg << "Out of range target output pin utilization '" << target_ext_pin_util.output_pin_util << "' (expected within range [0.0, 1.0])";
                 VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
             }

             if (block_type.empty()) {
                 //Default value
                 if (default_set) {
                     std::stringstream msg;
                     msg << "Only one default pin utilization should be specified";
                     VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
                 }
                 targets.set_default_pin_util(target_ext_pin_util);
                 default_set = true;
             } else {
                 if (seen_block_types.count(block_type)) {
                     std::stringstream msg;
                     msg << "Only one pin utilization should be specified for block type '" << block_type << "'";
                     VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
                 }

                 targets.set_block_pin_util(block_type, target_ext_pin_util);
                 seen_block_types.insert(block_type);
             }
         }
     }

     return targets;
 }

 static std::string target_external_pin_util_to_string(const t_ext_pin_util_targets& ext_pin_utils) {
     std::stringstream ss;

     auto& device_ctx = g_vpr_ctx.device();

     for (unsigned int itype = 0; itype < device_ctx.physical_tile_types.size(); ++itype) {
         if (is_empty_type(&device_ctx.physical_tile_types[itype])) continue;

         auto blk_name = device_ctx.physical_tile_types[itype].name;

         ss << blk_name << ":";

         auto pin_util = ext_pin_utils.get_pin_util(blk_name);
         ss << pin_util.input_pin_util << ',' << pin_util.output_pin_util;

         if (itype != device_ctx.physical_tile_types.size() - 1) {
             ss << " ";
         }
     }

     return ss.str();
 }

 static t_pack_high_fanout_thresholds parse_high_fanout_thresholds(std::vector<std::string> specs) {
     t_pack_high_fanout_thresholds high_fanout_thresholds(128);

     if (specs.size() == 1 && specs[0] == "auto") {
         //No user-specified high fanout thresholds, infer them automatically.
         //
         //We set the high fanout threshold a based on the block type, with
         //the logic block having a lower threshold than other blocks.
         //(Since logic blocks are the ones which tend to be too densely
         //clustered.)

         auto& device_ctx = g_vpr_ctx.device();
         auto& grid = device_ctx.grid;
         t_logical_block_type_ptr logic_block_type = infer_logic_block_type(grid);

         if (logic_block_type != nullptr) {
             constexpr float LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD = 32;

             high_fanout_thresholds.set(logic_block_type->name, LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD);
         } else {
             VTR_LOG_WARN("Unable to identify logic block type to apply default packer high fanout thresholds; this may result in denser packing than desired\n");
         }
     } else {
         //Process user specified overrides

         bool default_set = false;
         std::set<std::string> seen_block_types;

         for (auto spec : specs) {
             auto block_values = vtr::split(spec, ":");
             std::string block_type;
             std::string value;
             if (block_values.size() == 1) {
                 value = block_values[0];
             } else if (block_values.size() == 2) {
                 block_type = block_values[0];
                 value = block_values[1];
             } else {
                 std::stringstream msg;
                 msg << "In valid block high fanout threshold specification '" << spec << "' (expected at most one ':' between block name and value";
                 VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
             }

             int threshold = vtr::atoi(value);

             if (block_type.empty()) {
                 //Default value
                 if (default_set) {
                     std::stringstream msg;
                     msg << "Only one default high fanout threshold should be specified";
                     VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
                 }
                 high_fanout_thresholds.set_default(threshold);
                 default_set = true;
             } else {
                 if (seen_block_types.count(block_type)) {
                     std::stringstream msg;
                     msg << "Only one high fanout threshold should be specified for block type '" << block_type << "'";
                     VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
                 }

                 high_fanout_thresholds.set(block_type, threshold);
                 seen_block_types.insert(block_type);
             }
         }
     }

     return high_fanout_thresholds;
 }

 static std::string high_fanout_thresholds_to_string(const t_pack_high_fanout_thresholds& hf_thresholds) {
     std::stringstream ss;

     auto& device_ctx = g_vpr_ctx.device();

     for (unsigned int itype = 0; itype < device_ctx.physical_tile_types.size(); ++itype) {
         if (is_empty_type(&device_ctx.physical_tile_types[itype])) continue;

         auto blk_name = device_ctx.physical_tile_types[itype].name;

         ss << blk_name << ":";

         auto threshold = hf_thresholds.get_threshold(blk_name);
         ss << threshold;

         if (itype != device_ctx.physical_tile_types.size() - 1) {
             ss << " ";
         }
     }

     return ss.str();
 }
	#include <cstdio>
	#include <cstring>
	#include <unordered_set>
	#include <unordered_map>
	#include <fstream>
	#include <stdlib.h>
	#include <sstream>

	#include "vtr_assert.h"
	#include "vtr_log.h"
	#include "vtr_math.h"

	#include "vpr_error.h"
	#include "vpr_types.h"

	#include "read_xml_arch_file.h"
	#include "globals.h"
	#include "atom_netlist.h"
	#include "prepack.h"
	#include "pack_types.h"
	#include "pack.h"
	#include "read_blif.h"
	#include "cluster.h"
	#include "SetupGrid.h"

	/* #define DUMP_PB_GRAPH 1 */
	/* #define DUMP_BLIF_INPUT 1 */

	static std::unordered_set<AtomNetId> alloc_and_load_is_clock(bool global_clocks);
	static bool try_size_device_grid(const t_arch& arch, const std::map<t_logical_block_type_ptr, size_t>& num_type_instances, float target_device_utilization, std::string device_layout_name);

	static t_ext_pin_util_targets parse_target_external_pin_util(std::vector<std::string> specs);
	static std::string target_external_pin_util_to_string(const t_ext_pin_util_targets& ext_pin_utils);

	static t_pack_high_fanout_thresholds parse_high_fanout_thresholds(std::vector<std::string> specs);
	static std::string high_fanout_thresholds_to_string(const t_pack_high_fanout_thresholds& hf_thresholds);

	bool try_pack(t_packer_opts* packer_opts,
	const t_analysis_opts* analysis_opts,
	const t_arch* arch,
	const t_model* user_models,
	const t_model* library_models,
	float interc_delay,
	std::vector<t_lb_type_rr_node>* lb_type_rr_graphs) {
	std::unordered_set<AtomNetId> is_clock;
	std::multimap<AtomBlockId, t_pack_molecule*> atom_molecules; //The molecules associated with each atom block
	std::unordered_map<AtomBlockId, t_pb_graph_node*> expected_lowest_cost_pb_gnode; //The molecules associated with each atom block
	const t_model* cur_model;
	int num_models;
	t_pack_patterns* list_of_packing_patterns;
	int num_packing_patterns;
	t_pack_molecule list_of_pack_molecules, cur_pack_molecule;
	VTR_LOG("Begin packing '%s'.\n", packer_opts->blif_file_name.c_str());

	/* determine number of models in the architecture */
	num_models = 0;
	cur_model = user_models;
	while (cur_model) {
	num_models++;
	cur_model = cur_model->next;
	}
	cur_model = library_models;
	while (cur_model) {
	num_models++;
	cur_model = cur_model->next;
	}

	is_clock = alloc_and_load_is_clock(packer_opts->global_clocks);

	auto& atom_ctx = g_vpr_ctx.atom();

	size_t num_p_inputs = 0;
	size_t num_p_outputs = 0;
	for (auto blk_id : atom_ctx.nlist.blocks()) {
	auto type = atom_ctx.nlist.block_type(blk_id);
	if (type == AtomBlockType::INPAD) {
	++num_p_inputs;
	} else if (type == AtomBlockType::OUTPAD) {
	++num_p_outputs;
	}
	}

	VTR_LOG("\n");
	VTR_LOG("After removing unused inputs...\n");
	VTR_LOG("\ttotal blocks: %zu, total nets: %zu, total inputs: %zu, total outputs: %zu\n",
	atom_ctx.nlist.blocks().size(), atom_ctx.nlist.nets().size(), num_p_inputs, num_p_outputs);

	VTR_LOG("Begin prepacking.\n");
	list_of_packing_patterns = alloc_and_load_pack_patterns(&num_packing_patterns);
	list_of_pack_molecules = alloc_and_load_pack_molecules(list_of_packing_patterns,
	atom_molecules,
	expected_lowest_cost_pb_gnode,
	num_packing_patterns);
	VTR_LOG("Finish prepacking.\n");

	if (packer_opts->auto_compute_inter_cluster_net_delay) {
	packer_opts->inter_cluster_net_delay = interc_delay;
	VTR_LOG("Using inter-cluster delay: %g\n", packer_opts->inter_cluster_net_delay);
	}

	t_ext_pin_util_targets target_external_pin_util = parse_target_external_pin_util(packer_opts->target_external_pin_util);
	t_pack_high_fanout_thresholds high_fanout_thresholds = parse_high_fanout_thresholds(packer_opts->high_fanout_threshold);

	VTR_LOG("Packing with pin utilization targets: %s\n", target_external_pin_util_to_string(target_external_pin_util).c_str());
	VTR_LOG("Packing with high fanout thresholds: %s\n", high_fanout_thresholds_to_string(high_fanout_thresholds).c_str());

	bool allow_unrelated_clustering = false;
	if (packer_opts->allow_unrelated_clustering == e_unrelated_clustering::ON) {
	allow_unrelated_clustering = true;
	} else if (packer_opts->allow_unrelated_clustering == e_unrelated_clustering::OFF) {
	allow_unrelated_clustering = false;
	}

	bool balance_block_type_util = false;
	if (packer_opts->balance_block_type_utilization == e_balance_block_type_util::ON) {
	balance_block_type_util = true;
	} else if (packer_opts->balance_block_type_utilization == e_balance_block_type_util::OFF) {
	balance_block_type_util = false;
	}

	int pack_iteration = 1;

	while (true) {
	//Cluster the netlist
	auto num_type_instances = do_clustering(
	*packer_opts,
	*analysis_opts,
	arch, list_of_pack_molecules, num_models,
	is_clock,
	atom_molecules,
	expected_lowest_cost_pb_gnode,
	allow_unrelated_clustering,
	balance_block_type_util,
	lb_type_rr_graphs,
	target_external_pin_util,
	high_fanout_thresholds);

	//Try to size/find a device
	bool fits_on_device = try_size_device_grid(*arch, num_type_instances, packer_opts->target_device_utilization, packer_opts->device_layout);

	if (fits_on_device) {
	break; //Done
	} else if (pack_iteration == 1) {
	//1st pack attempt was unsucessful (i.e. not dense enough) and we have control of unrelated clustering
	//
	//Turn it on to increase packing density
	if (packer_opts->allow_unrelated_clustering == e_unrelated_clustering::AUTO) {
	VTR_ASSERT(allow_unrelated_clustering == false);
	allow_unrelated_clustering = true;
	}
	if (packer_opts->balance_block_type_utilization == e_balance_block_type_util::AUTO) {
	VTR_ASSERT(balance_block_type_util == false);
	balance_block_type_util = true;
	}
	VTR_LOG("Packing failed to fit on device. Re-packing with: unrelated_logic_clustering=%s balance_block_type_util=%s\n",
	(allow_unrelated_clustering ? "true" : "false"),
	(balance_block_type_util ? "true" : "false"));
	} else {
	//Unable to pack densely enough: Give Up

	//No suitable device found
	std::string resource_reqs;
	std::string resource_avail;
	auto& grid = g_vpr_ctx.device().grid;
	for (auto iter = num_type_instances.begin(); iter != num_type_instances.end(); ++iter) {
	if (iter != num_type_instances.begin()) {
	resource_reqs += ", ";
	resource_avail += ", ";
	}

	resource_reqs += std::string(iter->first->name) + ": " + std::to_string(iter->second);
	resource_avail += std::string(iter->first->name) + ": " + std::to_string(grid.num_instances(physical_tile_type(iter->first)));
	}

	VPR_FATAL_ERROR(VPR_ERROR_OTHER, "Failed to find device which satisifies resource requirements required: %s (available %s)", resource_reqs.c_str(), resource_avail.c_str());
	}

	//Reset clustering for re-packing
	g_vpr_ctx.mutable_clustering().clb_nlist = ClusteredNetlist();
	for (auto blk : g_vpr_ctx.atom().nlist.blocks()) {
	g_vpr_ctx.mutable_atom().lookup.set_atom_clb(blk, ClusterBlockId::INVALID());
	g_vpr_ctx.mutable_atom().lookup.set_atom_pb(blk, nullptr);
	}
	for (auto net : g_vpr_ctx.atom().nlist.nets()) {
	g_vpr_ctx.mutable_atom().lookup.set_atom_clb_net(net, ClusterNetId::INVALID());
	}

	++pack_iteration;
	}

	/free list_of_pack_molecules/
	free_list_of_pack_patterns(list_of_packing_patterns, num_packing_patterns);

	cur_pack_molecule = list_of_pack_molecules;
	while (cur_pack_molecule != nullptr) {
	cur_pack_molecule = list_of_pack_molecules->next;
	delete list_of_pack_molecules;
	list_of_pack_molecules = cur_pack_molecule;
	}

	VTR_LOG("\n");
	VTR_LOG("Netlist conversion complete.\n");
	VTR_LOG("\n");

	return true;
	}

	float get_arch_switch_info(short switch_index, int switch_fanin, float& Tdel_switch, float& R_switch, float& Cout_switch) {
	/* Fetches delay, resistance and output capacitance of the architecture switch at switch_index.
	* Returns the total delay through the switch. Used to calculate inter-cluster net delay. */

	/* The intrinsic delay may depend on fanin to the switch. If the delay map of a
	* switch from the architecture file has multiple (#inputs, delay) entries, we
	* interpolate/extrapolate to get the delay at 'switch_fanin'. */
	auto& device_ctx = g_vpr_ctx.device();

	Tdel_switch = device_ctx.arch_switch_inf[switch_index].Tdel(switch_fanin);
	R_switch = device_ctx.arch_switch_inf[switch_index].R;
	Cout_switch = device_ctx.arch_switch_inf[switch_index].Cout;

	/* The delay through a loaded switch is its intrinsic (unloaded)
	* delay plus the product of its resistance and output capacitance. */
	return Tdel_switch + R_switch * Cout_switch;
	}

	std::unordered_set<AtomNetId> alloc_and_load_is_clock(bool global_clocks) {
	/* Looks through all the atom blocks to find and mark all the clocks, by setting
	* the corresponding entry by adding the clock to is_clock.
	* global_clocks is used
	* only for an error check. */

	int num_clocks = 0;
	std::unordered_set<AtomNetId> is_clock;

	/* Want to identify all the clock nets. */
	auto& atom_ctx = g_vpr_ctx.atom();

	for (auto blk_id : atom_ctx.nlist.blocks()) {
	for (auto pin_id : atom_ctx.nlist.block_clock_pins(blk_id)) {
	auto net_id = atom_ctx.nlist.pin_net(pin_id);
	if (!is_clock.count(net_id)) {
	is_clock.insert(net_id);
	num_clocks++;
	}
	}
	}

	/* If we have multiple clocks and we're supposed to declare them global, *
	* print a warning message, since it looks like this circuit may have *
	* locally generated clocks. */

	if (num_clocks > 1 && global_clocks) {
	VTR_LOG_WARN(
	"All %d clocks will be treated as global.\n", num_clocks);
	}

	return (is_clock);
	}

	static bool try_size_device_grid(const t_arch& arch, const std::map<t_logical_block_type_ptr, size_t>& num_type_instances, float target_device_utilization, std::string device_layout_name) {
	auto& device_ctx = g_vpr_ctx.mutable_device();

	//Build the device
	auto grid = create_device_grid(device_layout_name, arch.grid_layouts, num_type_instances, target_device_utilization);

	/*
	*Report on the device
	*/
	VTR_LOG("FPGA sized to %zu x %zu (%s)\n", grid.width(), grid.height(), grid.name().c_str());

	bool fits_on_device = true;

	float device_utilization = calculate_device_utilization(grid, num_type_instances);
	VTR_LOG("Device Utilization: %.2f (target %.2f)\n", device_utilization, target_device_utilization);
	std::map<t_logical_block_type_ptr, float> type_util;
	for (const auto& type : device_ctx.logical_block_types) {
	auto physical_type = physical_tile_type(&type);
	auto itr = num_type_instances.find(&type);
	if (itr == num_type_instances.end()) continue;

	float num_instances = itr->second;
	float util = 0.;
	if (device_ctx.grid.num_instances(physical_type) != 0) {
	util = num_instances / device_ctx.grid.num_instances(physical_type);
	}
	type_util[&type] = util;

	if (util > 1.) {
	fits_on_device = false;
	}
	VTR_LOG("\tBlock Utilization: %.2f Type: %s\n", util, type.name);
	}
	VTR_LOG("\n");

	return fits_on_device;
	}

	static t_ext_pin_util_targets parse_target_external_pin_util(std::vector<std::string> specs) {
	t_ext_pin_util_targets targets(1., 1.);

	if (specs.size() == 1 && specs[0] == "auto") {
	//No user-specified pin utilizations, infer them automatically.
	//
	//We set a pin utilization target based on the block type, with
	//the logic block having a lower utilization target and other blocks
	//(e.g. hard blocks) having no limit.

	auto& device_ctx = g_vpr_ctx.device();
	auto& grid = device_ctx.grid;
	t_logical_block_type_ptr logic_block_type = infer_logic_block_type(grid);

	//Allowing 100% pin utilization of the logic block type can harm
	//routability, since it may allow a few (typically outlier) clusters to
	//use a very large number of pins -- causing routability issues. These
	//clusters can cause failed routings where only a handful of routing
	//resource nodes remain overused (and do not resolve) These can be
	//avoided by putting a (soft) limit on the number of input pins which
	//can be used, effectively clipping off the most egregeous outliers.
	//
	//Experiments show that limiting input utilization produces better quality
	//than limiting output utilization (limiting input utilization implicitly
	//also limits output utilization).
	//
	//For relatively high pin utilizations (e.g. > 70%) this has little-to-no
	//impact on the number of clusters required. As a result we set a default
	//input pin utilization target which is high, but less than 100%.
	if (logic_block_type != nullptr) {
	constexpr float LOGIC_BLOCK_TYPE_AUTO_INPUT_UTIL = 0.8;
	constexpr float LOGIC_BLOCK_TYPE_AUTO_OUTPUT_UTIL = 1.0;

	t_ext_pin_util logic_block_ext_pin_util(LOGIC_BLOCK_TYPE_AUTO_INPUT_UTIL, LOGIC_BLOCK_TYPE_AUTO_OUTPUT_UTIL);

	targets.set_block_pin_util(logic_block_type->name, logic_block_ext_pin_util);
	} else {
	VTR_LOG_WARN("Unable to identify logic block type to apply default pin utilization targets to; this may result in denser packing than desired\n");
	}

	} else {
	//Process user specified overrides

	bool default_set = false;
	std::set<std::string> seen_block_types;

	for (auto spec : specs) {
	t_ext_pin_util target_ext_pin_util(1., 1.);

	auto block_values = vtr::split(spec, ":");
	std::string block_type;
	std::string values;
	if (block_values.size() == 2) {
	block_type = block_values[0];
	values = block_values[1];
	} else if (block_values.size() == 1) {
	values = block_values[0];
	} else {
	std::stringstream msg;
	msg << "In valid block pin utilization specification '" << spec << "' (expected at most one ':' between block name and values";
	VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
	}

	auto elements = vtr::split(values, ",");
	if (elements.size() == 1) {
	target_ext_pin_util.input_pin_util = vtr::atof(elements[0]);
	} else if (elements.size() == 2) {
	target_ext_pin_util.input_pin_util = vtr::atof(elements[0]);
	target_ext_pin_util.output_pin_util = vtr::atof(elements[1]);
	} else {
	std::stringstream msg;
	msg << "Invalid conversion from '" << spec << "' to external pin util (expected either a single float value, or two float values separted by a comma)";
	VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
	}

	if (target_ext_pin_util.input_pin_util < 0. \|\| target_ext_pin_util.input_pin_util > 1.) {
	std::stringstream msg;
	msg << "Out of range target input pin utilization '" << target_ext_pin_util.input_pin_util << "' (expected within range [0.0, 1.0])";
	VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
	}
	if (target_ext_pin_util.output_pin_util < 0. \|\| target_ext_pin_util.output_pin_util > 1.) {
	std::stringstream msg;
	msg << "Out of range target output pin utilization '" << target_ext_pin_util.output_pin_util << "' (expected within range [0.0, 1.0])";
	VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
	}

	if (block_type.empty()) {
	//Default value
	if (default_set) {
	std::stringstream msg;
	msg << "Only one default pin utilization should be specified";
	VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
	}
	targets.set_default_pin_util(target_ext_pin_util);
	default_set = true;
	} else {
	if (seen_block_types.count(block_type)) {
	std::stringstream msg;
	msg << "Only one pin utilization should be specified for block type '" << block_type << "'";
	VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
	}

	targets.set_block_pin_util(block_type, target_ext_pin_util);
	seen_block_types.insert(block_type);
	}
	}
	}

	return targets;
	}

	static std::string target_external_pin_util_to_string(const t_ext_pin_util_targets& ext_pin_utils) {
	std::stringstream ss;

	auto& device_ctx = g_vpr_ctx.device();

	for (unsigned int itype = 0; itype < device_ctx.physical_tile_types.size(); ++itype) {
	if (is_empty_type(&device_ctx.physical_tile_types[itype])) continue;

	auto blk_name = device_ctx.physical_tile_types[itype].name;

	ss << blk_name << ":";

	auto pin_util = ext_pin_utils.get_pin_util(blk_name);
	ss << pin_util.input_pin_util << ',' << pin_util.output_pin_util;

	if (itype != device_ctx.physical_tile_types.size() - 1) {
	ss << " ";
	}
	}

	return ss.str();
	}

	static t_pack_high_fanout_thresholds parse_high_fanout_thresholds(std::vector<std::string> specs) {
	t_pack_high_fanout_thresholds high_fanout_thresholds(128);

	if (specs.size() == 1 && specs[0] == "auto") {
	//No user-specified high fanout thresholds, infer them automatically.
	//
	//We set the high fanout threshold a based on the block type, with
	//the logic block having a lower threshold than other blocks.
	//(Since logic blocks are the ones which tend to be too densely
	//clustered.)

	auto& device_ctx = g_vpr_ctx.device();
	auto& grid = device_ctx.grid;
	t_logical_block_type_ptr logic_block_type = infer_logic_block_type(grid);

	if (logic_block_type != nullptr) {
	constexpr float LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD = 32;

	high_fanout_thresholds.set(logic_block_type->name, LOGIC_BLOCK_TYPE_HIGH_FANOUT_THRESHOLD);
	} else {
	VTR_LOG_WARN("Unable to identify logic block type to apply default packer high fanout thresholds; this may result in denser packing than desired\n");
	}
	} else {
	//Process user specified overrides

	bool default_set = false;
	std::set<std::string> seen_block_types;

	for (auto spec : specs) {
	auto block_values = vtr::split(spec, ":");
	std::string block_type;
	std::string value;
	if (block_values.size() == 1) {
	value = block_values[0];
	} else if (block_values.size() == 2) {
	block_type = block_values[0];
	value = block_values[1];
	} else {
	std::stringstream msg;
	msg << "In valid block high fanout threshold specification '" << spec << "' (expected at most one ':' between block name and value";
	VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
	}

	int threshold = vtr::atoi(value);

	if (block_type.empty()) {
	//Default value
	if (default_set) {
	std::stringstream msg;
	msg << "Only one default high fanout threshold should be specified";
	VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
	}
	high_fanout_thresholds.set_default(threshold);
	default_set = true;
	} else {
	if (seen_block_types.count(block_type)) {
	std::stringstream msg;
	msg << "Only one high fanout threshold should be specified for block type '" << block_type << "'";
	VPR_FATAL_ERROR(VPR_ERROR_PACK, msg.str().c_str());
	}

	high_fanout_thresholds.set(block_type, threshold);
	seen_block_types.insert(block_type);
	}
	}
	}

	return high_fanout_thresholds;
	}

	static std::string high_fanout_thresholds_to_string(const t_pack_high_fanout_thresholds& hf_thresholds) {
	std::stringstream ss;

	auto& device_ctx = g_vpr_ctx.device();

	for (unsigned int itype = 0; itype < device_ctx.physical_tile_types.size(); ++itype) {
	if (is_empty_type(&device_ctx.physical_tile_types[itype])) continue;

	auto blk_name = device_ctx.physical_tile_types[itype].name;

	ss << blk_name << ":";

	auto threshold = hf_thresholds.get_threshold(blk_name);
	ss << threshold;

	if (itype != device_ctx.physical_tile_types.size() - 1) {
	ss << " ";
	}
	}

	return ss.str();
	}