vpr/src/pack/cluster.cpp - third_party/vtr-verilog-to-routing - Git at Google

 /*
  * Main clustering algorithm
  * Author(s): Vaughn Betz (first revision - VPack), Alexander Marquardt (second revision - T-VPack), Jason Luu (third revision - AAPack)
  * June 8, 2011
  */

 /*
  * The clusterer uses several key data structures:
  *
  *      t_pb_type (and related types):
  *          Represent the architecture as described in the architecture file.
  *
  *      t_pb_graph_node (and related types):
  *          Represents a flattened version of the architecture with t_pb_types
  *          expanded (according to num_pb) into unique t_pb_graph_node instances,
  *          and the routing connectivity converted to a graph of t_pb_graph_pin (nodes)
  *          and t_pb_graph_edge.
  *
  *      t_pb:
  *          Represents a clustered instance of a t_pb_graph_node containing netlist primitives
  *
  *  t_pb_type and t_pb_graph_node (and related types) describe the targetted FPGA architecture, while t_pb represents
  *  the actual clustering of the user netlist.
  *
  *  For example:
  *      Consider an architecture where CLBs contain 4 BLEs, and each BLE is a LUT + FF pair.
  *      We wish to map a netlist of 400 LUTs and 400 FFs.
  *
  *      A BLE corresponds to one t_pb_type (which has num_pb = 4).
  *
  *      Each of the 4 BLE positions corresponds to a t_pb_graph_node (each of which references the BLE t_pb_type).
  *
  *      The output of clustering is 400 t_pb of type BLE which represent the clustered user netlist.
  *      Each of the 400 t_pb will reference one of the 4 BLE-type t_pb_graph_nodes.
  */
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <ctime>
 #include <map>
 #include <algorithm>
 #include <fstream>
 using namespace std;

 #include "vtr_assert.h"
 #include "vtr_log.h"
 #include "vtr_math.h"
 #include "vtr_memory.h"

 #include "vpr_types.h"
 #include "vpr_error.h"

 #include "globals.h"
 #include "atom_netlist.h"
 #include "pack_types.h"
 #include "cluster.h"
 #include "output_clustering.h"
 #include "SetupGrid.h"
 #include "read_xml_arch_file.h"
 #include "path_delay2.h"
 #include "path_delay.h"
 #include "vpr_utils.h"
 #include "cluster_placement.h"
 #include "echo_files.h"
 #include "cluster_router.h"
 #include "lb_type_rr_graph.h"

 #include "timing_info.h"
 #include "PreClusterDelayCalculator.h"
 #include "tatum/echo_writer.hpp"
 #include "tatum/report/graphviz_dot_writer.hpp"

 #include "read_sdc.h"

 #define AAPACK_MAX_FEASIBLE_BLOCK_ARRAY_SIZE 30      /* This value is used to determine the max size of the priority queue for candidates that pass the early filter legality test but not the more detailed routing test */
 #define AAPACK_MAX_NET_SINKS_IGNORE 256				/* The packer looks at all sinks of a net when deciding what next candidate block to pack, for high-fanout nets, this is too runtime costly for marginal benefit, thus ignore those high fanout nets */
 #define AAPACK_MAX_HIGH_FANOUT_EXPLORE 10			/* For high-fanout nets that are ignored, consider a maximum of this many sinks, must be less than AAPACK_MAX_FEASIBLE_BLOCK_ARRAY_SIZE */
 #define AAPACK_MAX_TRANSITIVE_FANOUT_EXPLORE 4		/* When investigating transitive fanout connections in packing, this is the highest fanout net that will be explored */
 #define AAPACK_MAX_TRANSITIVE_EXPLORE 4				/* When investigating transitive fanout connections in packing, consider a maximum of this many molecules, must be less tahn AAPACK_MAX_FEASIBLE_BLOCK_ARRAY_SIZE */

 #define SCALE_NUM_PATHS 1e-2     /*this value is used as a multiplier to assign a    *
 				  *slightly higher criticality value to nets that    *
 				  *affect a large number of critical paths versus    *
 				  *nets that do not have such a broad effect.        *
 				  *Note that this multiplier is intentionally very   *
 				  *small compared to the total criticality because   *
 				  *we want to make sure that atom net criticality is      *
 				  *primarily determined by slacks, with this acting  *
 				  *only as a tie-breaker between otherwise equal nets*/
 #define SCALE_DISTANCE_VAL 1e-4  /*this value is used as a multiplier to assign a    *
 				  *slightly higher criticality value to nets that    *
 				  *are otherwise equal but one is  farther           *
 				  *from sources (the farther one being made slightly *
 				  *more critical)                                    */

 enum e_gain_update {
 	GAIN, NO_GAIN
 };
 enum e_feasibility {
 	FEASIBLE, INFEASIBLE
 };
 enum e_gain_type {
 	HILL_CLIMBING, NOT_HILL_CLIMBING
 };
 enum e_removal_policy {
 	REMOVE_CLUSTERED, LEAVE_CLUSTERED
 };
 /* TODO: REMOVE_CLUSTERED no longer used, remove */
 enum e_net_relation_to_clustered_block {
 	INPUT, OUTPUT
 };

 enum e_detailed_routing_stages {
 	E_DETAILED_ROUTE_AT_END_ONLY = 0, E_DETAILED_ROUTE_FOR_EACH_ATOM, E_DETAILED_ROUTE_END
 };


 /* Linked list structure.  Stores one integer (iblk). */
 struct t_molecule_link {
 	PackMoleculeId molecule_id;
 	t_molecule_link *next;
 };

 /* Keeps a linked list of the unclustered blocks to speed up looking for *
  * unclustered blocks with a certain number of *external* inputs.        *
  * [0..lut_size].  Unclustered_list_head[i] points to the head of the    *
  * list of blocks with i inputs to be hooked up via external interconnect. */
 static t_molecule_link *unclustered_list_head;
 int unclustered_list_head_size;
 static t_molecule_link *memory_pool; /*Declared here so I can free easily.*/

 /* Does the atom block that drives the output of this atom net also appear as a   *
  * receiver (input) pin of the atom net? If so, then by how much?
  *
  * This is used in the gain routines to avoid double counting the connections from   *
  * the current cluster to other blocks (hence yielding better clusterings). *
  * The only time an atom block should connect to the same atom net *
  * twice is when one connection is an output and the other is an input, *
  * so this should take care of all multiple connections.                */
 static std::unordered_map<AtomNetId,int> net_output_feeds_driving_block_input;

 /* Timing information for blocks */
 static std::vector<AtomBlockId> critindexarray;
 static unordered_map<AtomBlockId,float> block_criticality;

 /* Score different seeds for blocks */
 static std::vector<AtomBlockId> seed_blend_index_array;
 static std::unordered_map<AtomBlockId,float> seed_blend_gain;


 /*****************************************/
 /*local functions*/
 /*****************************************/

 static void check_clocks(const std::unordered_set<AtomNetId>& is_clock);

 #if 0
 static void check_for_duplicate_inputs ();
 #endif

 static bool is_atom_blk_in_pb(const AtomBlockId blk_id, const t_pb *pb);

 static void add_molecule_to_pb_stats_candidates(const PackMolecules& molecules, const PackMoleculeId molecule_id, const MoleculeStats& molecule_stats,
 		std::map<AtomBlockId, float> &gain, t_pb *pb, int max_queue_size);

 static void alloc_and_init_clustering(const PackMolecules& molecules,
         const MoleculeStats& molecule_stats,
 		std::vector<t_cluster_placement_stats>& cluster_placement_stats,
 		vtr::vector<MoleculeBlockId,t_pb_graph_node*>& primitives_list);
 static void free_pb_stats_recursive(t_pb *pb);
 static void try_update_lookahead_pins_used(t_pb *cur_pb);
 static void reset_lookahead_pins_used(t_pb *cur_pb);
 static void compute_and_mark_lookahead_pins_used(const AtomBlockId blk_id);
 static void compute_and_mark_lookahead_pins_used_for_pin(
 		const t_pb_graph_pin *pb_graph_pin, const t_pb *primitive_pb, const AtomNetId net_id);
 static void commit_lookahead_pins_used(t_pb *cur_pb);
 static bool check_lookahead_pins_used(t_pb *cur_pb);
 static bool primitive_feasible(const AtomBlockId blk_id, t_pb *cur_pb);
 static bool primitive_memory_sibling_feasible(const AtomBlockId blk_id, const t_pb_type *cur_pb_type, const AtomBlockId sibling_memory_blk);

 static PackMoleculeId get_molecule_by_num_ext_inputs(
         const PackMolecules& molecules,
         const MoleculeStats& molecule_stats,
 		const int ext_inps, const enum e_removal_policy remove_flag,
 		t_cluster_placement_stats& cluster_placement_stats);

 static PackMoleculeId get_free_molecule_with_most_ext_inputs_for_cluster(
         const PackMolecules& molecules,
         const MoleculeStats& molecule_stats,
 		t_pb *cur_pb,
 		t_cluster_placement_stats& cluster_placement_stats);

 static PackMoleculeId get_seed_logical_molecule_with_most_ext_inputs(const MoleculeStats& molecule_stats);

 static enum e_block_pack_status try_pack_molecule(
 		t_cluster_placement_stats& cluster_placement_stats,
         const PackMolecules& molecules,
         const PackMoleculeId molecule_id,
         MoleculeStats& molecule_stats,
         vtr::vector<MoleculeBlockId,t_pb_graph_node*>& primitives_list,
 		t_pb * pb, const int max_models, const int max_cluster_size,
 		const ClusterBlockId clb_index, const int detailed_routing_stage, t_lb_router_data *router_data,
         bool debug_clustering,
         bool enable_pin_feasibility_filter);
 static enum e_block_pack_status try_place_atom_block_rec(
 		const t_pb_graph_node *pb_graph_node, const AtomBlockId blk_id,
 		t_pb *cb, t_pb **parent, const int max_models,
 		const int max_cluster_size, const ClusterBlockId clb_index,
 		const t_cluster_placement_stats& cluster_placement_stats,
 		const bool is_root_of_chain, const t_pb_graph_pin *chain_root_pin, t_lb_router_data *router_data);
 static void revert_place_atom_block(const AtomBlockId blk_id, t_lb_router_data *router_data,
         const PackMolecules& molecules,
         MoleculeStats& atom_molecules);

 static void update_connection_gain_values(const AtomNetId net_id, const AtomBlockId clustered_blk_id,
 		t_pb * cur_pb,
 		enum e_net_relation_to_clustered_block net_relation_to_clustered_block);

 static void update_timing_gain_values(const AtomNetId net_id,
 		t_pb* cur_pb,
 		enum e_net_relation_to_clustered_block net_relation_to_clustered_block,
         const SetupTimingInfo& timing_info,
         const std::unordered_set<AtomNetId>& is_global);

 static void mark_and_update_partial_gain(const AtomNetId inet, enum e_gain_update gain_flag,
 		const AtomBlockId clustered_blk_id,
         bool timing_driven,
 		bool connection_driven,
 		enum e_net_relation_to_clustered_block net_relation_to_clustered_block,
         const SetupTimingInfo& timing_info,
         const std::unordered_set<AtomNetId>& is_global);

 static void update_total_gain(float alpha, float beta, bool timing_driven,
 		bool connection_driven, t_pb *pb);

 static void update_cluster_stats(
         const PackMolecules& molecules,
         const PackMoleculeId molecule_id,
 		const ClusterBlockId clb_index,
         const std::unordered_set<AtomNetId>& is_clock,
         const std::unordered_set<AtomNetId>& is_global,
         const bool global_clocks,
 		const float alpha, const float beta, const bool timing_driven,
 		const bool connection_driven,
         const SetupTimingInfo& timing_info);

 static void start_new_cluster(
     std::vector<t_cluster_placement_stats>& cluster_placement_stats,
     vtr::vector<MoleculeBlockId,t_pb_graph_node*>& primitives_list,
     const PackMolecules& molecules,
     const PackMoleculeId molecule_id,
     MoleculeStats& molecule_stats,
 	ClusterBlockId clb_index,
 	std::map<t_type_ptr, size_t>& num_used_type_instances,
 	const float target_device_utilization,
 	const int num_models, const int max_cluster_size,
 	const t_arch* arch,
 	std::string device_layout_name,
 	const vector<t_lb_type_rr_graph>& lb_type_rr_graphs,
     const std::vector<t_lb_type_rr_graph_info>& lb_type_rr_graph_infos,
 	t_lb_router_data **router_data,
 	const int detailed_routing_stage,
 	ClusteredNetlist *clb_nlist,
     const std::map<const t_model*,std::vector<t_type_ptr>>& primitive_candidate_block_types,
     bool debug_clustering,
     bool enable_pin_feasibility_filter);

 static PackMoleculeId get_highest_gain_molecule(
 		t_pb *cur_pb,
         const PackMolecules& molecules,
         const MoleculeStats& molecule_stats,
 		const enum e_gain_type gain_mode,
 		t_cluster_placement_stats& cluster_placement_stats,
 		vtr::vector_map<ClusterBlockId,std::vector<AtomNetId>> &clb_inter_blk_nets,
 		const ClusterBlockId cluster_index);

 static bool exists_free_primitives_for_molecule(const PackMolecule& molecule, t_cluster_placement_stats& cluster_placement_stats);

 static PackMoleculeId get_molecule_for_cluster(
 		t_pb *cur_pb,
         const PackMolecules& molecules,
         const MoleculeStats& molecule_stats,
 		const bool allow_unrelated_clustering,
 		int *num_unrelated_clustering_attempts,
 		t_cluster_placement_stats& cluster_placement_stats,
 		vtr::vector_map<ClusterBlockId,std::vector<AtomNetId>> &clb_inter_blk_nets,
 		ClusterBlockId cluster_index);

 static void check_clustering();

 static void check_cluster_atom_blocks(t_pb *pb, std::unordered_set<AtomBlockId>& blocks_checked);

 static PackMoleculeId get_highest_gain_seed_molecule(int * seedindex, const PackMolecules& molecules, const MoleculeStats& molecule_stats, bool getblend);

 static float get_molecule_gain(const PackMolecules& molecules, const PackMoleculeId molecule_id, const MoleculeStats& molecule_stats, std::map<AtomBlockId, float> &blk_gain);
 static int compare_molecule_gain(const void *a, const void *b);
 int net_sinks_reachable_in_cluster(const t_pb_graph_pin* driver_pb_gpin, const int depth, const AtomNetId net_id);

 static void print_block_criticalities(const char * fname);

 static void load_transitive_fanout_candidates(ClusterBlockId cluster_index,
                                               const PackMolecules& molecules,
                                               const MoleculeStats& molecule_stats,
 											  t_pb_stats *pb_stats,
 											  vtr::vector_map<ClusterBlockId,std::vector<AtomNetId>> &clb_inter_blk_nets);

 static std::map<const t_model*,std::vector<t_type_ptr>> identify_primitive_candidate_block_types();

 static void revalid_molecules(const t_pb* pb, const PackMolecules& molecules, MoleculeStats& molecule_stats);

 MoleculeStats build_molecule_stats(const PackMolecules& molecules, const AtomNetlist& netlist);
 int count_molecule_blocks(const PackMolecule& molecule);
 int count_molecule_ext_inputs(const PackMolecule& molecule, const AtomNetlist& netlist);

 static std::vector<t_type_ptr> find_candidate_types_for_molecule(const PackMolecule& molecule,
                                                                  const std::map<const t_model*,std::vector<t_type_ptr>>& primitive_candidate_block_types);
 /*****************************************/
 /*globally accessible function*/
 void do_clustering(const t_arch *arch,
 		int num_models, bool global_clocks,
         const std::unordered_set<AtomNetId>& is_clock,
         const PackMolecules& molecules,
         const std::unordered_map<AtomBlockId,t_pb_graph_node*>& expected_lowest_cost_pb_gnode,
 		bool hill_climbing_flag, const char *out_fname, bool timing_driven,
 		enum e_cluster_seed cluster_seed_type, float alpha, float beta,
         float inter_cluster_net_delay,
         const float target_device_utilization,
 		bool allow_unrelated_clustering,
 		bool connection_driven,
 		enum e_packer_algorithm packer_algorithm,
         const vector<t_lb_type_rr_graph>& lb_type_rr_graphs,
 		const std::vector<t_lb_type_rr_graph_info>& lb_type_rr_graph_infos,
         std::string device_layout_name,
         bool debug_clustering,
         bool enable_pin_feasibility_filter
 #ifdef USE_HMETIS
 		, vtr::vector_map<AtomBlockId, int>& partitions
 #endif
 #ifdef ENABLE_CLASSIC_VPR_STA
         , t_timing_inf timing_inf
 #endif
         ) {

 	/* Does the actual work of clustering multiple netlist blocks *
 	 * into clusters.                                                  */

 	/* Algorithm employed
 	 1.  Find type that can legally hold block and create cluster with pb info
 	 2.  Populate started cluster
 	 3.  Repeat 1 until no more blocks need to be clustered

 	 */

 	/****************************************************************
 	* Initialization
 	*****************************************************************/
     VTR_ASSERT(packer_algorithm == PACK_GREEDY);

 	int i, blocks_since_last_analysis, num_clb,
 		num_blocks_hill_added, max_cluster_size, cur_cluster_size,
 		max_pb_depth, cur_pb_depth, num_unrelated_clustering_attempts,
 		seedindex, savedseedindex /* index of next most timing critical block */,
 		detailed_routing_stage, *hill_climbing_inputs_avail;

     std::map<t_type_ptr,size_t> num_used_type_instances;

 	bool early_exit, is_cluster_legal;
 	enum e_block_pack_status block_pack_status;

     vtr::vector<MoleculeBlockId,t_pb_graph_node*> primitives_list;
 	t_lb_router_data *router_data = nullptr;
     PackMoleculeId istart, next_molecule, prev_molecule;

     auto& atom_ctx = g_vpr_ctx.atom();
     auto& device_ctx = g_vpr_ctx.mutable_device();
 	auto& cluster_ctx = g_vpr_ctx.mutable_clustering();

     std::vector<t_cluster_placement_stats> cluster_placement_stats;
 	vtr::vector_map<ClusterBlockId, std::vector<t_intra_lb_net>*> intra_lb_routing;

     std::shared_ptr<PreClusterDelayCalculator> clustering_delay_calc;
     std::shared_ptr<SetupTimingInfo> timing_info;

 	num_clb = 0;
 	seedindex = 0;

 	/* TODO: This is memory inefficient, fix if causes problems */
 	/* Store stats on nets used by packed block, useful for determining transitively connected blocks
 	(eg. [A1, A2, ..]->[B1, B2, ..]->C implies cluster [A1, A2, ...] and C have a weak link) */
 	vtr::vector_map<ClusterBlockId, std::vector<AtomNetId>> clb_inter_blk_nets(atom_ctx.nlist.blocks().size());

 	/* determine bound on cluster size and primitive input size */
 	max_cluster_size = 0;
 	max_pb_depth = 0;

 	for (i = 0; i < device_ctx.num_block_types; i++) {
 		if (device_ctx.EMPTY_TYPE == &device_ctx.block_types[i])
 			continue;
 		cur_cluster_size = get_max_primitives_in_pb_type(
 				device_ctx.block_types[i].pb_type);
 		cur_pb_depth = get_max_depth_of_pb_type(device_ctx.block_types[i].pb_type);
 		if (cur_cluster_size > max_cluster_size) {
 			max_cluster_size = cur_cluster_size;
 		}
 		if (cur_pb_depth > max_pb_depth) {
 			max_pb_depth = cur_pb_depth;
 		}
 	}

     MoleculeStats molecule_stats = build_molecule_stats(molecules, atom_ctx.nlist);

 	if (hill_climbing_flag) {
 		hill_climbing_inputs_avail = (int *) vtr::calloc(max_cluster_size + 1,
 				sizeof(int));
 	} else {
 		hill_climbing_inputs_avail = nullptr; /* if used, die hard */
 	}

 	check_clocks(is_clock);
 #if 0
 	check_for_duplicate_inputs ();
 #endif
 	alloc_and_init_clustering(molecules, molecule_stats,
 			cluster_placement_stats, primitives_list);

     auto primitive_candidate_block_types = identify_primitive_candidate_block_types();

 	blocks_since_last_analysis = 0;
 	early_exit = false;
 	num_blocks_hill_added = 0;

 	VTR_ASSERT(max_cluster_size < MAX_SHORT);
 	/* Limit maximum number of elements for each cluster */

 	if (timing_driven) {

         /*
          * Initialize the timing analyzer
          */
         clustering_delay_calc = std::make_shared<PreClusterDelayCalculator>(atom_ctx.nlist, atom_ctx.lookup, inter_cluster_net_delay, expected_lowest_cost_pb_gnode);
         timing_info = make_setup_timing_info(clustering_delay_calc);

         //Calculate the initial timing
         timing_info->update();

         if(isEchoFileEnabled(E_ECHO_PRE_PACKING_TIMING_GRAPH)) {
             auto& timing_ctx = g_vpr_ctx.timing();
             tatum::write_echo(getEchoFileName(E_ECHO_PRE_PACKING_TIMING_GRAPH),
                               *timing_ctx.graph, *timing_ctx.constraints, *clustering_delay_calc, timing_info->analyzer());
         }


 #ifdef ENABLE_CLASSIC_VPR_STA
         t_slack* slacks = alloc_and_load_pre_packing_timing_graph(inter_cluster_net_delay, timing_inf, expected_lowest_cost_pb_gnode);
         do_timing_analysis(slacks, timing_inf, true, true);
         std::string fname = std::string("classic.") + getEchoFileName(E_ECHO_PRE_PACKING_TIMING_GRAPH);
         print_timing_graph(fname.c_str());

         auto cpds = timing_info->critical_paths();
         auto critical_path = timing_info->least_slack_critical_path();

         float cpd_diff_ns = std::abs(get_critical_path_delay() - 1e9*critical_path.delay());
         if(cpd_diff_ns > 0.01) {
             print_classic_cpds();
             print_tatum_cpds(timing_info->critical_paths());
             compare_tatum_classic_constraints();

             vpr_throw(VPR_ERROR_TIMING, __FILE__, __LINE__, "Classic VPR and Tatum critical paths do not match (%g and %g respectively)", get_critical_path_delay(), 1e9*critical_path.delay());
         }

         free_timing_graph(slacks);
 #endif

 		for (auto blk_id : atom_ctx.nlist.blocks()) {
 			critindexarray.push_back(blk_id);
 			seed_blend_index_array.push_back(blk_id);
 		}

         //Calculate criticality of each block
         for(AtomBlockId blk : atom_ctx.nlist.blocks()) {
             for(AtomPinId in_pin : atom_ctx.nlist.block_input_pins(blk)) {
                 //Max criticality over incoming nets
                 float crit = timing_info->setup_pin_criticality(in_pin);
                 block_criticality[blk] = std::max(block_criticality[blk], crit);
             }
         }

         for(auto blk_id : atom_ctx.nlist.blocks()) {
 			/* Score seed gain of each block as a weighted sum of timing criticality,
              * number of tightly coupled blocks connected to it, and number of external inputs */
 			float seed_blend_fac = 0.5;
 			float max_blend_gain = 0;

             auto molecule_rng = molecules.atom_molecules.equal_range(blk_id);
             for(const auto& kv : vtr::make_range(molecule_rng.first, molecule_rng.second)) {
                 PackMoleculeId molecule_id = kv.second;

 				int inputs_of_molecule = molecule_stats.num_ext_inputs(molecule_id);
 				int blocks_of_molecule = molecule_stats.num_blocks(molecule_id);
 				int max_molecule_inputs = molecule_stats.max_num_blocks();

                 VTR_ASSERT(max_molecule_inputs > 0);

 				float blend_gain = 0;
                 blend_gain += seed_blend_fac * block_criticality[blk_id];
                 blend_gain += (1-seed_blend_fac) * (inputs_of_molecule / max_molecule_inputs);
                 blend_gain *= (1 + 0.2 * (blocks_of_molecule - 1));
 				if(blend_gain > max_blend_gain) {
 					max_blend_gain = blend_gain;
 				}
             }
 			seed_blend_gain[blk_id] = max_blend_gain;

 		}

         //Sort in decreasing order (i.e. most critical at index 0)
         std::sort(critindexarray.begin(), critindexarray.end(),
             [](const AtomBlockId lhs, const AtomBlockId rhs) {
                 return block_criticality[lhs] > block_criticality[rhs];
             }
         );

         //Sort in decreasing order (i.e. highest gain at index 0)
         std::sort(seed_blend_index_array.begin(), seed_blend_index_array.end(),
             [](const AtomBlockId lhs, const AtomBlockId rhs) {
                 return seed_blend_gain[lhs] > seed_blend_gain[rhs];
             }
         );

 		if (getEchoEnabled() && isEchoFileEnabled(E_ECHO_CLUSTERING_BLOCK_CRITICALITIES)) {
 			print_block_criticalities(getEchoFileName(E_ECHO_CLUSTERING_BLOCK_CRITICALITIES));
 		}

 		if (cluster_seed_type == VPACK_BLEND) {
 			istart = get_highest_gain_seed_molecule(&seedindex, molecules, molecule_stats, true);
 		} else if (cluster_seed_type == VPACK_TIMING) {
 			istart = get_highest_gain_seed_molecule(&seedindex, molecules, molecule_stats, false);
 		} else {/*max input seed*/
 			istart = get_seed_logical_molecule_with_most_ext_inputs(molecule_stats);
 		}

 	} else /*cluster seed is max input (since there is no timing information)*/ {
 		istart = get_seed_logical_molecule_with_most_ext_inputs(molecule_stats);
 	}

 	/****************************************************************
 	* Clustering
 	*****************************************************************/

 	while (istart) {
 		is_cluster_legal = false;
 		savedseedindex = seedindex;
 		for (detailed_routing_stage = (int)E_DETAILED_ROUTE_AT_END_ONLY; !is_cluster_legal && detailed_routing_stage != (int)E_DETAILED_ROUTE_END; detailed_routing_stage++) {
 			ClusterBlockId clb_index(num_clb);

 			/* start a new cluster and reset all stats */
             vtr::printf("Starting new cluster with molecule %zu\n", size_t(istart));
 			start_new_cluster(cluster_placement_stats, primitives_list,
 					molecules, istart, molecule_stats, clb_index,
 					num_used_type_instances,
 					target_device_utilization,
 					num_models, max_cluster_size,
 					arch, device_layout_name,
 					lb_type_rr_graphs,
 					lb_type_rr_graph_infos,
                     &router_data,
 					detailed_routing_stage, &cluster_ctx.clb_nlist,
                     primitive_candidate_block_types,
                     debug_clustering,
                     enable_pin_feasibility_filter);
 			vtr::printf_info("Complex block %d: %s, type: %s ",
 					num_clb, cluster_ctx.clb_nlist.block_name(clb_index).c_str(), cluster_ctx.clb_nlist.block_type(clb_index)->name);
             if (debug_clustering) {
                 vtr::printf("\n");
             } else {
                 vtr::printf("."); //Progress dot for seed-block
             }
 			fflush(stdout);
 			update_cluster_stats(molecules, istart, clb_index,
                     is_clock, //Set of clock nets
                     is_clock, //Set of global nets (currently all clocks)
                     global_clocks, alpha,
 					beta, timing_driven, connection_driven,
                     *timing_info);
 			num_clb++;

 			if (timing_driven && !early_exit) {
 				blocks_since_last_analysis++;
 				/*it doesn't make sense to do a timing analysis here since there*
 				 *is only one atom block clustered it would not change anything      */
 			}
             t_cluster_placement_stats& cur_cluster_placement_stats = cluster_placement_stats[cluster_ctx.clb_nlist.block_type(clb_index)->index];
 			num_unrelated_clustering_attempts = 0;
 			next_molecule = get_molecule_for_cluster(
 					cluster_ctx.clb_nlist.block_pb(clb_index),
                     molecules,
                     molecule_stats,
                     allow_unrelated_clustering,
 					&num_unrelated_clustering_attempts,
 					cur_cluster_placement_stats,
 					clb_inter_blk_nets,
 					clb_index);
 			prev_molecule = istart;
 			while (next_molecule && prev_molecule != next_molecule) {
                 vtr::printf("Trying to pack new molecule %zu\n", size_t(next_molecule));
 				block_pack_status = try_pack_molecule(
 						cur_cluster_placement_stats,
                         molecules,
                         next_molecule,
                         molecule_stats,
 						primitives_list, cluster_ctx.clb_nlist.block_pb(clb_index), num_models,
 						max_cluster_size, clb_index, detailed_routing_stage, router_data,
                         debug_clustering,
                         enable_pin_feasibility_filter);
 				prev_molecule = next_molecule;

                 auto blk_id = molecules.pack_molecules[next_molecule].root_block_atom();
                 VTR_ASSERT(blk_id);

                 std::string blk_name = atom_ctx.nlist.block_name(blk_id);
                 const t_model* blk_model = atom_ctx.nlist.block_model(blk_id);

 				if (block_pack_status != BLK_PASSED) {
 					if (next_molecule) {
 						if (block_pack_status == BLK_FAILED_ROUTE) {
                             if (debug_clustering) {
                                 vtr::printf_direct("\tNO_ROUTE: %s type %s/n",
                                         blk_name.c_str(),
                                         blk_model->name);
                                 fflush(stdout);
                             }
 						} else {
                             if (debug_clustering) {
                                 vtr::printf_direct("\tFAILED_CHECK: %s type %s check %d\n",
                                         blk_name.c_str(),
                                         blk_model->name,
                                         block_pack_status);
                                 fflush(stdout);
                             }
 						}
 					}

 					next_molecule = get_molecule_for_cluster(
 							cluster_ctx.clb_nlist.block_pb(clb_index),
                             molecules,
                             molecule_stats,
                             allow_unrelated_clustering,
 							&num_unrelated_clustering_attempts,
 							cur_cluster_placement_stats,
 							clb_inter_blk_nets,
 							clb_index);
 					continue;
 				} else {
 					/* Continue packing by filling smallest cluster */
                     if (debug_clustering) {
                         vtr::printf_direct("\tPASSED: %s type %s\n",
                                 blk_name.c_str(),
                                 blk_model->name);
                     } else {
                         vtr::printf(".");
                     }
 					fflush(stdout);
 				}
 				update_cluster_stats(molecules, next_molecule, clb_index,
                         is_clock, //Set of all clocks
                         is_clock, //Set of all global signals (currently clocks)
 						global_clocks, alpha, beta, timing_driven,
 						connection_driven,
                         *timing_info);
 				num_unrelated_clustering_attempts = 0;

 				if (timing_driven && !early_exit) {
 					blocks_since_last_analysis++; /* historically, timing slacks were recomputed after X number of blocks were packed, but this doesn't significantly alter results so I (jluu) did not port the code */
 				}
 				next_molecule = get_molecule_for_cluster(
 						cluster_ctx.clb_nlist.block_pb(clb_index),
                         molecules,
                         molecule_stats,
                         allow_unrelated_clustering,
 						&num_unrelated_clustering_attempts,
 						cur_cluster_placement_stats,
 						clb_inter_blk_nets,
 						clb_index);
 			}

             if (!debug_clustering) {
                 vtr::printf("\n");
             }

 			if (detailed_routing_stage == (int)E_DETAILED_ROUTE_AT_END_ONLY) {
 				is_cluster_legal = try_intra_lb_route(router_data, debug_clustering);
 				if (is_cluster_legal == true) {
                     if (debug_clustering) {
                         vtr::printf_info("Passed route at end.\n");
                     }
 				} else {
 					vtr::printf_info("Failed route at end, repack cluster trying detailed routing at each stage.\n");
 				}
 			} else {
 				is_cluster_legal = true;
 			}
 			if (is_cluster_legal == true) {
 				intra_lb_routing.push_back(router_data->saved_lb_nets);
 				VTR_ASSERT((int)intra_lb_routing.size() == num_clb);
 				router_data->saved_lb_nets = nullptr;
 				if (timing_driven) {
 					if (num_blocks_hill_added > 0 && !early_exit) {
 						blocks_since_last_analysis += num_blocks_hill_added;
 					}
 					if (cluster_seed_type == VPACK_BLEND) {
 						istart = get_highest_gain_seed_molecule(&seedindex, molecules, molecule_stats, true);
 					} else if (cluster_seed_type == VPACK_TIMING) {
 						istart = get_highest_gain_seed_molecule(&seedindex, molecules, molecule_stats, false);
 					} else { /*max input seed*/
 						istart = get_seed_logical_molecule_with_most_ext_inputs(molecule_stats);
 					}
 				} else {
 					/*cluster seed is max input (since there is no timing information)*/
 					istart = get_seed_logical_molecule_with_most_ext_inputs(molecule_stats);
                 }

 				/* store info that will be used later in packing from pb_stats and free the rest */
 				t_pb_stats *pb_stats = cluster_ctx.clb_nlist.block_pb(clb_index)->pb_stats;
 				for(const AtomNetId mnet_id : pb_stats->marked_nets) {
 					int external_terminals = atom_ctx.nlist.net_pins(mnet_id).size() - pb_stats->num_pins_of_net_in_pb[mnet_id];
 					/* Check if external terminals of net is within the fanout limit and that there exists external terminals */
 					if(external_terminals < AAPACK_MAX_TRANSITIVE_FANOUT_EXPLORE && external_terminals > 0) {
 						clb_inter_blk_nets[clb_index].push_back(mnet_id);
 					}
 				}
 				free_pb_stats_recursive(cluster_ctx.clb_nlist.block_pb(clb_index));
 			} else {
 				/* Free up data structures and requeue used molecules */
 				num_used_type_instances[cluster_ctx.clb_nlist.block_type(clb_index)]--;
                 revalid_molecules(cluster_ctx.clb_nlist.block_pb(clb_index), molecules, molecule_stats);
 				cluster_ctx.clb_nlist.remove_block(clb_index);
 				cluster_ctx.clb_nlist.compress();
 				num_clb--;
 				seedindex = savedseedindex;
 			}
 			free_router_data(router_data);
 			router_data = nullptr;
 		}
 	}

 	/****************************************************************
 	* Free Data Structures
 	*****************************************************************/
 	VTR_ASSERT(num_clb == (int)cluster_ctx.clb_nlist.blocks().size());
 	check_clustering();

 	output_clustering(intra_lb_routing, global_clocks, is_clock, arch->architecture_id, out_fname, false);

 	VTR_ASSERT(cluster_ctx.clb_nlist.blocks().size() == intra_lb_routing.size());
 	for (auto blk_id : cluster_ctx.clb_nlist.blocks())
 		free_intra_lb_nets(intra_lb_routing[blk_id]);

 	intra_lb_routing.clear();

 	if (hill_climbing_flag)
 		free(hill_climbing_inputs_avail);

 	for (auto blk_id : cluster_ctx.clb_nlist.blocks())
 		cluster_ctx.clb_nlist.remove_block(blk_id);

 	cluster_ctx.clb_nlist = ClusteredNetlist();

 	free(unclustered_list_head);
 	free(memory_pool);

 	if (timing_driven) {
 		block_criticality.clear();
 		critindexarray.clear();
 		seed_blend_gain.clear();
 		seed_blend_index_array.clear();
 	}
 }

 /*****************************************/
 static void check_clocks(const std::unordered_set<AtomNetId>& is_clock) {

 	/* Checks that nets used as clock inputs to latches are never also used *
 	 * as VPACK_LUT inputs.  It's electrically questionable, and more importantly *
 	 * would break the clustering code.                                     */
     auto& atom_ctx = g_vpr_ctx.atom();

     for(auto blk_id : atom_ctx.nlist.blocks()) {
 		if (atom_ctx.nlist.block_type(blk_id) != AtomBlockType::OUTPAD) {
             for(auto pin_id : atom_ctx.nlist.block_input_pins(blk_id)) {
                 auto net_id = atom_ctx.nlist.pin_net(pin_id);
                 if (is_clock.count(net_id)) {
                     vpr_throw(VPR_ERROR_PACK, __FILE__, __LINE__,
                             "Error in check_clocks.\n"
                             "Net %s is a clock, but also connects to a logic block input on atom block %s.\n"
                             "This would break the current clustering implementation and is electrically "
                             "questionable, so clustering has been aborted.\n",
                             atom_ctx.nlist.net_name(net_id).c_str(), atom_ctx.nlist.block_name(blk_id).c_str());
                 }
             }
 		}
 	}
 }

 /* Determine if atom block is in pb */
 static bool is_atom_blk_in_pb(const AtomBlockId blk_id, const t_pb *pb) {
     auto& atom_ctx = g_vpr_ctx.atom();

 	const t_pb* cur_pb = atom_ctx.lookup.atom_pb(blk_id);
 	while (cur_pb) {
 		if (cur_pb == pb) {
 			return true;
 		}
 		cur_pb = cur_pb->parent_pb;
 	}
 	return false;
 }

 /* Add blk to list of feasible blocks sorted according to gain */
 static void add_molecule_to_pb_stats_candidates(const PackMolecules& molecules, const PackMoleculeId molecule_id, const MoleculeStats& molecule_stats,
 		std::map<AtomBlockId, float> &gain, t_pb *pb, int max_queue_size) {
 	int i, j;

 	for (i = 0; i < pb->pb_stats->num_feasible_blocks; i++) {
 		if (pb->pb_stats->feasible_blocks[i] == molecule_id) {
 			return; /* already in queue, do nothing */
 		}
 	}

 	if (pb->pb_stats->num_feasible_blocks >= max_queue_size - 1) {
 		/* maximum size for array, remove smallest gain element and sort */
 		if (get_molecule_gain(molecules, molecule_id, molecule_stats, gain) > get_molecule_gain(molecules, pb->pb_stats->feasible_blocks[0], molecule_stats, gain)) {
 			/* single loop insertion sort */
 			for (j = 0; j < pb->pb_stats->num_feasible_blocks - 1; j++) {
 				if (get_molecule_gain(molecules, molecule_id, molecule_stats, gain) <= get_molecule_gain(molecules, pb->pb_stats->feasible_blocks[j + 1], molecule_stats, gain)) {
 					pb->pb_stats->feasible_blocks[j] = molecule_id;
 					break;
 				} else {
 					pb->pb_stats->feasible_blocks[j] = pb->pb_stats->feasible_blocks[j + 1];
 				}
 			}
 			if (j == pb->pb_stats->num_feasible_blocks - 1) {
 				pb->pb_stats->feasible_blocks[j] = molecule_id;
 			}
 		}
 	} else {
 		/* Expand array and single loop insertion sort */
 		for (j = pb->pb_stats->num_feasible_blocks - 1; j >= 0; j--) {
 			if (get_molecule_gain(molecules, pb->pb_stats->feasible_blocks[j], molecule_stats, gain) > get_molecule_gain(molecules, molecule_id, molecule_stats, gain)) {
 				pb->pb_stats->feasible_blocks[j + 1] = pb->pb_stats->feasible_blocks[j];
 			} else {
 				pb->pb_stats->feasible_blocks[j + 1] = molecule_id;
 				break;
 			}
 		}
 		if (j < 0) {
 			pb->pb_stats->feasible_blocks[0] = molecule_id;
 		}
 		pb->pb_stats->num_feasible_blocks++;
 	}
 }

 /*****************************************/
 static void alloc_and_init_clustering(const PackMolecules& molecules,
         const MoleculeStats& molecule_stats,
 		std::vector<t_cluster_placement_stats>& cluster_placement_stats,
 		vtr::vector<MoleculeBlockId,t_pb_graph_node*>& primitives_list) {

 	/* Allocates the main data structures used for clustering and properly *
 	 * initializes them.                                                   */

     //Sort into ascending gain order (i.e. lowest gain at index 0)
     //When this is inserted into the list below it is reversed, so the final
     //ordering is descending
     auto cmp_gain_ascending = [&](const PackMoleculeId lhs, const PackMoleculeId rhs) {
         return molecule_stats.base_gain(lhs) < molecule_stats.base_gain(rhs);
     };
     std::vector<PackMoleculeId> molecule_ids_by_desc_gain = molecules.pack_molecule_ids;
     std::sort(molecule_ids_by_desc_gain.begin(), molecule_ids_by_desc_gain.end(), cmp_gain_ascending);

 	/* alloc and load list of molecules to pack */
 	unclustered_list_head_size = molecule_stats.max_num_ext_inputs() + 1;
 	unclustered_list_head = (t_molecule_link *) vtr::calloc(unclustered_list_head_size , sizeof(t_molecule_link));

 	for (int i = 0; i < unclustered_list_head_size; i++) {
 		unclustered_list_head[i].next = nullptr;
 	}

 	memory_pool = (t_molecule_link *) vtr::malloc(molecules.pack_molecules.size() * sizeof(t_molecule_link));
 	t_molecule_link* next_ptr = memory_pool;

 	for (PackMoleculeId molecule_id : molecule_ids_by_desc_gain) {
 		int ext_inps = molecule_stats.num_ext_inputs(molecule_id);
 		next_ptr->molecule_id = molecule_id;
 		next_ptr->next = unclustered_list_head[ext_inps].next;
 		unclustered_list_head[ext_inps].next = next_ptr;
 		next_ptr++;
 	}

     for (int i = 0; i < unclustered_list_head_size; ++i) {
         t_molecule_link* ptr = unclustered_list_head[i].next;
         vtr::printf("Free Inputs: %d\n", i);
         while (ptr) {
             vtr::printf("  Molecule %d\n", size_t(ptr->molecule_id));
             ptr = ptr->next;
         }
     }

 	/* load net info */
     auto& atom_ctx = g_vpr_ctx.atom();
     for(AtomNetId net : atom_ctx.nlist.nets()) {
         AtomPinId driver_pin = atom_ctx.nlist.net_driver(net);
         if (!driver_pin) {
             vtr::printf_warning(__FILE__, __LINE__, "Net '%s' has no driver\n", atom_ctx.nlist.net_name(net).c_str());
             continue;
         }
         AtomBlockId driver_block = atom_ctx.nlist.pin_block(driver_pin);

         for(AtomPinId sink_pin : atom_ctx.nlist.net_sinks(net)) {
             AtomBlockId sink_block = atom_ctx.nlist.pin_block(sink_pin);

             if(driver_block == sink_block) {
                 net_output_feeds_driving_block_input[net]++;
             }

         }
     }

 	/* alloc and load cluster placement info */
 	cluster_placement_stats = alloc_and_load_cluster_placement_stats();

 	/* alloc array that will store primitives that a molecule gets placed to,
 	 primitive_list is referenced by index, for example a atom block in index 2 of a molecule matches to a primitive in index 2 in primitive_list
 	 this array must be the size of the biggest molecule
 	 */
 	primitives_list.assign(molecule_stats.max_num_blocks(), nullptr);
 }

 /*****************************************/
 static void free_pb_stats_recursive(t_pb *pb) {

 	int i, j;
 	/* Releases all the memory used by clustering data structures.   */
 	if (pb) {
 		if (pb->pb_graph_node != nullptr) {
 			if (pb->pb_graph_node->pb_type->num_modes != 0) {
 				for (i = 0; i < pb->pb_graph_node->pb_type->modes[pb->mode].num_pb_type_children; i++) {
 					for (j = 0; j < pb->pb_graph_node->pb_type->modes[pb->mode].pb_type_children[i].num_pb; j++) {
 						if (pb->child_pbs && pb->child_pbs[i]) {
 							free_pb_stats_recursive(&pb->child_pbs[i][j]);
 						}
 					}
 				}
 			}
 		}
 		free_pb_stats(pb);
 	}
 }

 static bool primitive_feasible(const AtomBlockId blk_id, t_pb *cur_pb) {
 	const t_pb_type *cur_pb_type = cur_pb->pb_graph_node->pb_type;

 	VTR_ASSERT(cur_pb_type->num_modes == 0); /* primitive */

     auto& atom_ctx = g_vpr_ctx.atom();
     AtomBlockId cur_pb_blk_id = atom_ctx.lookup.pb_atom(cur_pb);
 	if (cur_pb_blk_id && cur_pb_blk_id != blk_id) {

 		/* This pb already has a different logical block */
 		return false;
 	}

 	if (cur_pb_type->class_type == MEMORY_CLASS) {
 		/* Memory class has additional feasibility requirements:
          *   - all siblings must share all nets, including open nets, with the exception of data nets */

 		/* find sibling if one exists */
         AtomBlockId sibling_memory_blk_id = find_memory_sibling(cur_pb);

 		if (sibling_memory_blk_id) {
             //There is a sibling, see if the current block is feasible with it
             bool sibling_feasible = primitive_memory_sibling_feasible(blk_id, cur_pb_type, sibling_memory_blk_id);
             if(!sibling_feasible) {
                 return false;
             }
 		}
 	}

     //Generic feasibility check
 	return primitive_type_feasible(blk_id, cur_pb_type);
 }

 static bool primitive_memory_sibling_feasible(const AtomBlockId blk_id, const t_pb_type *cur_pb_type, const AtomBlockId sibling_blk_id) {
     /* Check that the two atom blocks blk_id and sibling_blk_id (which should both be memory slices)
      * are feasible, in the sence that they have precicely the same net connections (with the
      * exception of nets in data port classes).
      *
      * Note that this routine does not check pin feasibility against the cur_pb_type; so
      * primitive_type_feasible() should also be called on blk_id before concluding it is feasible.
      */
     auto& atom_ctx = g_vpr_ctx.atom();
     VTR_ASSERT(cur_pb_type->class_type == MEMORY_CLASS);

     //First, identify the 'data' ports by looking at the cur_pb_type
     std::unordered_set<t_model_ports*> data_ports;
     for (int iport = 0; iport < cur_pb_type->num_ports; ++iport) {
         const char* port_class = cur_pb_type->ports[iport].port_class;
         if(port_class && strstr(port_class, "data") == port_class) {
             //The port_class starts with "data", so it is a data port

             //Record the port
             data_ports.insert(cur_pb_type->ports[iport].model_port);
         }
     }

     //Now verify that all nets (except those connected to data ports) are equivalent
     //between blk_id and sibling_blk_id

     //Since the atom netlist stores only in-use ports, we iterate over the model to ensure
     //all ports are compared
     const t_model* model = cur_pb_type->model;
     for(t_model_ports* port : {model->inputs, model->outputs}) {
         for(; port; port = port->next) {
             if(data_ports.count(port)) {
                 //Don't check data ports
                 continue;
             }

             //Note: VPR doesn't support multi-driven nets, so all outputs
             //should be data ports, otherwise the siblings will both be
             //driving the output net

             //Get the ports from each primitive
             auto blk_port_id = atom_ctx.nlist.find_atom_port(blk_id, port);
             auto sib_port_id = atom_ctx.nlist.find_atom_port(sibling_blk_id, port);

             //Check that all nets (including unconnected nets) match
             for(int ipin = 0; ipin < port->size; ++ipin) {
                 //The nets are initialized as invalid (i.e. disconnected)
                 AtomNetId blk_net_id;
                 AtomNetId sib_net_id;

                 //We can get the actual net provided the port exists
                 //
                 //Note that if the port did not exist, the net is left
                 //as invalid/disconneced
                 if(blk_port_id) {
                     blk_net_id = atom_ctx.nlist.port_net(blk_port_id, ipin);
                 }
                 if(sib_port_id) {
                     sib_net_id = atom_ctx.nlist.port_net(sib_port_id, ipin);
                 }

                 //The sibling and block must have the same (possibly disconnected)
                 //net on this pin
                 if(blk_net_id != sib_net_id) {
                     //Nets do not match, not feasible
                     return false;
                 }
             }
         }
     }

 	return true;
 }

 /*****************************************/
 static PackMoleculeId get_molecule_by_num_ext_inputs(
         const PackMolecules& molecules,
         const MoleculeStats& molecule_stats,
 		const int ext_inps, const enum e_removal_policy remove_flag,
 		t_cluster_placement_stats& cluster_placement_stats) {

 	/* This routine returns an atom block which has not been clustered, has  *
 	 * no connection to the current cluster, satisfies the cluster     *
 	 * clock constraints, is a valid subblock inside the cluster, does not exceed the cluster subblock units available,
 	 and has ext_inps external inputs.  If        *
 	 * there is no such atom block it returns ClusterBlockId::INVALID().  Remove_flag      *
 	 * controls whether or not blocks that have already been clustered *
 	 * are removed from the unclustered_list data structures.  NB:     *
 	 * to get a atom block regardless of clock constraints just set clocks_ *
 	 * avail > 0.                                                      */

 	t_molecule_link* prev_ptr = &unclustered_list_head[ext_inps];
 	t_molecule_link* ptr = unclustered_list_head[ext_inps].next;
 	while (ptr != nullptr) {
 		/* TODO: Get better candidate atom block in future, eg. return most timing critical or some other smarter metric */
         PackMoleculeId molecule_id = ptr->molecule_id;
 		if (molecule_stats.valid(molecule_id)) {
             const PackMolecule& molecule = molecules.pack_molecules[molecule_id];

             if (exists_free_primitives_for_molecule(molecule, cluster_placement_stats)) {
 				return molecule_id;
 			}
 			prev_ptr = ptr;

 		} else if (remove_flag == REMOVE_CLUSTERED) {
 			VTR_ASSERT(0); /* this doesn't work right now with 2 the pass packing for each complex block */
 			prev_ptr->next = ptr->next;
 		}

 		ptr = ptr->next;
 	}

 	return PackMoleculeId::INVALID();
 }

 /*****************************************/
 static PackMoleculeId get_free_molecule_with_most_ext_inputs_for_cluster(
         const PackMolecules& molecules,
         const MoleculeStats& molecule_stats,
 		t_pb *cur_pb,
 		t_cluster_placement_stats& cluster_placement_stats) {

 	/* This routine is used to find new blocks for clustering when there are no feasible       *
 	 * blocks with any attraction to the current cluster (i.e. it finds       *
 	 * blocks which are unconnected from the current cluster).  It returns    *
 	 * the atom block with the largest number of used inputs that satisfies the    *
 	 * clocking and number of inputs constraints.  If no suitable atom block is    *
 	 * found, the routine returns ClusterBlockId::INVALID().
 	 * TODO: Analyze if this function is useful in more detail, also, should probably not include clock in input count
 	 */

 	int ext_inps;
 	int i, j;

 	int inputs_avail = 0;

 	for (i = 0; i < cur_pb->pb_graph_node->num_input_pin_class; i++) {
 		for (j = 0; j < cur_pb->pb_graph_node->input_pin_class_size[i]; j++) {
 			if (cur_pb->pb_stats->input_pins_used[i][j])
 				inputs_avail++;
 		}
 	}


 	if (inputs_avail >= unclustered_list_head_size) {
 		inputs_avail = unclustered_list_head_size - 1;
 	}

     PackMoleculeId molecule;
 	for (ext_inps = inputs_avail; ext_inps >= 0; ext_inps--) {
 		molecule = get_molecule_by_num_ext_inputs(molecules, molecule_stats,
 				ext_inps, LEAVE_CLUSTERED, cluster_placement_stats);
 		if (molecule) {
 			break;
 		}
 	}
 	return molecule;
 }

 /*****************************************/
 static PackMoleculeId get_seed_logical_molecule_with_most_ext_inputs(const MoleculeStats& molecule_stats) {

 	/* This routine is used to find the first seed atom block for the clustering.  It returns    *
 	 * the atom block with the largest number of used inputs that satisfies the    *
 	 * clocking and number of inputs constraints.  If no suitable atom block is    *
 	 * found, the routine returns ClusterBlockId::INVALID().                                 */

 	for (int ext_inps = molecule_stats.max_num_ext_inputs(); ext_inps >= 0; ext_inps--) {
 		t_molecule_link* ptr = unclustered_list_head[ext_inps].next;

 		while (ptr != nullptr) {
 			if (molecule_stats.valid(ptr->molecule_id)) {
 				return ptr->molecule_id;
 			}
 			ptr = ptr->next;
 		}
 	}
 	return PackMoleculeId::INVALID();
 }

 /*****************************************/

 /*****************************************/
 static void alloc_and_load_pb_stats(t_pb *pb) {

 	/* Call this routine when starting to fill up a new cluster.  It resets *
 	 * the gain vector, etc.                                                */

 	int i;

 	pb->pb_stats = new t_pb_stats;

 	/* If statement below is for speed.  If nets are reasonably low-fanout,  *
 	 * only a relatively small number of blocks will be marked, and updating *
 	 * only those atom block structures will be fastest.  If almost all blocks    *
 	 * have been touched it should be faster to just run through them all    *
 	 * in order (less addressing and better cache locality).                 */
 	pb->pb_stats->input_pins_used = std::vector<std::vector<AtomNetId>>(pb->pb_graph_node->num_input_pin_class);
 	pb->pb_stats->output_pins_used = std::vector<std::vector<AtomNetId>>(pb->pb_graph_node->num_output_pin_class);
 	pb->pb_stats->lookahead_input_pins_used = std::vector<std::vector<AtomNetId>>(pb->pb_graph_node->num_input_pin_class);
 	pb->pb_stats->lookahead_output_pins_used = std::vector<std::vector<AtomNetId>>(pb->pb_graph_node->num_output_pin_class);
 	pb->pb_stats->feasible_blocks = std::vector<PackMoleculeId>(AAPACK_MAX_FEASIBLE_BLOCK_ARRAY_SIZE);
 	pb->pb_stats->num_feasible_blocks = NOT_VALID;

 	pb->pb_stats->tie_break_high_fanout_net = AtomNetId::INVALID();
 	for (i = 0; i < pb->pb_graph_node->num_input_pin_class; i++) {
 		pb->pb_stats->input_pins_used[i] = std::vector<AtomNetId>(pb->pb_graph_node->input_pin_class_size[i]);
 	}

 	for (i = 0; i < pb->pb_graph_node->num_output_pin_class; i++) {
 		pb->pb_stats->output_pins_used[i] = std::vector<AtomNetId>(pb->pb_graph_node->output_pin_class_size[i]);
 	}

 	pb->pb_stats->gain.clear();
 	pb->pb_stats->timinggain.clear();
 	pb->pb_stats->connectiongain.clear();
 	pb->pb_stats->sharinggain.clear();
 	pb->pb_stats->hillgain.clear();

 	pb->pb_stats->num_pins_of_net_in_pb.clear();

 	pb->pb_stats->num_child_blocks_in_pb = 0;

 	pb->pb_stats->explore_transitive_fanout = true;
 	pb->pb_stats->transitive_fanout_candidates = nullptr;
 }
 /*****************************************/

 /**
  * Try pack molecule into current cluster
  */
 static enum e_block_pack_status try_pack_molecule(
 		t_cluster_placement_stats& cluster_placement_stats,
         const PackMolecules& molecules,
         const PackMoleculeId molecule_id,
         MoleculeStats& molecule_stats,
 		vtr::vector<MoleculeBlockId,t_pb_graph_node*>&  primitives_list,
 		t_pb * pb, const int max_models, const int max_cluster_size,
 		const ClusterBlockId clb_index, const int detailed_routing_stage, t_lb_router_data *router_data,
         bool debug_clustering,
         bool enable_pin_feasibility_filter) {
 	enum e_block_pack_status block_pack_status;
 	t_pb *parent;
 	t_pb *cur_pb;
 	bool is_root_of_chain;
 	t_pb_graph_pin *chain_root_pin;
     auto& atom_ctx = g_vpr_ctx.atom();

 	parent = nullptr;

 	block_pack_status = BLK_STATUS_UNDEFINED;

     std::vector<AtomBlockId> placed_atoms;

     auto& molecule = molecules.pack_molecules[molecule_id];

 	while (block_pack_status != BLK_PASSED) {
 		if (get_next_primitive_list(cluster_placement_stats, molecules, molecule_id, molecule_stats, primitives_list)) {
 			block_pack_status = BLK_PASSED;

             for (auto molecule_blk : molecule.blocks()) {
                 if (molecule.block_type(molecule_blk) == PackMolecule::BlockType::EXTERNAL) continue;

                 AtomBlockId atom_blk = molecule.block_atom(molecule_blk);
                 VTR_ASSERT(atom_blk);

                 if (primitives_list[molecule_blk] == nullptr) continue;

                 //TODO: remove these?
                 chain_root_pin = nullptr;
                 is_root_of_chain = false;

                 block_pack_status = try_place_atom_block_rec(
                         primitives_list[molecule_blk],
                         atom_blk, pb, &parent,
                         max_models, max_cluster_size, clb_index,
                         cluster_placement_stats, is_root_of_chain, chain_root_pin, router_data);

                 if (block_pack_status != BLK_PASSED) break;

                 placed_atoms.push_back(atom_blk);
 			}

 			if (enable_pin_feasibility_filter && block_pack_status == BLK_PASSED) {
 				/* Check if pin usage is feasible for the current packing assigment */
 				reset_lookahead_pins_used(pb);
 				try_update_lookahead_pins_used(pb);
 				if (!check_lookahead_pins_used(pb)) {
 					block_pack_status = BLK_FAILED_FEASIBLE;
 				}
 			}

 			if (block_pack_status == BLK_PASSED) {
 				/* Try to route if heuristic is to route for every atom
 					Skip routing if heuristic is to route at the end of packing complex block
 				*/
 				if (detailed_routing_stage == (int)E_DETAILED_ROUTE_FOR_EACH_ATOM && try_intra_lb_route(router_data, debug_clustering) == false) {
 					/* Cannot pack */
 					block_pack_status = BLK_FAILED_ROUTE;
 				} else {
 					/* Pack successful, commit
 					 TODO: SW Engineering note - may want to update cluster stats here too instead of doing it outside
 					 */
 					VTR_ASSERT(block_pack_status == BLK_PASSED);
 #if 0
 					if (molecule.blocks().size() > 2) {
 						/* Chained molecules often take up lots of area and are important,
                          * if a chain is packed in, want to rename logic block to match chain name */
                         AtomBlockId chain_root_blk_id = molecule.root_block_atom();
 						cur_pb = atom_ctx.lookup.atom_pb(chain_root_blk_id)->parent_pb;
 						while(cur_pb != nullptr) {
 						    free(cur_pb->name);
 						    cur_pb->name = vtr::strdup(atom_ctx.nlist.block_name(chain_root_blk_id).c_str());
 						    cur_pb = cur_pb->parent_pb;
                         }
 					}
 #endif
                     for (auto molecule_blk : molecule.blocks()) {
                         if (molecule.block_type(molecule_blk) == PackMolecule::BlockType::EXTERNAL) continue;

                         AtomBlockId atom_blk = molecule.block_atom(molecule_blk);
                         VTR_ASSERT(atom_blk);

                         if (primitives_list[molecule_blk] == nullptr) continue;

                         /* invalidate all molecules that share atom block with current molecule */
                         auto rng = molecules.atom_molecules.equal_range(atom_blk);
                         for(const auto& kv : vtr::make_range(rng.first, rng.second)) {
                             PackMoleculeId related_molecule_id = kv.second;
                             vtr::printf("Invalidating molecule %zu for atom '%s' from molecule %zu\n", related_molecule_id, atom_ctx.nlist.block_name(atom_blk).c_str(), molecule_id);
                             molecule_stats.set_valid(related_molecule_id, false);
                         }

                         commit_primitive(cluster_placement_stats, primitives_list[molecule_blk]);
 					}
 				}
 			}
 			if (block_pack_status != BLK_PASSED) {
 				for (AtomBlockId atom_blk : placed_atoms) {
                     remove_atom_from_target(router_data, atom_blk);
                     revert_place_atom_block(
                             atom_blk,
                             router_data,
                             molecules,
                             molecule_stats);
 				}
 			}
 		} else {
 			block_pack_status = BLK_FAILED_FEASIBLE;
 			break; /* no more candidate primitives available, this molecule will not pack, return fail */
 		}
 	}
 	return block_pack_status;
 }

 /**
  * Try place atom block into current primitive location
  */

 static enum e_block_pack_status try_place_atom_block_rec(
 		const t_pb_graph_node *pb_graph_node, const AtomBlockId blk_id,
 		t_pb *cb, t_pb **parent, const int max_models,
 		const int max_cluster_size, const ClusterBlockId clb_index,
 		const t_cluster_placement_stats& cluster_placement_stats,
 		const bool is_root_of_chain, const t_pb_graph_pin *chain_root_pin, t_lb_router_data *router_data) {
 	int i, j;
 	bool is_primitive;
 	enum e_block_pack_status block_pack_status;

 	t_pb *my_parent;
 	t_pb *pb, *parent_pb;
 	const t_pb_type *pb_type;

     auto& atom_ctx = g_vpr_ctx.mutable_atom();

 	my_parent = nullptr;

 	block_pack_status = BLK_PASSED;

 	/* Discover parent */
 	if (pb_graph_node->parent_pb_graph_node != cb->pb_graph_node) {
 		block_pack_status = try_place_atom_block_rec(
 				pb_graph_node->parent_pb_graph_node, blk_id, cb,
 				&my_parent, max_models, max_cluster_size, clb_index,
 				cluster_placement_stats, is_root_of_chain, chain_root_pin, router_data);
 		parent_pb = my_parent;
 	} else {
 		parent_pb = cb;
         vtr::printf("Placing atom primitive block '%s' in cluster '%s' of type '%s'\n",
                     atom_ctx.nlist.block_name(blk_id).c_str(), cb->name, cb->pb_graph_node->pb_type->name);
 	}

 	/* Create siblings if siblings are not allocated */
 	if (parent_pb->child_pbs == nullptr) {
         atom_ctx.lookup.set_atom_pb(AtomBlockId::INVALID(), parent_pb);

 		VTR_ASSERT(parent_pb->name == nullptr);
 		parent_pb->name = vtr::strdup(atom_ctx.nlist.block_name(blk_id).c_str());
 		parent_pb->mode = pb_graph_node->pb_type->parent_mode->index;
 		set_reset_pb_modes(router_data, parent_pb, true);
         const t_mode* mode = &parent_pb->pb_graph_node->pb_type->modes[parent_pb->mode];
         parent_pb->child_pbs = new t_pb*[mode->num_pb_type_children];

 		for (i = 0; i < mode->num_pb_type_children; i++) {
             parent_pb->child_pbs[i] = new t_pb[mode->pb_type_children[i].num_pb];

 			for (j = 0; j < mode->pb_type_children[i].num_pb; j++) {
 				parent_pb->child_pbs[i][j].parent_pb = parent_pb;

                 atom_ctx.lookup.set_atom_pb(AtomBlockId::INVALID(), &parent_pb->child_pbs[i][j]);

 				parent_pb->child_pbs[i][j].pb_graph_node =
 						&(parent_pb->pb_graph_node->child_pb_graph_nodes[parent_pb->mode][i][j]);
 			}
 		}
 	} else {
 		VTR_ASSERT(parent_pb->mode == pb_graph_node->pb_type->parent_mode->index);
 	}

     const t_mode* mode = &parent_pb->pb_graph_node->pb_type->modes[parent_pb->mode];
 	for (i = 0; i < mode->num_pb_type_children; i++) {
 		if (pb_graph_node->pb_type == &mode->pb_type_children[i]) {
 			break;
 		}
 	}
 	VTR_ASSERT(i < mode->num_pb_type_children);
 	pb = &parent_pb->child_pbs[i][pb_graph_node->placement_index];
 	*parent = pb; /* this pb is parent of it's child that called this function */
 	VTR_ASSERT(pb->pb_graph_node == pb_graph_node);
 	if (pb->pb_stats == nullptr) {
 		alloc_and_load_pb_stats(pb);
 	}
 	pb_type = pb_graph_node->pb_type;

 	is_primitive = (pb_type->num_modes == 0);

 	if (is_primitive) {
 		VTR_ASSERT(!atom_ctx.lookup.pb_atom(pb)
                     && atom_ctx.lookup.atom_pb(blk_id) == nullptr
                     && atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID());
 		/* try pack to location */
         VTR_ASSERT(pb->name == nullptr);
 		pb->name = vtr::strdup(atom_ctx.nlist.block_name(blk_id).c_str());

         //Update the atom netlist mappings
         atom_ctx.lookup.set_atom_clb(blk_id, clb_index);
         atom_ctx.lookup.set_atom_pb(blk_id, pb);

 		add_atom_as_target(router_data, blk_id);
 		if (!primitive_feasible(blk_id, pb)) {
 			/* failed location feasibility check, revert pack */
 			block_pack_status = BLK_FAILED_FEASIBLE;
 		}

 		if (block_pack_status == BLK_PASSED && is_root_of_chain == true) {
 			/* is carry chain, must check if this carry chain spans multiple logic blocks or not */
             t_model_ports *root_port = chain_root_pin->port->model_port;
             auto port_id = atom_ctx.nlist.find_atom_port(blk_id, root_port);
             if(port_id) {
                 auto chain_net_id = atom_ctx.nlist.port_net(port_id, chain_root_pin->pin_number);

                 if(chain_net_id) {
                     /* this carry chain spans multiple logic blocks, must match up correctly with previous chain for this to route */
                     if(pb_graph_node != chain_root_pin->parent_node) {
                         /* this location does not match with the dedicated chain input from outside logic block, therefore not feasible */
                         block_pack_status = BLK_FAILED_FEASIBLE;
                     }
                 }
             }
 		}
 	}

     if (block_pack_status != BLK_PASSED) {
         free(pb->name);
         pb->name = nullptr;
     }

 	return block_pack_status;
 }

 /* Revert trial atom block iblock and free up memory space accordingly
  */
 static void revert_place_atom_block(const AtomBlockId blk_id, t_lb_router_data *router_data,
     const PackMolecules& molecules,
     MoleculeStats& molecule_stats) {

     auto& atom_ctx = g_vpr_ctx.mutable_atom();

     //We cast away const here since we may free the pb, and it is
     //being removed from the active mapping.
     //
     //In general most code works fine accessing cosnt t_pb*,
     //which is why we store them as such in atom_ctx.lookup
     t_pb* pb = const_cast<t_pb*>(atom_ctx.lookup.atom_pb(blk_id));

     //Update the atom netlist mapping
     atom_ctx.lookup.set_atom_clb(blk_id, ClusterBlockId::INVALID());
     atom_ctx.lookup.set_atom_pb(blk_id, nullptr);

 	if (pb != nullptr) {
 		/* When freeing molecules, the current block might already have been freed by a prior revert
 		 When this happens, no need to do anything beyond basic book keeping at the atom block
 		 */

 		t_pb* next = pb->parent_pb;
         revalid_molecules(pb, molecules, molecule_stats);
 		free_pb(pb);
 		pb = next;

 		while (pb != nullptr) {
 			/* If this is pb is created only for the purposes of holding new molecule, remove it
 			 Must check if cluster is already freed (which can be the case)
 			 */
 			next = pb->parent_pb;

 			if (pb->child_pbs != nullptr && pb->pb_stats != nullptr
 					&& pb->pb_stats->num_child_blocks_in_pb == 0) {
 				set_reset_pb_modes(router_data, pb, false);
 				if (next != nullptr) {
 					/* If the code gets here, then that means that placing the initial seed molecule
                      * failed, don't free the actual complex block itself as the seed needs to find
                      * another placement */
                     revalid_molecules(pb, molecules, molecule_stats);
 					free_pb(pb);
 				}
 			}
 			pb = next;
 		}
 	}
 }

 static void update_connection_gain_values(const AtomNetId net_id, const AtomBlockId clustered_blk_id,
 		t_pb *cur_pb,
 		enum e_net_relation_to_clustered_block net_relation_to_clustered_block) {
 	/*This function is called when the connectiongain values on the net net_id*
 	 *require updating.   */

 	int num_internal_connections, num_open_connections, num_stuck_connections;

 	num_internal_connections = num_open_connections = num_stuck_connections = 0;

     auto& atom_ctx = g_vpr_ctx.atom();
 	ClusterBlockId clb_index = atom_ctx.lookup.atom_clb(clustered_blk_id);

 	/* may wish to speed things up by ignoring clock nets since they are high fanout */

     for(auto pin_id : atom_ctx.nlist.net_pins(net_id)) {
         auto blk_id = atom_ctx.nlist.pin_block(pin_id);
 		if (atom_ctx.lookup.atom_clb(blk_id) == clb_index
 				&& is_atom_blk_in_pb(blk_id, atom_ctx.lookup.atom_pb(clustered_blk_id))) {
 			num_internal_connections++;
 		} else if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) {
 			num_open_connections++;
 		} else {
 			num_stuck_connections++;
 		}
 	}

 	if (net_relation_to_clustered_block == OUTPUT) {
         for(auto pin_id : atom_ctx.nlist.net_sinks(net_id)) {
             auto blk_id = atom_ctx.nlist.pin_block(pin_id);
             VTR_ASSERT(blk_id);

 			if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) {
 				/* TODO: Gain function accurate only if net has one connection to block,
                  * TODO: Should we handle case where net has multi-connection to block?
                  *       Gain computation is only off by a bit in this case */
 				if(cur_pb->pb_stats->connectiongain.count(blk_id) == 0) {
 					cur_pb->pb_stats->connectiongain[blk_id] = 0;
 				}

 				if (num_internal_connections > 1) {
 					cur_pb->pb_stats->connectiongain[blk_id] -= 1 / (float) (num_open_connections + 1.5 * num_stuck_connections + 1 + 0.1);
 				}
 				cur_pb->pb_stats->connectiongain[blk_id] += 1 / (float) (num_open_connections + 1.5 * num_stuck_connections + 0.1);
 			}
 		}
 	}

 	if (net_relation_to_clustered_block == INPUT) {
 		/*Calculate the connectiongain for the atom block which is driving *
 		 *the atom net that is an input to an atom block in the cluster */

         auto driver_pin_id = atom_ctx.nlist.net_driver(net_id);
         auto blk_id = atom_ctx.nlist.pin_block(driver_pin_id);

 		if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) {
 			if(cur_pb->pb_stats->connectiongain.count(blk_id) == 0) {
 				cur_pb->pb_stats->connectiongain[blk_id] = 0;
 			}
 			if (num_internal_connections > 1) {
 				cur_pb->pb_stats->connectiongain[blk_id] -= 1 / (float) (num_open_connections + 1.5 * num_stuck_connections + 0.1 + 1);
 			}
 			cur_pb->pb_stats->connectiongain[blk_id] += 1 / (float) (num_open_connections + 1.5 * num_stuck_connections + 0.1);
 		}
 	}
 }
 /*****************************************/
 static void update_timing_gain_values(const AtomNetId net_id,
 		t_pb *cur_pb,
 		enum e_net_relation_to_clustered_block net_relation_to_clustered_block,
         const SetupTimingInfo& timing_info,
         const std::unordered_set<AtomNetId>& is_global) {

 	/*This function is called when the timing_gain values on the atom net*
 	 *net_id requires updating.   */
 	float timinggain;

     auto& atom_ctx = g_vpr_ctx.atom();

 	/* Check if this atom net lists its driving atom block twice.  If so, avoid  *
 	 * double counting this atom block by skipping the first (driving) pin. */
     auto pins = atom_ctx.nlist.net_pins(net_id);
 	if (net_output_feeds_driving_block_input[net_id] != 0)
         pins = atom_ctx.nlist.net_sinks(net_id);

 	if (net_relation_to_clustered_block == OUTPUT
 			&& !is_global.count(net_id)) {
         for(auto pin_id : pins) {
             auto blk_id = atom_ctx.nlist.pin_block(pin_id);
 			if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) {

                 timinggain = timing_info.setup_pin_criticality(pin_id);

 				if(cur_pb->pb_stats->timinggain.count(blk_id) == 0) {
 					cur_pb->pb_stats->timinggain[blk_id] = 0;
 				}
 				if (timinggain > cur_pb->pb_stats->timinggain[blk_id])
 					cur_pb->pb_stats->timinggain[blk_id] = timinggain;
 			}
 		}
 	}

 	if (net_relation_to_clustered_block == INPUT
 			&& !is_global.count(net_id)) {
 		/*Calculate the timing gain for the atom block which is driving *
 		 *the atom net that is an input to a atom block in the cluster */
         auto driver_pin = atom_ctx.nlist.net_driver(net_id);
         auto new_blk_id = atom_ctx.nlist.pin_block(driver_pin);

 		if (atom_ctx.lookup.atom_clb(new_blk_id) == ClusterBlockId::INVALID()) {
 			for (auto pin_id : atom_ctx.nlist.net_sinks(net_id)) {

                 timinggain = timing_info.setup_pin_criticality(pin_id);

 				if(cur_pb->pb_stats->timinggain.count(new_blk_id) == 0) {
 					cur_pb->pb_stats->timinggain[new_blk_id] = 0;
 				}
 				if (timinggain > cur_pb->pb_stats->timinggain[new_blk_id])
 					cur_pb->pb_stats->timinggain[new_blk_id] = timinggain;

 			}
 		}
 	}
 }

 /*****************************************/
 static void mark_and_update_partial_gain(const AtomNetId net_id, enum e_gain_update gain_flag,
 		const AtomBlockId clustered_blk_id,
 		bool timing_driven,
 		bool connection_driven,
 		enum e_net_relation_to_clustered_block net_relation_to_clustered_block,
         const SetupTimingInfo& timing_info,
         const std::unordered_set<AtomNetId>& is_global) {

 	/* Updates the marked data structures, and if gain_flag is GAIN,  *
 	 * the gain when an atom block is added to a cluster.  The        *
 	 * sharinggain is the number of inputs that a atom block shares with   *
 	 * blocks that are already in the cluster. Hillgain is the        *
 	 * reduction in number of pins-required by adding a atom block to the  *
 	 * cluster. The timinggain is the criticality of the most critical*
 	 * atom net between this atom block and an atom block in the cluster.             */

     auto& atom_ctx = g_vpr_ctx.atom();
 	t_pb* cur_pb = atom_ctx.lookup.atom_pb(clustered_blk_id)->parent_pb;

 	if (atom_ctx.nlist.net_sinks(net_id).size() > AAPACK_MAX_NET_SINKS_IGNORE) {
 		/* Optimization: It can be too runtime costly for marking all sinks for
          * a high fanout-net that probably has no hope of ever getting packed,
          * thus ignore those high fanout nets */
 		if(!is_global.count(net_id)) {
 			/* If no low/medium fanout nets, we may need to consider
              * high fan-out nets for packing, so select one and store it */
 			while(cur_pb->parent_pb != nullptr) {
 				cur_pb = cur_pb->parent_pb;
 			}
 			AtomNetId stored_net = cur_pb->pb_stats->tie_break_high_fanout_net;
 			if(!stored_net || atom_ctx.nlist.net_sinks(net_id).size() < atom_ctx.nlist.net_sinks(stored_net).size()) {
 				cur_pb->pb_stats->tie_break_high_fanout_net = net_id;
 			}
 		}
 		return;
 	}

 	while (cur_pb) {
 		/* Mark atom net as being visited, if necessary. */

 		if (cur_pb->pb_stats->num_pins_of_net_in_pb.count(net_id) == 0) {
 			cur_pb->pb_stats->marked_nets.push_back(net_id);
 		}

 		/* Update gains of affected blocks. */

 		if (gain_flag == GAIN) {

 			/* Check if this net is connected to it's driver block multiple times (i.e. as both an output and input)
              * If so, avoid double counting by skipping the first (driving) pin. */

             auto pins = atom_ctx.nlist.net_pins(net_id);
 			if (net_output_feeds_driving_block_input[net_id] != 0)
                 //We implicitly assume here that net_output_feeds_driver_block_input[net_id] is 2
                 //(i.e. the net loops back to the block only once)
 			    pins = atom_ctx.nlist.net_sinks(net_id);

 			if (cur_pb->pb_stats->num_pins_of_net_in_pb.count(net_id) == 0) {
                 for(auto pin_id : pins) {
                     auto blk_id = atom_ctx.nlist.pin_block(pin_id);
 					if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) {

 						if (cur_pb->pb_stats->sharinggain.count(blk_id) == 0) {
 							cur_pb->pb_stats->marked_blocks.push_back(blk_id);
 							cur_pb->pb_stats->sharinggain[blk_id] = 1;
 							cur_pb->pb_stats->hillgain[blk_id] = 1 - num_ext_inputs_atom_block(blk_id);
 						} else {
 							cur_pb->pb_stats->sharinggain[blk_id]++;
 							cur_pb->pb_stats->hillgain[blk_id]++;
 						}
 					}
 				}
 			}

 			if (connection_driven) {
 				update_connection_gain_values(net_id, clustered_blk_id, cur_pb,
 						net_relation_to_clustered_block);
 			}

 			if (timing_driven) {
 				update_timing_gain_values(net_id, cur_pb,
 						net_relation_to_clustered_block,
                         timing_info,
                         is_global);
 			}
 		}
 		if(cur_pb->pb_stats->num_pins_of_net_in_pb.count(net_id) == 0) {
 			cur_pb->pb_stats->num_pins_of_net_in_pb[net_id] = 0;
 		}
 		cur_pb->pb_stats->num_pins_of_net_in_pb[net_id]++;
 		cur_pb = cur_pb->parent_pb;
 	}
 }

 /*****************************************/
 static void update_total_gain(float alpha, float beta, bool timing_driven,
 		bool connection_driven, t_pb *pb) {

 	/*Updates the total  gain array to reflect the desired tradeoff between*
 	 *input sharing (sharinggain) and path_length minimization (timinggain)*/
     auto& atom_ctx = g_vpr_ctx.atom();
 	t_pb * cur_pb = pb;
 	while (cur_pb) {

 		for (AtomBlockId blk_id : cur_pb->pb_stats->marked_blocks) {

 			if(cur_pb->pb_stats->connectiongain.count(blk_id) == 0) {
 				cur_pb->pb_stats->connectiongain[blk_id] = 0;
 			}
 			if(cur_pb->pb_stats->sharinggain.count(blk_id) == 0) {
 				cur_pb->pb_stats->sharinggain[blk_id] = 0;
 			}

 			/* Todo: This was used to explore different normalization options, can
              * be made more efficient once we decide on which one to use*/
 			int num_used_input_pins = atom_ctx.nlist.block_input_pins(blk_id).size();
 			int num_used_output_pins = atom_ctx.nlist.block_output_pins(blk_id).size();
 			/* end todo */

 			/* Calculate area-only cost function */
             int num_used_pins = num_used_input_pins + num_used_output_pins;
             VTR_ASSERT(num_used_pins > 0);
 			if (connection_driven) {
 				/*try to absorb as many connections as possible*/
 				cur_pb->pb_stats->gain[blk_id] = ((1 - beta)
 						* (float) cur_pb->pb_stats->sharinggain[blk_id]
 						+ beta * (float) cur_pb->pb_stats->connectiongain[blk_id])
 						/ (num_used_pins);
 			} else {
 				cur_pb->pb_stats->gain[blk_id] =
 						((float) cur_pb->pb_stats->sharinggain[blk_id])
 								/ (num_used_pins);

 			}

 			/* Add in timing driven cost into cost function */
 			if (timing_driven) {
 				cur_pb->pb_stats->gain[blk_id] = alpha
 						* cur_pb->pb_stats->timinggain[blk_id]
 						+ (1.0 - alpha) * (float) cur_pb->pb_stats->gain[blk_id];
 			}
 		}
 		cur_pb = cur_pb->parent_pb;
 	}
 }

 /*****************************************/
 static void update_cluster_stats(
         const PackMolecules& molecules,
         const PackMoleculeId molecule_id,
 		const ClusterBlockId clb_index,
         const std::unordered_set<AtomNetId>& is_clock,
         const std::unordered_set<AtomNetId>& is_global,
         const bool global_clocks,
 		const float alpha, const float beta, const bool timing_driven,
 		const bool connection_driven,
         const SetupTimingInfo& timing_info) {

 	/* Updates cluster stats such as gain, used pins, and clock structures.  */
 	t_pb *cur_pb, *cb;

 	/* TODO: what a scary comment from Vaughn, we'll have to watch out for this causing problems */

 	/* Output can be open so the check is necessary.  I don't change    *
 	 * the gain for clock outputs when clocks are globally distributed  *
 	 * because I assume there is no real need to pack similarly clocked *
 	 * FFs together then.  Note that by updating the gain when the      *
 	 * clock driver is placed in a cluster implies that the output of   *
 	 * LUTs can be connected to clock inputs internally.  Probably not  *
 	 * true, but it doesn't make much difference, since it will still   *
 	 * make local routing of this clock very short, and none of my      *
 	 * benchmarks actually generate local clocks (all come from pads).  */

     auto& atom_ctx = g_vpr_ctx.mutable_atom();
 	cb = nullptr;

     auto& molecule = molecules.pack_molecules[molecule_id];
 	for (auto molecule_blk : molecule.blocks()) {
         if (molecule.block_type(molecule_blk) == PackMolecule::BlockType::EXTERNAL) continue;
         AtomBlockId blk_id = molecule.block_atom(molecule_blk);
 		VTR_ASSERT(blk_id);

         //Update atom netlist mapping
         atom_ctx.lookup.set_atom_clb(blk_id, clb_index);

 		cur_pb = atom_ctx.lookup.atom_pb(blk_id)->parent_pb;
 		while (cur_pb) {
 			/* reset list of feasible blocks */
 			cur_pb->pb_stats->num_feasible_blocks = NOT_VALID;
 			cur_pb->pb_stats->num_child_blocks_in_pb++;
 			if (cur_pb->parent_pb == nullptr) {
 				cb = cur_pb;
 			}
 			cur_pb = cur_pb->parent_pb;
 		}

         /* Outputs first */
         for(auto pin_id : atom_ctx.nlist.block_output_pins(blk_id)) {
             auto net_id = atom_ctx.nlist.pin_net(pin_id);
             if (!is_clock.count(net_id) || !global_clocks) {
                 mark_and_update_partial_gain(net_id, GAIN, blk_id,
                         timing_driven,
                         connection_driven, OUTPUT,
                         timing_info,
                         is_global);
             } else {
                 mark_and_update_partial_gain(net_id, NO_GAIN, blk_id,
                         timing_driven,
                         connection_driven, OUTPUT,
                         timing_info,
                         is_global);
             }
         }

         /* Next Inputs */
         for(auto pin_id : atom_ctx.nlist.block_input_pins(blk_id)) {
             auto net_id = atom_ctx.nlist.pin_net(pin_id);
             mark_and_update_partial_gain(net_id, GAIN, blk_id,
                     timing_driven, connection_driven,
                     INPUT,
                     timing_info,
                     is_global);
         }

         /* Finally Clocks */
 		/* Note:  The code below ONLY WORKS when nets that go to clock inputs *
 		 * NEVER go to the input of a VPACK_COMB.  It doesn't really make electrical *
 		 * sense for that to happen, and I check this in the check_clocks     *
 		 * function.  Don't disable that sanity check.                        */
         //TODO: lift above restriction (does happen on some circuits)
         for(auto pin_id : atom_ctx.nlist.block_clock_pins(blk_id)) {
             auto net_id = atom_ctx.nlist.pin_net(pin_id);
             if (global_clocks) {
                 mark_and_update_partial_gain(net_id, NO_GAIN, blk_id,
                         timing_driven, connection_driven, INPUT,
                         timing_info,
                         is_global);
             } else {
                 mark_and_update_partial_gain(net_id, GAIN, blk_id,
                         timing_driven, connection_driven, INPUT,
                         timing_info,
                         is_global);
             }
         }

 		update_total_gain(alpha, beta, timing_driven, connection_driven,
 				atom_ctx.lookup.atom_pb(blk_id)->parent_pb);

 		commit_lookahead_pins_used(cb);
 	}
 }

 static void start_new_cluster(
 		std::vector<t_cluster_placement_stats>& cluster_placement_stats,
 		vtr::vector<MoleculeBlockId,t_pb_graph_node*>&  primitives_list,
         const PackMolecules& molecules,
 		const PackMoleculeId molecule_id,
         MoleculeStats& molecule_stats,
         ClusterBlockId clb_index,
 		std::map<t_type_ptr, size_t>& num_used_type_instances,
         const float target_device_utilization,
 		const int num_models, const int max_cluster_size,
 		const t_arch* arch,
 		std::string device_layout_name,
 		const vector<t_lb_type_rr_graph>& lb_type_rr_graphs,
 		const std::vector<t_lb_type_rr_graph_info>& lb_type_rr_graph_infos,
 		t_lb_router_data **router_data,
 		const int detailed_routing_stage,
 		ClusteredNetlist *clb_nlist,
         const std::map<const t_model*,std::vector<t_type_ptr>>& primitive_candidate_block_types,
         bool debug_clustering,
         bool enable_pin_feasibility_filter) {
 	/* Given a starting seed block, start_new_cluster determines the next cluster type to use
 	 It expands the FPGA if it cannot find a legal cluster for the atom block
 	 */

     auto& atom_ctx = g_vpr_ctx.atom();
     auto& device_ctx = g_vpr_ctx.mutable_device();

 	/* Allocate a dummy initial cluster and load a atom block as a seed and check if it is legal */
     auto& molecule = molecules.pack_molecules[molecule_id];

     std::string molecule_name = molecule.name(); //TODO: choose a better name
     std::vector<t_type_ptr> candidate_types = find_candidate_types_for_molecule(molecule, primitive_candidate_block_types);

     //We sort the candidate types in ascending order by their current utilization.
     //This means that the packer will prefer to use types with lower utilization.
     //This is a naive approach to try balancing utilization when multiple types can
     //support the same primitive(s).
     std::sort(candidate_types.begin(), candidate_types.end(),
         [&](t_type_ptr lhs, t_type_ptr rhs) {
             float lhs_util = float(num_used_type_instances[lhs]) / device_ctx.grid.num_instances(lhs);
             float rhs_util = float(num_used_type_instances[rhs]) / device_ctx.grid.num_instances(rhs);

             //Lower util first
             return lhs_util < rhs_util;
         }
     );

     //Try packing into each candidate type
 	bool success = false;
     for (size_t i = 0; i < candidate_types.size(); i++) {
         auto type = candidate_types[i];

         t_pb* pb = new t_pb;
         pb->pb_graph_node = type->pb_graph_head;
         alloc_and_load_pb_stats(pb);
         pb->parent_pb = nullptr;

         *router_data = alloc_and_load_router_data(lb_type_rr_graphs[type->index], lb_type_rr_graph_infos[type->index], type);

         //Try packing into each mode
         for (int j = 0; j < type->pb_graph_head->pb_type->num_modes && !success; j++) {
             pb->mode = j;

             reset_cluster_placement_stats(cluster_placement_stats[type->index]);
             set_mode_cluster_placement_stats(pb->pb_graph_node, j);

             auto pack_result = try_pack_molecule(cluster_placement_stats[type->index],
                                             molecules,
                                             molecule_id,
                                             molecule_stats,
                                             primitives_list, pb,
                                             num_models, max_cluster_size, clb_index,
                                             detailed_routing_stage, *router_data,
                                             debug_clustering,
                                             enable_pin_feasibility_filter);

             success = (pack_result == BLK_PASSED);
         }

         if (success) {
             //Once clustering succeeds, add it to the clb netlist
             if (pb->name != nullptr) {
                 free(pb->name);
             }
             pb->name = vtr::strdup(molecule_name.c_str());
             clb_index = clb_nlist->create_block(molecule_name.c_str(), pb, type);
             break;
         } else {
             //Free failed clustering and try again
             free_router_data(*router_data);
             free_pb(pb);
             delete pb;
             *router_data = nullptr;
         }
     }

     if (!success) {
         //Explored all candidates
         std::string msg;
         msg += "Can not find any logic block that can implement molecule '" + molecule_name + "' consisting of:\n";

         auto& molecule = molecules.pack_molecules[molecule_id];
         for (auto molecule_blk : molecule.blocks()) {
             if (molecule.block_type(molecule_blk) == PackMolecule::BlockType::EXTERNAL) continue;

             auto atom_blk = molecule.block_atom(molecule_blk);
             msg += vtr::string_fmt("\tAtom %s (%s)\n", atom_ctx.nlist.block_name(atom_blk).c_str(), atom_ctx.nlist.block_model(atom_blk)->name);
         }

         VPR_THROW(VPR_ERROR_PACK, msg.c_str());
     }

     VTR_ASSERT(success);

 	//Successfully create cluster
 	num_used_type_instances[clb_nlist->block_type(clb_index)]++;

 	/* Expand FPGA size if needed */
 	if (num_used_type_instances[clb_nlist->block_type(clb_index)] > device_ctx.grid.num_instances(clb_nlist->block_type(clb_index))) {
 		device_ctx.grid = create_device_grid(device_layout_name, arch->grid_layouts, num_used_type_instances, target_device_utilization);
 		vtr::printf_info("Not enough resources expand FPGA size to (%d x %d)\n",
 			device_ctx.grid.width(), device_ctx.grid.height());
 	}
 }

 /*
 Get candidate molecule to pack into currently open cluster
 Molecule selection priority:
 	1. Find unpacked molecule based on criticality and strong connectedness (connected by low fanout nets) with current cluster
 	2. Find unpacked molecule based on weak connectedness (connected by high fanout nets) with current cluster
 	3. Find unpacked molecule based on transitive connections (eg. 2 hops away) with current cluster
 */
 static PackMoleculeId get_highest_gain_molecule(
 		t_pb *cur_pb,
         const PackMolecules& molecules,
         const MoleculeStats& molecule_stats,
 		const enum e_gain_type gain_mode,
 		t_cluster_placement_stats& cluster_placement_stats,
 		vtr::vector_map<ClusterBlockId,std::vector<AtomNetId>> &clb_inter_blk_nets,
 		const ClusterBlockId cluster_index) {

 	/* This routine populates a list of feasible blocks outside the cluster then returns the best one for the list    *
 	 * not currently in a cluster and satisfies the feasibility     *
 	 * function passed in as is_feasible.  If there are no feasible *
 	 * blocks it returns ClusterBlockId::INVALID().                                */

 	int j, index, count;

     auto& atom_ctx = g_vpr_ctx.atom();

 	if (gain_mode == HILL_CLIMBING) {
 		vpr_throw(VPR_ERROR_PACK, __FILE__, __LINE__,
 				"Hill climbing not supported yet, error out.\n");
 	}

 	// 1. Find unpacked molecule based on criticality and strong connectedness (connected by low fanout nets) with current cluster
 	if (cur_pb->pb_stats->num_feasible_blocks == NOT_VALID) {
 		cur_pb->pb_stats->num_feasible_blocks = 0;
 		cur_pb->pb_stats->explore_transitive_fanout = true; /* If no legal molecules found, enable exploration of molecules two hops away */
 		for (AtomBlockId blk_id : cur_pb->pb_stats->marked_blocks) {
 			if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) {

                 auto rng = molecules.atom_molecules.equal_range(blk_id);
                 for(const auto& kv : vtr::make_range(rng.first, rng.second)) {
                     PackMoleculeId molecule_id = kv.second;
 					if (molecule_stats.valid(molecule_id)) {
                         auto& molecule = molecules.pack_molecules[molecule_id];

 						if (exists_free_primitives_for_molecule(molecule, cluster_placement_stats)) {
 							add_molecule_to_pb_stats_candidates(molecules, molecule_id, molecule_stats,
 									cur_pb->pb_stats->gain, cur_pb, AAPACK_MAX_FEASIBLE_BLOCK_ARRAY_SIZE);
 						}
                     }
                 }
 			}
 		}
 	}

 	// 2. Find unpacked molecule based on weak connectedness (connected by high fanout nets) with current cluster
 	if(cur_pb->pb_stats->num_feasible_blocks == 0 && cur_pb->pb_stats->tie_break_high_fanout_net) {
 		/* Because the packer ignores high fanout nets when marking what blocks
          * to consider, use one of the ignored high fanout net to fill up lightly
          * related blocks */
 		reset_tried_but_unused_cluster_placements(cluster_placement_stats);

 		AtomNetId net_id = cur_pb->pb_stats->tie_break_high_fanout_net;

 		count = 0;
         for(auto pin_id : atom_ctx.nlist.net_pins(net_id)) {
             if(count >= AAPACK_MAX_HIGH_FANOUT_EXPLORE) {
                 break;
             }

             AtomBlockId blk_id = atom_ctx.nlist.pin_block(pin_id);

 			if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) {

                 auto rng = molecules.atom_molecules.equal_range(blk_id);
                 for(const auto& kv : vtr::make_range(rng.first, rng.second)) {
                     PackMoleculeId molecule_id = kv.second;
 					if (molecule_stats.valid(molecule_id)) {
                         auto& molecule = molecules.pack_molecules[molecule_id];

 						if (exists_free_primitives_for_molecule(molecule, cluster_placement_stats)) {
 							add_molecule_to_pb_stats_candidates(molecules, molecule_id, molecule_stats,
 									cur_pb->pb_stats->gain, cur_pb, min(AAPACK_MAX_FEASIBLE_BLOCK_ARRAY_SIZE, AAPACK_MAX_HIGH_FANOUT_EXPLORE));
 							count++;
 						}
 					}
                 }
 			}
 		}
 		cur_pb->pb_stats->tie_break_high_fanout_net = AtomNetId::INVALID(); /* Mark off that this high fanout net has been considered */
 	}

 	// 3. Find unpacked molecule based on transitive connections (eg. 2 hops away) with current cluster
 	if(cur_pb->pb_stats->num_feasible_blocks == 0 &&
 	   !cur_pb->pb_stats->tie_break_high_fanout_net &&
 	   cur_pb->pb_stats->explore_transitive_fanout == true
 	   ) {
 		if(cur_pb->pb_stats->transitive_fanout_candidates == nullptr) {
 			/* First time finding transitive fanout candidates therefore alloc and load them */
 			cur_pb->pb_stats->transitive_fanout_candidates = new vector<PackMoleculeId>;
 			load_transitive_fanout_candidates(cluster_index,
                                               molecules,
                                               molecule_stats,
 											  cur_pb->pb_stats,
 											  clb_inter_blk_nets);

 			/* Only consider candidates that pass a very simple legality check */
 			for(int i = 0; i < (int) cur_pb->pb_stats->transitive_fanout_candidates->size(); i++) {
 				PackMoleculeId molecule_id = (*cur_pb->pb_stats->transitive_fanout_candidates)[i];
 				if (molecule_stats.valid(molecule_id)) {
                     auto& molecule = molecules.pack_molecules[molecule_id];

                     if (exists_free_primitives_for_molecule(molecule, cluster_placement_stats)) {
 						add_molecule_to_pb_stats_candidates(molecules, molecule_id, molecule_stats,
 								cur_pb->pb_stats->gain, cur_pb, min(AAPACK_MAX_FEASIBLE_BLOCK_ARRAY_SIZE,AAPACK_MAX_TRANSITIVE_EXPLORE));
 					}
 				}
 			}
 		} else {
 			/* Clean up, no more candidates in transitive fanout to consider */
 			delete cur_pb->pb_stats->transitive_fanout_candidates;
 			cur_pb->pb_stats->transitive_fanout_candidates = nullptr;
 			cur_pb->pb_stats->explore_transitive_fanout = false;
 		}
 	}

 	/* Grab highest gain molecule */
 	PackMoleculeId molecule_id;
 	for (j = 0; j < cur_pb->pb_stats->num_feasible_blocks; j++) {
 		if (cur_pb->pb_stats->num_feasible_blocks != 0) {
 			cur_pb->pb_stats->num_feasible_blocks--;
 			index = cur_pb->pb_stats->num_feasible_blocks;
 			molecule_id = cur_pb->pb_stats->feasible_blocks[index];
 			VTR_ASSERT(molecule_stats.valid(molecule_id));
 			return molecule_id;
 		}
 	}

 	return molecule_id;
 }

 static bool exists_free_primitives_for_molecule(const PackMolecule& molecule, t_cluster_placement_stats& cluster_placement_stats) {
     auto& atom_ctx = g_vpr_ctx.atom();

     for (auto molecule_blk : molecule.blocks()) {
         AtomBlockId atom_blk = molecule.block_atom(molecule_blk);
         VTR_ASSERT(atom_ctx.lookup.atom_clb(atom_blk) == ClusterBlockId::INVALID());
         if (!exists_free_primitive_for_atom_block(cluster_placement_stats, atom_blk)) {
             /* TODO: debating whether to check if placement exists for molecule
              * (more robust) or individual atom blocks (faster) */
             return false;
         }
     }

     return true;
 }

 /*****************************************/
 static PackMoleculeId get_molecule_for_cluster(
 		t_pb *cur_pb,
         const PackMolecules& molecules,
         const MoleculeStats& molecule_stats,
 		const bool allow_unrelated_clustering,
 		int *num_unrelated_clustering_attempts,
 		t_cluster_placement_stats& cluster_placement_stats,
 		vtr::vector_map<ClusterBlockId,std::vector<AtomNetId>> &clb_inter_blk_nets,
 		ClusterBlockId cluster_index) {

 	/* Finds the block with the the greatest gain that satisifies the      *
 	 * input, clock and capacity constraints of a cluster that are       *
 	 * passed in.  If no suitable block is found it returns ClusterBlockId::INVALID().
 	 */

 	PackMoleculeId best_molecule;

 	/* If cannot pack into primitive, try packing into cluster */

 	best_molecule = get_highest_gain_molecule(cur_pb, molecules, molecule_stats,
 			NOT_HILL_CLIMBING, cluster_placement_stats, clb_inter_blk_nets, cluster_index);

 	/* If no blocks have any gain to the current cluster, the code above *
 	 * will not find anything.  However, another atom block with no inputs in *
 	 * common with the cluster may still be inserted into the cluster.   */

 	if (allow_unrelated_clustering) {
 		if (!best_molecule) {
 			if (*num_unrelated_clustering_attempts == 0) {
 				best_molecule = get_free_molecule_with_most_ext_inputs_for_cluster(
                                     molecules,
                                     molecule_stats,
                                     cur_pb,
                                     cluster_placement_stats);
 				(*num_unrelated_clustering_attempts)++;
 			}
 		} else {
 			*num_unrelated_clustering_attempts = 0;
 		}
 	}

 	return best_molecule;
 }


 /* TODO: Add more error checking! */
 static void check_clustering() {
     std::unordered_set<AtomBlockId> atoms_checked;
     auto& atom_ctx = g_vpr_ctx.atom();
 	auto& cluster_ctx = g_vpr_ctx.clustering();

     if(cluster_ctx.clb_nlist.blocks().size() == 0) {
         vtr::printf_warning(__FILE__, __LINE__, "Packing produced no clustered blocks");
     }

 	/*
 	 * Check that each atom block connects to one physical primitive and that the primitive links up to the parent clb
 	 */
     for(auto blk_id : atom_ctx.nlist.blocks()) {
         //Each atom should be part of a pb
         const t_pb* atom_pb = atom_ctx.lookup.atom_pb(blk_id);
         if(!atom_pb) {
 			vpr_throw(VPR_ERROR_PACK, __FILE__, __LINE__,
 					"Atom block %s is not mapped to a pb\n",
 					atom_ctx.nlist.block_name(blk_id).c_str());
         }

         //Check the reverse mapping is consistent
 		if (atom_ctx.lookup.pb_atom(atom_pb) != blk_id) {
 			vpr_throw(VPR_ERROR_PACK, __FILE__, __LINE__,
 					"pb %s does not contain atom block %s but atom block %s maps to pb.\n",
                     atom_pb->name,
 					atom_ctx.nlist.block_name(blk_id).c_str(),
                     atom_ctx.nlist.block_name(blk_id).c_str());
 		}

 		VTR_ASSERT(atom_ctx.nlist.block_name(blk_id) == atom_pb->name);

         const t_pb* cur_pb = atom_pb;
 		while (cur_pb->parent_pb) {
 			cur_pb = cur_pb->parent_pb;
 			VTR_ASSERT(cur_pb->name);
 		}

 		ClusterBlockId clb_index = atom_ctx.lookup.atom_clb(blk_id);
         if(clb_index == ClusterBlockId::INVALID()) {
 			vpr_throw(VPR_ERROR_PACK, __FILE__, __LINE__,
 					"Atom %s is not mapped to a CLB\n",
 					atom_ctx.nlist.block_name(blk_id).c_str());
         }

 		if (cur_pb != cluster_ctx.clb_nlist.block_pb(clb_index)) {
 			vpr_throw(VPR_ERROR_PACK, __FILE__, __LINE__,
 					"CLB %s does not match CLB contained by pb %s.\n",
 					cur_pb->name, atom_pb->name);
 		}
 	}

 	/* Check that I do not have spurious links in children pbs */
 	for (auto blk_id : cluster_ctx.clb_nlist.blocks()) {
 		check_cluster_atom_blocks(cluster_ctx.clb_nlist.block_pb(blk_id), atoms_checked);
 	}

 	for (auto blk_id : atom_ctx.nlist.blocks()) {
 		if (!atoms_checked.count(blk_id)) {
 			vpr_throw(VPR_ERROR_PACK, __FILE__, __LINE__,
 					"Atom block %s not found in any cluster.\n",
 					atom_ctx.nlist.block_name(blk_id).c_str());
 		}
 	}
 }

 /* TODO: May want to check that all atom blocks are actually reached */
 static void check_cluster_atom_blocks(t_pb *pb, std::unordered_set<AtomBlockId>& blocks_checked) {
 	int i, j;
 	const t_pb_type *pb_type;
 	bool has_child = false;
     auto& atom_ctx = g_vpr_ctx.atom();

 	pb_type = pb->pb_graph_node->pb_type;
 	if (pb_type->num_modes == 0) {
 		/* primitive */
         auto blk_id = atom_ctx.lookup.pb_atom(pb);
 		if (blk_id) {
 			if (blocks_checked.count(blk_id)) {
 				vpr_throw(VPR_ERROR_PACK, __FILE__, __LINE__,
 						"pb %s contains atom block %s but atom block is already contained in another pb.\n",
 						pb->name, atom_ctx.nlist.block_name(blk_id).c_str());
 			}
 			blocks_checked.insert(blk_id);
 			if (pb != atom_ctx.lookup.atom_pb(blk_id)) {
 				vpr_throw(VPR_ERROR_PACK, __FILE__, __LINE__,
 						"pb %s contains atom block %s but atom block does not link to pb.\n",
 						pb->name, atom_ctx.nlist.block_name(blk_id).c_str());
 			}
 		}
 	} else {
 		/* this is a container pb, all container pbs must contain children */
 		for (i = 0; i < pb_type->modes[pb->mode].num_pb_type_children; i++) {
 			for (j = 0; j < pb_type->modes[pb->mode].pb_type_children[i].num_pb; j++) {
 				if (pb->child_pbs[i] != nullptr) {
 					if (pb->child_pbs[i][j].name != nullptr) {
 						has_child = true;
 						check_cluster_atom_blocks(&pb->child_pbs[i][j], blocks_checked);
 					}
 				}
 			}
 		}
 		VTR_ASSERT(has_child);
 	}
 }

 static PackMoleculeId get_highest_gain_seed_molecule(int * seedindex, const PackMolecules& molecules, const MoleculeStats& molecule_stats, bool getblend) {
 	/* Do_timing_analysis must be called before this, or this function
 	 * will return garbage. Returns molecule with most critical block;
 	 * if block belongs to multiple molecules, return the biggest molecule. */

 	AtomBlockId blk_id;
     auto& atom_ctx = g_vpr_ctx.atom();

     VTR_ASSERT(seed_blend_index_array.size() == critindexarray.size());

     PackMoleculeId best;
 	while (*seedindex < static_cast<int>(seed_blend_index_array.size())) {

 		if(getblend == true) {
 			blk_id = seed_blend_index_array[(*seedindex)++];
 		} else {
 			blk_id = critindexarray[(*seedindex)++];
 		}

 		if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) {

             auto rng = molecules.atom_molecules.equal_range(blk_id);
             for(const auto& kv : vtr::make_range(rng.first, rng.second)) {
                 PackMoleculeId molecule_id = kv.second;
 				if (molecule_stats.valid(molecule_id)) {
 					if (!best || molecule_stats.base_gain(best) < molecule_stats.base_gain(molecule_id)) {
 						best = molecule_id;
 					}
 				}
             }
 			VTR_ASSERT(best);
 			return best;
 		}
 	}

 	/*if it makes it to here , there are no more blocks available*/
 	return PackMoleculeId::INVALID();
 }

 /* get gain of packing molecule into current cluster
      gain is equal to:
         total_block_gain
         + molecule_base_gain*some_factor
         - introduced_input_nets_of_unrelated_blocks_pulled_in_by_molecule*some_other_factor
  */
 static float get_molecule_gain(const PackMolecules& molecules, const PackMoleculeId molecule_id, const MoleculeStats& molecule_stats, std::map<AtomBlockId, float> &blk_gain) {
 	float gain;
 	int num_introduced_inputs_of_indirectly_related_block;
     auto& atom_ctx = g_vpr_ctx.atom();

     auto& molecule = molecules.pack_molecules[molecule_id];
 	gain = 0;
 	num_introduced_inputs_of_indirectly_related_block = 0;
     for (auto molecule_blk_id : molecule.blocks()) {
         AtomBlockId atom_blk = molecule.block_atom(molecule_blk_id);
         VTR_ASSERT(atom_blk);

         if(blk_gain.count(atom_blk) > 0) {
             gain += blk_gain[atom_blk];
         } else {
             /* This block has no connection with current cluster, penalize molecule for having this block
              */
             for(auto pin_id : atom_ctx.nlist.block_input_pins(atom_blk)) {
                 auto net_id = atom_ctx.nlist.pin_net(pin_id);
                 VTR_ASSERT(net_id);

                 auto driver_pin_id = atom_ctx.nlist.net_driver(net_id);
                 VTR_ASSERT(driver_pin_id);

                 auto driver_blk_id = atom_ctx.nlist.pin_block(driver_pin_id);

                 num_introduced_inputs_of_indirectly_related_block++;
                 for (auto molecule_blk_id2 : molecule.blocks()) {
                     AtomBlockId atom_blk2 = molecule.block_atom(molecule_blk_id2);
                     if (driver_blk_id == atom_blk2) {
                         //valid block which is driver (and hence not an input)
                         num_introduced_inputs_of_indirectly_related_block--;
                         break;
                     }
                 }
             }
         }
 	}

 	gain += molecule_stats.base_gain(molecule_id) * 0.0001; /* Use base gain as tie breaker TODO: need to sweep this value and perhaps normalize */
 	gain -= num_introduced_inputs_of_indirectly_related_block * (0.001);

 	return gain;
 }

 static int compare_molecule_gain(const void *a, const void *b) {
 	float base_gain_a, base_gain_b, diff;
 	const t_pack_molecule *molecule_a, *molecule_b;
 	molecule_a = (*(const t_pack_molecule * const *) a);
 	molecule_b = (*(const t_pack_molecule * const *) b);

 	base_gain_a = molecule_a->base_gain;
 	base_gain_b = molecule_b->base_gain;
 	diff = base_gain_a - base_gain_b;
 	if (diff > 0) {
 		return 1;
 	}
 	if (diff < 0) {
 		return -1;
 	}
 	return 0;
 }

 /* Determine if speculatively packed cur_pb is pin feasible
  * Runtime is actually not that bad for this.  It's worst case O(k^2) where k is the
  * number of pb_graph pins.  Can use hash tables or make incremental if becomes an issue.
  */
 static void try_update_lookahead_pins_used(t_pb *cur_pb) {
 	int i, j;
 	const t_pb_type *pb_type = cur_pb->pb_graph_node->pb_type;

 	if (pb_type->num_modes > 0 && cur_pb->name != nullptr) {
 		if (cur_pb->child_pbs != nullptr) {
 			for (i = 0; i < pb_type->modes[cur_pb->mode].num_pb_type_children; i++) {
 				if (cur_pb->child_pbs[i] != nullptr) {
 					for (j = 0; j < pb_type->modes[cur_pb->mode].pb_type_children[i].num_pb; j++) {
 						try_update_lookahead_pins_used(&cur_pb->child_pbs[i][j]);
 					}
 				}
 			}
 		}
 	} else {

         auto& atom_ctx = g_vpr_ctx.atom();
         AtomBlockId blk_id = atom_ctx.lookup.pb_atom(cur_pb);
 		if (pb_type->blif_model != nullptr && blk_id) {
 			compute_and_mark_lookahead_pins_used(blk_id);
 		}
 	}
 }

 /* Resets nets used at different pin classes for determining pin feasibility */
 static void reset_lookahead_pins_used(t_pb *cur_pb) {
 	int i, j;
 	const t_pb_type *pb_type = cur_pb->pb_graph_node->pb_type;
 	if (cur_pb->pb_stats == nullptr) {
 		return; /* No pins used, no need to continue */
 	}

 	if (pb_type->num_modes > 0 && cur_pb->name != nullptr) {
 		for (i = 0; i < cur_pb->pb_graph_node->num_input_pin_class; i++) {
 			cur_pb->pb_stats->lookahead_input_pins_used[i].clear();
 		}

 		for (i = 0; i < cur_pb->pb_graph_node->num_output_pin_class; i++) {
 			cur_pb->pb_stats->lookahead_output_pins_used[i].clear();
 		}

 		if (cur_pb->child_pbs != nullptr) {
 			for (i = 0; i < pb_type->modes[cur_pb->mode].num_pb_type_children; i++) {
 				if (cur_pb->child_pbs[i] != nullptr) {
 					for (j = 0; j < pb_type->modes[cur_pb->mode].pb_type_children[i].num_pb; j++) {
 						reset_lookahead_pins_used(&cur_pb->child_pbs[i][j]);
 					}
 				}
 			}
 		}
 	}
 }

 /* Determine if pins of speculatively packed pb are legal */
 static void compute_and_mark_lookahead_pins_used(const AtomBlockId blk_id) {
     auto& atom_ctx = g_vpr_ctx.atom();

 	const t_pb* cur_pb = atom_ctx.lookup.atom_pb(blk_id);
     VTR_ASSERT(cur_pb != nullptr);

 	/* Walk through inputs, outputs, and clocks marking pins off of the same class */
     for(auto pin_id : atom_ctx.nlist.block_pins(blk_id)) {
         auto net_id = atom_ctx.nlist.pin_net(pin_id);

         const t_pb_graph_pin* pb_graph_pin = find_pb_graph_pin(atom_ctx.nlist, atom_ctx.lookup, pin_id);
         compute_and_mark_lookahead_pins_used_for_pin(pb_graph_pin, cur_pb, net_id);
     }
 }

 /* Given a pin and its assigned net, mark all pin classes that are affected */
 static void compute_and_mark_lookahead_pins_used_for_pin(const t_pb_graph_pin *pb_graph_pin, const t_pb *primitive_pb, const AtomNetId net_id) {
 	int depth;
 	int pin_class, output_port;
 	t_pb * cur_pb;
 	const t_pb_type *pb_type;
 	t_port *prim_port;
 	t_pb_graph_pin *output_pb_graph_pin;

 	bool skip, found;

     auto& atom_ctx = g_vpr_ctx.atom();

 	cur_pb = primitive_pb->parent_pb;

 	while (cur_pb) {
 		depth = cur_pb->pb_graph_node->pb_type->depth;
 		pin_class = pb_graph_pin->parent_pin_class[depth];
 		VTR_ASSERT(pin_class != OPEN);

         auto driver_blk_id = atom_ctx.nlist.net_driver_block(net_id);

 		if (pb_graph_pin->port->type == IN_PORT) {
 			/* find location of net driver if exist in clb, NULL otherwise */
             auto driver_pin_id = atom_ctx.nlist.net_driver(net_id);

             auto prim_blk_id = atom_ctx.lookup.pb_atom(primitive_pb);

             const t_pb* driver_pb = atom_ctx.lookup.atom_pb(driver_blk_id);

 			output_pb_graph_pin = nullptr;
 			if (atom_ctx.lookup.atom_clb(driver_blk_id) == atom_ctx.lookup.atom_clb(prim_blk_id)) {
 				pb_type = driver_pb->pb_graph_node->pb_type;
 				output_port = 0;
 				found = false;
 				for (int i = 0; i < pb_type->num_ports && !found; i++) {
 					prim_port = &pb_type->ports[i];
 					if (prim_port->type == OUT_PORT) {
                         auto driver_port_id = atom_ctx.nlist.pin_port(driver_pin_id);
                         auto driver_model_port = atom_ctx.nlist.port_model(driver_port_id);
 						if (pb_type->ports[i].model_port == driver_model_port) {
 							found = true;
 							break;
 						}
 						output_port++;
 					}
 				}
 				VTR_ASSERT(found);
 				output_pb_graph_pin = &(driver_pb->pb_graph_node->output_pins[output_port][atom_ctx.nlist.pin_port_bit(driver_pin_id)]);
 			}

 			skip = false;

 			/* check if driving pin for input is contained within the currently
              * investigated cluster, if yes, do nothing since no input needs to be used */
 			if (output_pb_graph_pin != nullptr) {
 				const t_pb* check_pb = driver_pb;
 				while (check_pb != nullptr && check_pb != cur_pb) {
 					check_pb = check_pb->parent_pb;
 				}
 				if (check_pb != nullptr) {
 					for (int i = 0; skip == false && i < output_pb_graph_pin->num_connectable_primitive_input_pins[depth]; i++) {
 						if (pb_graph_pin == output_pb_graph_pin->list_of_connectable_input_pin_ptrs[depth][i]) {
 							skip = true;
 						}
 					}
 				}
 			}

 			/* Must use input pin */
 			if (!skip) {
 				/* Check if already in pin class, if yes, skip */
 				skip = false;
 				int la_inpins_size = (int)cur_pb->pb_stats->lookahead_input_pins_used[pin_class].size();
 				for (int i = 0; i < la_inpins_size; i++) {
 					if (cur_pb->pb_stats->lookahead_input_pins_used[pin_class][i] == net_id) {
 						skip = true;
 					}
 				}
 				if (!skip) {
 					/* Net must take up a slot */
 					cur_pb->pb_stats->lookahead_input_pins_used[pin_class].push_back(net_id);
 				}
 			}
 		} else {
 			VTR_ASSERT(pb_graph_pin->port->type == OUT_PORT);
             /*
              * Determine if this net (which is driven within this cluster) leaves this cluster
              * (and hense uses an output pin).
              */

 			bool net_exits_cluster = true;
             int num_net_sinks = static_cast<int>(atom_ctx.nlist.net_sinks(net_id).size());

 			if (pb_graph_pin->num_connectable_primitive_input_pins[depth] >= num_net_sinks) {
                 //It is possible the net is completely absorbed in the cluster,
                 //since this pin could (potentially) drive all the net's sinks

 				/* Important: This runtime penalty looks a lot scarier than it really is.
                  * For high fan-out nets, I at most look at the number of pins within the
                  * cluster which limits runtime.
                  *
 				 * DO NOT REMOVE THIS INITIAL FILTER WITHOUT CAREFUL ANALYSIS ON RUNTIME!!!
 				 *
 				 * Key Observation:
 				 * For LUT-based designs it is impossible for the average fanout to exceed
                  * the number of LUT inputs so it's usually around 4-5 (pigeon-hole argument,
                  * if the average fanout is greater than the number of LUT inputs, where do
                  * the extra connections go?  Therefore, average fanout must be capped to a
                  * small constant where the constant is equal to the number of LUT inputs).
                  * The real danger to runtime is when the number of sinks of a net gets doubled
 				 */

                 //Check if all the net sinks are, in fact, inside this cluster
                 bool all_sinks_in_cur_cluster = true;
 				ClusterBlockId driver_clb = atom_ctx.lookup.atom_clb(driver_blk_id);
                 for(auto pin_id : atom_ctx.nlist.net_sinks(net_id)) {
                     auto sink_blk_id = atom_ctx.nlist.pin_block(pin_id);
 					if (atom_ctx.lookup.atom_clb(sink_blk_id) != driver_clb) {
                         all_sinks_in_cur_cluster = false;
 						break;
 					}
 				}

 				if (all_sinks_in_cur_cluster) {
                     //All the sinks are part of this cluster, so the net may be fully absorbed.
                     //
                     //Verify this, by counting the number of net sinks reachable from the driver pin.
                     //If the count equals the number of net sinks then the net is fully absorbed and
                     //the net does not exit the cluster
 					/* TODO: I should cache the absorbed outputs, once net is absorbed,
                      *       net is forever absorbed, no point in rechecking every time */
 					if (net_sinks_reachable_in_cluster(pb_graph_pin, depth, net_id)) {
                         //All the sinks are reachable inside the cluster
 						net_exits_cluster = false;
 					}
 				}
 			}

 			if (net_exits_cluster) {
 				/* This output must exit this cluster */
 				cur_pb->pb_stats->lookahead_output_pins_used[pin_class].push_back(net_id);
 			}
 		}

 		cur_pb = cur_pb->parent_pb;
 	}
 }

 int net_sinks_reachable_in_cluster(const t_pb_graph_pin* driver_pb_gpin, const int depth, const AtomNetId net_id) {
     size_t num_reachable_sinks = 0;
     auto& atom_ctx = g_vpr_ctx.atom();

     //Record the sink pb graph pins we are looking for
     std::unordered_set<const t_pb_graph_pin*> sink_pb_gpins;
     for(const AtomPinId pin_id : atom_ctx.nlist.net_sinks(net_id)) {
         const t_pb_graph_pin* sink_pb_gpin = find_pb_graph_pin(atom_ctx.nlist, atom_ctx.lookup, pin_id);
         VTR_ASSERT(sink_pb_gpin);

         sink_pb_gpins.insert(sink_pb_gpin);
     }

     //Count how many sink pins are reachable
     for(int i_prim_pin = 0; i_prim_pin < driver_pb_gpin->num_connectable_primitive_input_pins[depth]; ++i_prim_pin) {
         const t_pb_graph_pin* reachable_pb_gpin = driver_pb_gpin->list_of_connectable_input_pin_ptrs[depth][i_prim_pin];

         if(sink_pb_gpins.count(reachable_pb_gpin)) {
             ++num_reachable_sinks;
             if(num_reachable_sinks == atom_ctx.nlist.net_sinks(net_id).size()) {
                 return true;
             }
         }
     }

     return false;
 }

 /* Check if the number of available inputs/outputs for a pin class is sufficient for speculatively packed blocks */
 static bool check_lookahead_pins_used(t_pb *cur_pb) {
 	int i, j;
 	const t_pb_type *pb_type = cur_pb->pb_graph_node->pb_type;
 	bool success;

 	success = true;

 	if (pb_type->num_modes > 0 && cur_pb->name != nullptr) {
 		for (i = 0; i < cur_pb->pb_graph_node->num_input_pin_class && success; i++) {
 			if (cur_pb->pb_stats->lookahead_input_pins_used[i].size() > (unsigned int)cur_pb->pb_graph_node->input_pin_class_size[i]) {
 				success = false;
 			}
 		}

 		for (i = 0; i < cur_pb->pb_graph_node->num_output_pin_class && success;
 				i++) {
 			if (cur_pb->pb_stats->lookahead_output_pins_used[i].size() > (unsigned int)cur_pb->pb_graph_node->output_pin_class_size[i]) {
 				success = false;
 			}
 		}

 		if (success && cur_pb->child_pbs != nullptr) {
 			for (i = 0; success && i < pb_type->modes[cur_pb->mode].num_pb_type_children; i++) {
 				if (cur_pb->child_pbs[i] != nullptr) {
 					for (j = 0; success && j < pb_type->modes[cur_pb->mode].pb_type_children[i].num_pb; j++) {
 						success = check_lookahead_pins_used(
 								&cur_pb->child_pbs[i][j]);
 					}
 				}
 			}
 		}
 	}
 	return success;
 }

 /* Speculation successful, commit input/output pins used */
 static void commit_lookahead_pins_used(t_pb *cur_pb) {
 	int i, j;
 	int ipin;
 	const t_pb_type *pb_type = cur_pb->pb_graph_node->pb_type;

 	if (pb_type->num_modes > 0 && cur_pb->name != nullptr) {
 		for (i = 0; i < cur_pb->pb_graph_node->num_input_pin_class; i++) {
 			ipin = 0;
 			VTR_ASSERT(cur_pb->pb_stats->lookahead_input_pins_used[i].size() <= (unsigned int)cur_pb->pb_graph_node->input_pin_class_size[i]);
 			for (j = 0; j < (int) cur_pb->pb_stats->lookahead_input_pins_used[i].size(); j++) {
 				VTR_ASSERT(cur_pb->pb_stats->lookahead_input_pins_used[i][j]);
 				cur_pb->pb_stats->input_pins_used[i][ipin] = cur_pb->pb_stats->lookahead_input_pins_used[i][j];
 				ipin++;
 			}
 		}

 		for (i = 0; i < cur_pb->pb_graph_node->num_output_pin_class; i++) {
 			ipin = 0;
 			VTR_ASSERT(cur_pb->pb_stats->lookahead_output_pins_used[i].size() <= (unsigned int)cur_pb->pb_graph_node->output_pin_class_size[i]);
 			for (j = 0; j < (int) cur_pb->pb_stats->lookahead_output_pins_used[i].size(); j++) {
 				VTR_ASSERT(cur_pb->pb_stats->lookahead_output_pins_used[i][j]);
 				cur_pb->pb_stats->output_pins_used[i][ipin] = cur_pb->pb_stats->lookahead_output_pins_used[i][j];
 				ipin++;
 			}
 		}

 		if (cur_pb->child_pbs != nullptr) {
 			for (i = 0; i < pb_type->modes[cur_pb->mode].num_pb_type_children;
 					i++) {
 				if (cur_pb->child_pbs[i] != nullptr) {
 					for (j = 0; j < pb_type->modes[cur_pb->mode].pb_type_children[i].num_pb; j++) {
 						commit_lookahead_pins_used(&cur_pb->child_pbs[i][j]);
 					}
 				}
 			}
 		}
 	}
 }

 /* Score unclustered atoms that are two hops away from current cluster */
 static void load_transitive_fanout_candidates(ClusterBlockId clb_index,
                                               const PackMolecules& molecules,
                                               const MoleculeStats& molecule_stats,
 											  t_pb_stats *pb_stats,
 											  vtr::vector_map<ClusterBlockId,std::vector<AtomNetId>> &clb_inter_blk_nets) {
     auto& atom_ctx = g_vpr_ctx.atom();

     for(const auto net_id : pb_stats->marked_nets) {
 		if(atom_ctx.nlist.net_pins(net_id).size() < AAPACK_MAX_TRANSITIVE_FANOUT_EXPLORE + 1) {
             for(const auto pin_id : atom_ctx.nlist.net_pins(net_id)) {
                 AtomBlockId atom_blk_id = atom_ctx.nlist.pin_block(pin_id);
 				ClusterBlockId tclb = atom_ctx.lookup.atom_clb(atom_blk_id); //The transitive CLB
 				if(tclb != clb_index && tclb != ClusterBlockId::INVALID()) {
 					/* Explore transitive connections from already packed cluster */
 					for(AtomNetId tnet : clb_inter_blk_nets[tclb]) {
                         for(AtomPinId tpin : atom_ctx.nlist.net_pins(tnet)) {
                             auto blk_id = atom_ctx.nlist.pin_block(tpin);
 							if(atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) {
 								/* This transitive atom is not packed, score and add */
 								auto &transitive_fanout_candidates = *(pb_stats->transitive_fanout_candidates);

 								if(pb_stats->gain.count(blk_id) == 0) {
 									pb_stats->gain[blk_id] = 0.001;
 								} else {
 									pb_stats->gain[blk_id] += 0.001;
 								}
                                 auto rng = molecules.atom_molecules.equal_range(blk_id);
                                 for(const auto& kv : vtr::make_range(rng.first, rng.second)) {
                                     PackMoleculeId molecule_id = kv.second;
 									if (molecule_stats.valid(molecule_id)) {
 										unsigned int imol = 0;

 										/* The number of potential molecules is heavily bounded so
                                          * this O(N) operation should be safe since N is small */
 										for(imol = 0; imol < transitive_fanout_candidates.size(); imol++) {
 											if(molecule_id == transitive_fanout_candidates[imol]) {
 												break;
 											}
 										}
 										if(imol == transitive_fanout_candidates.size()) {
 											/* not in candidate list, add to list */
 											transitive_fanout_candidates.push_back(molecule_id);
 										}
 									}
                                 }
 							}
 						}
 					}
 				}
 			}
 		}
 	}
 }

 static void print_block_criticalities(const char * fname) {
 	/* Prints criticality and critindexarray for each atom block to a file. */
     auto& atom_ctx = g_vpr_ctx.atom();

 	FILE * fp = vtr::fopen(fname, "w");

     //For prett formatting determine the maximum name length
     int max_name_len = strlen("atom_block_name");
     for(auto blk_id : atom_ctx.nlist.blocks()) {
         max_name_len = std::max(max_name_len, (int) atom_ctx.nlist.block_name(blk_id).size());
     }

     fprintf(fp, "%-*s %s %s %s %s\n", max_name_len, "atom_block_name", "criticality", "critindexarray", "seed_blend_gain", "seed_blend_gain_index");

     for(auto blk_id : atom_ctx.nlist.blocks()) {
         std::string name = atom_ctx.nlist.block_name(blk_id);
 		fprintf(fp, "%-*s ", max_name_len, name.c_str());

 		fprintf(fp, "%*f ", (int) strlen("criticality"), block_criticality[blk_id]);

         auto crit_idx_iter = std::find(critindexarray.begin(), critindexarray.end(), blk_id);
         VTR_ASSERT(crit_idx_iter != critindexarray.end());
         int crit_idx = crit_idx_iter - critindexarray.begin();
         fprintf(fp, "%*d ", (int) strlen("critindexarray"), crit_idx);

         fprintf(fp, "%*f ", (int) strlen("seed_blend_gain"), seed_blend_gain[blk_id]);
         auto seed_gain_iter = std::find(seed_blend_index_array.begin(), seed_blend_index_array.end(), blk_id);
         int seed_blend_gain_index = seed_gain_iter - seed_blend_index_array.begin();
         fprintf(fp, "%*d", (int) strlen("seed_blend_gain_index"), seed_blend_gain_index);

         fprintf(fp, "\n");
 	}
 	fclose(fp);
 }

 static std::map<const t_model*,std::vector<t_type_ptr>> identify_primitive_candidate_block_types() {
     std::map<const t_model*,std::vector<t_type_ptr>> model_candidates;
     auto& atom_ctx = g_vpr_ctx.atom();
     auto& atom_nlist = atom_ctx.nlist;
     auto& device_ctx = g_vpr_ctx.device();

     std::set<const t_model*> unique_models;
     for(auto blk : atom_nlist.blocks()) {
         auto model = atom_nlist.block_model(blk);
         unique_models.insert(model);
     }

     for (auto model : unique_models) {
         model_candidates[model] = {};

         for (int itype = 0; itype < device_ctx.num_block_types; ++itype ) {
             t_type_ptr type = &device_ctx.block_types[itype];
             if (block_type_contains_blif_model(type, model->name)) {
                 model_candidates[model].push_back(type);
             }
         }
     }

     return model_candidates;
 }

 static void revalid_molecules(const t_pb* pb, const PackMolecules& molecules, MoleculeStats& molecule_stats) {
 	const t_pb_type* pb_type = pb->pb_graph_node->pb_type;

 	if (pb_type->blif_model == nullptr) {
 		int mode = pb->mode;
 		for (int i = 0; i < pb_type->modes[mode].num_pb_type_children && pb->child_pbs != nullptr; i++) {
 			for (int j = 0; j < pb_type->modes[mode].pb_type_children[i].num_pb	&& pb->child_pbs[i] != nullptr; j++) {
 				if (pb->child_pbs[i][j].name != nullptr || pb->child_pbs[i][j].child_pbs != nullptr) {
 					revalid_molecules(&pb->child_pbs[i][j], molecules, molecule_stats);
 				}
 			}
         }
     } else {
         //Primitive
         auto& atom_ctx = g_vpr_ctx.mutable_atom();

         auto blk_id = atom_ctx.lookup.pb_atom(pb);
 		if (blk_id) {
             /* If any molecules were marked invalid because of this logic block getting packed, mark them valid */

             //Update atom netlist mapping
             atom_ctx.lookup.set_atom_clb(blk_id, ClusterBlockId::INVALID());
             atom_ctx.lookup.set_atom_pb(blk_id, nullptr);

             auto rng = molecules.atom_molecules.equal_range(blk_id);
             for(const auto& kv : vtr::make_range(rng.first, rng.second)) {
                 PackMoleculeId cur_molecule_id = kv.second;
                 if (!molecule_stats.valid(cur_molecule_id)) {
                     auto& cur_molecule = molecules.pack_molecules[cur_molecule_id];

                     bool all_atoms_valid = true;
                     for (auto molecule_blk_id : cur_molecule.blocks()) {
                         AtomBlockId atom_blk = cur_molecule.block_atom(molecule_blk_id);
                         if (atom_ctx.lookup.atom_clb(atom_blk) != ClusterBlockId::INVALID()) {
                             all_atoms_valid = false;
                             break;
                         }
                     }

                     /* All atom blocks are open for this molecule, place back in queue */
                     if (all_atoms_valid) {
                         molecule_stats.set_valid(cur_molecule_id, true);
                     }
                 }
             }
         }
     }
 }


 MoleculeStats build_molecule_stats(const PackMolecules& molecules, const AtomNetlist& netlist) {
     MoleculeStats stats;

     for (const PackMolecule& molecule : molecules.pack_molecules) {
         MoleculeStats::Info info;

         info.num_blocks = count_molecule_blocks(molecule);
         info.num_ext_inputs = count_molecule_ext_inputs(molecule, netlist);
         info.base_gain = molecule.base_gain();

         stats.info_.push_back(info);
     }

     VTR_ASSERT(molecules.pack_molecules.size() > 0);

     //Max external inputs
     auto cmp_ext_inputs = [](const MoleculeStats::Info& lhs, const MoleculeStats::Info& rhs) {
         return lhs.num_ext_inputs < rhs.num_ext_inputs;
     };
     stats.max_num_ext_inputs_ = std::max_element(stats.info_.begin(), stats.info_.end(), cmp_ext_inputs)->num_ext_inputs;

     //Max blocks
     auto cmp_num_blocks = [](const MoleculeStats::Info& lhs, const MoleculeStats::Info& rhs) {
         return lhs.num_blocks < rhs.num_blocks;
     };
     stats.max_num_blocks_ = std::max_element(stats.info_.begin(), stats.info_.end(), cmp_num_blocks)->num_blocks;

     return stats;
 }

 int count_molecule_blocks(const PackMolecule& molecule) {
     int count = 0;

     for (auto blk_id : molecule.blocks()) {
         if (molecule.block_type(blk_id) == PackMolecule::BlockType::INTERNAL) {
             ++count;
         }
     }

     return count;
 }

 int count_molecule_ext_inputs(const PackMolecule& molecule, const AtomNetlist& netlist) {

     //Count inputs of all internal blocks
     int total_inputs = 0;
     for (auto molecule_blk_id : molecule.blocks()) {
         AtomBlockId atom_blk_id = molecule.block_atom(molecule_blk_id);

         if (molecule.block_type(molecule_blk_id) != PackMolecule::BlockType::INTERNAL) continue;

         total_inputs += netlist.block_input_pins(atom_blk_id).size();
         total_inputs += netlist.block_clock_pins(atom_blk_id).size();
     }

     //Subtract away connections contained within the molecule
     int ext_inputs = total_inputs;
     for (auto molecule_edge_id : molecule.edges()) {
         auto molecule_driver_pin_id = molecule.edge_driver(molecule_edge_id);
         auto molecule_driver_blk_id = molecule.pin_block(molecule_driver_pin_id);
         if (molecule.block_type(molecule_driver_blk_id) != PackMolecule::BlockType::INTERNAL) continue;

         for (auto molecule_sink_pin_id : molecule.edge_sinks(molecule_edge_id)) {
             auto molecule_sink_blk_id = molecule.pin_block(molecule_sink_pin_id);
             if (molecule.block_type(molecule_sink_blk_id) != PackMolecule::BlockType::INTERNAL) continue;

             //Both driver and sink are internal to this molecule, so remove it from the external input count
             --ext_inputs;
         }
     }

     return ext_inputs;
 }

 static std::vector<t_type_ptr> find_candidate_types_for_molecule(const PackMolecule& molecule,
                                                                  const std::map<const t_model*,std::vector<t_type_ptr>>& primitive_candidate_block_types) {
     std::set<t_type_ptr> candidate_types;

     auto& atom_ctx = g_vpr_ctx.atom();

     //Collect the set of models required for this molecule
     std::set<const t_model*> internal_models;
     for (auto molecule_blk_id : molecule.blocks()) {
         if (molecule.block_type(molecule_blk_id) != PackMolecule::BlockType::INTERNAL) continue;

         AtomBlockId atom_blk_id = molecule.block_atom(molecule_blk_id);
         internal_models.insert(atom_ctx.nlist.block_model(atom_blk_id));
     }

     //Find the set of blocks which contain all the required models
     size_t i = 0;
     for (auto model : internal_models) {
         auto itr = primitive_candidate_block_types.find(model);
         VTR_ASSERT(itr != primitive_candidate_block_types.end());

         std::set<t_type_ptr> sorted_candidates(itr->second.begin(), itr->second.end());
         if (i == 0) {
             candidate_types = sorted_candidates;
         } else {

             std::set<t_type_ptr> intersection;
             std::set_intersection(candidate_types.begin(), candidate_types.end(),
                                   sorted_candidates.begin(), sorted_candidates.end(),
                                   std::inserter(intersection, intersection.begin()));

             candidate_types = intersection;
         }
         ++i;
     }

     return std::vector<t_type_ptr>(candidate_types.begin(), candidate_types.end());
 }