| /* |
| * Main clustering algorithm |
| * Author(s): Vaughn Betz (first revision - VPack), Alexander Marquardt (second revision - T-VPack), Jason Luu (third revision - AAPack) |
| * June 8, 2011 |
| */ |
| |
| /* |
| * The clusterer uses several key data structures: |
| * |
| * t_pb_type (and related types): |
| * Represent the architecture as described in the architecture file. |
| * |
| * t_pb_graph_node (and related types): |
| * Represents a flattened version of the architecture with t_pb_types |
| * expanded (according to num_pb) into unique t_pb_graph_node instances, |
| * and the routing connectivity converted to a graph of t_pb_graph_pin (nodes) |
| * and t_pb_graph_edge. |
| * |
| * t_pb: |
| * Represents a clustered instance of a t_pb_graph_node containing netlist primitives |
| * |
| * t_pb_type and t_pb_graph_node (and related types) describe the targetted FPGA architecture, while t_pb represents |
| * the actual clustering of the user netlist. |
| * |
| * For example: |
| * Consider an architecture where CLBs contain 4 BLEs, and each BLE is a LUT + FF pair. |
| * We wish to map a netlist of 400 LUTs and 400 FFs. |
| * |
| * A BLE corresponds to one t_pb_type (which has num_pb = 4). |
| * |
| * Each of the 4 BLE positions corresponds to a t_pb_graph_node (each of which references the BLE t_pb_type). |
| * |
| * The output of clustering is 400 t_pb of type BLE which represent the clustered user netlist. |
| * Each of the 400 t_pb will reference one of the 4 BLE-type t_pb_graph_nodes. |
| */ |
| #include <cstdio> |
| #include <cstdlib> |
| #include <cstring> |
| #include <ctime> |
| #include <map> |
| #include <algorithm> |
| #include <fstream> |
| |
| #include "vtr_assert.h" |
| #include "vtr_log.h" |
| #include "vtr_math.h" |
| #include "vtr_memory.h" |
| |
| #include "vpr_types.h" |
| #include "vpr_error.h" |
| |
| #include "globals.h" |
| #include "atom_netlist.h" |
| #include "pack_types.h" |
| #include "cluster.h" |
| #include "output_clustering.h" |
| #include "SetupGrid.h" |
| #include "read_xml_arch_file.h" |
| #include "vpr_utils.h" |
| #include "cluster_placement.h" |
| #include "echo_files.h" |
| #include "cluster_router.h" |
| #include "lb_type_rr_graph.h" |
| |
| #include "timing_info.h" |
| #include "timing_reports.h" |
| #include "PreClusterDelayCalculator.h" |
| #include "PreClusterTimingGraphResolver.h" |
| #include "tatum/echo_writer.hpp" |
| #include "tatum/report/graphviz_dot_writer.hpp" |
| #include "tatum/TimingReporter.hpp" |
| |
| #define AAPACK_MAX_HIGH_FANOUT_EXPLORE 10 /* For high-fanout nets that are ignored, consider a maximum of this many sinks, must be less than packer_opts.feasible_block_array_size */ |
| #define AAPACK_MAX_TRANSITIVE_EXPLORE 40 /* When investigating transitive fanout connections in packing, consider a maximum of this many molecules, must be less than packer_opts.feasible_block_array_size */ |
| |
| //Constant allowing all cluster pins to be used |
| const t_ext_pin_util FULL_EXTERNAL_PIN_UTIL(1., 1.); |
| |
| enum e_gain_update { |
| GAIN, |
| NO_GAIN |
| }; |
| enum e_feasibility { |
| FEASIBLE, |
| INFEASIBLE |
| }; |
| enum e_gain_type { |
| HILL_CLIMBING, |
| NOT_HILL_CLIMBING |
| }; |
| enum e_removal_policy { |
| REMOVE_CLUSTERED, |
| LEAVE_CLUSTERED |
| }; |
| /* TODO: REMOVE_CLUSTERED no longer used, remove */ |
| enum e_net_relation_to_clustered_block { |
| INPUT, |
| OUTPUT |
| }; |
| |
| enum e_detailed_routing_stages { |
| E_DETAILED_ROUTE_AT_END_ONLY = 0, |
| E_DETAILED_ROUTE_FOR_EACH_ATOM, |
| E_DETAILED_ROUTE_INVALID |
| }; |
| |
| /* Linked list structure. Stores one integer (iblk). */ |
| struct t_molecule_link { |
| t_pack_molecule* moleculeptr; |
| t_molecule_link* next; |
| }; |
| |
| struct t_molecule_stats { |
| int num_blocks = 0; //Number of blocks across all primitives in molecule |
| |
| int num_pins = 0; //Number of pins across all primitives in molecule |
| int num_input_pins = 0; //Number of input pins across all primitives in molecule |
| int num_output_pins = 0; //Number of output pins across all primitives in molecule |
| |
| int num_used_ext_pins = 0; //Number of *used external* pins across all primitives in molecule |
| int num_used_ext_inputs = 0; //Number of *used external* input pins across all primitives in molecule |
| int num_used_ext_outputs = 0; //Number of *used external* output pins across all primitives in molecule |
| }; |
| |
| /* Keeps a linked list of the unclustered blocks to speed up looking for * |
| * unclustered blocks with a certain number of *external* inputs. * |
| * [0..lut_size]. Unclustered_list_head[i] points to the head of the * |
| * list of blocks with i inputs to be hooked up via external interconnect. */ |
| static t_molecule_link* unclustered_list_head; |
| int unclustered_list_head_size; |
| static t_molecule_link* memory_pool; /*Declared here so I can free easily.*/ |
| |
| /* Does the atom block that drives the output of this atom net also appear as a * |
| * receiver (input) pin of the atom net? If so, then by how much? |
| * |
| * This is used in the gain routines to avoid double counting the connections from * |
| * the current cluster to other blocks (hence yielding better clusterings). * |
| * The only time an atom block should connect to the same atom net * |
| * twice is when one connection is an output and the other is an input, * |
| * so this should take care of all multiple connections. */ |
| static std::unordered_map<AtomNetId, int> net_output_feeds_driving_block_input; |
| |
| /*****************************************/ |
| /*local functions*/ |
| /*****************************************/ |
| |
| #if 0 |
| static void check_for_duplicate_inputs (); |
| #endif |
| |
| static bool is_atom_blk_in_pb(const AtomBlockId blk_id, const t_pb* pb); |
| |
| static void add_molecule_to_pb_stats_candidates(t_pack_molecule* molecule, |
| std::map<AtomBlockId, float>& gain, |
| t_pb* pb, |
| int max_queue_size); |
| |
| static void alloc_and_init_clustering(const t_molecule_stats& max_molecule_stats, |
| t_cluster_placement_stats** cluster_placement_stats, |
| t_pb_graph_node*** primitives_list, |
| t_pack_molecule* molecules_head, |
| int num_molecules); |
| |
| static void free_pb_stats_recursive(t_pb* pb); |
| |
| static void try_update_lookahead_pins_used(t_pb* cur_pb); |
| |
| static void reset_lookahead_pins_used(t_pb* cur_pb); |
| |
| static void compute_and_mark_lookahead_pins_used(const AtomBlockId blk_id); |
| |
| static void compute_and_mark_lookahead_pins_used_for_pin(const t_pb_graph_pin* pb_graph_pin, |
| const t_pb* primitive_pb, |
| const AtomNetId net_id); |
| |
| static void commit_lookahead_pins_used(t_pb* cur_pb); |
| |
| static bool check_lookahead_pins_used(t_pb* cur_pb, t_ext_pin_util max_external_pin_util); |
| |
| static bool primitive_feasible(const AtomBlockId blk_id, t_pb* cur_pb); |
| |
| static bool primitive_memory_sibling_feasible(const AtomBlockId blk_id, const t_pb_type* cur_pb_type, const AtomBlockId sibling_memory_blk); |
| |
| static t_pack_molecule* get_molecule_by_num_ext_inputs(const int ext_inps, |
| const enum e_removal_policy remove_flag, |
| t_cluster_placement_stats* cluster_placement_stats_ptr); |
| |
| static t_pack_molecule* get_free_molecule_with_most_ext_inputs_for_cluster(t_pb* cur_pb, |
| t_cluster_placement_stats* cluster_placement_stats_ptr); |
| |
| static enum e_block_pack_status try_pack_molecule(t_cluster_placement_stats* cluster_placement_stats_ptr, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| t_pack_molecule* molecule, |
| t_pb_graph_node** primitives_list, |
| t_pb* pb, |
| const int max_models, |
| const int max_cluster_size, |
| const ClusterBlockId clb_index, |
| const int detailed_routing_stage, |
| t_lb_router_data* router_data, |
| int verbosity, |
| bool enable_pin_feasibility_filter, |
| const int feasible_block_array_size, |
| t_ext_pin_util max_external_pin_util); |
| |
| static enum e_block_pack_status try_place_atom_block_rec(const t_pb_graph_node* pb_graph_node, |
| const AtomBlockId blk_id, |
| t_pb* cb, |
| t_pb** parent, |
| const int max_models, |
| const int max_cluster_size, |
| const ClusterBlockId clb_index, |
| const t_cluster_placement_stats* cluster_placement_stats_ptr, |
| const t_pack_molecule* molecule, |
| t_lb_router_data* router_data, |
| int verbosity, |
| const int feasible_block_array_size); |
| |
| static void revert_place_atom_block(const AtomBlockId blk_id, t_lb_router_data* router_data, const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules); |
| |
| static void update_connection_gain_values(const AtomNetId net_id, const AtomBlockId clustered_blk_id, t_pb* cur_pb, enum e_net_relation_to_clustered_block net_relation_to_clustered_block); |
| |
| static void update_timing_gain_values(const AtomNetId net_id, |
| t_pb* cur_pb, |
| enum e_net_relation_to_clustered_block net_relation_to_clustered_block, |
| const SetupTimingInfo& timing_info, |
| const std::unordered_set<AtomNetId>& is_global); |
| |
| static void mark_and_update_partial_gain(const AtomNetId inet, enum e_gain_update gain_flag, const AtomBlockId clustered_blk_id, bool timing_driven, bool connection_driven, enum e_net_relation_to_clustered_block net_relation_to_clustered_block, const SetupTimingInfo& timing_info, const std::unordered_set<AtomNetId>& is_global, const int high_fanout_net_threshold); |
| |
| static void update_total_gain(float alpha, float beta, bool timing_driven, bool connection_driven, t_pb* pb); |
| |
| static void update_cluster_stats(const t_pack_molecule* molecule, |
| const ClusterBlockId clb_index, |
| const std::unordered_set<AtomNetId>& is_clock, |
| const std::unordered_set<AtomNetId>& is_global, |
| const bool global_clocks, |
| const float alpha, |
| const float beta, |
| const bool timing_driven, |
| const bool connection_driven, |
| const int high_fanout_net_threshold, |
| const SetupTimingInfo& timing_info); |
| |
| static void start_new_cluster(t_cluster_placement_stats* cluster_placement_stats, |
| t_pb_graph_node** primitives_list, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| ClusterBlockId clb_index, |
| t_pack_molecule* molecule, |
| std::map<t_logical_block_type_ptr, size_t>& num_used_type_instances, |
| const float target_device_utilization, |
| const int num_models, |
| const int max_cluster_size, |
| const t_arch* arch, |
| std::string device_layout_name, |
| std::vector<t_lb_type_rr_node>* lb_type_rr_graphs, |
| t_lb_router_data** router_data, |
| const int detailed_routing_stage, |
| ClusteredNetlist* clb_nlist, |
| const std::map<const t_model*, std::vector<t_logical_block_type_ptr>>& primitive_candidate_block_types, |
| int verbosity, |
| bool enable_pin_feasibility_filter, |
| bool balance_block_type_utilization, |
| const int feasible_block_array_size); |
| |
| static t_pack_molecule* get_highest_gain_molecule(t_pb* cur_pb, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| const enum e_gain_type gain_mode, |
| t_cluster_placement_stats* cluster_placement_stats_ptr, |
| vtr::vector<ClusterBlockId, std::vector<AtomNetId>>& clb_inter_blk_nets, |
| const ClusterBlockId cluster_index, |
| bool prioritize_transitive_connectivity, |
| int transitive_fanout_threshold, |
| const int feasible_block_array_size); |
| |
| void add_cluster_molecule_candidates_by_connectivity_and_timing(t_pb* cur_pb, |
| t_cluster_placement_stats* cluster_placement_stats_ptr, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| const int feasible_block_array_size); |
| |
| void add_cluster_molecule_candidates_by_highfanout_connectivity(t_pb* cur_pb, |
| t_cluster_placement_stats* cluster_placement_stats_ptr, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| const int feasible_block_array_size); |
| |
| void add_cluster_molecule_candidates_by_transitive_connectivity(t_pb* cur_pb, |
| t_cluster_placement_stats* cluster_placement_stats_ptr, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| vtr::vector<ClusterBlockId, std::vector<AtomNetId>>& clb_inter_blk_nets, |
| const ClusterBlockId cluster_index, |
| int transitive_fanout_threshold, |
| const int feasible_block_array_size); |
| |
| static t_pack_molecule* get_molecule_for_cluster(t_pb* cur_pb, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| const bool allow_unrelated_clustering, |
| const bool prioritize_transitive_connectivity, |
| const int transitive_fanout_threshold, |
| const int feasible_block_array_size, |
| int* num_unrelated_clustering_attempts, |
| t_cluster_placement_stats* cluster_placement_stats_ptr, |
| vtr::vector<ClusterBlockId, std::vector<AtomNetId>>& clb_inter_blk_nets, |
| ClusterBlockId cluster_index, |
| int verbosity); |
| |
| static void check_clustering(); |
| |
| static void check_cluster_atom_blocks(t_pb* pb, std::unordered_set<AtomBlockId>& blocks_checked); |
| |
| static void mark_all_molecules_valid(t_pack_molecule* molecule_head); |
| |
| static int count_molecules(t_pack_molecule* molecule_head); |
| |
| static t_molecule_stats calc_molecule_stats(const t_pack_molecule* molecule); |
| |
| static t_molecule_stats calc_max_molecules_stats(const t_pack_molecule* molecule_head); |
| |
| static std::vector<AtomBlockId> initialize_seed_atoms(const e_cluster_seed seed_type, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| const t_molecule_stats& max_molecule_stats, |
| const vtr::vector<AtomBlockId, float>& atom_criticality); |
| |
| static t_pack_molecule* get_highest_gain_seed_molecule(int* seedindex, const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, const std::vector<AtomBlockId> seed_atoms); |
| |
| static float get_molecule_gain(t_pack_molecule* molecule, std::map<AtomBlockId, float>& blk_gain); |
| static int compare_molecule_gain(const void* a, const void* b); |
| int net_sinks_reachable_in_cluster(const t_pb_graph_pin* driver_pb_gpin, const int depth, const AtomNetId net_id); |
| |
| static void print_seed_gains(const char* fname, const std::vector<AtomBlockId>& seed_atoms, const vtr::vector<AtomBlockId, float>& atom_gain, const vtr::vector<AtomBlockId, float>& atom_criticality); |
| |
| static void load_transitive_fanout_candidates(ClusterBlockId cluster_index, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| t_pb_stats* pb_stats, |
| vtr::vector<ClusterBlockId, std::vector<AtomNetId>>& clb_inter_blk_nets, |
| int transitive_fanout_threshold); |
| |
| static std::map<const t_model*, std::vector<t_logical_block_type_ptr>> identify_primitive_candidate_block_types(); |
| |
| static void update_molecule_chain_info(t_pack_molecule* chain_molecule, const t_pb_graph_node* root_primitive); |
| |
| static enum e_block_pack_status check_chain_root_placement_feasibility(const t_pb_graph_node* pb_graph_node, |
| const t_pack_molecule* molecule, |
| const AtomBlockId blk_id); |
| |
| static t_pb_graph_pin* get_driver_pb_graph_pin(const t_pb* driver_pb, const AtomPinId driver_pin_id); |
| |
| static void update_pb_type_count(const t_pb* pb, std::unordered_map<std::string, int>& pb_type_count); |
| |
| static void update_le_count(const t_pb* pb, const t_logical_block_type_ptr logic_block_type, const t_pb_type* le_pb_type, std::vector<int>& le_count); |
| |
| static void print_pb_type_count(std::unordered_map<std::string, int>& pb_type_count); |
| |
| static t_logical_block_type_ptr identify_logic_block_type(std::map<const t_model*, std::vector<t_logical_block_type_ptr>>& primitive_candidate_block_types); |
| |
| static t_pb_type* identify_le_block_type(t_logical_block_type_ptr logic_block_type); |
| |
| static bool pb_used_for_blif_model(const t_pb* pb, std::string blif_model_name); |
| |
| static void print_le_count(std::vector<int>& le_count, const t_pb_type* le_pb_type); |
| |
| /*****************************************/ |
| /*globally accessible function*/ |
| std::map<t_logical_block_type_ptr, size_t> do_clustering(const t_packer_opts& packer_opts, |
| const t_analysis_opts& analysis_opts, |
| const t_arch* arch, |
| t_pack_molecule* molecule_head, |
| int num_models, |
| const std::unordered_set<AtomNetId>& is_clock, |
| std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| const std::unordered_map<AtomBlockId, t_pb_graph_node*>& expected_lowest_cost_pb_gnode, |
| bool allow_unrelated_clustering, |
| bool balance_block_type_utilization, |
| std::vector<t_lb_type_rr_node>* lb_type_rr_graphs, |
| const t_ext_pin_util_targets& ext_pin_util_targets, |
| const t_pack_high_fanout_thresholds& high_fanout_thresholds) { |
| /* Does the actual work of clustering multiple netlist blocks * |
| * into clusters. */ |
| |
| /* Algorithm employed |
| * 1. Find type that can legally hold block and create cluster with pb info |
| * 2. Populate started cluster |
| * 3. Repeat 1 until no more blocks need to be clustered |
| * |
| */ |
| |
| /**************************************************************** |
| * Initialization |
| *****************************************************************/ |
| VTR_ASSERT(packer_opts.packer_algorithm == PACK_GREEDY); |
| |
| int num_molecules, blocks_since_last_analysis, num_clb, |
| num_blocks_hill_added, max_cluster_size, cur_cluster_size, |
| max_pb_depth, cur_pb_depth, num_unrelated_clustering_attempts, |
| seedindex, savedseedindex /* index of next most timing critical block */, |
| detailed_routing_stage, *hill_climbing_inputs_avail; |
| |
| const int verbosity = packer_opts.pack_verbosity; |
| |
| std::map<t_logical_block_type_ptr, size_t> num_used_type_instances; |
| |
| bool is_cluster_legal; |
| enum e_block_pack_status block_pack_status; |
| |
| t_cluster_placement_stats *cluster_placement_stats, *cur_cluster_placement_stats_ptr; |
| t_pb_graph_node** primitives_list; |
| t_lb_router_data* router_data = nullptr; |
| t_pack_molecule *istart, *next_molecule, *prev_molecule; |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| auto& device_ctx = g_vpr_ctx.mutable_device(); |
| auto& cluster_ctx = g_vpr_ctx.mutable_clustering(); |
| |
| vtr::vector<ClusterBlockId, std::vector<t_intra_lb_net>*> intra_lb_routing; |
| |
| std::shared_ptr<PreClusterDelayCalculator> clustering_delay_calc; |
| std::shared_ptr<SetupTimingInfo> timing_info; |
| |
| // this data structure is used to track the total number of physical blocks |
| // used for each physical block type defined in the architecture file. |
| std::unordered_map<std::string, int> pb_type_count; |
| |
| // this data structure tracks the number of Logic Elements (LEs) used. It is |
| // populated only for architectures which has LEs. The architecture is assumed |
| // to have LEs only iff it has a logic block that contains LUT primitives and is |
| // the first pb_block to have more than one instance from the top of the hierarchy |
| // (All parent pb_block have one instance only and one mode only). Index 0 holds |
| // the number of LEs that are used for both logic (LUTs/adders) and registers. |
| // Index 1 holds the number of LEs that are used for logic (LUTs/adders) only. |
| // Index 2 holds the number of LEs that are used for registers only. |
| std::vector<int> le_count(3, 0); |
| |
| num_clb = 0; |
| |
| /* TODO: This is memory inefficient, fix if causes problems */ |
| /* Store stats on nets used by packed block, useful for determining transitively connected blocks |
| * (eg. [A1, A2, ..]->[B1, B2, ..]->C implies cluster [A1, A2, ...] and C have a weak link) */ |
| vtr::vector<ClusterBlockId, std::vector<AtomNetId>> clb_inter_blk_nets(atom_ctx.nlist.blocks().size()); |
| |
| istart = nullptr; |
| |
| /* determine bound on cluster size and primitive input size */ |
| max_cluster_size = 0; |
| max_pb_depth = 0; |
| |
| seedindex = 0; |
| |
| const t_molecule_stats max_molecule_stats = calc_max_molecules_stats(molecule_head); |
| |
| mark_all_molecules_valid(molecule_head); |
| |
| num_molecules = count_molecules(molecule_head); |
| |
| for (const auto& type : device_ctx.logical_block_types) { |
| if (device_ctx.EMPTY_TYPE == physical_tile_type(&type)) |
| continue; |
| |
| cur_cluster_size = get_max_primitives_in_pb_type(type.pb_type); |
| cur_pb_depth = get_max_depth_of_pb_type(type.pb_type); |
| if (cur_cluster_size > max_cluster_size) { |
| max_cluster_size = cur_cluster_size; |
| } |
| if (cur_pb_depth > max_pb_depth) { |
| max_pb_depth = cur_pb_depth; |
| } |
| } |
| |
| if (packer_opts.hill_climbing_flag) { |
| hill_climbing_inputs_avail = (int*)vtr::calloc(max_cluster_size + 1, |
| sizeof(int)); |
| } else { |
| hill_climbing_inputs_avail = nullptr; /* if used, die hard */ |
| } |
| |
| #if 0 |
| check_for_duplicate_inputs (); |
| #endif |
| alloc_and_init_clustering(max_molecule_stats, |
| &cluster_placement_stats, &primitives_list, molecule_head, |
| num_molecules); |
| |
| auto primitive_candidate_block_types = identify_primitive_candidate_block_types(); |
| // find the cluster type that has lut primitives |
| auto logic_block_type = identify_logic_block_type(primitive_candidate_block_types); |
| // find a LE pb_type within the found logic_block_type |
| auto le_pb_type = identify_le_block_type(logic_block_type); |
| |
| blocks_since_last_analysis = 0; |
| num_blocks_hill_added = 0; |
| |
| VTR_ASSERT(max_cluster_size < MAX_SHORT); |
| /* Limit maximum number of elements for each cluster */ |
| |
| //Default criticalities set to zero (e.g. if not timing driven) |
| vtr::vector<AtomBlockId, float> atom_criticality(atom_ctx.nlist.blocks().size(), 0.); |
| |
| if (packer_opts.timing_driven) { |
| /* |
| * Initialize the timing analyzer |
| */ |
| clustering_delay_calc = std::make_shared<PreClusterDelayCalculator>(atom_ctx.nlist, atom_ctx.lookup, packer_opts.inter_cluster_net_delay, expected_lowest_cost_pb_gnode); |
| timing_info = make_setup_timing_info(clustering_delay_calc); |
| |
| //Calculate the initial timing |
| timing_info->update(); |
| |
| if (isEchoFileEnabled(E_ECHO_PRE_PACKING_TIMING_GRAPH)) { |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| tatum::write_echo(getEchoFileName(E_ECHO_PRE_PACKING_TIMING_GRAPH), |
| *timing_ctx.graph, *timing_ctx.constraints, *clustering_delay_calc, timing_info->analyzer()); |
| } |
| |
| { |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| PreClusterTimingGraphResolver resolver(atom_ctx.nlist, |
| atom_ctx.lookup, *timing_ctx.graph, *clustering_delay_calc); |
| resolver.set_detail_level(analysis_opts.timing_report_detail); |
| |
| tatum::TimingReporter timing_reporter(resolver, *timing_ctx.graph, |
| *timing_ctx.constraints); |
| |
| timing_reporter.report_timing_setup( |
| "pre_pack.report_timing.setup.rpt", |
| *timing_info->setup_analyzer(), |
| analysis_opts.timing_report_npaths); |
| } |
| |
| //Calculate true criticalities of each block |
| for (AtomBlockId blk : atom_ctx.nlist.blocks()) { |
| for (AtomPinId in_pin : atom_ctx.nlist.block_input_pins(blk)) { |
| //Max criticality over incoming nets |
| float crit = timing_info->setup_pin_criticality(in_pin); |
| atom_criticality[blk] = std::max(atom_criticality[blk], crit); |
| } |
| } |
| } |
| |
| auto seed_atoms = initialize_seed_atoms(packer_opts.cluster_seed_type, atom_molecules, max_molecule_stats, atom_criticality); |
| |
| istart = get_highest_gain_seed_molecule(&seedindex, atom_molecules, seed_atoms); |
| |
| /**************************************************************** |
| * Clustering |
| *****************************************************************/ |
| |
| while (istart != nullptr) { |
| is_cluster_legal = false; |
| savedseedindex = seedindex; |
| for (detailed_routing_stage = (int)E_DETAILED_ROUTE_AT_END_ONLY; !is_cluster_legal && detailed_routing_stage != (int)E_DETAILED_ROUTE_INVALID; detailed_routing_stage++) { |
| ClusterBlockId clb_index(num_clb); |
| |
| VTR_LOGV(verbosity > 2, "Complex block %d:\n", num_clb); |
| |
| start_new_cluster(cluster_placement_stats, primitives_list, |
| atom_molecules, clb_index, istart, |
| num_used_type_instances, |
| packer_opts.target_device_utilization, |
| num_models, max_cluster_size, |
| arch, packer_opts.device_layout, |
| lb_type_rr_graphs, &router_data, |
| detailed_routing_stage, &cluster_ctx.clb_nlist, |
| primitive_candidate_block_types, |
| packer_opts.pack_verbosity, |
| packer_opts.enable_pin_feasibility_filter, |
| balance_block_type_utilization, |
| packer_opts.feasible_block_array_size); |
| |
| VTR_LOGV(verbosity == 2, |
| "Complex block %d: '%s' (%s) ", num_clb, |
| cluster_ctx.clb_nlist.block_name(clb_index).c_str(), |
| cluster_ctx.clb_nlist.block_type(clb_index)->name); |
| VTR_LOGV(verbosity == 2, "."); //Progress dot for seed-block |
| fflush(stdout); |
| |
| t_ext_pin_util target_ext_pin_util = ext_pin_util_targets.get_pin_util(cluster_ctx.clb_nlist.block_type(clb_index)->name); |
| int high_fanout_threshold = high_fanout_thresholds.get_threshold(cluster_ctx.clb_nlist.block_type(clb_index)->name); |
| update_cluster_stats(istart, clb_index, |
| is_clock, //Set of clock nets |
| is_clock, //Set of global nets (currently all clocks) |
| packer_opts.global_clocks, |
| packer_opts.alpha, packer_opts.beta, |
| packer_opts.timing_driven, packer_opts.connection_driven, |
| high_fanout_threshold, |
| *timing_info); |
| num_clb++; |
| |
| if (packer_opts.timing_driven) { |
| blocks_since_last_analysis++; |
| /*it doesn't make sense to do a timing analysis here since there* |
| *is only one atom block clustered it would not change anything */ |
| } |
| cur_cluster_placement_stats_ptr = &cluster_placement_stats[cluster_ctx.clb_nlist.block_type(clb_index)->index]; |
| num_unrelated_clustering_attempts = 0; |
| next_molecule = get_molecule_for_cluster(cluster_ctx.clb_nlist.block_pb(clb_index), |
| atom_molecules, |
| allow_unrelated_clustering, |
| packer_opts.prioritize_transitive_connectivity, |
| packer_opts.transitive_fanout_threshold, |
| packer_opts.feasible_block_array_size, |
| &num_unrelated_clustering_attempts, |
| cur_cluster_placement_stats_ptr, |
| clb_inter_blk_nets, |
| clb_index, |
| packer_opts.pack_verbosity); |
| prev_molecule = istart; |
| while (next_molecule != nullptr && prev_molecule != next_molecule) { |
| block_pack_status = try_pack_molecule(cur_cluster_placement_stats_ptr, |
| atom_molecules, |
| next_molecule, |
| primitives_list, |
| cluster_ctx.clb_nlist.block_pb(clb_index), |
| num_models, |
| max_cluster_size, |
| clb_index, |
| detailed_routing_stage, |
| router_data, |
| packer_opts.pack_verbosity, |
| packer_opts.enable_pin_feasibility_filter, |
| packer_opts.feasible_block_array_size, |
| target_ext_pin_util); |
| prev_molecule = next_molecule; |
| |
| auto blk_id = next_molecule->atom_block_ids[next_molecule->root]; |
| VTR_ASSERT(blk_id); |
| |
| std::string blk_name = atom_ctx.nlist.block_name(blk_id); |
| const t_model* blk_model = atom_ctx.nlist.block_model(blk_id); |
| |
| if (block_pack_status != BLK_PASSED) { |
| if (verbosity > 2) { |
| if (block_pack_status == BLK_FAILED_ROUTE) { |
| VTR_LOG("\tNO_ROUTE: '%s' (%s)", blk_name.c_str(), blk_model->name); |
| VTR_LOGV(next_molecule->pack_pattern, " molecule %s molecule_size %zu", |
| next_molecule->pack_pattern->name, next_molecule->atom_block_ids.size()); |
| VTR_LOG("\n"); |
| fflush(stdout); |
| } else { |
| VTR_LOG("\tFAILED_FEASIBILITY_CHECK: '%s' (%s)", blk_name.c_str(), blk_model->name, block_pack_status); |
| VTR_LOGV(next_molecule->pack_pattern, " molecule %s molecule_size %zu", |
| next_molecule->pack_pattern->name, next_molecule->atom_block_ids.size()); |
| VTR_LOG("\n"); |
| fflush(stdout); |
| } |
| } |
| |
| next_molecule = get_molecule_for_cluster(cluster_ctx.clb_nlist.block_pb(clb_index), |
| atom_molecules, |
| allow_unrelated_clustering, |
| packer_opts.prioritize_transitive_connectivity, |
| packer_opts.transitive_fanout_threshold, |
| packer_opts.feasible_block_array_size, |
| &num_unrelated_clustering_attempts, |
| cur_cluster_placement_stats_ptr, |
| clb_inter_blk_nets, |
| clb_index, packer_opts.pack_verbosity); |
| continue; |
| } |
| |
| /* Continue packing by filling smallest cluster */ |
| if (verbosity > 2) { |
| VTR_LOG("\tPASSED: '%s' (%s)", blk_name.c_str(), blk_model->name); |
| VTR_LOGV(next_molecule->pack_pattern, " molecule %s molecule_size %zu", |
| next_molecule->pack_pattern->name, next_molecule->atom_block_ids.size()); |
| VTR_LOG("\n"); |
| } |
| VTR_LOGV(verbosity == 2, "."); |
| fflush(stdout); |
| |
| update_cluster_stats(next_molecule, clb_index, |
| is_clock, //Set of all clocks |
| is_clock, //Set of all global signals (currently clocks) |
| packer_opts.global_clocks, packer_opts.alpha, packer_opts.beta, packer_opts.timing_driven, |
| packer_opts.connection_driven, |
| high_fanout_threshold, |
| *timing_info); |
| num_unrelated_clustering_attempts = 0; |
| |
| if (packer_opts.timing_driven) { |
| blocks_since_last_analysis++; /* historically, timing slacks were recomputed after X number of blocks were packed, but this doesn't significantly alter results so I (jluu) did not port the code */ |
| } |
| next_molecule = get_molecule_for_cluster(cluster_ctx.clb_nlist.block_pb(clb_index), |
| atom_molecules, |
| allow_unrelated_clustering, |
| packer_opts.prioritize_transitive_connectivity, |
| packer_opts.transitive_fanout_threshold, |
| packer_opts.feasible_block_array_size, |
| &num_unrelated_clustering_attempts, |
| cur_cluster_placement_stats_ptr, |
| clb_inter_blk_nets, |
| clb_index, |
| packer_opts.pack_verbosity); |
| } |
| |
| VTR_LOGV(verbosity == 2, "\n"); |
| |
| if (detailed_routing_stage == (int)E_DETAILED_ROUTE_AT_END_ONLY) { |
| /* is_mode_conflict does not affect this stage. It is needed when trying to route the packed clusters. |
| * |
| * It holds a flag that is used to verify whether try_intra_lb_route ended in a mode conflict issue. |
| * If the value is TRUE the cluster has to be repacked, and its internal pb_graph_nodes will have more restrict choices |
| * for what regards the mode that has to be selected |
| */ |
| t_mode_selection_status mode_status; |
| is_cluster_legal = try_intra_lb_route(router_data, packer_opts.pack_verbosity, &mode_status); |
| if (is_cluster_legal) { |
| VTR_LOGV(verbosity > 2, "\tPassed route at end.\n"); |
| } else { |
| VTR_LOGV(verbosity > 0, "Failed route at end, repack cluster trying detailed routing at each stage.\n"); |
| } |
| } else { |
| is_cluster_legal = true; |
| } |
| |
| if (is_cluster_legal) { |
| intra_lb_routing.push_back(router_data->saved_lb_nets); |
| VTR_ASSERT((int)intra_lb_routing.size() == num_clb); |
| router_data->saved_lb_nets = nullptr; |
| |
| //Pick a new seed |
| istart = get_highest_gain_seed_molecule(&seedindex, atom_molecules, seed_atoms); |
| |
| if (packer_opts.timing_driven) { |
| if (num_blocks_hill_added > 0) { |
| blocks_since_last_analysis += num_blocks_hill_added; |
| } |
| } |
| |
| /* store info that will be used later in packing from pb_stats and free the rest */ |
| t_pb_stats* pb_stats = cluster_ctx.clb_nlist.block_pb(clb_index)->pb_stats; |
| for (const AtomNetId mnet_id : pb_stats->marked_nets) { |
| int external_terminals = atom_ctx.nlist.net_pins(mnet_id).size() - pb_stats->num_pins_of_net_in_pb[mnet_id]; |
| /* Check if external terminals of net is within the fanout limit and that there exists external terminals */ |
| if (external_terminals < packer_opts.transitive_fanout_threshold && external_terminals > 0) { |
| clb_inter_blk_nets[clb_index].push_back(mnet_id); |
| } |
| } |
| auto cur_pb = cluster_ctx.clb_nlist.block_pb(clb_index); |
| // update the pb type count by counting the used pb types in this packed cluster |
| update_pb_type_count(cur_pb, pb_type_count); |
| // update the data structure holding the LE counts |
| update_le_count(cur_pb, logic_block_type, le_pb_type, le_count); |
| free_pb_stats_recursive(cur_pb); |
| } else { |
| /* Free up data structures and requeue used molecules */ |
| num_used_type_instances[cluster_ctx.clb_nlist.block_type(clb_index)]--; |
| revalid_molecules(cluster_ctx.clb_nlist.block_pb(clb_index), atom_molecules); |
| cluster_ctx.clb_nlist.remove_block(clb_index); |
| cluster_ctx.clb_nlist.compress(); |
| num_clb--; |
| seedindex = savedseedindex; |
| } |
| free_router_data(router_data); |
| router_data = nullptr; |
| } |
| } |
| |
| // print the total number of used physical blocks for each |
| // physical block type after finishing the packing stage |
| print_pb_type_count(pb_type_count); |
| |
| // if this architecture has LE physical block, report its usage |
| if (le_pb_type) { |
| print_le_count(le_count, le_pb_type); |
| } |
| |
| /**************************************************************** |
| * Free Data Structures |
| *****************************************************************/ |
| VTR_ASSERT(num_clb == (int)cluster_ctx.clb_nlist.blocks().size()); |
| check_clustering(); |
| |
| output_clustering(intra_lb_routing, packer_opts.global_clocks, is_clock, arch->architecture_id, packer_opts.output_file.c_str(), false); |
| |
| VTR_ASSERT(cluster_ctx.clb_nlist.blocks().size() == intra_lb_routing.size()); |
| for (auto blk_id : cluster_ctx.clb_nlist.blocks()) |
| free_intra_lb_nets(intra_lb_routing[blk_id]); |
| |
| intra_lb_routing.clear(); |
| |
| if (packer_opts.hill_climbing_flag) |
| free(hill_climbing_inputs_avail); |
| |
| free_cluster_placement_stats(cluster_placement_stats); |
| |
| for (auto blk_id : cluster_ctx.clb_nlist.blocks()) |
| cluster_ctx.clb_nlist.remove_block(blk_id); |
| |
| cluster_ctx.clb_nlist = ClusteredNetlist(); |
| |
| free(unclustered_list_head); |
| free(memory_pool); |
| |
| free(primitives_list); |
| |
| return num_used_type_instances; |
| } |
| |
| /* Determine if atom block is in pb */ |
| static bool is_atom_blk_in_pb(const AtomBlockId blk_id, const t_pb* pb) { |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| const t_pb* cur_pb = atom_ctx.lookup.atom_pb(blk_id); |
| while (cur_pb) { |
| if (cur_pb == pb) { |
| return true; |
| } |
| cur_pb = cur_pb->parent_pb; |
| } |
| return false; |
| } |
| |
| /* Add blk to list of feasible blocks sorted according to gain */ |
| static void add_molecule_to_pb_stats_candidates(t_pack_molecule* molecule, |
| std::map<AtomBlockId, float>& gain, |
| t_pb* pb, |
| int max_queue_size) { |
| int i, j; |
| |
| for (i = 0; i < pb->pb_stats->num_feasible_blocks; i++) { |
| if (pb->pb_stats->feasible_blocks[i] == molecule) { |
| return; /* already in queue, do nothing */ |
| } |
| } |
| |
| if (pb->pb_stats->num_feasible_blocks >= max_queue_size - 1) { |
| /* maximum size for array, remove smallest gain element and sort */ |
| if (get_molecule_gain(molecule, gain) > get_molecule_gain(pb->pb_stats->feasible_blocks[0], gain)) { |
| /* single loop insertion sort */ |
| for (j = 0; j < pb->pb_stats->num_feasible_blocks - 1; j++) { |
| if (get_molecule_gain(molecule, gain) <= get_molecule_gain(pb->pb_stats->feasible_blocks[j + 1], gain)) { |
| pb->pb_stats->feasible_blocks[j] = molecule; |
| break; |
| } else { |
| pb->pb_stats->feasible_blocks[j] = pb->pb_stats->feasible_blocks[j + 1]; |
| } |
| } |
| if (j == pb->pb_stats->num_feasible_blocks - 1) { |
| pb->pb_stats->feasible_blocks[j] = molecule; |
| } |
| } |
| } else { |
| /* Expand array and single loop insertion sort */ |
| for (j = pb->pb_stats->num_feasible_blocks - 1; j >= 0; j--) { |
| if (get_molecule_gain(pb->pb_stats->feasible_blocks[j], gain) > get_molecule_gain(molecule, gain)) { |
| pb->pb_stats->feasible_blocks[j + 1] = pb->pb_stats->feasible_blocks[j]; |
| } else { |
| pb->pb_stats->feasible_blocks[j + 1] = molecule; |
| break; |
| } |
| } |
| if (j < 0) { |
| pb->pb_stats->feasible_blocks[0] = molecule; |
| } |
| pb->pb_stats->num_feasible_blocks++; |
| } |
| } |
| |
| /*****************************************/ |
| static void alloc_and_init_clustering(const t_molecule_stats& max_molecule_stats, |
| t_cluster_placement_stats** cluster_placement_stats, |
| t_pb_graph_node*** primitives_list, |
| t_pack_molecule* molecules_head, |
| int num_molecules) { |
| /* Allocates the main data structures used for clustering and properly * |
| * initializes them. */ |
| |
| t_molecule_link* next_ptr; |
| t_pack_molecule* cur_molecule; |
| t_pack_molecule** molecule_array; |
| int max_molecule_size; |
| |
| /* alloc and load list of molecules to pack */ |
| unclustered_list_head = (t_molecule_link*)vtr::calloc(max_molecule_stats.num_used_ext_inputs + 1, sizeof(t_molecule_link)); |
| unclustered_list_head_size = max_molecule_stats.num_used_ext_inputs + 1; |
| |
| for (int i = 0; i <= max_molecule_stats.num_used_ext_inputs; i++) { |
| unclustered_list_head[i].next = nullptr; |
| } |
| |
| molecule_array = (t_pack_molecule**)vtr::malloc(num_molecules * sizeof(t_pack_molecule*)); |
| cur_molecule = molecules_head; |
| for (int i = 0; i < num_molecules; i++) { |
| VTR_ASSERT(cur_molecule != nullptr); |
| molecule_array[i] = cur_molecule; |
| cur_molecule = cur_molecule->next; |
| } |
| VTR_ASSERT(cur_molecule == nullptr); |
| qsort((void*)molecule_array, num_molecules, sizeof(t_pack_molecule*), |
| compare_molecule_gain); |
| |
| memory_pool = (t_molecule_link*)vtr::malloc(num_molecules * sizeof(t_molecule_link)); |
| next_ptr = memory_pool; |
| |
| for (int i = 0; i < num_molecules; i++) { |
| //Figure out how many external inputs are used by this molecule |
| t_molecule_stats molecule_stats = calc_molecule_stats(molecule_array[i]); |
| int ext_inps = molecule_stats.num_used_ext_inputs; |
| |
| //Insert the molecule into the unclustered lists by number of external inputs |
| next_ptr->moleculeptr = molecule_array[i]; |
| next_ptr->next = unclustered_list_head[ext_inps].next; |
| unclustered_list_head[ext_inps].next = next_ptr; |
| |
| next_ptr++; |
| } |
| free(molecule_array); |
| |
| /* load net info */ |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| for (AtomNetId net : atom_ctx.nlist.nets()) { |
| AtomPinId driver_pin = atom_ctx.nlist.net_driver(net); |
| AtomBlockId driver_block = atom_ctx.nlist.pin_block(driver_pin); |
| |
| for (AtomPinId sink_pin : atom_ctx.nlist.net_sinks(net)) { |
| AtomBlockId sink_block = atom_ctx.nlist.pin_block(sink_pin); |
| |
| if (driver_block == sink_block) { |
| net_output_feeds_driving_block_input[net]++; |
| } |
| } |
| } |
| |
| /* alloc and load cluster placement info */ |
| *cluster_placement_stats = alloc_and_load_cluster_placement_stats(); |
| |
| /* alloc array that will store primitives that a molecule gets placed to, |
| * primitive_list is referenced by index, for example a atom block in index 2 of a molecule matches to a primitive in index 2 in primitive_list |
| * this array must be the size of the biggest molecule |
| */ |
| max_molecule_size = 1; |
| cur_molecule = molecules_head; |
| while (cur_molecule != nullptr) { |
| if (cur_molecule->num_blocks > max_molecule_size) { |
| max_molecule_size = cur_molecule->num_blocks; |
| } |
| cur_molecule = cur_molecule->next; |
| } |
| *primitives_list = (t_pb_graph_node**)vtr::calloc(max_molecule_size, sizeof(t_pb_graph_node*)); |
| } |
| |
| /*****************************************/ |
| static void free_pb_stats_recursive(t_pb* pb) { |
| int i, j; |
| /* Releases all the memory used by clustering data structures. */ |
| if (pb) { |
| if (pb->pb_graph_node != nullptr) { |
| if (!pb->pb_graph_node->is_primitive()) { |
| for (i = 0; i < pb->pb_graph_node->pb_type->modes[pb->mode].num_pb_type_children; i++) { |
| for (j = 0; j < pb->pb_graph_node->pb_type->modes[pb->mode].pb_type_children[i].num_pb; j++) { |
| if (pb->child_pbs && pb->child_pbs[i]) { |
| free_pb_stats_recursive(&pb->child_pbs[i][j]); |
| } |
| } |
| } |
| } |
| } |
| free_pb_stats(pb); |
| } |
| } |
| |
| static bool primitive_feasible(const AtomBlockId blk_id, t_pb* cur_pb) { |
| const t_pb_type* cur_pb_type = cur_pb->pb_graph_node->pb_type; |
| |
| VTR_ASSERT(cur_pb_type->num_modes == 0); /* primitive */ |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| AtomBlockId cur_pb_blk_id = atom_ctx.lookup.pb_atom(cur_pb); |
| if (cur_pb_blk_id && cur_pb_blk_id != blk_id) { |
| /* This pb already has a different logical block */ |
| return false; |
| } |
| |
| if (cur_pb_type->class_type == MEMORY_CLASS) { |
| /* Memory class has additional feasibility requirements: |
| * - all siblings must share all nets, including open nets, with the exception of data nets */ |
| |
| /* find sibling if one exists */ |
| AtomBlockId sibling_memory_blk_id = find_memory_sibling(cur_pb); |
| |
| if (sibling_memory_blk_id) { |
| //There is a sibling, see if the current block is feasible with it |
| bool sibling_feasible = primitive_memory_sibling_feasible(blk_id, cur_pb_type, sibling_memory_blk_id); |
| if (!sibling_feasible) { |
| return false; |
| } |
| } |
| } |
| |
| //Generic feasibility check |
| return primitive_type_feasible(blk_id, cur_pb_type); |
| } |
| |
| static bool primitive_memory_sibling_feasible(const AtomBlockId blk_id, const t_pb_type* cur_pb_type, const AtomBlockId sibling_blk_id) { |
| /* Check that the two atom blocks blk_id and sibling_blk_id (which should both be memory slices) |
| * are feasible, in the sence that they have precicely the same net connections (with the |
| * exception of nets in data port classes). |
| * |
| * Note that this routine does not check pin feasibility against the cur_pb_type; so |
| * primitive_type_feasible() should also be called on blk_id before concluding it is feasible. |
| */ |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| VTR_ASSERT(cur_pb_type->class_type == MEMORY_CLASS); |
| |
| //First, identify the 'data' ports by looking at the cur_pb_type |
| std::unordered_set<t_model_ports*> data_ports; |
| for (int iport = 0; iport < cur_pb_type->num_ports; ++iport) { |
| const char* port_class = cur_pb_type->ports[iport].port_class; |
| if (port_class && strstr(port_class, "data") == port_class) { |
| //The port_class starts with "data", so it is a data port |
| |
| //Record the port |
| data_ports.insert(cur_pb_type->ports[iport].model_port); |
| } |
| } |
| |
| //Now verify that all nets (except those connected to data ports) are equivalent |
| //between blk_id and sibling_blk_id |
| |
| //Since the atom netlist stores only in-use ports, we iterate over the model to ensure |
| //all ports are compared |
| const t_model* model = cur_pb_type->model; |
| for (t_model_ports* port : {model->inputs, model->outputs}) { |
| for (; port; port = port->next) { |
| if (data_ports.count(port)) { |
| //Don't check data ports |
| continue; |
| } |
| |
| //Note: VPR doesn't support multi-driven nets, so all outputs |
| //should be data ports, otherwise the siblings will both be |
| //driving the output net |
| |
| //Get the ports from each primitive |
| auto blk_port_id = atom_ctx.nlist.find_atom_port(blk_id, port); |
| auto sib_port_id = atom_ctx.nlist.find_atom_port(sibling_blk_id, port); |
| |
| //Check that all nets (including unconnected nets) match |
| for (int ipin = 0; ipin < port->size; ++ipin) { |
| //The nets are initialized as invalid (i.e. disconnected) |
| AtomNetId blk_net_id; |
| AtomNetId sib_net_id; |
| |
| //We can get the actual net provided the port exists |
| // |
| //Note that if the port did not exist, the net is left |
| //as invalid/disconneced |
| if (blk_port_id) { |
| blk_net_id = atom_ctx.nlist.port_net(blk_port_id, ipin); |
| } |
| if (sib_port_id) { |
| sib_net_id = atom_ctx.nlist.port_net(sib_port_id, ipin); |
| } |
| |
| //The sibling and block must have the same (possibly disconnected) |
| //net on this pin |
| if (blk_net_id != sib_net_id) { |
| //Nets do not match, not feasible |
| return false; |
| } |
| } |
| } |
| } |
| |
| return true; |
| } |
| |
| /*****************************************/ |
| static t_pack_molecule* get_molecule_by_num_ext_inputs(const int ext_inps, |
| const enum e_removal_policy remove_flag, |
| t_cluster_placement_stats* cluster_placement_stats_ptr) { |
| /* This routine returns an atom block which has not been clustered, has * |
| * no connection to the current cluster, satisfies the cluster * |
| * clock constraints, is a valid subblock inside the cluster, does not exceed the cluster subblock units available, |
| * and has ext_inps external inputs. If * |
| * there is no such atom block it returns ClusterBlockId::INVALID(). Remove_flag * |
| * controls whether or not blocks that have already been clustered * |
| * are removed from the unclustered_list data structures. NB: * |
| * to get a atom block regardless of clock constraints just set clocks_ * |
| * avail > 0. */ |
| |
| t_molecule_link *ptr, *prev_ptr; |
| int i; |
| bool success; |
| |
| prev_ptr = &unclustered_list_head[ext_inps]; |
| ptr = unclustered_list_head[ext_inps].next; |
| while (ptr != nullptr) { |
| /* TODO: Get better candidate atom block in future, eg. return most timing critical or some other smarter metric */ |
| if (ptr->moleculeptr->valid) { |
| success = true; |
| for (i = 0; i < get_array_size_of_molecule(ptr->moleculeptr); i++) { |
| if (ptr->moleculeptr->atom_block_ids[i]) { |
| auto blk_id = ptr->moleculeptr->atom_block_ids[i]; |
| if (!exists_free_primitive_for_atom_block(cluster_placement_stats_ptr, blk_id)) { |
| /* TODO: I should be using a better filtering check especially when I'm |
| * dealing with multiple clock/multiple global reset signals where the clock/reset |
| * packed in matters, need to do later when I have the circuits to check my work */ |
| success = false; |
| break; |
| } |
| } |
| } |
| if (success == true) { |
| return ptr->moleculeptr; |
| } |
| prev_ptr = ptr; |
| } |
| |
| else if (remove_flag == REMOVE_CLUSTERED) { |
| VTR_ASSERT(0); /* this doesn't work right now with 2 the pass packing for each complex block */ |
| prev_ptr->next = ptr->next; |
| } |
| |
| ptr = ptr->next; |
| } |
| |
| return nullptr; |
| } |
| |
| /*****************************************/ |
| static t_pack_molecule* get_free_molecule_with_most_ext_inputs_for_cluster(t_pb* cur_pb, |
| t_cluster_placement_stats* cluster_placement_stats_ptr) { |
| /* This routine is used to find new blocks for clustering when there are no feasible * |
| * blocks with any attraction to the current cluster (i.e. it finds * |
| * blocks which are unconnected from the current cluster). It returns * |
| * the atom block with the largest number of used inputs that satisfies the * |
| * clocking and number of inputs constraints. If no suitable atom block is * |
| * found, the routine returns ClusterBlockId::INVALID(). |
| * TODO: Analyze if this function is useful in more detail, also, should probably not include clock in input count |
| */ |
| |
| int inputs_avail = 0; |
| |
| for (int i = 0; i < cur_pb->pb_graph_node->num_input_pin_class; i++) { |
| inputs_avail += cur_pb->pb_stats->input_pins_used[i].size(); |
| } |
| |
| t_pack_molecule* molecule = nullptr; |
| |
| if (inputs_avail >= unclustered_list_head_size) { |
| inputs_avail = unclustered_list_head_size - 1; |
| } |
| |
| for (int ext_inps = inputs_avail; ext_inps >= 0; ext_inps--) { |
| molecule = get_molecule_by_num_ext_inputs(ext_inps, LEAVE_CLUSTERED, cluster_placement_stats_ptr); |
| if (molecule != nullptr) { |
| break; |
| } |
| } |
| return molecule; |
| } |
| |
| /*****************************************/ |
| static void alloc_and_load_pb_stats(t_pb* pb, const int feasible_block_array_size) { |
| /* Call this routine when starting to fill up a new cluster. It resets * |
| * the gain vector, etc. */ |
| |
| pb->pb_stats = new t_pb_stats; |
| |
| /* If statement below is for speed. If nets are reasonably low-fanout, * |
| * only a relatively small number of blocks will be marked, and updating * |
| * only those atom block structures will be fastest. If almost all blocks * |
| * have been touched it should be faster to just run through them all * |
| * in order (less addressing and better cache locality). */ |
| pb->pb_stats->input_pins_used = std::vector<std::unordered_map<size_t, AtomNetId>>(pb->pb_graph_node->num_input_pin_class); |
| pb->pb_stats->output_pins_used = std::vector<std::unordered_map<size_t, AtomNetId>>(pb->pb_graph_node->num_output_pin_class); |
| pb->pb_stats->lookahead_input_pins_used = std::vector<std::vector<AtomNetId>>(pb->pb_graph_node->num_input_pin_class); |
| pb->pb_stats->lookahead_output_pins_used = std::vector<std::vector<AtomNetId>>(pb->pb_graph_node->num_output_pin_class); |
| pb->pb_stats->num_feasible_blocks = NOT_VALID; |
| pb->pb_stats->feasible_blocks = (t_pack_molecule**)vtr::calloc(feasible_block_array_size, sizeof(t_pack_molecule*)); |
| |
| pb->pb_stats->tie_break_high_fanout_net = AtomNetId::INVALID(); |
| |
| pb->pb_stats->gain.clear(); |
| pb->pb_stats->timinggain.clear(); |
| pb->pb_stats->connectiongain.clear(); |
| pb->pb_stats->sharinggain.clear(); |
| pb->pb_stats->hillgain.clear(); |
| pb->pb_stats->transitive_fanout_candidates.clear(); |
| |
| pb->pb_stats->num_pins_of_net_in_pb.clear(); |
| |
| pb->pb_stats->num_child_blocks_in_pb = 0; |
| |
| pb->pb_stats->explore_transitive_fanout = true; |
| } |
| /*****************************************/ |
| |
| /** |
| * Try pack molecule into current cluster |
| */ |
| static enum e_block_pack_status try_pack_molecule(t_cluster_placement_stats* cluster_placement_stats_ptr, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| t_pack_molecule* molecule, |
| t_pb_graph_node** primitives_list, |
| t_pb* pb, |
| const int max_models, |
| const int max_cluster_size, |
| const ClusterBlockId clb_index, |
| const int detailed_routing_stage, |
| t_lb_router_data* router_data, |
| int verbosity, |
| bool enable_pin_feasibility_filter, |
| const int feasible_block_array_size, |
| t_ext_pin_util max_external_pin_util) { |
| int molecule_size, failed_location; |
| int i; |
| enum e_block_pack_status block_pack_status; |
| t_pb* parent; |
| t_pb* cur_pb; |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| parent = nullptr; |
| |
| block_pack_status = BLK_STATUS_UNDEFINED; |
| |
| molecule_size = get_array_size_of_molecule(molecule); |
| failed_location = 0; |
| |
| if (verbosity > 3) { |
| AtomBlockId root_atom = molecule->atom_block_ids[molecule->root]; |
| VTR_LOG("\t\tTry pack molecule: '%s' (%s)", |
| atom_ctx.nlist.block_name(root_atom).c_str(), |
| atom_ctx.nlist.block_model(root_atom)->name); |
| VTR_LOGV(molecule->pack_pattern, |
| " molecule_type %s molecule_size %zu", |
| molecule->pack_pattern->name, |
| molecule->atom_block_ids.size()); |
| VTR_LOG("\n"); |
| } |
| |
| // if this cluster has a molecule placed in it that is part of a long chain |
| // (a chain that consists of more than one molecule), don't allow more long chain |
| // molecules to be placed in this cluster. To avoid possibly creating cluster level |
| // blocks that have incompatible placement constraints or form very long placement |
| // macros that limit placement flexibility. |
| if (cluster_placement_stats_ptr->has_long_chain && molecule->is_chain() && molecule->chain_info->is_long_chain) { |
| VTR_LOGV(verbosity > 4, "\t\t\tFAILED Placement Feasibility Filter: Only one long chain per cluster is allowed\n"); |
| return BLK_FAILED_FEASIBLE; |
| } |
| |
| while (block_pack_status != BLK_PASSED) { |
| if (get_next_primitive_list(cluster_placement_stats_ptr, molecule, |
| primitives_list)) { |
| block_pack_status = BLK_PASSED; |
| |
| for (i = 0; i < molecule_size && block_pack_status == BLK_PASSED; i++) { |
| VTR_ASSERT((primitives_list[i] == nullptr) == (!molecule->atom_block_ids[i])); |
| failed_location = i + 1; |
| // try place atom block if it exists |
| if (molecule->atom_block_ids[i]) { |
| block_pack_status = try_place_atom_block_rec(primitives_list[i], |
| molecule->atom_block_ids[i], pb, &parent, |
| max_models, max_cluster_size, clb_index, |
| cluster_placement_stats_ptr, molecule, router_data, |
| verbosity, feasible_block_array_size); |
| } |
| } |
| if (enable_pin_feasibility_filter && block_pack_status == BLK_PASSED) { |
| /* Check if pin usage is feasible for the current packing assignment */ |
| reset_lookahead_pins_used(pb); |
| try_update_lookahead_pins_used(pb); |
| if (!check_lookahead_pins_used(pb, max_external_pin_util)) { |
| VTR_LOGV(verbosity > 4, "\t\t\tFAILED Pin Feasibility Filter\n"); |
| block_pack_status = BLK_FAILED_FEASIBLE; |
| } |
| } |
| if (block_pack_status == BLK_PASSED) { |
| /* |
| * during the clustering step of `do_clustering`, `detailed_routing_stage` is incremented at each iteration until it a cluster |
| * is correctly generated or `detailed_routing_stage` assumes an invalid value (E_DETAILED_ROUTE_INVALID). |
| * depending on its value we have different behaviors: |
| * - E_DETAILED_ROUTE_AT_END_ONLY: Skip routing if heuristic is to route at the end of packing complex block. |
| * - E_DETAILED_ROUTE_FOR_EACH_ATOM: Try to route if heuristic is to route for every atom. If the clusterer arrives at this stage, |
| * it means that more checks have to be performed as the previous stage failed to generate a new cluster. |
| * |
| * mode_status is a data structure containing the status of the mode selection. Its members are: |
| * - bool is_mode_conflict |
| * - bool try_expand_all_modes |
| * - bool expand_all_modes |
| * |
| * is_mode_conflict affects this stage. Its value determines whether the cluster failed to pack after a mode conflict issue. |
| * It holds a flag that is used to verify whether try_intra_lb_route ended in a mode conflict issue. |
| * |
| * Until is_mode_conflict is set to FALSE by try_intra_lb_route, the loop re-iterates. If all the available modes are exhausted |
| * an error will be thrown during mode conflicts checks (this to prevent infinite loops). |
| * |
| * If the value is TRUE the cluster has to be re-routed, and its internal pb_graph_nodes will have more restrict choices |
| * for what regards the mode that has to be selected. |
| * |
| * is_mode_conflict is initially set to TRUE, and, unless a mode conflict is found, it is set to false in `try_intra_lb_route`. |
| * |
| * try_expand_all_modes is set if the node expansion failed to find a valid routing path. The clusterer tries to find another route |
| * by using all the modes during node expansion. |
| * |
| * expand_all_modes is used to enable the expansion of all the nodes using all the possible modes. |
| */ |
| t_mode_selection_status mode_status; |
| bool is_routed = false; |
| bool do_detailed_routing_stage = detailed_routing_stage == (int)E_DETAILED_ROUTE_FOR_EACH_ATOM; |
| if (do_detailed_routing_stage) { |
| do { |
| reset_intra_lb_route(router_data); |
| is_routed = try_intra_lb_route(router_data, verbosity, &mode_status); |
| } while (do_detailed_routing_stage && mode_status.is_mode_issue()); |
| } |
| |
| if (do_detailed_routing_stage && is_routed == false) { |
| /* Cannot pack */ |
| VTR_LOGV(verbosity > 4, "\t\t\tFAILED Detailed Routing Legality\n"); |
| block_pack_status = BLK_FAILED_ROUTE; |
| } else { |
| /* Pack successful, commit |
| * TODO: SW Engineering note - may want to update cluster stats here too instead of doing it outside |
| */ |
| VTR_ASSERT(block_pack_status == BLK_PASSED); |
| if (molecule->is_chain()) { |
| /* Chained molecules often take up lots of area and are important, |
| * if a chain is packed in, want to rename logic block to match chain name */ |
| AtomBlockId chain_root_blk_id = molecule->atom_block_ids[molecule->pack_pattern->root_block->block_id]; |
| cur_pb = atom_ctx.lookup.atom_pb(chain_root_blk_id)->parent_pb; |
| while (cur_pb != nullptr) { |
| free(cur_pb->name); |
| cur_pb->name = vtr::strdup(atom_ctx.nlist.block_name(chain_root_blk_id).c_str()); |
| cur_pb = cur_pb->parent_pb; |
| } |
| // if this molecule is part of a chain, mark the cluster as having a long chain |
| // molecule. Also check if it's the first molecule in the chain to be packed. |
| // If so, update the chain id for this chain of molecules to make sure all |
| // molecules will be packed to the same chain id and can reach each other using |
| // the chain direct links between clusters |
| if (molecule->chain_info->is_long_chain) { |
| cluster_placement_stats_ptr->has_long_chain = true; |
| if (molecule->chain_info->chain_id == -1) { |
| update_molecule_chain_info(molecule, primitives_list[molecule->root]); |
| } |
| } |
| } |
| |
| for (i = 0; i < molecule_size; i++) { |
| if (molecule->atom_block_ids[i]) { |
| /* invalidate all molecules that share atom block with current molecule */ |
| |
| auto rng = atom_molecules.equal_range(molecule->atom_block_ids[i]); |
| for (const auto& kv : vtr::make_range(rng.first, rng.second)) { |
| t_pack_molecule* cur_molecule = kv.second; |
| cur_molecule->valid = false; |
| } |
| |
| commit_primitive(cluster_placement_stats_ptr, primitives_list[i]); |
| } |
| } |
| } |
| } |
| |
| if (block_pack_status != BLK_PASSED) { |
| for (i = 0; i < failed_location; i++) { |
| if (molecule->atom_block_ids[i]) { |
| remove_atom_from_target(router_data, molecule->atom_block_ids[i]); |
| } |
| } |
| for (i = 0; i < failed_location; i++) { |
| if (molecule->atom_block_ids[i]) { |
| revert_place_atom_block(molecule->atom_block_ids[i], router_data, atom_molecules); |
| } |
| } |
| } else { |
| VTR_LOGV(verbosity > 3, "\t\tPASSED pack molecule\n"); |
| } |
| } else { |
| VTR_LOGV(verbosity > 3, "\t\tFAILED No candidate primitives available\n"); |
| block_pack_status = BLK_FAILED_FEASIBLE; |
| break; /* no more candidate primitives available, this molecule will not pack, return fail */ |
| } |
| } |
| return block_pack_status; |
| } |
| |
| /** |
| * Try place atom block into current primitive location |
| */ |
| |
| static enum e_block_pack_status try_place_atom_block_rec(const t_pb_graph_node* pb_graph_node, |
| const AtomBlockId blk_id, |
| t_pb* cb, |
| t_pb** parent, |
| const int max_models, |
| const int max_cluster_size, |
| const ClusterBlockId clb_index, |
| const t_cluster_placement_stats* cluster_placement_stats_ptr, |
| const t_pack_molecule* molecule, |
| t_lb_router_data* router_data, |
| int verbosity, |
| const int feasible_block_array_size) { |
| int i, j; |
| bool is_primitive; |
| enum e_block_pack_status block_pack_status; |
| |
| t_pb* my_parent; |
| t_pb *pb, *parent_pb; |
| const t_pb_type* pb_type; |
| |
| auto& atom_ctx = g_vpr_ctx.mutable_atom(); |
| |
| my_parent = nullptr; |
| |
| block_pack_status = BLK_PASSED; |
| |
| /* Discover parent */ |
| if (pb_graph_node->parent_pb_graph_node != cb->pb_graph_node) { |
| block_pack_status = try_place_atom_block_rec(pb_graph_node->parent_pb_graph_node, blk_id, cb, |
| &my_parent, max_models, max_cluster_size, clb_index, |
| cluster_placement_stats_ptr, molecule, router_data, |
| verbosity, feasible_block_array_size); |
| parent_pb = my_parent; |
| } else { |
| parent_pb = cb; |
| } |
| |
| /* Create siblings if siblings are not allocated */ |
| if (parent_pb->child_pbs == nullptr) { |
| atom_ctx.lookup.set_atom_pb(AtomBlockId::INVALID(), parent_pb); |
| |
| VTR_ASSERT(parent_pb->name == nullptr); |
| parent_pb->name = vtr::strdup(atom_ctx.nlist.block_name(blk_id).c_str()); |
| parent_pb->mode = pb_graph_node->pb_type->parent_mode->index; |
| set_reset_pb_modes(router_data, parent_pb, true); |
| const t_mode* mode = &parent_pb->pb_graph_node->pb_type->modes[parent_pb->mode]; |
| parent_pb->child_pbs = new t_pb*[mode->num_pb_type_children]; |
| |
| for (i = 0; i < mode->num_pb_type_children; i++) { |
| parent_pb->child_pbs[i] = new t_pb[mode->pb_type_children[i].num_pb]; |
| |
| for (j = 0; j < mode->pb_type_children[i].num_pb; j++) { |
| parent_pb->child_pbs[i][j].parent_pb = parent_pb; |
| |
| atom_ctx.lookup.set_atom_pb(AtomBlockId::INVALID(), &parent_pb->child_pbs[i][j]); |
| |
| parent_pb->child_pbs[i][j].pb_graph_node = &(parent_pb->pb_graph_node->child_pb_graph_nodes[parent_pb->mode][i][j]); |
| } |
| } |
| } else { |
| VTR_ASSERT(parent_pb->mode == pb_graph_node->pb_type->parent_mode->index); |
| } |
| |
| const t_mode* mode = &parent_pb->pb_graph_node->pb_type->modes[parent_pb->mode]; |
| for (i = 0; i < mode->num_pb_type_children; i++) { |
| if (pb_graph_node->pb_type == &mode->pb_type_children[i]) { |
| break; |
| } |
| } |
| VTR_ASSERT(i < mode->num_pb_type_children); |
| pb = &parent_pb->child_pbs[i][pb_graph_node->placement_index]; |
| *parent = pb; /* this pb is parent of it's child that called this function */ |
| VTR_ASSERT(pb->pb_graph_node == pb_graph_node); |
| if (pb->pb_stats == nullptr) { |
| alloc_and_load_pb_stats(pb, feasible_block_array_size); |
| } |
| pb_type = pb_graph_node->pb_type; |
| |
| is_primitive = (pb_type->num_modes == 0); |
| |
| if (is_primitive) { |
| VTR_ASSERT(!atom_ctx.lookup.pb_atom(pb) |
| && atom_ctx.lookup.atom_pb(blk_id) == nullptr |
| && atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()); |
| /* try pack to location */ |
| VTR_ASSERT(pb->name == nullptr); |
| pb->name = vtr::strdup(atom_ctx.nlist.block_name(blk_id).c_str()); |
| |
| //Update the atom netlist mappings |
| atom_ctx.lookup.set_atom_clb(blk_id, clb_index); |
| atom_ctx.lookup.set_atom_pb(blk_id, pb); |
| |
| add_atom_as_target(router_data, blk_id); |
| if (!primitive_feasible(blk_id, pb)) { |
| /* failed location feasibility check, revert pack */ |
| block_pack_status = BLK_FAILED_FEASIBLE; |
| } |
| |
| // if this block passed and is part of a chained molecule |
| if (block_pack_status == BLK_PASSED && molecule->is_chain()) { |
| auto molecule_root_block = molecule->atom_block_ids[molecule->root]; |
| // if this is the root block of the chain molecule check its placmeent feasibility |
| if (blk_id == molecule_root_block) { |
| block_pack_status = check_chain_root_placement_feasibility(pb_graph_node, molecule, blk_id); |
| } |
| } |
| |
| VTR_LOGV(verbosity > 4 && block_pack_status == BLK_PASSED, |
| "\t\t\tPlaced atom '%s' (%s) at %s\n", |
| atom_ctx.nlist.block_name(blk_id).c_str(), |
| atom_ctx.nlist.block_model(blk_id)->name, |
| pb->hierarchical_type_name().c_str()); |
| } |
| |
| if (block_pack_status != BLK_PASSED) { |
| free(pb->name); |
| pb->name = nullptr; |
| } |
| |
| return block_pack_status; |
| } |
| |
| /* Revert trial atom block iblock and free up memory space accordingly |
| */ |
| static void revert_place_atom_block(const AtomBlockId blk_id, t_lb_router_data* router_data, const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules) { |
| auto& atom_ctx = g_vpr_ctx.mutable_atom(); |
| |
| //We cast away const here since we may free the pb, and it is |
| //being removed from the active mapping. |
| // |
| //In general most code works fine accessing cosnt t_pb*, |
| //which is why we store them as such in atom_ctx.lookup |
| t_pb* pb = const_cast<t_pb*>(atom_ctx.lookup.atom_pb(blk_id)); |
| |
| //Update the atom netlist mapping |
| atom_ctx.lookup.set_atom_clb(blk_id, ClusterBlockId::INVALID()); |
| atom_ctx.lookup.set_atom_pb(blk_id, nullptr); |
| |
| if (pb != nullptr) { |
| /* When freeing molecules, the current block might already have been freed by a prior revert |
| * When this happens, no need to do anything beyond basic book keeping at the atom block |
| */ |
| |
| t_pb* next = pb->parent_pb; |
| revalid_molecules(pb, atom_molecules); |
| free_pb(pb); |
| pb = next; |
| |
| while (pb != nullptr) { |
| /* If this is pb is created only for the purposes of holding new molecule, remove it |
| * Must check if cluster is already freed (which can be the case) |
| */ |
| next = pb->parent_pb; |
| |
| if (pb->child_pbs != nullptr && pb->pb_stats != nullptr |
| && pb->pb_stats->num_child_blocks_in_pb == 0) { |
| set_reset_pb_modes(router_data, pb, false); |
| if (next != nullptr) { |
| /* If the code gets here, then that means that placing the initial seed molecule |
| * failed, don't free the actual complex block itself as the seed needs to find |
| * another placement */ |
| revalid_molecules(pb, atom_molecules); |
| free_pb(pb); |
| } |
| } |
| pb = next; |
| } |
| } |
| } |
| |
| static void update_connection_gain_values(const AtomNetId net_id, const AtomBlockId clustered_blk_id, t_pb* cur_pb, enum e_net_relation_to_clustered_block net_relation_to_clustered_block) { |
| /*This function is called when the connectiongain values on the net net_id* |
| *require updating. */ |
| |
| int num_internal_connections, num_open_connections, num_stuck_connections; |
| |
| num_internal_connections = num_open_connections = num_stuck_connections = 0; |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| ClusterBlockId clb_index = atom_ctx.lookup.atom_clb(clustered_blk_id); |
| |
| /* may wish to speed things up by ignoring clock nets since they are high fanout */ |
| |
| for (auto pin_id : atom_ctx.nlist.net_pins(net_id)) { |
| auto blk_id = atom_ctx.nlist.pin_block(pin_id); |
| if (atom_ctx.lookup.atom_clb(blk_id) == clb_index |
| && is_atom_blk_in_pb(blk_id, atom_ctx.lookup.atom_pb(clustered_blk_id))) { |
| num_internal_connections++; |
| } else if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) { |
| num_open_connections++; |
| } else { |
| num_stuck_connections++; |
| } |
| } |
| |
| if (net_relation_to_clustered_block == OUTPUT) { |
| for (auto pin_id : atom_ctx.nlist.net_sinks(net_id)) { |
| auto blk_id = atom_ctx.nlist.pin_block(pin_id); |
| VTR_ASSERT(blk_id); |
| |
| if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) { |
| /* TODO: Gain function accurate only if net has one connection to block, |
| * TODO: Should we handle case where net has multi-connection to block? |
| * Gain computation is only off by a bit in this case */ |
| if (cur_pb->pb_stats->connectiongain.count(blk_id) == 0) { |
| cur_pb->pb_stats->connectiongain[blk_id] = 0; |
| } |
| |
| if (num_internal_connections > 1) { |
| cur_pb->pb_stats->connectiongain[blk_id] -= 1 / (float)(num_open_connections + 1.5 * num_stuck_connections + 1 + 0.1); |
| } |
| cur_pb->pb_stats->connectiongain[blk_id] += 1 / (float)(num_open_connections + 1.5 * num_stuck_connections + 0.1); |
| } |
| } |
| } |
| |
| if (net_relation_to_clustered_block == INPUT) { |
| /*Calculate the connectiongain for the atom block which is driving * |
| *the atom net that is an input to an atom block in the cluster */ |
| |
| auto driver_pin_id = atom_ctx.nlist.net_driver(net_id); |
| auto blk_id = atom_ctx.nlist.pin_block(driver_pin_id); |
| |
| if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) { |
| if (cur_pb->pb_stats->connectiongain.count(blk_id) == 0) { |
| cur_pb->pb_stats->connectiongain[blk_id] = 0; |
| } |
| if (num_internal_connections > 1) { |
| cur_pb->pb_stats->connectiongain[blk_id] -= 1 / (float)(num_open_connections + 1.5 * num_stuck_connections + 0.1 + 1); |
| } |
| cur_pb->pb_stats->connectiongain[blk_id] += 1 / (float)(num_open_connections + 1.5 * num_stuck_connections + 0.1); |
| } |
| } |
| } |
| /*****************************************/ |
| static void update_timing_gain_values(const AtomNetId net_id, |
| t_pb* cur_pb, |
| enum e_net_relation_to_clustered_block net_relation_to_clustered_block, |
| const SetupTimingInfo& timing_info, |
| const std::unordered_set<AtomNetId>& is_global) { |
| /*This function is called when the timing_gain values on the atom net* |
| *net_id requires updating. */ |
| float timinggain; |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| /* Check if this atom net lists its driving atom block twice. If so, avoid * |
| * double counting this atom block by skipping the first (driving) pin. */ |
| auto pins = atom_ctx.nlist.net_pins(net_id); |
| if (net_output_feeds_driving_block_input[net_id] != 0) |
| pins = atom_ctx.nlist.net_sinks(net_id); |
| |
| if (net_relation_to_clustered_block == OUTPUT |
| && !is_global.count(net_id)) { |
| for (auto pin_id : pins) { |
| auto blk_id = atom_ctx.nlist.pin_block(pin_id); |
| if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) { |
| timinggain = timing_info.setup_pin_criticality(pin_id); |
| |
| if (cur_pb->pb_stats->timinggain.count(blk_id) == 0) { |
| cur_pb->pb_stats->timinggain[blk_id] = 0; |
| } |
| if (timinggain > cur_pb->pb_stats->timinggain[blk_id]) |
| cur_pb->pb_stats->timinggain[blk_id] = timinggain; |
| } |
| } |
| } |
| |
| if (net_relation_to_clustered_block == INPUT |
| && !is_global.count(net_id)) { |
| /*Calculate the timing gain for the atom block which is driving * |
| *the atom net that is an input to a atom block in the cluster */ |
| auto driver_pin = atom_ctx.nlist.net_driver(net_id); |
| auto new_blk_id = atom_ctx.nlist.pin_block(driver_pin); |
| |
| if (atom_ctx.lookup.atom_clb(new_blk_id) == ClusterBlockId::INVALID()) { |
| for (auto pin_id : atom_ctx.nlist.net_sinks(net_id)) { |
| timinggain = timing_info.setup_pin_criticality(pin_id); |
| |
| if (cur_pb->pb_stats->timinggain.count(new_blk_id) == 0) { |
| cur_pb->pb_stats->timinggain[new_blk_id] = 0; |
| } |
| if (timinggain > cur_pb->pb_stats->timinggain[new_blk_id]) |
| cur_pb->pb_stats->timinggain[new_blk_id] = timinggain; |
| } |
| } |
| } |
| } |
| |
| /*****************************************/ |
| static void mark_and_update_partial_gain(const AtomNetId net_id, enum e_gain_update gain_flag, const AtomBlockId clustered_blk_id, bool timing_driven, bool connection_driven, enum e_net_relation_to_clustered_block net_relation_to_clustered_block, const SetupTimingInfo& timing_info, const std::unordered_set<AtomNetId>& is_global, const int high_fanout_net_threshold) { |
| /* Updates the marked data structures, and if gain_flag is GAIN, * |
| * the gain when an atom block is added to a cluster. The * |
| * sharinggain is the number of inputs that a atom block shares with * |
| * blocks that are already in the cluster. Hillgain is the * |
| * reduction in number of pins-required by adding a atom block to the * |
| * cluster. The timinggain is the criticality of the most critical* |
| * atom net between this atom block and an atom block in the cluster. */ |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| t_pb* cur_pb = atom_ctx.lookup.atom_pb(clustered_blk_id)->parent_pb; |
| |
| if (int(atom_ctx.nlist.net_sinks(net_id).size()) > high_fanout_net_threshold) { |
| /* Optimization: It can be too runtime costly for marking all sinks for |
| * a high fanout-net that probably has no hope of ever getting packed, |
| * thus ignore those high fanout nets */ |
| if (!is_global.count(net_id)) { |
| /* If no low/medium fanout nets, we may need to consider |
| * high fan-out nets for packing, so select one and store it */ |
| while (cur_pb->parent_pb != nullptr) { |
| cur_pb = cur_pb->parent_pb; |
| } |
| AtomNetId stored_net = cur_pb->pb_stats->tie_break_high_fanout_net; |
| if (!stored_net || atom_ctx.nlist.net_sinks(net_id).size() < atom_ctx.nlist.net_sinks(stored_net).size()) { |
| cur_pb->pb_stats->tie_break_high_fanout_net = net_id; |
| } |
| } |
| return; |
| } |
| |
| while (cur_pb) { |
| /* Mark atom net as being visited, if necessary. */ |
| |
| if (cur_pb->pb_stats->num_pins_of_net_in_pb.count(net_id) == 0) { |
| cur_pb->pb_stats->marked_nets.push_back(net_id); |
| } |
| |
| /* Update gains of affected blocks. */ |
| |
| if (gain_flag == GAIN) { |
| /* Check if this net is connected to it's driver block multiple times (i.e. as both an output and input) |
| * If so, avoid double counting by skipping the first (driving) pin. */ |
| |
| auto pins = atom_ctx.nlist.net_pins(net_id); |
| if (net_output_feeds_driving_block_input[net_id] != 0) |
| //We implicitly assume here that net_output_feeds_driver_block_input[net_id] is 2 |
| //(i.e. the net loops back to the block only once) |
| pins = atom_ctx.nlist.net_sinks(net_id); |
| |
| if (cur_pb->pb_stats->num_pins_of_net_in_pb.count(net_id) == 0) { |
| for (auto pin_id : pins) { |
| auto blk_id = atom_ctx.nlist.pin_block(pin_id); |
| if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) { |
| if (cur_pb->pb_stats->sharinggain.count(blk_id) == 0) { |
| cur_pb->pb_stats->marked_blocks.push_back(blk_id); |
| cur_pb->pb_stats->sharinggain[blk_id] = 1; |
| cur_pb->pb_stats->hillgain[blk_id] = 1 - num_ext_inputs_atom_block(blk_id); |
| } else { |
| cur_pb->pb_stats->sharinggain[blk_id]++; |
| cur_pb->pb_stats->hillgain[blk_id]++; |
| } |
| } |
| } |
| } |
| |
| if (connection_driven) { |
| update_connection_gain_values(net_id, clustered_blk_id, cur_pb, |
| net_relation_to_clustered_block); |
| } |
| |
| if (timing_driven) { |
| update_timing_gain_values(net_id, cur_pb, |
| net_relation_to_clustered_block, |
| timing_info, |
| is_global); |
| } |
| } |
| if (cur_pb->pb_stats->num_pins_of_net_in_pb.count(net_id) == 0) { |
| cur_pb->pb_stats->num_pins_of_net_in_pb[net_id] = 0; |
| } |
| cur_pb->pb_stats->num_pins_of_net_in_pb[net_id]++; |
| cur_pb = cur_pb->parent_pb; |
| } |
| } |
| |
| /*****************************************/ |
| static void update_total_gain(float alpha, float beta, bool timing_driven, bool connection_driven, t_pb* pb) { |
| /*Updates the total gain array to reflect the desired tradeoff between* |
| *input sharing (sharinggain) and path_length minimization (timinggain)*/ |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| t_pb* cur_pb = pb; |
| while (cur_pb) { |
| for (AtomBlockId blk_id : cur_pb->pb_stats->marked_blocks) { |
| if (cur_pb->pb_stats->connectiongain.count(blk_id) == 0) { |
| cur_pb->pb_stats->connectiongain[blk_id] = 0; |
| } |
| if (cur_pb->pb_stats->sharinggain.count(blk_id) == 0) { |
| cur_pb->pb_stats->sharinggain[blk_id] = 0; |
| } |
| |
| /* Todo: This was used to explore different normalization options, can |
| * be made more efficient once we decide on which one to use*/ |
| int num_used_input_pins = atom_ctx.nlist.block_input_pins(blk_id).size(); |
| int num_used_output_pins = atom_ctx.nlist.block_output_pins(blk_id).size(); |
| /* end todo */ |
| |
| /* Calculate area-only cost function */ |
| int num_used_pins = num_used_input_pins + num_used_output_pins; |
| VTR_ASSERT(num_used_pins > 0); |
| if (connection_driven) { |
| /*try to absorb as many connections as possible*/ |
| cur_pb->pb_stats->gain[blk_id] = ((1 - beta) |
| * (float)cur_pb->pb_stats->sharinggain[blk_id] |
| + beta * (float)cur_pb->pb_stats->connectiongain[blk_id]) |
| / (num_used_pins); |
| } else { |
| cur_pb->pb_stats->gain[blk_id] = ((float)cur_pb->pb_stats->sharinggain[blk_id]) |
| / (num_used_pins); |
| } |
| |
| /* Add in timing driven cost into cost function */ |
| if (timing_driven) { |
| cur_pb->pb_stats->gain[blk_id] = alpha |
| * cur_pb->pb_stats->timinggain[blk_id] |
| + (1.0 - alpha) * (float)cur_pb->pb_stats->gain[blk_id]; |
| } |
| } |
| cur_pb = cur_pb->parent_pb; |
| } |
| } |
| |
| /*****************************************/ |
| static void update_cluster_stats(const t_pack_molecule* molecule, |
| const ClusterBlockId clb_index, |
| const std::unordered_set<AtomNetId>& is_clock, |
| const std::unordered_set<AtomNetId>& is_global, |
| const bool global_clocks, |
| const float alpha, |
| const float beta, |
| const bool timing_driven, |
| const bool connection_driven, |
| const int high_fanout_net_threshold, |
| const SetupTimingInfo& timing_info) { |
| /* Updates cluster stats such as gain, used pins, and clock structures. */ |
| |
| int molecule_size; |
| int iblock; |
| t_pb *cur_pb, *cb; |
| |
| /* TODO: what a scary comment from Vaughn, we'll have to watch out for this causing problems */ |
| |
| /* Output can be open so the check is necessary. I don't change * |
| * the gain for clock outputs when clocks are globally distributed * |
| * because I assume there is no real need to pack similarly clocked * |
| * FFs together then. Note that by updating the gain when the * |
| * clock driver is placed in a cluster implies that the output of * |
| * LUTs can be connected to clock inputs internally. Probably not * |
| * true, but it doesn't make much difference, since it will still * |
| * make local routing of this clock very short, and none of my * |
| * benchmarks actually generate local clocks (all come from pads). */ |
| |
| auto& atom_ctx = g_vpr_ctx.mutable_atom(); |
| molecule_size = get_array_size_of_molecule(molecule); |
| cb = nullptr; |
| |
| for (iblock = 0; iblock < molecule_size; iblock++) { |
| auto blk_id = molecule->atom_block_ids[iblock]; |
| if (!blk_id) { |
| continue; |
| } |
| |
| //Update atom netlist mapping |
| atom_ctx.lookup.set_atom_clb(blk_id, clb_index); |
| |
| const t_pb* atom_pb = atom_ctx.lookup.atom_pb(blk_id); |
| VTR_ASSERT(atom_pb); |
| |
| cur_pb = atom_pb->parent_pb; |
| while (cur_pb) { |
| /* reset list of feasible blocks */ |
| if (cur_pb->is_root()) { |
| cb = cur_pb; |
| } |
| cur_pb->pb_stats->num_feasible_blocks = NOT_VALID; |
| cur_pb->pb_stats->num_child_blocks_in_pb++; |
| cur_pb = cur_pb->parent_pb; |
| } |
| |
| /* Outputs first */ |
| for (auto pin_id : atom_ctx.nlist.block_output_pins(blk_id)) { |
| auto net_id = atom_ctx.nlist.pin_net(pin_id); |
| if (!is_clock.count(net_id) || !global_clocks) { |
| mark_and_update_partial_gain(net_id, GAIN, blk_id, |
| timing_driven, |
| connection_driven, OUTPUT, |
| timing_info, |
| is_global, |
| high_fanout_net_threshold); |
| } else { |
| mark_and_update_partial_gain(net_id, NO_GAIN, blk_id, |
| timing_driven, |
| connection_driven, OUTPUT, |
| timing_info, |
| is_global, |
| high_fanout_net_threshold); |
| } |
| } |
| |
| /* Next Inputs */ |
| for (auto pin_id : atom_ctx.nlist.block_input_pins(blk_id)) { |
| auto net_id = atom_ctx.nlist.pin_net(pin_id); |
| mark_and_update_partial_gain(net_id, GAIN, blk_id, |
| timing_driven, connection_driven, |
| INPUT, |
| timing_info, |
| is_global, |
| high_fanout_net_threshold); |
| } |
| |
| /* Finally Clocks */ |
| for (auto pin_id : atom_ctx.nlist.block_clock_pins(blk_id)) { |
| auto net_id = atom_ctx.nlist.pin_net(pin_id); |
| if (global_clocks) { |
| mark_and_update_partial_gain(net_id, NO_GAIN, blk_id, |
| timing_driven, connection_driven, INPUT, |
| timing_info, |
| is_global, |
| high_fanout_net_threshold); |
| } else { |
| mark_and_update_partial_gain(net_id, GAIN, blk_id, |
| timing_driven, connection_driven, INPUT, |
| timing_info, |
| is_global, |
| high_fanout_net_threshold); |
| } |
| } |
| |
| update_total_gain(alpha, beta, timing_driven, connection_driven, |
| atom_pb->parent_pb); |
| |
| commit_lookahead_pins_used(cb); |
| } |
| |
| // if this molecule came from the transitive fanout candidates remove it |
| if (cb) { |
| cb->pb_stats->transitive_fanout_candidates.erase(molecule->atom_block_ids[molecule->root]); |
| cb->pb_stats->explore_transitive_fanout = true; |
| } |
| } |
| |
| static void start_new_cluster(t_cluster_placement_stats* cluster_placement_stats, |
| t_pb_graph_node** primitives_list, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| ClusterBlockId clb_index, |
| t_pack_molecule* molecule, |
| std::map<t_logical_block_type_ptr, size_t>& num_used_type_instances, |
| const float target_device_utilization, |
| const int num_models, |
| const int max_cluster_size, |
| const t_arch* arch, |
| std::string device_layout_name, |
| std::vector<t_lb_type_rr_node>* lb_type_rr_graphs, |
| t_lb_router_data** router_data, |
| const int detailed_routing_stage, |
| ClusteredNetlist* clb_nlist, |
| const std::map<const t_model*, std::vector<t_logical_block_type_ptr>>& primitive_candidate_block_types, |
| int verbosity, |
| bool enable_pin_feasibility_filter, |
| bool balance_block_type_utilization, |
| const int feasible_block_array_size) { |
| /* Given a starting seed block, start_new_cluster determines the next cluster type to use |
| * It expands the FPGA if it cannot find a legal cluster for the atom block |
| */ |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| auto& device_ctx = g_vpr_ctx.mutable_device(); |
| |
| /* Allocate a dummy initial cluster and load a atom block as a seed and check if it is legal */ |
| AtomBlockId root_atom = molecule->atom_block_ids[molecule->root]; |
| const std::string& root_atom_name = atom_ctx.nlist.block_name(root_atom); |
| const t_model* root_model = atom_ctx.nlist.block_model(root_atom); |
| |
| auto itr = primitive_candidate_block_types.find(root_model); |
| VTR_ASSERT(itr != primitive_candidate_block_types.end()); |
| std::vector<t_logical_block_type_ptr> candidate_types = itr->second; |
| |
| if (balance_block_type_utilization) { |
| //We sort the candidate types in ascending order by their current utilization. |
| //This means that the packer will prefer to use types with lower utilization. |
| //This is a naive approach to try balancing utilization when multiple types can |
| //support the same primitive(s). |
| std::stable_sort(candidate_types.begin(), candidate_types.end(), |
| [&](t_logical_block_type_ptr lhs, t_logical_block_type_ptr rhs) { |
| float lhs_util = vtr::safe_ratio<float>(num_used_type_instances[lhs], device_ctx.grid.num_instances(physical_tile_type(lhs))); |
| float rhs_util = vtr::safe_ratio<float>(num_used_type_instances[rhs], device_ctx.grid.num_instances(physical_tile_type(rhs))); |
| //Lower util first |
| return lhs_util < rhs_util; |
| }); |
| } |
| |
| if (verbosity > 2) { |
| VTR_LOG("\tSeed: '%s' (%s)", root_atom_name.c_str(), root_model->name); |
| VTR_LOGV(molecule->pack_pattern, " molecule_type %s molecule_size %zu", |
| molecule->pack_pattern->name, molecule->atom_block_ids.size()); |
| VTR_LOG("\n"); |
| } |
| |
| //Try packing into each candidate type |
| bool success = false; |
| for (size_t i = 0; i < candidate_types.size(); i++) { |
| auto type = candidate_types[i]; |
| |
| t_pb* pb = new t_pb; |
| pb->pb_graph_node = type->pb_graph_head; |
| alloc_and_load_pb_stats(pb, feasible_block_array_size); |
| pb->parent_pb = nullptr; |
| |
| *router_data = alloc_and_load_router_data(&lb_type_rr_graphs[type->index], type); |
| |
| //Try packing into each mode |
| e_block_pack_status pack_result = BLK_STATUS_UNDEFINED; |
| for (int j = 0; j < type->pb_graph_head->pb_type->num_modes && !success; j++) { |
| pb->mode = j; |
| |
| reset_cluster_placement_stats(&cluster_placement_stats[type->index]); |
| set_mode_cluster_placement_stats(pb->pb_graph_node, j); |
| |
| //Note that since we are starting a new cluster, we use FULL_EXTERNAL_PIN_UTIL, |
| //which allows all cluster pins to be used. This ensures that if we have a large |
| //molecule which would otherwise exceed the external pin utilization targets it |
| //can use the full set of cluster pins when selected as the seed block -- ensuring |
| //it is still implementable. |
| pack_result = try_pack_molecule(&cluster_placement_stats[type->index], |
| atom_molecules, |
| molecule, primitives_list, pb, |
| num_models, max_cluster_size, clb_index, |
| detailed_routing_stage, *router_data, |
| verbosity, |
| enable_pin_feasibility_filter, |
| feasible_block_array_size, |
| FULL_EXTERNAL_PIN_UTIL); |
| |
| success = (pack_result == BLK_PASSED); |
| } |
| |
| if (success) { |
| VTR_LOGV(verbosity > 2, "\tPASSED_SEED: Block Type %s\n", type->name); |
| //Once clustering succeeds, add it to the clb netlist |
| if (pb->name != nullptr) { |
| free(pb->name); |
| } |
| pb->name = vtr::strdup(root_atom_name.c_str()); |
| clb_index = clb_nlist->create_block(root_atom_name.c_str(), pb, type); |
| break; |
| } else { |
| VTR_LOGV(verbosity > 2, "\tFAILED_SEED: Block Type %s\n", type->name); |
| //Free failed clustering and try again |
| free_router_data(*router_data); |
| free_pb(pb); |
| delete pb; |
| *router_data = nullptr; |
| } |
| } |
| |
| if (!success) { |
| //Explored all candidates |
| if (molecule->type == MOLECULE_FORCED_PACK) { |
| VPR_FATAL_ERROR(VPR_ERROR_PACK, |
| "Can not find any logic block that can implement molecule.\n" |
| "\tPattern %s %s\n", |
| molecule->pack_pattern->name, |
| root_atom_name.c_str()); |
| } else { |
| VPR_FATAL_ERROR(VPR_ERROR_PACK, |
| "Can not find any logic block that can implement molecule.\n" |
| "\tAtom %s (%s)\n", |
| root_atom_name.c_str(), root_model->name); |
| } |
| } |
| |
| VTR_ASSERT(success); |
| |
| //Successfully create cluster |
| num_used_type_instances[clb_nlist->block_type(clb_index)]++; |
| |
| /* Expand FPGA size if needed */ |
| if (num_used_type_instances[clb_nlist->block_type(clb_index)] > device_ctx.grid.num_instances(physical_tile_type(clb_index))) { |
| device_ctx.grid = create_device_grid(device_layout_name, arch->grid_layouts, num_used_type_instances, target_device_utilization); |
| VTR_LOGV(verbosity > 0, "Not enough resources expand FPGA size to (%d x %d)\n", |
| device_ctx.grid.width(), device_ctx.grid.height()); |
| } |
| } |
| |
| /* |
| * Get candidate molecule to pack into currently open cluster |
| * Molecule selection priority: |
| * 1. Find unpacked molecule based on criticality and strong connectedness (connected by low fanout nets) with current cluster |
| * 2. Find unpacked molecule based on transitive connections (eg. 2 hops away) with current cluster |
| * 3. Find unpacked molecule based on weak connectedness (connected by high fanout nets) with current cluster |
| */ |
| static t_pack_molecule* get_highest_gain_molecule(t_pb* cur_pb, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| const enum e_gain_type gain_mode, |
| t_cluster_placement_stats* cluster_placement_stats_ptr, |
| vtr::vector<ClusterBlockId, std::vector<AtomNetId>>& clb_inter_blk_nets, |
| const ClusterBlockId cluster_index, |
| bool prioritize_transitive_connectivity, |
| int transitive_fanout_threshold, |
| const int feasible_block_array_size) { |
| /* This routine populates a list of feasible blocks outside the cluster then returns the best one for the list * |
| * not currently in a cluster and satisfies the feasibility * |
| * function passed in as is_feasible. If there are no feasible * |
| * blocks it returns ClusterBlockId::INVALID(). */ |
| |
| if (gain_mode == HILL_CLIMBING) { |
| VPR_FATAL_ERROR(VPR_ERROR_PACK, |
| "Hill climbing not supported yet, error out.\n"); |
| } |
| |
| // 1. Find unpacked molecule based on criticality and strong connectedness (connected by low fanout nets) with current cluster |
| if (cur_pb->pb_stats->num_feasible_blocks == NOT_VALID) { |
| add_cluster_molecule_candidates_by_connectivity_and_timing(cur_pb, cluster_placement_stats_ptr, atom_molecules, feasible_block_array_size); |
| } |
| |
| if (prioritize_transitive_connectivity) { |
| // 2. Find unpacked molecule based on transitive connections (eg. 2 hops away) with current cluster |
| if (cur_pb->pb_stats->num_feasible_blocks == 0 && cur_pb->pb_stats->explore_transitive_fanout) { |
| add_cluster_molecule_candidates_by_transitive_connectivity(cur_pb, cluster_placement_stats_ptr, atom_molecules, clb_inter_blk_nets, |
| cluster_index, transitive_fanout_threshold, feasible_block_array_size); |
| } |
| |
| // 3. Find unpacked molecule based on weak connectedness (connected by high fanout nets) with current cluster |
| if (cur_pb->pb_stats->num_feasible_blocks == 0 && cur_pb->pb_stats->tie_break_high_fanout_net) { |
| add_cluster_molecule_candidates_by_highfanout_connectivity(cur_pb, cluster_placement_stats_ptr, atom_molecules, feasible_block_array_size); |
| } |
| } else { //Reverse order |
| // 3. Find unpacked molecule based on weak connectedness (connected by high fanout nets) with current cluster |
| if (cur_pb->pb_stats->num_feasible_blocks == 0 && cur_pb->pb_stats->tie_break_high_fanout_net) { |
| add_cluster_molecule_candidates_by_highfanout_connectivity(cur_pb, cluster_placement_stats_ptr, atom_molecules, feasible_block_array_size); |
| } |
| |
| // 2. Find unpacked molecule based on transitive connections (eg. 2 hops away) with current cluster |
| if (cur_pb->pb_stats->num_feasible_blocks == 0 && cur_pb->pb_stats->explore_transitive_fanout) { |
| add_cluster_molecule_candidates_by_transitive_connectivity(cur_pb, cluster_placement_stats_ptr, atom_molecules, clb_inter_blk_nets, |
| cluster_index, transitive_fanout_threshold, feasible_block_array_size); |
| } |
| } |
| |
| /* Grab highest gain molecule */ |
| t_pack_molecule* molecule = nullptr; |
| if (cur_pb->pb_stats->num_feasible_blocks > 0) { |
| cur_pb->pb_stats->num_feasible_blocks--; |
| int index = cur_pb->pb_stats->num_feasible_blocks; |
| molecule = cur_pb->pb_stats->feasible_blocks[index]; |
| VTR_ASSERT(molecule->valid == true); |
| return molecule; |
| } |
| |
| return molecule; |
| } |
| |
| void add_cluster_molecule_candidates_by_connectivity_and_timing(t_pb* cur_pb, |
| t_cluster_placement_stats* cluster_placement_stats_ptr, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| const int feasible_block_array_size) { |
| VTR_ASSERT(cur_pb->pb_stats->num_feasible_blocks == NOT_VALID); |
| |
| cur_pb->pb_stats->num_feasible_blocks = 0; |
| cur_pb->pb_stats->explore_transitive_fanout = true; /* If no legal molecules found, enable exploration of molecules two hops away */ |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| for (AtomBlockId blk_id : cur_pb->pb_stats->marked_blocks) { |
| if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) { |
| auto rng = atom_molecules.equal_range(blk_id); |
| for (const auto& kv : vtr::make_range(rng.first, rng.second)) { |
| t_pack_molecule* molecule = kv.second; |
| if (molecule->valid) { |
| bool success = true; |
| for (int j = 0; j < get_array_size_of_molecule(molecule); j++) { |
| if (molecule->atom_block_ids[j]) { |
| VTR_ASSERT(atom_ctx.lookup.atom_clb(molecule->atom_block_ids[j]) == ClusterBlockId::INVALID()); |
| auto blk_id2 = molecule->atom_block_ids[j]; |
| if (!exists_free_primitive_for_atom_block(cluster_placement_stats_ptr, blk_id2)) { |
| /* TODO: debating whether to check if placement exists for molecule |
| * (more robust) or individual atom blocks (faster) */ |
| success = false; |
| break; |
| } |
| } |
| } |
| if (success) { |
| add_molecule_to_pb_stats_candidates(molecule, |
| cur_pb->pb_stats->gain, cur_pb, feasible_block_array_size); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| void add_cluster_molecule_candidates_by_highfanout_connectivity(t_pb* cur_pb, |
| t_cluster_placement_stats* cluster_placement_stats_ptr, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| const int feasible_block_array_size) { |
| /* Because the packer ignores high fanout nets when marking what blocks |
| * to consider, use one of the ignored high fanout net to fill up lightly |
| * related blocks */ |
| reset_tried_but_unused_cluster_placements(cluster_placement_stats_ptr); |
| |
| AtomNetId net_id = cur_pb->pb_stats->tie_break_high_fanout_net; |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| int count = 0; |
| for (auto pin_id : atom_ctx.nlist.net_pins(net_id)) { |
| if (count >= AAPACK_MAX_HIGH_FANOUT_EXPLORE) { |
| break; |
| } |
| |
| AtomBlockId blk_id = atom_ctx.nlist.pin_block(pin_id); |
| |
| if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) { |
| auto rng = atom_molecules.equal_range(blk_id); |
| for (const auto& kv : vtr::make_range(rng.first, rng.second)) { |
| t_pack_molecule* molecule = kv.second; |
| if (molecule->valid) { |
| bool success = true; |
| for (int j = 0; j < get_array_size_of_molecule(molecule); j++) { |
| if (molecule->atom_block_ids[j]) { |
| VTR_ASSERT(atom_ctx.lookup.atom_clb(molecule->atom_block_ids[j]) == ClusterBlockId::INVALID()); |
| auto blk_id2 = molecule->atom_block_ids[j]; |
| if (!exists_free_primitive_for_atom_block(cluster_placement_stats_ptr, blk_id2)) { |
| /* TODO: debating whether to check if placement exists for molecule (more |
| * robust) or individual atom blocks (faster) */ |
| success = false; |
| break; |
| } |
| } |
| } |
| if (success) { |
| add_molecule_to_pb_stats_candidates(molecule, |
| cur_pb->pb_stats->gain, cur_pb, std::min(feasible_block_array_size, AAPACK_MAX_HIGH_FANOUT_EXPLORE)); |
| count++; |
| } |
| } |
| } |
| } |
| } |
| cur_pb->pb_stats->tie_break_high_fanout_net = AtomNetId::INVALID(); /* Mark off that this high fanout net has been considered */ |
| } |
| |
| void add_cluster_molecule_candidates_by_transitive_connectivity(t_pb* cur_pb, |
| t_cluster_placement_stats* cluster_placement_stats_ptr, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| vtr::vector<ClusterBlockId, std::vector<AtomNetId>>& clb_inter_blk_nets, |
| const ClusterBlockId cluster_index, |
| int transitive_fanout_threshold, |
| const int feasible_block_array_size) { |
| //TODO: For now, only done by fan-out; should also consider fan-in |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| cur_pb->pb_stats->explore_transitive_fanout = false; |
| |
| /* First time finding transitive fanout candidates therefore alloc and load them */ |
| load_transitive_fanout_candidates(cluster_index, |
| atom_molecules, |
| cur_pb->pb_stats, |
| clb_inter_blk_nets, |
| transitive_fanout_threshold); |
| /* Only consider candidates that pass a very simple legality check */ |
| for (const auto& transitive_candidate : cur_pb->pb_stats->transitive_fanout_candidates) { |
| t_pack_molecule* molecule = transitive_candidate.second; |
| if (molecule->valid) { |
| bool success = true; |
| for (int j = 0; j < get_array_size_of_molecule(molecule); j++) { |
| if (molecule->atom_block_ids[j]) { |
| VTR_ASSERT(atom_ctx.lookup.atom_clb(molecule->atom_block_ids[j]) == ClusterBlockId::INVALID()); |
| auto blk_id = molecule->atom_block_ids[j]; |
| if (!exists_free_primitive_for_atom_block(cluster_placement_stats_ptr, blk_id)) { |
| /* TODO: debating whether to check if placement exists for molecule (more |
| * robust) or individual atom blocks (faster) */ |
| success = false; |
| break; |
| } |
| } |
| } |
| if (success) { |
| add_molecule_to_pb_stats_candidates(molecule, |
| cur_pb->pb_stats->gain, cur_pb, std::min(feasible_block_array_size, AAPACK_MAX_TRANSITIVE_EXPLORE)); |
| } |
| } |
| } |
| } |
| |
| /*****************************************/ |
| static t_pack_molecule* get_molecule_for_cluster(t_pb* cur_pb, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| const bool allow_unrelated_clustering, |
| const bool prioritize_transitive_connectivity, |
| const int transitive_fanout_threshold, |
| const int feasible_block_array_size, |
| int* num_unrelated_clustering_attempts, |
| t_cluster_placement_stats* cluster_placement_stats_ptr, |
| vtr::vector<ClusterBlockId, std::vector<AtomNetId>>& clb_inter_blk_nets, |
| ClusterBlockId cluster_index, |
| int verbosity) { |
| /* Finds the block with the greatest gain that satisfies the |
| * input, clock and capacity constraints of a cluster that are |
| * passed in. If no suitable block is found it returns ClusterBlockId::INVALID(). |
| */ |
| |
| VTR_ASSERT(cur_pb->is_root()); |
| |
| /* If cannot pack into primitive, try packing into cluster */ |
| |
| auto best_molecule = get_highest_gain_molecule(cur_pb, atom_molecules, |
| NOT_HILL_CLIMBING, cluster_placement_stats_ptr, clb_inter_blk_nets, |
| cluster_index, prioritize_transitive_connectivity, |
| transitive_fanout_threshold, feasible_block_array_size); |
| |
| /* If no blocks have any gain to the current cluster, the code above * |
| * will not find anything. However, another atom block with no inputs in * |
| * common with the cluster may still be inserted into the cluster. */ |
| |
| if (allow_unrelated_clustering) { |
| if (best_molecule == nullptr) { |
| if (*num_unrelated_clustering_attempts == 0) { |
| best_molecule = get_free_molecule_with_most_ext_inputs_for_cluster(cur_pb, |
| cluster_placement_stats_ptr); |
| (*num_unrelated_clustering_attempts)++; |
| VTR_LOGV(best_molecule && verbosity > 2, "\tFound unrelated molecule to cluster\n"); |
| } |
| } else { |
| *num_unrelated_clustering_attempts = 0; |
| } |
| } else { |
| VTR_LOGV(!best_molecule && verbosity > 2, "\tNo related molecule found and unrelated clustering disabled\n"); |
| } |
| |
| return best_molecule; |
| } |
| |
| /* TODO: Add more error checking! */ |
| static void check_clustering() { |
| std::unordered_set<AtomBlockId> atoms_checked; |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| auto& cluster_ctx = g_vpr_ctx.clustering(); |
| |
| if (cluster_ctx.clb_nlist.blocks().size() == 0) { |
| VTR_LOG_WARN("Packing produced no clustered blocks"); |
| } |
| |
| /* |
| * Check that each atom block connects to one physical primitive and that the primitive links up to the parent clb |
| */ |
| for (auto blk_id : atom_ctx.nlist.blocks()) { |
| //Each atom should be part of a pb |
| const t_pb* atom_pb = atom_ctx.lookup.atom_pb(blk_id); |
| if (!atom_pb) { |
| VPR_FATAL_ERROR(VPR_ERROR_PACK, |
| "Atom block %s is not mapped to a pb\n", |
| atom_ctx.nlist.block_name(blk_id).c_str()); |
| } |
| |
| //Check the reverse mapping is consistent |
| if (atom_ctx.lookup.pb_atom(atom_pb) != blk_id) { |
| VPR_FATAL_ERROR(VPR_ERROR_PACK, |
| "pb %s does not contain atom block %s but atom block %s maps to pb.\n", |
| atom_pb->name, |
| atom_ctx.nlist.block_name(blk_id).c_str(), |
| atom_ctx.nlist.block_name(blk_id).c_str()); |
| } |
| |
| VTR_ASSERT(atom_ctx.nlist.block_name(blk_id) == atom_pb->name); |
| |
| const t_pb* cur_pb = atom_pb; |
| while (cur_pb->parent_pb) { |
| cur_pb = cur_pb->parent_pb; |
| VTR_ASSERT(cur_pb->name); |
| } |
| |
| ClusterBlockId clb_index = atom_ctx.lookup.atom_clb(blk_id); |
| if (clb_index == ClusterBlockId::INVALID()) { |
| VPR_FATAL_ERROR(VPR_ERROR_PACK, |
| "Atom %s is not mapped to a CLB\n", |
| atom_ctx.nlist.block_name(blk_id).c_str()); |
| } |
| |
| if (cur_pb != cluster_ctx.clb_nlist.block_pb(clb_index)) { |
| VPR_FATAL_ERROR(VPR_ERROR_PACK, |
| "CLB %s does not match CLB contained by pb %s.\n", |
| cur_pb->name, atom_pb->name); |
| } |
| } |
| |
| /* Check that I do not have spurious links in children pbs */ |
| for (auto blk_id : cluster_ctx.clb_nlist.blocks()) { |
| check_cluster_atom_blocks(cluster_ctx.clb_nlist.block_pb(blk_id), atoms_checked); |
| } |
| |
| for (auto blk_id : atom_ctx.nlist.blocks()) { |
| if (!atoms_checked.count(blk_id)) { |
| VPR_FATAL_ERROR(VPR_ERROR_PACK, |
| "Atom block %s not found in any cluster.\n", |
| atom_ctx.nlist.block_name(blk_id).c_str()); |
| } |
| } |
| } |
| |
| /* TODO: May want to check that all atom blocks are actually reached */ |
| static void check_cluster_atom_blocks(t_pb* pb, std::unordered_set<AtomBlockId>& blocks_checked) { |
| int i, j; |
| const t_pb_type* pb_type; |
| bool has_child = false; |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| pb_type = pb->pb_graph_node->pb_type; |
| if (pb_type->num_modes == 0) { |
| /* primitive */ |
| auto blk_id = atom_ctx.lookup.pb_atom(pb); |
| if (blk_id) { |
| if (blocks_checked.count(blk_id)) { |
| VPR_FATAL_ERROR(VPR_ERROR_PACK, |
| "pb %s contains atom block %s but atom block is already contained in another pb.\n", |
| pb->name, atom_ctx.nlist.block_name(blk_id).c_str()); |
| } |
| blocks_checked.insert(blk_id); |
| if (pb != atom_ctx.lookup.atom_pb(blk_id)) { |
| VPR_FATAL_ERROR(VPR_ERROR_PACK, |
| "pb %s contains atom block %s but atom block does not link to pb.\n", |
| pb->name, atom_ctx.nlist.block_name(blk_id).c_str()); |
| } |
| } |
| } else { |
| /* this is a container pb, all container pbs must contain children */ |
| for (i = 0; i < pb_type->modes[pb->mode].num_pb_type_children; i++) { |
| for (j = 0; j < pb_type->modes[pb->mode].pb_type_children[i].num_pb; j++) { |
| if (pb->child_pbs[i] != nullptr) { |
| if (pb->child_pbs[i][j].name != nullptr) { |
| has_child = true; |
| check_cluster_atom_blocks(&pb->child_pbs[i][j], blocks_checked); |
| } |
| } |
| } |
| } |
| VTR_ASSERT(has_child); |
| } |
| } |
| |
| static void mark_all_molecules_valid(t_pack_molecule* molecule_head) { |
| for (auto cur_molecule = molecule_head; cur_molecule != nullptr; cur_molecule = cur_molecule->next) { |
| cur_molecule->valid = true; |
| } |
| } |
| |
| static int count_molecules(t_pack_molecule* molecule_head) { |
| int num_molecules = 0; |
| for (auto cur_molecule = molecule_head; cur_molecule != nullptr; cur_molecule = cur_molecule->next) { |
| ++num_molecules; |
| } |
| return num_molecules; |
| } |
| |
| //Calculates molecule statistics for a single molecule |
| static t_molecule_stats calc_molecule_stats(const t_pack_molecule* molecule) { |
| t_molecule_stats molecule_stats; |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| //Calculate the number of available pins on primitives within the molecule |
| for (auto blk : molecule->atom_block_ids) { |
| if (!blk) continue; |
| |
| ++molecule_stats.num_blocks; //Record number of valid blocks in molecule |
| |
| const t_model* model = atom_ctx.nlist.block_model(blk); |
| |
| for (const t_model_ports* input_port = model->inputs; input_port != nullptr; input_port = input_port->next) { |
| molecule_stats.num_input_pins += input_port->size; |
| } |
| |
| for (const t_model_ports* output_port = model->outputs; output_port != nullptr; output_port = output_port->next) { |
| molecule_stats.num_output_pins += output_port->size; |
| } |
| } |
| molecule_stats.num_pins = molecule_stats.num_input_pins + molecule_stats.num_output_pins; |
| |
| //Calculate the number of externally used pins |
| std::set<AtomBlockId> molecule_atoms(molecule->atom_block_ids.begin(), molecule->atom_block_ids.end()); |
| for (auto blk : molecule->atom_block_ids) { |
| if (!blk) continue; |
| |
| for (auto pin : atom_ctx.nlist.block_pins(blk)) { |
| auto net = atom_ctx.nlist.pin_net(pin); |
| |
| auto pin_type = atom_ctx.nlist.pin_type(pin); |
| if (pin_type == PinType::SINK) { |
| auto driver_blk = atom_ctx.nlist.net_driver_block(net); |
| |
| if (molecule_atoms.count(driver_blk)) { |
| //Pin driven by a block within the molecule |
| //Does not count as an external connection |
| } else { |
| //Pin driven by a block outside the molecule |
| ++molecule_stats.num_used_ext_inputs; |
| } |
| |
| } else { |
| VTR_ASSERT(pin_type == PinType::DRIVER); |
| |
| bool net_leaves_molecule = false; |
| for (auto sink_pin : atom_ctx.nlist.net_sinks(net)) { |
| auto sink_blk = atom_ctx.nlist.pin_block(sink_pin); |
| |
| if (!molecule_atoms.count(sink_blk)) { |
| //There is at least one sink outside of the current molecule |
| net_leaves_molecule = true; |
| break; |
| } |
| } |
| |
| //We assume that any fanout occurs outside of the molecule, hence we only |
| //count one used output (even if there are multiple sinks outside the molecule) |
| if (net_leaves_molecule) { |
| ++molecule_stats.num_used_ext_outputs; |
| } |
| } |
| } |
| } |
| molecule_stats.num_used_ext_pins = molecule_stats.num_used_ext_inputs + molecule_stats.num_used_ext_outputs; |
| |
| return molecule_stats; |
| } |
| |
| //Calculates maximum molecule statistics accross all molecules in linked list |
| static t_molecule_stats calc_max_molecules_stats(const t_pack_molecule* molecule_head) { |
| t_molecule_stats max_molecules_stats; |
| |
| for (auto cur_molecule = molecule_head; cur_molecule != nullptr; cur_molecule = cur_molecule->next) { |
| //Calculate per-molecule statistics |
| t_molecule_stats cur_molecule_stats = calc_molecule_stats(cur_molecule); |
| |
| //Record the maximums (member-wise) over all molecules |
| max_molecules_stats.num_blocks = std::max(max_molecules_stats.num_blocks, cur_molecule_stats.num_blocks); |
| |
| max_molecules_stats.num_pins = std::max(max_molecules_stats.num_pins, cur_molecule_stats.num_pins); |
| max_molecules_stats.num_input_pins = std::max(max_molecules_stats.num_input_pins, cur_molecule_stats.num_input_pins); |
| max_molecules_stats.num_output_pins = std::max(max_molecules_stats.num_output_pins, cur_molecule_stats.num_output_pins); |
| |
| max_molecules_stats.num_used_ext_pins = std::max(max_molecules_stats.num_used_ext_pins, cur_molecule_stats.num_used_ext_pins); |
| max_molecules_stats.num_used_ext_inputs = std::max(max_molecules_stats.num_used_ext_inputs, cur_molecule_stats.num_used_ext_inputs); |
| max_molecules_stats.num_used_ext_outputs = std::max(max_molecules_stats.num_used_ext_outputs, cur_molecule_stats.num_used_ext_outputs); |
| } |
| |
| return max_molecules_stats; |
| } |
| |
| static std::vector<AtomBlockId> initialize_seed_atoms(const e_cluster_seed seed_type, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| const t_molecule_stats& max_molecule_stats, |
| const vtr::vector<AtomBlockId, float>& atom_criticality) { |
| std::vector<AtomBlockId> seed_atoms; |
| |
| //Put all atoms in seed list |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| for (auto blk : atom_ctx.nlist.blocks()) { |
| seed_atoms.emplace_back(blk); |
| } |
| |
| //Initially all gains are zero |
| vtr::vector<AtomBlockId, float> atom_gains(atom_ctx.nlist.blocks().size(), 0.); |
| |
| if (seed_type == e_cluster_seed::TIMING) { |
| VTR_ASSERT(atom_gains.size() == atom_criticality.size()); |
| |
| //By criticality |
| atom_gains = atom_criticality; |
| |
| } else if (seed_type == e_cluster_seed::MAX_INPUTS) { |
| //By number of used molecule input pins |
| for (auto blk : atom_ctx.nlist.blocks()) { |
| int max_molecule_inputs = 0; |
| auto molecule_rng = atom_molecules.equal_range(blk); |
| for (const auto& kv : vtr::make_range(molecule_rng.first, molecule_rng.second)) { |
| const t_pack_molecule* blk_mol = kv.second; |
| |
| const t_molecule_stats molecule_stats = calc_molecule_stats(blk_mol); |
| |
| //Keep the max over all molecules associated with the atom |
| max_molecule_inputs = std::max(max_molecule_inputs, molecule_stats.num_used_ext_inputs); |
| } |
| |
| atom_gains[blk] = max_molecule_inputs; |
| } |
| |
| } else if (seed_type == e_cluster_seed::BLEND) { |
| //By blended gain (criticality and inputs used) |
| for (auto blk : atom_ctx.nlist.blocks()) { |
| /* Score seed gain of each block as a weighted sum of timing criticality, |
| * number of tightly coupled blocks connected to it, and number of external inputs */ |
| float seed_blend_fac = 0.5; |
| float max_blend_gain = 0; |
| |
| auto molecule_rng = atom_molecules.equal_range(blk); |
| for (const auto& kv : vtr::make_range(molecule_rng.first, molecule_rng.second)) { |
| const t_pack_molecule* blk_mol = kv.second; |
| |
| const t_molecule_stats molecule_stats = calc_molecule_stats(blk_mol); |
| |
| VTR_ASSERT(max_molecule_stats.num_used_ext_inputs > 0); |
| |
| float blend_gain = (seed_blend_fac * atom_criticality[blk] |
| + (1 - seed_blend_fac) * (molecule_stats.num_used_ext_inputs / max_molecule_stats.num_used_ext_inputs)); |
| blend_gain *= (1 + 0.2 * (molecule_stats.num_blocks - 1)); |
| |
| //Keep the max over all molecules associated with the atom |
| max_blend_gain = std::max(max_blend_gain, blend_gain); |
| } |
| atom_gains[blk] = max_blend_gain; |
| } |
| |
| } else if (seed_type == e_cluster_seed::MAX_PINS || seed_type == e_cluster_seed::MAX_INPUT_PINS) { |
| //By pins per molecule (i.e. available pins on primitives, not pins in use) |
| |
| for (auto blk : atom_ctx.nlist.blocks()) { |
| int max_molecule_pins = 0; |
| auto molecule_rng = atom_molecules.equal_range(blk); |
| for (const auto& kv : vtr::make_range(molecule_rng.first, molecule_rng.second)) { |
| const t_pack_molecule* mol = kv.second; |
| |
| const t_molecule_stats molecule_stats = calc_molecule_stats(mol); |
| |
| //Keep the max over all molecules associated with the atom |
| int molecule_pins = 0; |
| if (seed_type == e_cluster_seed::MAX_PINS) { |
| //All pins |
| molecule_pins = molecule_stats.num_pins; |
| } else { |
| VTR_ASSERT(seed_type == e_cluster_seed::MAX_INPUT_PINS); |
| //Input pins only |
| molecule_pins = molecule_stats.num_input_pins; |
| } |
| |
| //Keep the max over all molecules associated with the atom |
| max_molecule_pins = std::max(max_molecule_pins, molecule_pins); |
| } |
| atom_gains[blk] = max_molecule_pins; |
| } |
| |
| } else if (seed_type == e_cluster_seed::BLEND2) { |
| for (auto blk : atom_ctx.nlist.blocks()) { |
| float max_gain = 0; |
| auto molecule_rng = atom_molecules.equal_range(blk); |
| for (const auto& kv : vtr::make_range(molecule_rng.first, molecule_rng.second)) { |
| const t_pack_molecule* mol = kv.second; |
| |
| const t_molecule_stats molecule_stats = calc_molecule_stats(mol); |
| |
| float pin_ratio = vtr::safe_ratio<float>(molecule_stats.num_pins, max_molecule_stats.num_pins); |
| float input_pin_ratio = vtr::safe_ratio<float>(molecule_stats.num_input_pins, max_molecule_stats.num_input_pins); |
| float output_pin_ratio = vtr::safe_ratio<float>(molecule_stats.num_output_pins, max_molecule_stats.num_output_pins); |
| float used_ext_pin_ratio = vtr::safe_ratio<float>(molecule_stats.num_used_ext_pins, max_molecule_stats.num_used_ext_pins); |
| float used_ext_input_pin_ratio = vtr::safe_ratio<float>(molecule_stats.num_used_ext_inputs, max_molecule_stats.num_used_ext_inputs); |
| float used_ext_output_pin_ratio = vtr::safe_ratio<float>(molecule_stats.num_used_ext_outputs, max_molecule_stats.num_used_ext_outputs); |
| float num_blocks_ratio = vtr::safe_ratio<float>(molecule_stats.num_blocks, max_molecule_stats.num_blocks); |
| float criticality = atom_criticality[blk]; |
| |
| constexpr float PIN_WEIGHT = 0.; |
| constexpr float INPUT_PIN_WEIGHT = 0.5; |
| constexpr float OUTPUT_PIN_WEIGHT = 0.; |
| constexpr float USED_PIN_WEIGHT = 0.; |
| constexpr float USED_INPUT_PIN_WEIGHT = 0.2; |
| constexpr float USED_OUTPUT_PIN_WEIGHT = 0.; |
| constexpr float BLOCKS_WEIGHT = 0.2; |
| constexpr float CRITICALITY_WEIGHT = 0.1; |
| |
| float gain = PIN_WEIGHT * pin_ratio |
| + INPUT_PIN_WEIGHT * input_pin_ratio |
| + OUTPUT_PIN_WEIGHT * output_pin_ratio |
| |
| + USED_PIN_WEIGHT * used_ext_pin_ratio |
| + USED_INPUT_PIN_WEIGHT * used_ext_input_pin_ratio |
| + USED_OUTPUT_PIN_WEIGHT * used_ext_output_pin_ratio |
| |
| + BLOCKS_WEIGHT * num_blocks_ratio |
| + CRITICALITY_WEIGHT * criticality; |
| |
| max_gain = std::max(max_gain, gain); |
| } |
| |
| atom_gains[blk] = max_gain; |
| } |
| |
| } else { |
| VPR_FATAL_ERROR(VPR_ERROR_PACK, "Unrecognized cluster seed type"); |
| } |
| |
| //Sort seeds in descending order of gain (i.e. highest gain first) |
| // |
| // Note that we use a *stable* sort here. It has been observed that different |
| // standard library implementations (e.g. gcc-4.9 vs gcc-5) use sorting algorithms |
| // which produce different orderings for seeds of equal gain (which is allowed with |
| // std::sort which does not specify how equal values are handled). Using a stable |
| // sort ensures that regardless of the underlying sorting algorithm the same seed |
| // order is produced regardless of compiler. |
| auto by_descending_gain = [&](const AtomBlockId lhs, const AtomBlockId rhs) { |
| return atom_gains[lhs] > atom_gains[rhs]; |
| }; |
| std::stable_sort(seed_atoms.begin(), seed_atoms.end(), by_descending_gain); |
| |
| if (getEchoEnabled() && isEchoFileEnabled(E_ECHO_CLUSTERING_BLOCK_CRITICALITIES)) { |
| print_seed_gains(getEchoFileName(E_ECHO_CLUSTERING_BLOCK_CRITICALITIES), seed_atoms, atom_gains, atom_criticality); |
| } |
| |
| return seed_atoms; |
| } |
| |
| static t_pack_molecule* get_highest_gain_seed_molecule(int* seedindex, const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, const std::vector<AtomBlockId> seed_atoms) { |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| while (*seedindex < static_cast<int>(seed_atoms.size())) { |
| AtomBlockId blk_id = seed_atoms[(*seedindex)++]; |
| |
| if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) { |
| t_pack_molecule* best = nullptr; |
| |
| auto rng = atom_molecules.equal_range(blk_id); |
| for (const auto& kv : vtr::make_range(rng.first, rng.second)) { |
| t_pack_molecule* molecule = kv.second; |
| if (molecule->valid) { |
| if (best == nullptr || (best->base_gain) < (molecule->base_gain)) { |
| best = molecule; |
| } |
| } |
| } |
| VTR_ASSERT(best != nullptr); |
| return best; |
| } |
| } |
| |
| /*if it makes it to here , there are no more blocks available*/ |
| return nullptr; |
| } |
| |
| /* get gain of packing molecule into current cluster |
| * gain is equal to: |
| * total_block_gain |
| * + molecule_base_gain*some_factor |
| * - introduced_input_nets_of_unrelated_blocks_pulled_in_by_molecule*some_other_factor |
| */ |
| static float get_molecule_gain(t_pack_molecule* molecule, std::map<AtomBlockId, float>& blk_gain) { |
| float gain; |
| int i; |
| int num_introduced_inputs_of_indirectly_related_block; |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| gain = 0; |
| num_introduced_inputs_of_indirectly_related_block = 0; |
| for (i = 0; i < get_array_size_of_molecule(molecule); i++) { |
| auto blk_id = molecule->atom_block_ids[i]; |
| if (blk_id) { |
| if (blk_gain.count(blk_id) > 0) { |
| gain += blk_gain[blk_id]; |
| } else { |
| /* This block has no connection with current cluster, penalize molecule for having this block |
| */ |
| for (auto pin_id : atom_ctx.nlist.block_input_pins(blk_id)) { |
| auto net_id = atom_ctx.nlist.pin_net(pin_id); |
| VTR_ASSERT(net_id); |
| |
| auto driver_pin_id = atom_ctx.nlist.net_driver(net_id); |
| VTR_ASSERT(driver_pin_id); |
| |
| auto driver_blk_id = atom_ctx.nlist.pin_block(driver_pin_id); |
| |
| num_introduced_inputs_of_indirectly_related_block++; |
| for (int iblk = 0; iblk < get_array_size_of_molecule(molecule); iblk++) { |
| if (molecule->atom_block_ids[iblk] && driver_blk_id == molecule->atom_block_ids[iblk]) { |
| //valid block which is driver (and hence not an input) |
| num_introduced_inputs_of_indirectly_related_block--; |
| break; |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| gain += molecule->base_gain * 0.0001; /* Use base gain as tie breaker TODO: need to sweep this value and perhaps normalize */ |
| gain -= num_introduced_inputs_of_indirectly_related_block * (0.001); |
| |
| return gain; |
| } |
| |
| static int compare_molecule_gain(const void* a, const void* b) { |
| float base_gain_a, base_gain_b, diff; |
| const t_pack_molecule *molecule_a, *molecule_b; |
| molecule_a = (*(const t_pack_molecule* const*)a); |
| molecule_b = (*(const t_pack_molecule* const*)b); |
| |
| base_gain_a = molecule_a->base_gain; |
| base_gain_b = molecule_b->base_gain; |
| diff = base_gain_a - base_gain_b; |
| if (diff > 0) { |
| return 1; |
| } |
| if (diff < 0) { |
| return -1; |
| } |
| return 0; |
| } |
| |
| /* Determine if speculatively packed cur_pb is pin feasible |
| * Runtime is actually not that bad for this. It's worst case O(k^2) where k is the |
| * number of pb_graph pins. Can use hash tables or make incremental if becomes an issue. |
| */ |
| static void try_update_lookahead_pins_used(t_pb* cur_pb) { |
| int i, j; |
| const t_pb_type* pb_type = cur_pb->pb_graph_node->pb_type; |
| |
| // run recursively till a leaf (primitive) pb block is reached |
| if (pb_type->num_modes > 0 && cur_pb->name != nullptr) { |
| if (cur_pb->child_pbs != nullptr) { |
| for (i = 0; i < pb_type->modes[cur_pb->mode].num_pb_type_children; i++) { |
| if (cur_pb->child_pbs[i] != nullptr) { |
| for (j = 0; j < pb_type->modes[cur_pb->mode].pb_type_children[i].num_pb; j++) { |
| try_update_lookahead_pins_used(&cur_pb->child_pbs[i][j]); |
| } |
| } |
| } |
| } |
| } else { |
| // find if this child (primitive) pb block has an atom mapped to it, |
| // if yes compute and mark lookahead pins used for that pb block |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| AtomBlockId blk_id = atom_ctx.lookup.pb_atom(cur_pb); |
| if (pb_type->blif_model != nullptr && blk_id) { |
| compute_and_mark_lookahead_pins_used(blk_id); |
| } |
| } |
| } |
| |
| /* Resets nets used at different pin classes for determining pin feasibility */ |
| static void reset_lookahead_pins_used(t_pb* cur_pb) { |
| int i, j; |
| const t_pb_type* pb_type = cur_pb->pb_graph_node->pb_type; |
| if (cur_pb->pb_stats == nullptr) { |
| return; /* No pins used, no need to continue */ |
| } |
| |
| if (pb_type->num_modes > 0 && cur_pb->name != nullptr) { |
| for (i = 0; i < cur_pb->pb_graph_node->num_input_pin_class; i++) { |
| cur_pb->pb_stats->lookahead_input_pins_used[i].clear(); |
| } |
| |
| for (i = 0; i < cur_pb->pb_graph_node->num_output_pin_class; i++) { |
| cur_pb->pb_stats->lookahead_output_pins_used[i].clear(); |
| } |
| |
| if (cur_pb->child_pbs != nullptr) { |
| for (i = 0; i < pb_type->modes[cur_pb->mode].num_pb_type_children; i++) { |
| if (cur_pb->child_pbs[i] != nullptr) { |
| for (j = 0; j < pb_type->modes[cur_pb->mode].pb_type_children[i].num_pb; j++) { |
| reset_lookahead_pins_used(&cur_pb->child_pbs[i][j]); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /* Determine if pins of speculatively packed pb are legal */ |
| static void compute_and_mark_lookahead_pins_used(const AtomBlockId blk_id) { |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| const t_pb* cur_pb = atom_ctx.lookup.atom_pb(blk_id); |
| VTR_ASSERT(cur_pb != nullptr); |
| |
| /* Walk through inputs, outputs, and clocks marking pins off of the same class */ |
| for (auto pin_id : atom_ctx.nlist.block_pins(blk_id)) { |
| auto net_id = atom_ctx.nlist.pin_net(pin_id); |
| |
| const t_pb_graph_pin* pb_graph_pin = find_pb_graph_pin(atom_ctx.nlist, atom_ctx.lookup, pin_id); |
| compute_and_mark_lookahead_pins_used_for_pin(pb_graph_pin, cur_pb, net_id); |
| } |
| } |
| |
| /** |
| * Given a pin and its assigned net, mark all pin classes that are affected. |
| * Check if connecting this pin to it's driver pin or to all sink pins will |
| * require leaving a pb_block starting from the parent pb_block of the |
| * primitive till the root block (depth = 0). If leaving a pb_block is |
| * required add this net to the pin class (to increment the number of used |
| * pins from this class) that should be used to leave the pb_block. |
| */ |
| static void compute_and_mark_lookahead_pins_used_for_pin(const t_pb_graph_pin* pb_graph_pin, const t_pb* primitive_pb, const AtomNetId net_id) { |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| // starting from the parent pb of the input primitive go up in the hierarchy till the root block |
| for (auto cur_pb = primitive_pb->parent_pb; cur_pb; cur_pb = cur_pb->parent_pb) { |
| const auto depth = cur_pb->pb_graph_node->pb_type->depth; |
| const auto pin_class = pb_graph_pin->parent_pin_class[depth]; |
| VTR_ASSERT(pin_class != OPEN); |
| |
| const auto driver_blk_id = atom_ctx.nlist.net_driver_block(net_id); |
| |
| // if this primitive pin is an input pin |
| if (pb_graph_pin->port->type == IN_PORT) { |
| /* find location of net driver if exist in clb, NULL otherwise */ |
| // find the driver of the input net connected to the pin being studied |
| const auto driver_pin_id = atom_ctx.nlist.net_driver(net_id); |
| // find the id of the atom occupying the input primitive_pb |
| const auto prim_blk_id = atom_ctx.lookup.pb_atom(primitive_pb); |
| // find the pb block occupied by the driving atom |
| const auto driver_pb = atom_ctx.lookup.atom_pb(driver_blk_id); |
| // pb_graph_pin driving net_id in the driver pb block |
| t_pb_graph_pin* output_pb_graph_pin = nullptr; |
| // if the driver block is in the same clb as the input primitive block |
| if (atom_ctx.lookup.atom_clb(driver_blk_id) == atom_ctx.lookup.atom_clb(prim_blk_id)) { |
| // get pb_graph_pin driving the given net |
| output_pb_graph_pin = get_driver_pb_graph_pin(driver_pb, driver_pin_id); |
| } |
| |
| bool is_reachable = false; |
| |
| // if the driver pin is within the cluster |
| if (output_pb_graph_pin) { |
| // find if the driver pin can reach the input pin of the primitive or not |
| const t_pb* check_pb = driver_pb; |
| while (check_pb && check_pb != cur_pb) { |
| check_pb = check_pb->parent_pb; |
| } |
| if (check_pb) { |
| for (int i = 0; i < output_pb_graph_pin->num_connectable_primitive_input_pins[depth]; i++) { |
| if (pb_graph_pin == output_pb_graph_pin->list_of_connectable_input_pin_ptrs[depth][i]) { |
| is_reachable = true; |
| break; |
| } |
| } |
| } |
| } |
| |
| // Must use an input pin to connect the driver to the input pin of the given primitive, either the |
| // driver atom is not contained in the cluster or is contained but cannot reach the primitive pin |
| if (!is_reachable) { |
| // add net to lookahead_input_pins_used if not already added |
| auto it = std::find(cur_pb->pb_stats->lookahead_input_pins_used[pin_class].begin(), |
| cur_pb->pb_stats->lookahead_input_pins_used[pin_class].end(), net_id); |
| if (it == cur_pb->pb_stats->lookahead_input_pins_used[pin_class].end()) { |
| cur_pb->pb_stats->lookahead_input_pins_used[pin_class].push_back(net_id); |
| } |
| } |
| } else { |
| VTR_ASSERT(pb_graph_pin->port->type == OUT_PORT); |
| /* |
| * Determine if this net (which is driven from within this cluster) leaves this cluster |
| * (and hence uses an output pin). |
| */ |
| |
| bool net_exits_cluster = true; |
| int num_net_sinks = static_cast<int>(atom_ctx.nlist.net_sinks(net_id).size()); |
| |
| if (pb_graph_pin->num_connectable_primitive_input_pins[depth] >= num_net_sinks) { |
| //It is possible the net is completely absorbed in the cluster, |
| //since this pin could (potentially) drive all the net's sinks |
| |
| /* Important: This runtime penalty looks a lot scarier than it really is. |
| * For high fan-out nets, I at most look at the number of pins within the |
| * cluster which limits runtime. |
| * |
| * DO NOT REMOVE THIS INITIAL FILTER WITHOUT CAREFUL ANALYSIS ON RUNTIME!!! |
| * |
| * Key Observation: |
| * For LUT-based designs it is impossible for the average fanout to exceed |
| * the number of LUT inputs so it's usually around 4-5 (pigeon-hole argument, |
| * if the average fanout is greater than the number of LUT inputs, where do |
| * the extra connections go? Therefore, average fanout must be capped to a |
| * small constant where the constant is equal to the number of LUT inputs). |
| * The real danger to runtime is when the number of sinks of a net gets doubled |
| */ |
| |
| //Check if all the net sinks are, in fact, inside this cluster |
| bool all_sinks_in_cur_cluster = true; |
| ClusterBlockId driver_clb = atom_ctx.lookup.atom_clb(driver_blk_id); |
| for (auto pin_id : atom_ctx.nlist.net_sinks(net_id)) { |
| auto sink_blk_id = atom_ctx.nlist.pin_block(pin_id); |
| if (atom_ctx.lookup.atom_clb(sink_blk_id) != driver_clb) { |
| all_sinks_in_cur_cluster = false; |
| break; |
| } |
| } |
| |
| if (all_sinks_in_cur_cluster) { |
| //All the sinks are part of this cluster, so the net may be fully absorbed. |
| // |
| //Verify this, by counting the number of net sinks reachable from the driver pin. |
| //If the count equals the number of net sinks then the net is fully absorbed and |
| //the net does not exit the cluster |
| /* TODO: I should cache the absorbed outputs, once net is absorbed, |
| * net is forever absorbed, no point in rechecking every time */ |
| if (net_sinks_reachable_in_cluster(pb_graph_pin, depth, net_id)) { |
| //All the sinks are reachable inside the cluster |
| net_exits_cluster = false; |
| } |
| } |
| } |
| |
| if (net_exits_cluster) { |
| /* This output must exit this cluster */ |
| cur_pb->pb_stats->lookahead_output_pins_used[pin_class].push_back(net_id); |
| } |
| } |
| } |
| } |
| |
| int net_sinks_reachable_in_cluster(const t_pb_graph_pin* driver_pb_gpin, const int depth, const AtomNetId net_id) { |
| size_t num_reachable_sinks = 0; |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| //Record the sink pb graph pins we are looking for |
| std::unordered_set<const t_pb_graph_pin*> sink_pb_gpins; |
| for (const AtomPinId pin_id : atom_ctx.nlist.net_sinks(net_id)) { |
| const t_pb_graph_pin* sink_pb_gpin = find_pb_graph_pin(atom_ctx.nlist, atom_ctx.lookup, pin_id); |
| VTR_ASSERT(sink_pb_gpin); |
| |
| sink_pb_gpins.insert(sink_pb_gpin); |
| } |
| |
| //Count how many sink pins are reachable |
| for (int i_prim_pin = 0; i_prim_pin < driver_pb_gpin->num_connectable_primitive_input_pins[depth]; ++i_prim_pin) { |
| const t_pb_graph_pin* reachable_pb_gpin = driver_pb_gpin->list_of_connectable_input_pin_ptrs[depth][i_prim_pin]; |
| |
| if (sink_pb_gpins.count(reachable_pb_gpin)) { |
| ++num_reachable_sinks; |
| if (num_reachable_sinks == atom_ctx.nlist.net_sinks(net_id).size()) { |
| return true; |
| } |
| } |
| } |
| |
| return false; |
| } |
| |
| /** |
| * Returns the pb_graph_pin of the atom pin defined by the driver_pin_id in the driver_pb |
| */ |
| static t_pb_graph_pin* get_driver_pb_graph_pin(const t_pb* driver_pb, const AtomPinId driver_pin_id) { |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| const auto driver_pb_type = driver_pb->pb_graph_node->pb_type; |
| int output_port = 0; |
| // find the port of the pin driving the net as well as the port model |
| auto driver_port_id = atom_ctx.nlist.pin_port(driver_pin_id); |
| auto driver_model_port = atom_ctx.nlist.port_model(driver_port_id); |
| // find the port id of the port containing the driving pin in the driver_pb_type |
| for (int i = 0; i < driver_pb_type->num_ports; i++) { |
| auto& prim_port = driver_pb_type->ports[i]; |
| if (prim_port.type == OUT_PORT) { |
| if (prim_port.model_port == driver_model_port) { |
| // get the output pb_graph_pin driving this input net |
| return &(driver_pb->pb_graph_node->output_pins[output_port][atom_ctx.nlist.pin_port_bit(driver_pin_id)]); |
| } |
| output_port++; |
| } |
| } |
| // the pin should be found |
| VTR_ASSERT(false); |
| return nullptr; |
| } |
| |
| /* Check if the number of available inputs/outputs for a pin class is sufficient for speculatively packed blocks */ |
| static bool check_lookahead_pins_used(t_pb* cur_pb, t_ext_pin_util max_external_pin_util) { |
| const t_pb_type* pb_type = cur_pb->pb_graph_node->pb_type; |
| |
| if (pb_type->num_modes > 0 && cur_pb->name) { |
| for (int i = 0; i < cur_pb->pb_graph_node->num_input_pin_class; i++) { |
| size_t class_size = cur_pb->pb_graph_node->input_pin_class_size[i]; |
| |
| if (cur_pb->is_root()) { |
| // Scale the class size by the maximum external pin utilization factor |
| // Use ceil to avoid classes of size 1 from being scaled to zero |
| class_size = std::ceil(max_external_pin_util.input_pin_util * class_size); |
| // if the number of pins already used is larger than class size, then the number of |
| // cluster inputs already used should be our constraint. Why is this needed? This is |
| // needed since when packing the seed block the maximum external pin utilization is |
| // used as 1.0 allowing molecules that are using up to all the cluster inputs to be |
| // packed legally. Therefore, if the seed block is already using more inputs than |
| // the allowed maximum utilization, this should become the new maximum pin utilization. |
| class_size = std::max<size_t>(class_size, cur_pb->pb_stats->input_pins_used[i].size()); |
| } |
| |
| if (cur_pb->pb_stats->lookahead_input_pins_used[i].size() > class_size) { |
| return false; |
| } |
| } |
| |
| for (int i = 0; i < cur_pb->pb_graph_node->num_output_pin_class; i++) { |
| size_t class_size = cur_pb->pb_graph_node->output_pin_class_size[i]; |
| if (cur_pb->is_root()) { |
| // Scale the class size by the maximum external pin utilization factor |
| // Use ceil to avoid classes of size 1 from being scaled to zero |
| class_size = std::ceil(max_external_pin_util.output_pin_util * class_size); |
| // if the number of pins already used is larger than class size, then the number of |
| // cluster outputs already used should be our constraint. Why is this needed? This is |
| // needed since when packing the seed block the maximum external pin utilization is |
| // used as 1.0 allowing molecules that are using up to all the cluster inputs to be |
| // packed legally. Therefore, if the seed block is already using more inputs than |
| // the allowed maximum utilization, this should become the new maximum pin utilization. |
| class_size = std::max<size_t>(class_size, cur_pb->pb_stats->output_pins_used[i].size()); |
| } |
| |
| if (cur_pb->pb_stats->lookahead_output_pins_used[i].size() > class_size) { |
| return false; |
| } |
| } |
| |
| if (cur_pb->child_pbs) { |
| for (int i = 0; i < pb_type->modes[cur_pb->mode].num_pb_type_children; i++) { |
| if (cur_pb->child_pbs[i]) { |
| for (int j = 0; j < pb_type->modes[cur_pb->mode].pb_type_children[i].num_pb; j++) { |
| if (!check_lookahead_pins_used(&cur_pb->child_pbs[i][j], max_external_pin_util)) |
| return false; |
| } |
| } |
| } |
| } |
| } |
| |
| return true; |
| } |
| |
| /* Speculation successful, commit input/output pins used */ |
| static void commit_lookahead_pins_used(t_pb* cur_pb) { |
| const t_pb_type* pb_type = cur_pb->pb_graph_node->pb_type; |
| |
| if (pb_type->num_modes > 0 && cur_pb->name) { |
| for (int i = 0; i < cur_pb->pb_graph_node->num_input_pin_class; i++) { |
| VTR_ASSERT(cur_pb->pb_stats->lookahead_input_pins_used[i].size() <= (unsigned int)cur_pb->pb_graph_node->input_pin_class_size[i]); |
| for (size_t j = 0; j < cur_pb->pb_stats->lookahead_input_pins_used[i].size(); j++) { |
| VTR_ASSERT(cur_pb->pb_stats->lookahead_input_pins_used[i][j]); |
| cur_pb->pb_stats->input_pins_used[i].insert({j, cur_pb->pb_stats->lookahead_input_pins_used[i][j]}); |
| } |
| } |
| |
| for (int i = 0; i < cur_pb->pb_graph_node->num_output_pin_class; i++) { |
| VTR_ASSERT(cur_pb->pb_stats->lookahead_output_pins_used[i].size() <= (unsigned int)cur_pb->pb_graph_node->output_pin_class_size[i]); |
| for (size_t j = 0; j < cur_pb->pb_stats->lookahead_output_pins_used[i].size(); j++) { |
| VTR_ASSERT(cur_pb->pb_stats->lookahead_output_pins_used[i][j]); |
| cur_pb->pb_stats->output_pins_used[i].insert({j, cur_pb->pb_stats->lookahead_output_pins_used[i][j]}); |
| } |
| } |
| |
| if (cur_pb->child_pbs) { |
| for (int i = 0; i < pb_type->modes[cur_pb->mode].num_pb_type_children; i++) { |
| if (cur_pb->child_pbs[i]) { |
| for (int j = 0; j < pb_type->modes[cur_pb->mode].pb_type_children[i].num_pb; j++) { |
| commit_lookahead_pins_used(&cur_pb->child_pbs[i][j]); |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * Score unclustered atoms that are two hops away from current cluster |
| * For example, consider a cluster that has a FF feeding an adder in another |
| * cluster. Since this FF is feeding an adder that is packed in another cluster |
| * this function should find other FFs that are feeding other inputs of this adder |
| * since they are two hops away from the FF packed in this cluster |
| */ |
| static void load_transitive_fanout_candidates(ClusterBlockId clb_index, |
| const std::multimap<AtomBlockId, t_pack_molecule*>& atom_molecules, |
| t_pb_stats* pb_stats, |
| vtr::vector<ClusterBlockId, std::vector<AtomNetId>>& clb_inter_blk_nets, |
| int transitive_fanout_threshold) { |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| // iterate over all the nets that have pins in this cluster |
| for (const auto net_id : pb_stats->marked_nets) { |
| // only consider small nets to constrain runtime |
| if (int(atom_ctx.nlist.net_pins(net_id).size()) < transitive_fanout_threshold + 1) { |
| // iterate over all the pins of the net |
| for (const auto pin_id : atom_ctx.nlist.net_pins(net_id)) { |
| AtomBlockId atom_blk_id = atom_ctx.nlist.pin_block(pin_id); |
| // get the transitive cluster |
| ClusterBlockId tclb = atom_ctx.lookup.atom_clb(atom_blk_id); |
| // if the block connected to this pin is packed in another cluster |
| if (tclb != clb_index && tclb != ClusterBlockId::INVALID()) { |
| // explore transitive nets from already packed cluster |
| for (AtomNetId tnet : clb_inter_blk_nets[tclb]) { |
| // iterate over all the pins of the net |
| for (AtomPinId tpin : atom_ctx.nlist.net_pins(tnet)) { |
| auto blk_id = atom_ctx.nlist.pin_block(tpin); |
| // This transitive atom is not packed, score and add |
| if (atom_ctx.lookup.atom_clb(blk_id) == ClusterBlockId::INVALID()) { |
| auto& transitive_fanout_candidates = pb_stats->transitive_fanout_candidates; |
| |
| if (pb_stats->gain.count(blk_id) == 0) { |
| pb_stats->gain[blk_id] = 0.001; |
| } else { |
| pb_stats->gain[blk_id] += 0.001; |
| } |
| auto rng = atom_molecules.equal_range(blk_id); |
| for (const auto& kv : vtr::make_range(rng.first, rng.second)) { |
| t_pack_molecule* molecule = kv.second; |
| if (molecule->valid) { |
| transitive_fanout_candidates.insert({molecule->atom_block_ids[molecule->root], molecule}); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| static std::map<const t_model*, std::vector<t_logical_block_type_ptr>> identify_primitive_candidate_block_types() { |
| std::map<const t_model*, std::vector<t_logical_block_type_ptr>> model_candidates; |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| auto& atom_nlist = atom_ctx.nlist; |
| auto& device_ctx = g_vpr_ctx.device(); |
| |
| std::set<const t_model*> unique_models; |
| for (auto blk : atom_nlist.blocks()) { |
| auto model = atom_nlist.block_model(blk); |
| unique_models.insert(model); |
| } |
| |
| for (auto model : unique_models) { |
| model_candidates[model] = {}; |
| |
| for (auto const& type : device_ctx.logical_block_types) { |
| if (block_type_contains_blif_model(&type, model->name)) { |
| model_candidates[model].push_back(&type); |
| } |
| } |
| } |
| |
| return model_candidates; |
| } |
| |
| static void print_seed_gains(const char* fname, const std::vector<AtomBlockId>& seed_atoms, const vtr::vector<AtomBlockId, float>& atom_gain, const vtr::vector<AtomBlockId, float>& atom_criticality) { |
| FILE* fp = vtr::fopen(fname, "w"); |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| //For prett formatting determine the maximum name length |
| int max_name_len = strlen("atom_block_name"); |
| int max_type_len = strlen("atom_block_type"); |
| for (auto blk_id : atom_ctx.nlist.blocks()) { |
| max_name_len = std::max(max_name_len, (int)atom_ctx.nlist.block_name(blk_id).size()); |
| |
| const t_model* model = atom_ctx.nlist.block_model(blk_id); |
| max_type_len = std::max(max_type_len, (int)strlen(model->name)); |
| } |
| |
| fprintf(fp, "%-*s %-*s %8s %8s\n", max_name_len, "atom_block_name", max_type_len, "atom_block_type", "gain", "criticality"); |
| fprintf(fp, "\n"); |
| for (auto blk_id : seed_atoms) { |
| std::string name = atom_ctx.nlist.block_name(blk_id); |
| fprintf(fp, "%-*s ", max_name_len, name.c_str()); |
| |
| const t_model* model = atom_ctx.nlist.block_model(blk_id); |
| fprintf(fp, "%-*s ", max_type_len, model->name); |
| |
| fprintf(fp, "%*f ", std::max((int)strlen("gain"), 8), atom_gain[blk_id]); |
| fprintf(fp, "%*f ", std::max((int)strlen("criticality"), 8), atom_criticality[blk_id]); |
| fprintf(fp, "\n"); |
| } |
| |
| fclose(fp); |
| } |
| |
| /** |
| * This function takes a chain molecule, and the pb_graph_node that is chosen |
| * for packing the molecule's root block. Using the given root_primitive, this |
| * function will identify which chain id this molecule is being mapped to and |
| * will update the chain id value inside the chain info data structure of this |
| * molecule |
| */ |
| static void update_molecule_chain_info(t_pack_molecule* chain_molecule, const t_pb_graph_node* root_primitive) { |
| VTR_ASSERT(chain_molecule->chain_info->chain_id == -1 && chain_molecule->chain_info->is_long_chain); |
| |
| auto chain_root_pins = chain_molecule->pack_pattern->chain_root_pins; |
| |
| // long chains should only be placed at the beginning of the chain |
| // Since for long chains the molecule size is already equal to the |
| // total number of adders in the cluster. Therefore, it should |
| // always be placed at the very first adder in this cluster. |
| for (size_t chainId = 0; chainId < chain_root_pins.size(); chainId++) { |
| if (chain_root_pins[chainId][0]->parent_node == root_primitive) { |
| chain_molecule->chain_info->chain_id = chainId; |
| chain_molecule->chain_info->first_packed_molecule = chain_molecule; |
| return; |
| } |
| } |
| |
| VTR_ASSERT(false); |
| } |
| |
| /** |
| * This function takes the root block of a chain molecule and a proposed |
| * placement primitive for this block. The function then checks if this |
| * chain root block has a placement constraint (such as being driven from |
| * outside the cluster) and returns the status of the placement accordingly. |
| */ |
| static enum e_block_pack_status check_chain_root_placement_feasibility(const t_pb_graph_node* pb_graph_node, |
| const t_pack_molecule* molecule, |
| const AtomBlockId blk_id) { |
| enum e_block_pack_status block_pack_status = BLK_PASSED; |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| bool is_long_chain = molecule->chain_info->is_long_chain; |
| |
| const auto& chain_root_pins = molecule->pack_pattern->chain_root_pins; |
| |
| t_model_ports* root_port = chain_root_pins[0][0]->port->model_port; |
| AtomNetId chain_net_id; |
| auto port_id = atom_ctx.nlist.find_atom_port(blk_id, root_port); |
| |
| if (port_id) { |
| chain_net_id = atom_ctx.nlist.port_net(port_id, chain_root_pins[0][0]->pin_number); |
| } |
| |
| // if this block is part of a long chain or it is driven by a cluster |
| // input pin we need to check the placement legality of this block |
| // Depending on the logic synthesis even small chains that can fit within one |
| // cluster might need to start at the top of the cluster as their input can be |
| // driven by a global gnd or vdd. Therefore even if this is not a long chain |
| // but its input pin is driven by a net, the placement legality is checked. |
| if (is_long_chain || chain_net_id) { |
| auto chain_id = molecule->chain_info->chain_id; |
| // if this chain has a chain id assigned to it (implies is_long_chain too) |
| if (chain_id != -1) { |
| // the chosen primitive should be a valid starting point for the chain |
| // long chains should only be placed at the top of the chain tieOff = 0 |
| if (pb_graph_node != chain_root_pins[chain_id][0]->parent_node) { |
| block_pack_status = BLK_FAILED_FEASIBLE; |
| } |
| // the chain doesn't have an assigned chain_id yet |
| } else { |
| block_pack_status = BLK_FAILED_FEASIBLE; |
| for (const auto& chain : chain_root_pins) { |
| for (size_t tieOff = 0; tieOff < chain.size(); tieOff++) { |
| // check if this chosen primitive is one of the possible |
| // starting points for this chain. |
| if (pb_graph_node == chain[tieOff]->parent_node) { |
| // this location matches with the one of the dedicated chain |
| // input from outside logic block, therefore it is feasible |
| block_pack_status = BLK_PASSED; |
| break; |
| } |
| // long chains should only be placed at the top of the chain tieOff = 0 |
| if (is_long_chain) break; |
| } |
| } |
| } |
| } |
| |
| return block_pack_status; |
| } |
| |
| /** |
| * This function update the pb_type_count data structure by incrementing |
| * the number of used pb_types in the given packed cluster t_pb |
| */ |
| static void update_pb_type_count(const t_pb* pb, std::unordered_map<std::string, int>& pb_type_count) { |
| auto pb_graph_node = pb->pb_graph_node; |
| auto pb_type = pb_graph_node->pb_type; |
| auto mode = &pb_type->modes[pb->mode]; |
| std::string pb_type_name(pb_type->name); |
| |
| auto it = pb_type_count.find(pb_type_name); |
| // if this pb type is in the list increment its |
| // count by 1 otherwise initialize the count to 1 |
| if (it == pb_type_count.end()) |
| pb_type_count[pb_type_name] = 1; |
| else |
| pb_type_count[pb_type_name]++; |
| |
| if (pb_type->num_modes > 0) { |
| for (int i = 0; i < mode->num_pb_type_children; i++) { |
| for (int j = 0; j < mode->pb_type_children[i].num_pb; j++) { |
| if (pb->child_pbs[i] && pb->child_pbs[i][j].name) |
| update_pb_type_count(&pb->child_pbs[i][j], pb_type_count); |
| } |
| } |
| } |
| } |
| |
| /** |
| * Print the total number of used physical blocks for each pb type in the architecture |
| */ |
| static void print_pb_type_count(std::unordered_map<std::string, int>& pb_type_count) { |
| VTR_LOG("\nPb types usage...\n"); |
| for (auto& pb_type : pb_type_count) { |
| VTR_LOG(" %-12s : %d\n", pb_type.first.c_str(), pb_type.second); |
| } |
| VTR_LOG("\n"); |
| } |
| |
| /** |
| * This function identifies the logic block type which is |
| * defined by the block type which has a lut primitive |
| */ |
| static t_logical_block_type_ptr identify_logic_block_type(std::map<const t_model*, std::vector<t_logical_block_type_ptr>>& primitive_candidate_block_types) { |
| std::string lut_name = ".names"; |
| |
| for (auto& model : primitive_candidate_block_types) { |
| std::string model_name(model.first->name); |
| if (model_name == lut_name) |
| return model.second[0]; |
| } |
| |
| return nullptr; |
| } |
| |
| /** |
| * This function returns the pb_type that is similar to Logic Element (LE) in an FPGA |
| * The LE is defined as a physical block that contains a LUT primitive and |
| * is found by searching a cluster type to find the first pb_type (from the top |
| * of the hierarchy clb->LE) that has more than one instance within the cluster. |
| */ |
| static t_pb_type* identify_le_block_type(t_logical_block_type_ptr logic_block_type) { |
| // if there is no CLB-like cluster, then there is no LE pb_block |
| if (!logic_block_type) |
| return nullptr; |
| |
| // search down the hierarchy starting from the pb_graph_head |
| auto pb_graph_node = logic_block_type->pb_graph_head; |
| |
| while (pb_graph_node->child_pb_graph_nodes) { |
| // if this pb_graph_node has more than one mode or more than one pb_type in the default mode return |
| // nullptr since the logic block of this architecture is not a CLB-like logic block |
| if (pb_graph_node->pb_type->num_modes > 1 || pb_graph_node->pb_type->modes[0].num_pb_type_children > 1) |
| return nullptr; |
| // explore the only child of this pb_graph_node |
| pb_graph_node = &pb_graph_node->child_pb_graph_nodes[0][0][0]; |
| // if the child node has more than one instance in the |
| // cluster then this is the pb_type similar to a LE |
| if (pb_graph_node->pb_type->num_pb > 1) |
| return pb_graph_node->pb_type; |
| } |
| |
| return nullptr; |
| } |
| |
| /** |
| * This function updates the le_count data structure from the given packed cluster |
| */ |
| static void update_le_count(const t_pb* pb, const t_logical_block_type_ptr logic_block_type, const t_pb_type* le_pb_type, std::vector<int>& le_count) { |
| // if this cluster doesn't contain LEs or there |
| // are no les in this architecture, ignore it |
| if (!logic_block_type || pb->pb_graph_node != logic_block_type->pb_graph_head || !le_pb_type) |
| return; |
| |
| const std::string lut(".names"); |
| const std::string ff(".latch"); |
| const std::string adder("adder"); |
| |
| auto parent_pb = pb; |
| |
| // go down the hierarchy till the parent physical block of the LE is found |
| while (parent_pb->child_pbs[0][0].pb_graph_node->pb_type != le_pb_type) { |
| parent_pb = &parent_pb->child_pbs[0][0]; |
| } |
| |
| // iterate over all the LEs and update the LE count accordingly |
| for (int ile = 0; ile < parent_pb->get_num_children_of_type(0); ile++) { |
| if (!parent_pb->child_pbs[0][ile].name) |
| continue; |
| |
| auto has_used_lut = pb_used_for_blif_model(&parent_pb->child_pbs[0][ile], lut); |
| auto has_used_adder = pb_used_for_blif_model(&parent_pb->child_pbs[0][ile], adder); |
| auto has_used_ff = pb_used_for_blif_model(&parent_pb->child_pbs[0][ile], ff); |
| |
| // First type of LEs: used for logic and registers |
| if ((has_used_lut || has_used_adder) && has_used_ff) { |
| le_count[0]++; |
| // Second type of LEs: used for logic only |
| } else if (has_used_lut || has_used_adder) { |
| le_count[1]++; |
| // Third type of LEs: used for registers only |
| } else if (has_used_ff) { |
| le_count[2]++; |
| } |
| } |
| } |
| |
| /** |
| * This function returns true if the given physical block has |
| * a primitive matching the given blif model and is used |
| */ |
| static bool pb_used_for_blif_model(const t_pb* pb, std::string blif_model_name) { |
| auto pb_graph_node = pb->pb_graph_node; |
| auto pb_type = pb_graph_node->pb_type; |
| auto mode = &pb_type->modes[pb->mode]; |
| |
| // if this is a primitive check if it matches the given blif model name |
| if (pb_type->blif_model) { |
| if (blif_model_name == pb_type->blif_model || ".subckt " + blif_model_name == pb_type->blif_model) { |
| return true; |
| } |
| } |
| |
| if (pb_type->num_modes > 0) { |
| for (int i = 0; i < mode->num_pb_type_children; i++) { |
| for (int j = 0; j < mode->pb_type_children[i].num_pb; j++) { |
| if (pb->child_pbs[i] && pb->child_pbs[i][j].name) { |
| if (pb_used_for_blif_model(&pb->child_pbs[i][j], blif_model_name)) { |
| return true; |
| } |
| } |
| } |
| } |
| } |
| |
| return false; |
| } |
| |
| /** |
| * Print the LE count data strurture |
| */ |
| static void print_le_count(std::vector<int>& le_count, const t_pb_type* le_pb_type) { |
| VTR_LOG("\nLogic Element (%s) detailed count:\n", le_pb_type->name); |
| VTR_LOG(" Total number of Logic Elements used : %d\n", le_count[0] + le_count[1] + le_count[2]); |
| VTR_LOG(" LEs used for logic and registers : %d\n", le_count[0]); |
| VTR_LOG(" LEs used for logic only : %d\n", le_count[1]); |
| VTR_LOG(" LEs used for registers only : %d\n\n", le_count[2]); |
| } |