vpr/src/pack/pack.cpp - third_party/vtr-verilog-to-routing - Git at Google

 #include <cstdio>
 #include <cstring>
 #include <unordered_set>
 #include <unordered_map>
 #include <fstream>
 #include <stdlib.h>
 using namespace std;

 #include "vtr_assert.h"
 #include "vtr_log.h"
 #include "vtr_math.h"

 #include "vpr_error.h"
 #include "vpr_types.h"

 #include "read_xml_arch_file.h"
 #include "globals.h"
 #include "atom_netlist.h"
 #include "prepack.h"
 #include "pack_types.h"
 #include "pack.h"
 #include "read_blif.h"
 #include "cluster.h"

 #ifdef USE_HMETIS
 #include "hmetis_graph_writer.h"
 static vtr::vector_map<AtomBlockId, int> read_hmetis_graph(string &hmetis_output_file_name, const int num_parts);
 //TODO: CHANGE THIS HARDCODING
 static string hmetis("/cygdrive/c/Source/Repos/vtr-verilog-to-routing/vpr/hmetis-1.5-WIN32/shmetis.exe ");
 #endif

 /* #define DUMP_PB_GRAPH 1 */
 /* #define DUMP_BLIF_INPUT 1 */

 static std::unordered_set<AtomNetId> alloc_and_load_is_clock(bool global_clocks);

 void try_pack(t_packer_opts *packer_opts,
         const t_arch * arch,
 		const t_model *user_models,
         const t_model *library_models,
         float interc_delay,
         vector<t_lb_type_rr_node> *lb_type_rr_graphs
 #ifdef ENABLE_CLASSIC_VPR_STA
         , t_timing_inf timing_inf
 #endif
         ) {
     std::unordered_set<AtomNetId> is_clock;
     std::multimap<AtomBlockId,t_pack_molecule*> atom_molecules; //The molecules associated with each atom block
     std::unordered_map<AtomBlockId,t_pb_graph_node*> expected_lowest_cost_pb_gnode; //The molecules associated with each atom block
 	const t_model *cur_model;
 	int num_models;
 	t_pack_patterns *list_of_packing_patterns;
 	int num_packing_patterns;
 	t_pack_molecule *list_of_pack_molecules, * cur_pack_molecule;
 #ifdef USE_HMETIS
 	vtr::vector_map<AtomBlockId,int> partitions;
 #endif
 	vtr::printf_info("Begin packing '%s'.\n", packer_opts->blif_file_name.c_str());

 	/* determine number of models in the architecture */
 	num_models = 0;
 	cur_model = user_models;
 	while (cur_model) {
 		num_models++;
 		cur_model = cur_model->next;
 	}
 	cur_model = library_models;
 	while (cur_model) {
 		num_models++;
 		cur_model = cur_model->next;
 	}


 	is_clock = alloc_and_load_is_clock(packer_opts->global_clocks);

     auto& atom_ctx = g_vpr_ctx.atom();

     size_t num_p_inputs = 0;
     size_t num_p_outputs = 0;
     for(auto blk_id : atom_ctx.nlist.blocks()) {
         auto type = atom_ctx.nlist.block_type(blk_id);
         if(type == AtomBlockType::INPAD) {
             ++num_p_inputs;
         } else if(type == AtomBlockType::OUTPAD) {
             ++num_p_outputs;
         }
     }

 	vtr::printf_info("\n");
 	vtr::printf_info("After removing unused inputs...\n");
 	vtr::printf_info("\ttotal blocks: %zu, total nets: %zu, total inputs: %zu, total outputs: %zu\n",
 		atom_ctx.nlist.blocks().size(), atom_ctx.nlist.nets().size(), num_p_inputs, num_p_outputs);

 	vtr::printf_info("Begin prepacking.\n");
 	list_of_packing_patterns = alloc_and_load_pack_patterns(&num_packing_patterns);
     list_of_pack_molecules = alloc_and_load_pack_molecules(list_of_packing_patterns,
                                 atom_molecules,
                                 expected_lowest_cost_pb_gnode,
                                 num_packing_patterns);
 	vtr::printf_info("Finish prepacking.\n");

 	if(packer_opts->auto_compute_inter_cluster_net_delay) {
 		packer_opts->inter_cluster_net_delay = interc_delay;
 		vtr::printf_info("Using inter-cluster delay: %g\n", packer_opts->inter_cluster_net_delay);
 	}

 #ifdef USE_HMETIS
 	if (!packer_opts->hmetis_input_file.empty()) {
 		/*	hmetis.exe <GraphFile>			 <Nparts> <UBfactor> <Nruns> <Ctype> <RType> <VCycle> <Reconst> <dbglvl>
 			  or
 			hmetis.exe <GraphFile> <FixFile> <Nparts> <UBfactor> <Nruns> <Ctype> <RType> <VCycle> <Reconst> <dbglvl>
 			 GraphFile: name of the hypergraph file
 			 FixFile  : name of the file containing pre-assignment of vertices to partitions
 			 Nparts   : number of partitions desired
 			 UBfactor : balance between the partitions (e.g., use 5 for a 45-55 bisection balance)
 			 Nruns    : Number of Iterations (shmetis defaults to 10)
 			 CType    : HFC(1), FC(2), GFC(3), HEDGE(4), EDGE(5)
 			 RType    : FM(1), 1WayFM(2), EEFM(3)
 			 VCycle   : No(0), @End(1), ForMin(2), All(3)
 			 Reconst  : NoReconstruct_HE(0), Reconstruct_HE (1)
 			 dbglvl   : debug level*/

 		vtr::printf_info("Writing hmetis hypergraph to %s\n", packer_opts->hmetis_input_file);

 		// Write AtomNetlist into hGraph format
 		write_hmetis_graph(packer_opts->hmetis_input_file);

 		// Set the rest of the arguments
 		// For a reference to what the string arguments mean, refer to the manual

 		// The number of partitions is determined in one of two methods:
 		// 1. Partition tree would result in the size of subcircuits ~= 4 (Elias Vansteenkiste, et al.)
 		// 2. Partition depth = 5, i.e. num_parts = 32 (Doris Chen, et al.)
 		//int num_parts = atom_ctx.nlist.blocks().size() / 4; // Method 1.
 		//TODO: Find an appropriate value (may be from packer_opts) for num_parts
 		int num_parts = 32; //Method 2.

 		string run_hmetis = hmetis + packer_opts->hmetis_input_file + " " + to_string(num_parts) + " 5";

 		// Check if OS is Windows or Linux and run hMetis accordingly
 		//TODO: USE HMETIS WITH ALL ARGUMENTS INSTEAD FOR FURTHER REFINEMENT
 		//Using shmetis (standard hmetis) to simplify
 		system(run_hmetis.c_str());

 		/* Parse the output file from hMetis, contains |V| lines, ith line = partition of V_i.
 		*  Store the results into a vector_map.
 		*/
 		string hmetis_output_file(packer_opts->hmetis_input_file + ".part." + to_string(num_parts));

 		partitions = read_hmetis_graph(hmetis_output_file, num_parts);

 		// Print each block's partition
 		vtr::printf_info("Partitioning complete\n");

 		vector<vector<AtomBlockId>> print_partitions(num_parts);
 		for (auto blk_id : atom_ctx.nlist.blocks()) {
 			print_partitions[partitions[blk_id]].push_back(blk_id);
 		}
 		for (int i = 0; i < (int)print_partitions.size(); i++) {
 			vtr::printf_info("Blocks in partition %zu:\n\t", i);
 			for (int j = 0; j < (int)print_partitions[i].size(); j++)
 				vtr::printf_info("%zu ", size_t(print_partitions[i][j]));
 			vtr::printf_info("\n");
 		}
 	}
 #endif

     do_clustering(arch, list_of_pack_molecules, num_models,
             packer_opts->global_clocks, is_clock,
             atom_molecules,
             expected_lowest_cost_pb_gnode,
             packer_opts->hill_climbing_flag, packer_opts->output_file.c_str(),
             packer_opts->timing_driven, packer_opts->cluster_seed_type,
             packer_opts->alpha, packer_opts->beta,
             packer_opts->inter_cluster_net_delay,
             packer_opts->target_device_utilization,
             packer_opts->allow_unrelated_clustering,
             packer_opts->connection_driven,
             packer_opts->packer_algorithm,
             lb_type_rr_graphs,
             packer_opts->device_layout,
             packer_opts->debug_clustering,
             packer_opts->enable_pin_feasibility_filter
 #ifdef USE_HMETIS
 			, partitions
 #endif
 #ifdef ENABLE_CLASSIC_VPR_STA
             , timing_inf
 #endif
             );

 	/*free list_of_pack_molecules*/
 	free_list_of_pack_patterns(list_of_packing_patterns, num_packing_patterns);

 	cur_pack_molecule = list_of_pack_molecules;
 	while (cur_pack_molecule != nullptr){
 		cur_pack_molecule = list_of_pack_molecules->next;
 		delete list_of_pack_molecules;
 		list_of_pack_molecules = cur_pack_molecule;
 	}

 	vtr::printf_info("\n");
 	vtr::printf_info("Netlist conversion complete.\n");
 	vtr::printf_info("\n");
 }

 float get_arch_switch_info(short switch_index, int switch_fanin, float &Tdel_switch, float &R_switch, float &Cout_switch){
 	/* Fetches delay, resistance and output capacitance of the architecture switch at switch_index.
 	Returns the total delay through the switch. Used to calculate inter-cluster net delay. */

 	/* The intrinsic delay may depend on fanin to the switch. If the delay map of a
 	   switch from the architecture file has multiple (#inputs, delay) entries, we
 	   interpolate/extrapolate to get the delay at 'switch_fanin'. */
     auto& device_ctx = g_vpr_ctx.device();

     Tdel_switch = device_ctx.arch_switch_inf[switch_index].Tdel(switch_fanin);
 	R_switch = device_ctx.arch_switch_inf[switch_index].R;
 	Cout_switch = device_ctx.arch_switch_inf[switch_index].Cout;

 	/* The delay through a loaded switch is its intrinsic (unloaded)
 	delay plus the product of its resistance and output capacitance. */
 	return Tdel_switch + R_switch * Cout_switch;
 }

 std::unordered_set<AtomNetId> alloc_and_load_is_clock(bool global_clocks) {

 	/* Looks through all the atom blocks to find and mark all the clocks, by setting
 	 * the corresponding entry by adding the clock to is_clock.
      * global_clocks is used
 	 * only for an error check.                                                */

 	int num_clocks = 0;
     std::unordered_set<AtomNetId> is_clock;

 	/* Want to identify all the clock nets.  */
     auto& atom_ctx = g_vpr_ctx.atom();

     for(auto blk_id : atom_ctx.nlist.blocks()) {
         for(auto pin_id : atom_ctx.nlist.block_clock_pins(blk_id)) {
             auto net_id = atom_ctx.nlist.pin_net(pin_id);
             if (!is_clock.count(net_id)) {
                 is_clock.insert(net_id);
                 num_clocks++;
             }
         }
 	}

 	/* If we have multiple clocks and we're supposed to declare them global, *
 	 * print a warning message, since it looks like this circuit may have    *
 	 * locally generated clocks.                                             */

 	if (num_clocks > 1 && global_clocks) {
 		vtr::printf_warning(__FILE__, __LINE__,
 				"All %d clocks will be treated as global.\n", num_clocks);
 	}

 	return (is_clock);
 }

 #ifdef USE_HMETIS
 /* Reads in the output of the hMetis partitioner and returns
 *  A 2-D vector that contains all the blocks in each partition
 */
 static vtr::vector_map<AtomBlockId, int> read_hmetis_graph(string &hmetis_output_file, const int num_parts) {
 	ifstream fp(hmetis_output_file.c_str());
 	vtr::vector_map<AtomBlockId, int> partitions;
 	string line;
 	int block_num = 0; //Indexing for CLB's start at 0

 	// Check that # of lines in output file matches # of blocks/vertices
 	auto& atom_ctx = g_vpr_ctx.atom();
 	VTR_ASSERT((int)atom_ctx.nlist.blocks().size() == count(istreambuf_iterator<char>(fp), istreambuf_iterator<char>(), '\n'));

 	partitions.resize(atom_ctx.nlist.blocks().size());

 	// Reset the filestream to the beginning and reread
 	fp.clear();
 	fp.seekg(0, fp.beg);

 	while (getline(fp, line)) {
 		if (stoi(line) >= num_parts) {
 			vpr_throw(VPR_ERROR_OTHER, __FILE__, __LINE__,
 				"Partition for line '%s' exceeds the set number of partitions %d" , line, num_parts);
 		}

 		partitions[AtomBlockId(block_num)] = stoi(line);
 		block_num++;
 	}

 	fp.close();

 	return partitions;
 }
 #endif
	#include <cstdio>
	#include <cstring>
	#include <unordered_set>
	#include <unordered_map>
	#include <fstream>
	#include <stdlib.h>
	using namespace std;

	#include "vtr_assert.h"
	#include "vtr_log.h"
	#include "vtr_math.h"

	#include "vpr_error.h"
	#include "vpr_types.h"

	#include "read_xml_arch_file.h"
	#include "globals.h"
	#include "atom_netlist.h"
	#include "prepack.h"
	#include "pack_types.h"
	#include "pack.h"
	#include "read_blif.h"
	#include "cluster.h"

	#ifdef USE_HMETIS
	#include "hmetis_graph_writer.h"
	static vtr::vector_map<AtomBlockId, int> read_hmetis_graph(string &hmetis_output_file_name, const int num_parts);
	//TODO: CHANGE THIS HARDCODING
	static string hmetis("/cygdrive/c/Source/Repos/vtr-verilog-to-routing/vpr/hmetis-1.5-WIN32/shmetis.exe ");
	#endif

	/* #define DUMP_PB_GRAPH 1 */
	/* #define DUMP_BLIF_INPUT 1 */

	static std::unordered_set<AtomNetId> alloc_and_load_is_clock(bool global_clocks);

	void try_pack(t_packer_opts *packer_opts,
	const t_arch * arch,
	const t_model *user_models,
	const t_model *library_models,
	float interc_delay,
	vector<t_lb_type_rr_node> *lb_type_rr_graphs
	#ifdef ENABLE_CLASSIC_VPR_STA
	, t_timing_inf timing_inf
	#endif
	) {
	std::unordered_set<AtomNetId> is_clock;
	std::multimap<AtomBlockId,t_pack_molecule*> atom_molecules; //The molecules associated with each atom block
	std::unordered_map<AtomBlockId,t_pb_graph_node*> expected_lowest_cost_pb_gnode; //The molecules associated with each atom block
	const t_model *cur_model;
	int num_models;
	t_pack_patterns *list_of_packing_patterns;
	int num_packing_patterns;
	t_pack_molecule list_of_pack_molecules, cur_pack_molecule;
	#ifdef USE_HMETIS
	vtr::vector_map<AtomBlockId,int> partitions;
	#endif
	vtr::printf_info("Begin packing '%s'.\n", packer_opts->blif_file_name.c_str());

	/* determine number of models in the architecture */
	num_models = 0;
	cur_model = user_models;
	while (cur_model) {
	num_models++;
	cur_model = cur_model->next;
	}
	cur_model = library_models;
	while (cur_model) {
	num_models++;
	cur_model = cur_model->next;
	}


	is_clock = alloc_and_load_is_clock(packer_opts->global_clocks);

	auto& atom_ctx = g_vpr_ctx.atom();

	size_t num_p_inputs = 0;
	size_t num_p_outputs = 0;
	for(auto blk_id : atom_ctx.nlist.blocks()) {
	auto type = atom_ctx.nlist.block_type(blk_id);
	if(type == AtomBlockType::INPAD) {
	++num_p_inputs;
	} else if(type == AtomBlockType::OUTPAD) {
	++num_p_outputs;
	}
	}

	vtr::printf_info("\n");
	vtr::printf_info("After removing unused inputs...\n");
	vtr::printf_info("\ttotal blocks: %zu, total nets: %zu, total inputs: %zu, total outputs: %zu\n",
	atom_ctx.nlist.blocks().size(), atom_ctx.nlist.nets().size(), num_p_inputs, num_p_outputs);

	vtr::printf_info("Begin prepacking.\n");
	list_of_packing_patterns = alloc_and_load_pack_patterns(&num_packing_patterns);
	list_of_pack_molecules = alloc_and_load_pack_molecules(list_of_packing_patterns,
	atom_molecules,
	expected_lowest_cost_pb_gnode,
	num_packing_patterns);
	vtr::printf_info("Finish prepacking.\n");

	if(packer_opts->auto_compute_inter_cluster_net_delay) {
	packer_opts->inter_cluster_net_delay = interc_delay;
	vtr::printf_info("Using inter-cluster delay: %g\n", packer_opts->inter_cluster_net_delay);
	}

	#ifdef USE_HMETIS
	if (!packer_opts->hmetis_input_file.empty()) {
	/* hmetis.exe <GraphFile> <Nparts> <UBfactor> <Nruns> <Ctype> <RType> <VCycle> <Reconst> <dbglvl>
	or
	hmetis.exe <GraphFile> <FixFile> <Nparts> <UBfactor> <Nruns> <Ctype> <RType> <VCycle> <Reconst> <dbglvl>
	GraphFile: name of the hypergraph file
	FixFile : name of the file containing pre-assignment of vertices to partitions
	Nparts : number of partitions desired
	UBfactor : balance between the partitions (e.g., use 5 for a 45-55 bisection balance)
	Nruns : Number of Iterations (shmetis defaults to 10)
	CType : HFC(1), FC(2), GFC(3), HEDGE(4), EDGE(5)
	RType : FM(1), 1WayFM(2), EEFM(3)
	VCycle : No(0), @End(1), ForMin(2), All(3)
	Reconst : NoReconstruct_HE(0), Reconstruct_HE (1)
	dbglvl : debug level*/

	vtr::printf_info("Writing hmetis hypergraph to %s\n", packer_opts->hmetis_input_file);

	// Write AtomNetlist into hGraph format
	write_hmetis_graph(packer_opts->hmetis_input_file);

	// Set the rest of the arguments
	// For a reference to what the string arguments mean, refer to the manual

	// The number of partitions is determined in one of two methods:
	// 1. Partition tree would result in the size of subcircuits ~= 4 (Elias Vansteenkiste, et al.)
	// 2. Partition depth = 5, i.e. num_parts = 32 (Doris Chen, et al.)
	//int num_parts = atom_ctx.nlist.blocks().size() / 4; // Method 1.
	//TODO: Find an appropriate value (may be from packer_opts) for num_parts
	int num_parts = 32; //Method 2.

	string run_hmetis = hmetis + packer_opts->hmetis_input_file + " " + to_string(num_parts) + " 5";

	// Check if OS is Windows or Linux and run hMetis accordingly
	//TODO: USE HMETIS WITH ALL ARGUMENTS INSTEAD FOR FURTHER REFINEMENT
	//Using shmetis (standard hmetis) to simplify
	system(run_hmetis.c_str());

	/* Parse the output file from hMetis, contains \|V\| lines, ith line = partition of V_i.
	* Store the results into a vector_map.
	*/
	string hmetis_output_file(packer_opts->hmetis_input_file + ".part." + to_string(num_parts));

	partitions = read_hmetis_graph(hmetis_output_file, num_parts);

	// Print each block's partition
	vtr::printf_info("Partitioning complete\n");

	vector<vector<AtomBlockId>> print_partitions(num_parts);
	for (auto blk_id : atom_ctx.nlist.blocks()) {
	print_partitions[partitions[blk_id]].push_back(blk_id);
	}
	for (int i = 0; i < (int)print_partitions.size(); i++) {
	vtr::printf_info("Blocks in partition %zu:\n\t", i);
	for (int j = 0; j < (int)print_partitions[i].size(); j++)
	vtr::printf_info("%zu ", size_t(print_partitions[i][j]));
	vtr::printf_info("\n");
	}
	}
	#endif

	do_clustering(arch, list_of_pack_molecules, num_models,
	packer_opts->global_clocks, is_clock,
	atom_molecules,
	expected_lowest_cost_pb_gnode,
	packer_opts->hill_climbing_flag, packer_opts->output_file.c_str(),
	packer_opts->timing_driven, packer_opts->cluster_seed_type,
	packer_opts->alpha, packer_opts->beta,
	packer_opts->inter_cluster_net_delay,
	packer_opts->target_device_utilization,
	packer_opts->allow_unrelated_clustering,
	packer_opts->connection_driven,
	packer_opts->packer_algorithm,
	lb_type_rr_graphs,
	packer_opts->device_layout,
	packer_opts->debug_clustering,
	packer_opts->enable_pin_feasibility_filter
	#ifdef USE_HMETIS
	, partitions
	#endif
	#ifdef ENABLE_CLASSIC_VPR_STA
	, timing_inf
	#endif
	);

	/free list_of_pack_molecules/
	free_list_of_pack_patterns(list_of_packing_patterns, num_packing_patterns);

	cur_pack_molecule = list_of_pack_molecules;
	while (cur_pack_molecule != nullptr){
	cur_pack_molecule = list_of_pack_molecules->next;
	delete list_of_pack_molecules;
	list_of_pack_molecules = cur_pack_molecule;
	}

	vtr::printf_info("\n");
	vtr::printf_info("Netlist conversion complete.\n");
	vtr::printf_info("\n");
	}

	float get_arch_switch_info(short switch_index, int switch_fanin, float &Tdel_switch, float &R_switch, float &Cout_switch){
	/* Fetches delay, resistance and output capacitance of the architecture switch at switch_index.
	Returns the total delay through the switch. Used to calculate inter-cluster net delay. */

	/* The intrinsic delay may depend on fanin to the switch. If the delay map of a
	switch from the architecture file has multiple (#inputs, delay) entries, we
	interpolate/extrapolate to get the delay at 'switch_fanin'. */
	auto& device_ctx = g_vpr_ctx.device();

	Tdel_switch = device_ctx.arch_switch_inf[switch_index].Tdel(switch_fanin);
	R_switch = device_ctx.arch_switch_inf[switch_index].R;
	Cout_switch = device_ctx.arch_switch_inf[switch_index].Cout;

	/* The delay through a loaded switch is its intrinsic (unloaded)
	delay plus the product of its resistance and output capacitance. */
	return Tdel_switch + R_switch * Cout_switch;
	}

	std::unordered_set<AtomNetId> alloc_and_load_is_clock(bool global_clocks) {

	/* Looks through all the atom blocks to find and mark all the clocks, by setting
	* the corresponding entry by adding the clock to is_clock.
	* global_clocks is used
	* only for an error check. */

	int num_clocks = 0;
	std::unordered_set<AtomNetId> is_clock;

	/* Want to identify all the clock nets. */
	auto& atom_ctx = g_vpr_ctx.atom();

	for(auto blk_id : atom_ctx.nlist.blocks()) {
	for(auto pin_id : atom_ctx.nlist.block_clock_pins(blk_id)) {
	auto net_id = atom_ctx.nlist.pin_net(pin_id);
	if (!is_clock.count(net_id)) {
	is_clock.insert(net_id);
	num_clocks++;
	}
	}
	}

	/* If we have multiple clocks and we're supposed to declare them global, *
	* print a warning message, since it looks like this circuit may have *
	* locally generated clocks. */

	if (num_clocks > 1 && global_clocks) {
	vtr::printf_warning(__FILE__, __LINE__,
	"All %d clocks will be treated as global.\n", num_clocks);
	}

	return (is_clock);
	}

	#ifdef USE_HMETIS
	/* Reads in the output of the hMetis partitioner and returns
	* A 2-D vector that contains all the blocks in each partition
	*/
	static vtr::vector_map<AtomBlockId, int> read_hmetis_graph(string &hmetis_output_file, const int num_parts) {
	ifstream fp(hmetis_output_file.c_str());
	vtr::vector_map<AtomBlockId, int> partitions;
	string line;
	int block_num = 0; //Indexing for CLB's start at 0

	// Check that # of lines in output file matches # of blocks/vertices
	auto& atom_ctx = g_vpr_ctx.atom();
	VTR_ASSERT((int)atom_ctx.nlist.blocks().size() == count(istreambuf_iterator<char>(fp), istreambuf_iterator<char>(), '\n'));

	partitions.resize(atom_ctx.nlist.blocks().size());

	// Reset the filestream to the beginning and reread
	fp.clear();
	fp.seekg(0, fp.beg);

	while (getline(fp, line)) {
	if (stoi(line) >= num_parts) {
	vpr_throw(VPR_ERROR_OTHER, __FILE__, __LINE__,
	"Partition for line '%s' exceeds the set number of partitions %d" , line, num_parts);
	}

	partitions[AtomBlockId(block_num)] = stoi(line);
	block_num++;
	}

	fp.close();

	return partitions;
	}
	#endif