vpr/SRC/place/place.c - third_party/vtr-verilog-to-routing - Git at Google

 #include <cstdio>
 #include <cmath>
 using namespace std;

 #include <assert.h>

 #include "util.h"
 #include "vpr_types.h"
 #include "globals.h"
 #include "place.h"
 #include "read_place.h"
 #include "draw.h"
 #include "place_and_route.h"
 #include "net_delay.h"
 #include "path_delay.h"
 #include "timing_place_lookup.h"
 #include "timing_place.h"
 #include "place_stats.h"
 #include "read_xml_arch_file.h"
 #include "ReadOptions.h"
 #include "vpr_utils.h"
 #include "place_macro.h"

 /************** Types and defines local to place.c ***************************/

 /* Cut off for incremental bounding box updates.                          *
  * 4 is fastest -- I checked.                                             */
 /* To turn off incremental bounding box updates, set this to a huge value */
 #define SMALL_NET 4

 /* This defines the error tolerance for floating points variables used in *
  * cost computation. 0.01 means that there is a 1% error tolerance.       */
 #define ERROR_TOL .01

 /* This defines the maximum number of swap attempts before invoking the   *
  * once-in-a-while placement legality check as well as floating point     *
  * variables round-offs check.                                            */
 #define MAX_MOVES_BEFORE_RECOMPUTE 50000

 /* The maximum number of tries when trying to place a carry chain at a    *
  * random location before trying exhaustive placement - find the fist     *
  * legal position and place it during initial placement.                  */
 #define MAX_NUM_TRIES_TO_PLACE_MACROS_RANDOMLY 4

 /* Flags for the states of the bounding box.                              *
  * Stored as char for memory efficiency.                                  */
 #define NOT_UPDATED_YET 'N'
 #define UPDATED_ONCE 'U'
 #define GOT_FROM_SCRATCH 'S'

 /* For comp_cost.  NORMAL means use the method that generates updateable  *
  * bounding boxes for speed.  CHECK means compute all bounding boxes from *
  * scratch using a very simple routine to allow checks of the other       *
  * costs.                                                                 */
 enum cost_methods {
 	NORMAL, CHECK
 };

 /* This is for the placement swap routines. A swap attempt could be       *
  * rejected, accepted or aborted (due to the limitations placed on the    *
  * carry chain support at this point).                                    */
 enum swap_result {
 	REJECTED, ACCEPTED, ABORTED
 };

 struct s_placer_statistics {
 	double av_cost, av_bb_cost, av_timing_cost,
 	       sum_of_squares, av_delay_cost;
 	int success_sum;
 };

 typedef struct s_placer_statistics t_placer_statistics;

 #define MAX_INV_TIMING_COST 1.e9
 /* Stops inverse timing cost from going to infinity with very lax timing constraints,
 which avoids multiplying by a gigantic inverse_prev_timing_cost when auto-normalizing.
 The exact value of this cost has relatively little impact, but should not be
 large enough to be on the order of timing costs for normal constraints. */

 /********************** Variables local to place.c ***************************/

 /* Cost of a net, and a temporary cost of a net used during move assessment. */
 static float *net_cost = NULL, *temp_net_cost = NULL; /* [0..g_clbs_nlist.net.size()-1] */

 static t_legal_pos **legal_pos = NULL; /* [0..num_types-1][0..type_tsize - 1] */
 static int *num_legal_pos = NULL; /* [0..num_legal_pos-1] */

 /* [0...g_clbs_nlist.net.size()-1]                                               *
  * A flag array to indicate whether the specific bounding box has been updated   *
  * in this particular swap or not. If it has been updated before, the code       *
  * must use the updated data, instead of the out-of-date data passed into the    *
  * subroutine, particularly used in try_swap(). The value NOT_UPDATED_YET        *
  * indicates that the net has not been updated before, UPDATED_ONCE indicated    *
  * that the net has been updated once, if it is going to be updated again, the   *
  * values from the previous update must be used. GOT_FROM_SCRATCH is only        *
  * applicable for nets larger than SMALL_NETS and it indicates that the          *
  * particular bounding box cannot be updated incrementally before, hence the     *
  * bounding box is got from scratch, so the bounding box would definitely be     *
  * right, DO NOT update again.                                                   *
  * [0...g_clbs_nlist.net.size()-1]                                               */
 static char * bb_updated_before = NULL;

 /* [0..g_clbs_nlist.net.size()-1][1..num_pins-1]. What is the value of the timing   */
 /* driven portion of the cost function. These arrays will be set to  */
 /* (criticality * delay) for each point to point connection. */
 static float **point_to_point_timing_cost = NULL;
 static float **temp_point_to_point_timing_cost = NULL;

 /* [0..g_clbs_nlist.net.size()-1][1..num_pins-1]. What is the value of the delay */
 /* for each connection in the circuit */
 static float **point_to_point_delay_cost = NULL;
 static float **temp_point_to_point_delay_cost = NULL;

 /* [0..num_blocks-1][0..pins_per_clb-1]. Indicates which pin on the net */
 /* this block corresponds to, this is only required during timing-driven */
 /* placement. It is used to allow us to update individual connections on */
 /* each net */
 static int **net_pin_index = NULL;

 /* [0..g_clbs_nlist.net.size()-1].  Store the bounding box coordinates and the number of    *
  * blocks on each of a net's bounding box (to allow efficient updates),      *
  * respectively.                                                             */

 static struct s_bb *bb_coords = NULL, *bb_num_on_edges = NULL;

 /* Store the information on the blocks to be moved in a swap during     *
  * placement, in the form of array of structs instead of struct with    *
  * arrays for cache effifiency                                          *
  */
 static t_pl_blocks_to_be_moved blocks_affected;

 /* The arrays below are used to precompute the inverse of the average   *
  * number of tracks per channel between [subhigh] and [sublow].  Access *
  * them as chan?_place_cost_fac[subhigh][sublow].  They are used to     *
  * speed up the computation of the cost function that takes the length  *
  * of the net bounding box in each dimension, divided by the average    *
  * number of tracks in that direction; for other cost functions they    *
  * will never be used.                                                  *
  *                [0...ny]                [0...nx]                      */
 static float **chanx_place_cost_fac, **chany_place_cost_fac;

 /* The following arrays are used by the try_swap function for speed.   */
 /* [0...g_clbs_nlist.net.size()-1] */
 static struct s_bb *ts_bb_coord_new = NULL;
 static struct s_bb *ts_bb_edge_new = NULL;
 static int *ts_nets_to_update = NULL;

 /* The pl_macros array stores all the carry chains placement macros.   *
  * [0...num_pl_macros-1]                                                  */
 static t_pl_macro * pl_macros = NULL;
 static int num_pl_macros;

 /* These file-scoped variables keep track of the number of swaps       *
  * rejected, accepted or aborted. The total number of swap attempts    *
  * is the sum of the three number.                                     */
 static int num_swap_rejected = 0;
 static int num_swap_accepted = 0;
 static int num_swap_aborted = 0;
 static int num_ts_called = 0;

 /* Expected crossing counts for nets with different #'s of pins.  From *
  * ICCAD 94 pp. 690 - 695 (with linear interpolation applied by me).   *
  * Multiplied to bounding box of a net to better estimate wire length  *
  * for higher fanout nets. Each entry is the correction factor for the *
  * fanout index-1                                                      */
 static const float cross_count[50] = { /* [0..49] */1.0, 1.0, 1.0, 1.0828, 1.1536, 1.2206, 1.2823, 1.3385, 1.3991, 1.4493, 1.4974,
 		1.5455, 1.5937, 1.6418, 1.6899, 1.7304, 1.7709, 1.8114, 1.8519, 1.8924,
 		1.9288, 1.9652, 2.0015, 2.0379, 2.0743, 2.1061, 2.1379, 2.1698, 2.2016,
 		2.2334, 2.2646, 2.2958, 2.3271, 2.3583, 2.3895, 2.4187, 2.4479, 2.4772,
 		2.5064, 2.5356, 2.5610, 2.5864, 2.6117, 2.6371, 2.6625, 2.6887, 2.7148,
 		2.7410, 2.7671, 2.7933 };

 /********************* Static subroutines local to place.c *******************/
 #ifdef VERBOSE
 	static void print_clb_placement(const char *fname);
 #endif

 static void alloc_and_load_placement_structs(
 		float place_cost_exp, float ***old_region_occ_x,
 		float ***old_region_occ_y, struct s_placer_opts placer_opts,
 		t_direct_inf *directs, int num_directs, int num_segments);

 static void alloc_and_load_try_swap_structs();

 static void free_placement_structs(
 		float **old_region_occ_x, float **old_region_occ_y,
 		struct s_placer_opts placer_opts);

 static void alloc_and_load_for_fast_cost_update(float place_cost_exp);

 static void free_fast_cost_update(void);

 static void alloc_legal_placements();
 static void load_legal_placements();

 static void free_legal_placements();

 static int check_macro_can_be_placed(int imacro, int itype, int x, int y, int z);

 static int try_place_macro(int itype, int ipos, int imacro, int * free_locations);

 static void initial_placement_pl_macros(int macros_max_num_tries, int * free_locations);

 static void initial_placement_blocks(int * free_locations, enum e_pad_loc_type pad_loc_type,
 		bool apply_placement_regions = false);
 static void initial_placement_location(int * free_locations, int iblk,
 		int *pipos, int *px, int *py, int *pz);

 static void initial_placement(enum e_pad_loc_type pad_loc_type,
 		char *pad_loc_file);

 static float comp_bb_cost(enum cost_methods method);

 static int setup_blocks_affected(int b_from, int x_to, int y_to, int z_to);

 static int find_affected_blocks(int b_from, int x_to, int y_to, int z_to);

 static enum swap_result try_swap(float t, float *cost, float *bb_cost, float *timing_cost,
 		float rlim, float **old_region_occ_x,
 		float **old_region_occ_y,
 		enum e_place_algorithm place_algorithm, float timing_tradeoff,
 		float inverse_prev_bb_cost, float inverse_prev_timing_cost,
 		float *delay_cost);

 static void check_place(float bb_cost, float timing_cost,
 		enum e_place_algorithm place_algorithm,
 		float delay_cost);

 static float starting_t(float *cost_ptr, float *bb_cost_ptr,
 		float *timing_cost_ptr, float **old_region_occ_x,
 		float **old_region_occ_y,
 		struct s_annealing_sched annealing_sched, int max_moves, float rlim,
 		enum e_place_algorithm place_algorithm, float timing_tradeoff,
 		float inverse_prev_bb_cost, float inverse_prev_timing_cost,
 		float *delay_cost_ptr);

 static void update_t(float *t, float std_dev, float rlim, float success_rat,
 		struct s_annealing_sched annealing_sched);

 static void update_rlim(float *rlim, float success_rat);

 static int exit_crit(float t, float cost,
 		struct s_annealing_sched annealing_sched);

 static int count_connections(void);

 static double get_std_dev(int n, double sum_x_squared, double av_x);

 static float recompute_bb_cost(void);

 static float comp_td_point_to_point_delay(int inet, int ipin);

 static void update_td_cost(void);

 static void comp_delta_td_cost(float *delta_timing, float *delta_delay);

 static void comp_td_costs(float *timing_cost, float *connection_delay_sum);

 static enum swap_result assess_swap(float delta_c, float t);

 static bool find_to(t_type_ptr type, float rlim,
 		int iblk_from, int x_from, int y_from,
 		int *px_to, int *py_to, int *pz_to);
 static void find_to_location(t_type_ptr type, float rlim,
 		int iblk_from, int x_from, int y_from,
 		int *px_to, int *py_to, int *pz_to);

 static void get_non_updateable_bb(int inet, struct s_bb *bb_coord_new);

 static void update_bb(int inet, struct s_bb *bb_coord_new,
 		struct s_bb *bb_edge_new, int xold, int yold, int xnew, int ynew);

 static int find_affected_nets(int *nets_to_update);

 static float get_net_cost(int inet, struct s_bb *bb_ptr);

 static void get_bb_from_scratch(int inet, struct s_bb *coords,
 		struct s_bb *num_on_edges);

 static double get_net_wirelength_estimate(int inet, struct s_bb *bbptr);

 static void free_try_swap_arrays(void);

 static void outer_loop_recompute_criticalities(struct s_placer_opts placer_opts,
 	int num_connections, t_slack * slacks, float crit_exponent, float bb_cost,
 	float * place_delay_value, float * timing_cost, float * delay_cost,
 	int * outer_crit_iter_count, float * inverse_prev_timing_cost,
 	float * inverse_prev_bb_cost, float ** net_delay, const t_timing_inf &timing_inf);

 static void placement_inner_loop(float t, float rlim, struct s_placer_opts placer_opts,
 	float inverse_prev_bb_cost, float inverse_prev_timing_cost, int move_lim,
 	int num_connections, t_slack * slacks, float crit_exponent, int inner_recompute_limit,
 	t_placer_statistics *stats, float * cost, float * bb_cost, float * timing_cost,
 	float ** old_region_occ_x, float ** old_region_occ_y, float * delay_cost,
     float * place_delay_value, float ** net_delay, const t_timing_inf &timing_inf);

 /*****************************************************************************/
 void try_place(struct s_placer_opts placer_opts,
 		struct s_annealing_sched annealing_sched,
 		t_chan_width_dist chan_width_dist, struct s_router_opts router_opts,
 		struct s_det_routing_arch *det_routing_arch, t_segment_inf * segment_inf,
 		t_timing_inf timing_inf, t_direct_inf *directs, int num_directs) {

 	/* Does almost all the work of placing a circuit.  Width_fac gives the   *
 	 * width of the widest channel.  Place_cost_exp says what exponent the   *
 	 * width should be taken to when calculating costs.  This allows a       *
 	 * greater bias for anisotropic architectures.                           */

 	int tot_iter, move_lim, moves_since_cost_recompute, width_fac, num_connections,
 		outer_crit_iter_count, inner_recompute_limit;
 	unsigned int ipin, inet;
 	float t, success_rat, rlim, cost, timing_cost, bb_cost, new_bb_cost, new_timing_cost,
 		delay_cost, new_delay_cost, place_delay_value, inverse_prev_bb_cost, inverse_prev_timing_cost,
 		oldt, **old_region_occ_x, **old_region_occ_y, **net_delay = NULL, crit_exponent,
 		first_rlim, final_rlim, inverse_delta_rlim, critical_path_delay = UNDEFINED,
 		**remember_net_delay_original_ptr; /*used to free net_delay if it is re-assigned */
 	double std_dev;
 	int total_swap_attempts;
 	float reject_rate;
 	float accept_rate;
 	float abort_rate;
 	char msg[BUFSIZE];
 	t_placer_statistics stats;
 	t_slack * slacks = NULL;

 	/* Allocated here because it goes into timing critical code where each memory allocation is expensive */

 	remember_net_delay_original_ptr = NULL; /*prevents compiler warning */

 	/* init file scope variables */
 	num_swap_rejected = 0;
 	num_swap_accepted = 0;
 	num_swap_aborted = 0;
 	num_ts_called = 0;

 	if (placer_opts.place_algorithm == NET_TIMING_DRIVEN_PLACE
 			|| placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE
 			|| placer_opts.enable_timing_computations) {
 		/*do this before the initial placement to avoid messing up the initial placement */
 		slacks = alloc_lookups_and_criticalities(chan_width_dist, router_opts,
 				det_routing_arch, segment_inf, timing_inf, &net_delay, directs, num_directs);

 		remember_net_delay_original_ptr = net_delay;

 		/*#define PRINT_LOWER_BOUND */
 #ifdef PRINT_LOWER_BOUND
 		/*print the crit_path, assuming delay between blocks that are*
 		 *block_dist apart*/

 		if (placer_opts.block_dist <= nx)
 		place_delay_value =
 		delta_clb_to_clb[placer_opts.block_dist][0];
 		else if (placer_opts.block_dist <= ny)
 		place_delay_value =
 		delta_clb_to_clb[0][placer_opts.block_dist];
 		else
 		place_delay_value = delta_clb_to_clb[nx][ny];

 		vpr_printf_info("\n");
 		vpr_printf_info("Lower bound assuming delay of %g\n", place_delay_value);

 		load_constant_net_delay(net_delay, place_delay_value);
 		load_timing_graph_net_delays(net_delay);
 		do_timing_analysis(slacks, timing_inf, false, true);

 		if (getEchoEnabled()) {
 			if(isEchoFileEnabled(E_ECHO_PLACEMENT_CRITICAL_PATH))
 				print_critical_path(getEchoFileName(E_ECHO_PLACEMENT_CRITICAL_PATH), timing_inf);
 			if(isEchoFileEnabled(E_ECHO_PLACEMENT_LOWER_BOUND_SINK_DELAYS))
 				print_sink_delays(getEchoFileName(E_ECHO_PLACEMENT_LOWER_BOUND_SINK_DELAYS));
 			if(isEchoFileEnabled(E_ECHO_PLACEMENT_LOGIC_SINK_DELAYS))
 				print_sink_delays(getEchoFileName(E_ECHO_PLACEMENT_LOGIC_SINK_DELAYS));
 		}

 		/*also print sink delays assuming 0 delay between blocks,
 		 * this tells us how much logic delay is on each path */

 		load_constant_net_delay(net_delay, 0);
 		load_timing_graph_net_delays(net_delay);
 		do_timing_analysis(slacks, timing_inf, false, true);

 #endif

 	}

 	width_fac = placer_opts.place_chan_width;

 	init_chan(width_fac, &router_opts.fixed_channel_width, chan_width_dist);

 	alloc_and_load_placement_structs(
 			placer_opts.place_cost_exp,
 			&old_region_occ_x, &old_region_occ_y, placer_opts,
 			directs, num_directs, det_routing_arch->num_segment);

 	initial_placement(placer_opts.pad_loc_type, placer_opts.pad_loc_file);
 	init_draw_coords((float) width_fac);

 	/* Storing the number of pins on each type of block makes the swap routine *
 	 * slightly more efficient.                                                */

 	/* Gets initial cost and loads bounding boxes. */

 	if (placer_opts.place_algorithm == NET_TIMING_DRIVEN_PLACE
 			|| placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
 		bb_cost = comp_bb_cost(NORMAL);

 		crit_exponent = placer_opts.td_place_exp_first; /*this will be modified when rlim starts to change */

 		num_connections = count_connections();
 		vpr_printf_info("\n");
 		vpr_printf_info("There are %d point to point connections in this circuit.\n", num_connections);
 		vpr_printf_info("\n");

 		if (placer_opts.place_algorithm == NET_TIMING_DRIVEN_PLACE) {
 			for (inet = 0; inet < g_clbs_nlist.net.size(); inet++)
 				for (ipin = 1; ipin < g_clbs_nlist.net[inet].pins.size(); ipin++)
 					timing_place_crit[inet][ipin] = 0; /*dummy crit values */

 			comp_td_costs(&timing_cost, &delay_cost); /*first pass gets delay_cost, which is used
 			 * in criticality computations in the next call
 			 * to comp_td_costs. */
 			place_delay_value = delay_cost / num_connections; /*used for computing criticalities */
 			load_constant_net_delay(net_delay, place_delay_value, g_clbs_nlist.net,
 				g_clbs_nlist.net.size());

 		} else
 			place_delay_value = 0;

 		if (placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
 			net_delay = point_to_point_delay_cost; /*this keeps net_delay up to date with      *
 			 * *the same values that the placer is using  *
 			 * *point_to_point_delay_cost is computed each*
 			 * *time that comp_td_costs is called, and is *
 			 * *also updated after any swap is accepted   */
 		}

 		load_timing_graph_net_delays(net_delay);
 		do_timing_analysis(slacks, timing_inf, false, false);
 		load_criticalities(slacks, crit_exponent);
 		if (getEchoEnabled()) {
 			if(isEchoFileEnabled(E_ECHO_INITIAL_PLACEMENT_TIMING_GRAPH))
 				print_timing_graph(getEchoFileName(E_ECHO_INITIAL_PLACEMENT_TIMING_GRAPH));
 			if(isEchoFileEnabled(E_ECHO_INITIAL_PLACEMENT_SLACK))
 				print_slack(slacks->slack, false, getEchoFileName(E_ECHO_INITIAL_PLACEMENT_SLACK));
 			if(isEchoFileEnabled(E_ECHO_INITIAL_PLACEMENT_CRITICALITY))
 				print_criticality(slacks, getEchoFileName(E_ECHO_INITIAL_PLACEMENT_CRITICALITY));
 		}
 		outer_crit_iter_count = 1;

 		/*now we can properly compute costs  */
 		comp_td_costs(&timing_cost, &delay_cost); /*also vpr_printf proper values into point_to_point_delay_cost */

 		/* Timing cost appears to be 0 for small circuits without any critical paths, this will cause costs to be
 		indefinite values later, so set timing_cost to a small number to prevent it						*/
 		if (timing_cost == 0)
 			timing_cost = 1e-9;

 		inverse_prev_timing_cost = 1 / timing_cost;
 		inverse_prev_bb_cost = 1 / bb_cost;
 		cost = 1; /*our new cost function uses normalized values of           */
 		/*bb_cost and timing_cost, the value of cost will be reset  */
 		/*to 1 at each temperature when *_TIMING_DRIVEN_PLACE is true */
 	} else { /*BOUNDING_BOX_PLACE */
 		cost = bb_cost = comp_bb_cost(NORMAL);
 		timing_cost = 0;
 		delay_cost = 0;
 		place_delay_value = 0;
 		outer_crit_iter_count = 0;
 		num_connections = 0;
 		crit_exponent = 0;

 		inverse_prev_timing_cost = 0; /*inverses not used */
 		inverse_prev_bb_cost = 0;
 	}

 	move_lim = (int) (annealing_sched.inner_num * pow(num_blocks, 1.3333));

 	if (placer_opts.inner_loop_recompute_divider != 0)
 		inner_recompute_limit = (int)
 			(0.5 + (float) move_lim	/ (float) placer_opts.inner_loop_recompute_divider);
 	else
 		/*don't do an inner recompute */
 		inner_recompute_limit = move_lim + 1;

 	/* Sometimes I want to run the router with a random placement.  Avoid *
 	 * using 0 moves to stop division by 0 and 0 length vector problems,  *
 	 * by setting move_lim to 1 (which is still too small to do any       *
 	 * significant optimization).                                         */

 	if (move_lim <= 0)
 		move_lim = 1;

 	rlim = (float) max(nx + 1, ny + 1);

 	first_rlim = rlim; /*used in timing-driven placement for exponent computation */
 	final_rlim = 1;
 	inverse_delta_rlim = 1 / (first_rlim - final_rlim);

 	t = starting_t(&cost, &bb_cost, &timing_cost, old_region_occ_x, old_region_occ_y,
 			annealing_sched, move_lim, rlim,
 			placer_opts.place_algorithm, placer_opts.timing_tradeoff,
 			inverse_prev_bb_cost, inverse_prev_timing_cost, &delay_cost);

 	tot_iter = 0;
 	moves_since_cost_recompute = 0;
 	vpr_printf_info("Initial placement cost: %g bb_cost: %g td_cost: %g delay_cost: %g\n",
 			cost, bb_cost, timing_cost, delay_cost);
 	vpr_printf_info("\n");

 #ifndef SPEC
 	vpr_printf_info("%7s %7s %10s %10s %10s %10s %7s %7s %7s %7s %6s %9s %6s\n",
 			"-------", "-------", "----------", "----------", "----------", "----------",
 			"-------", "-------", "-------", "-------", "------", "---------", "------");
 	vpr_printf_info("%7s %7s %10s %10s %10s %10s %7s %7s %7s %7s %6s %9s %6s\n",
 			"T", "Cost", "Av BB Cost", "Av TD Cost", "Av Tot Del",
 			"P to P Del", "d_max", "Ac Rate", "Std Dev", "R limit", "Exp",
 			"Tot Moves", "Alpha");
 	vpr_printf_info("%7s %7s %10s %10s %10s %10s %7s %7s %7s %7s %6s %9s %6s\n",
 			"-------", "-------", "----------", "----------", "----------", "----------",
 			"-------", "-------", "-------", "-------", "------", "---------", "------");
 #endif

 	sprintf(msg, "Initial Placement.  Cost: %g  BB Cost: %g  TD Cost %g  Delay Cost: %g \t Channel Factor: %d",
 		cost, bb_cost, timing_cost, delay_cost, width_fac);
 	update_screen(MAJOR, msg, PLACEMENT, false, timing_inf);


 	/* Outer loop of the simmulated annealing begins */
 	while (exit_crit(t, cost, annealing_sched) == 0) {

 		if (placer_opts.place_algorithm == NET_TIMING_DRIVEN_PLACE
 				|| placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
 			cost = 1;
 		}

 		outer_loop_recompute_criticalities(placer_opts, num_connections, slacks,
 			crit_exponent, bb_cost, &place_delay_value, &timing_cost, &delay_cost,
 			&outer_crit_iter_count, &inverse_prev_timing_cost, &inverse_prev_bb_cost, net_delay, timing_inf);

 		placement_inner_loop(t, rlim, placer_opts, inverse_prev_bb_cost, inverse_prev_timing_cost,
 			move_lim, num_connections, slacks, crit_exponent, inner_recompute_limit, &stats,
 			&cost, &bb_cost, &timing_cost, old_region_occ_x, old_region_occ_y, &delay_cost,
 			&place_delay_value, net_delay, timing_inf);

 		/* Lines below prevent too much round-off error from accumulating *
 		 * in the cost over many iterations.  This round-off can lead to  *
 		 * error checks failing because the cost is different from what   *
 		 * you get when you recompute from scratch.                       */

 		moves_since_cost_recompute += move_lim;
 		if (moves_since_cost_recompute > MAX_MOVES_BEFORE_RECOMPUTE) {
 			new_bb_cost = recompute_bb_cost();
 			if (fabs(new_bb_cost - bb_cost) > bb_cost * ERROR_TOL) {
 				vpr_throw(VPR_ERROR_PLACE, __FILE__, __LINE__,
 						"in try_place: new_bb_cost = %g, old bb_cost = %g\n",
 						new_bb_cost, bb_cost);
 			}
 			bb_cost = new_bb_cost;

 			if (placer_opts.place_algorithm == NET_TIMING_DRIVEN_PLACE
 					|| placer_opts.place_algorithm
 							== PATH_TIMING_DRIVEN_PLACE) {
 				comp_td_costs(&new_timing_cost, &new_delay_cost);
 				if (fabs(new_timing_cost - timing_cost) > timing_cost * ERROR_TOL) {
 					vpr_throw(VPR_ERROR_PLACE, __FILE__, __LINE__,
 							"in try_place: new_timing_cost = %g, old timing_cost = %g, ERROR_TOL = %g\n",
 							new_timing_cost, timing_cost, ERROR_TOL);
 				}
 				if (fabs(new_delay_cost - delay_cost) > delay_cost * ERROR_TOL) {
 					vpr_throw(VPR_ERROR_PLACE, __FILE__, __LINE__,
 							"in try_place: new_delay_cost = %g, old delay_cost = %g, ERROR_TOL = %g\n",
 							new_delay_cost, delay_cost, ERROR_TOL);
 				}
 				timing_cost = new_timing_cost;
 			}

 			if (placer_opts.place_algorithm == BOUNDING_BOX_PLACE) {
 				cost = new_bb_cost;
 			}
 			moves_since_cost_recompute = 0;
 		}

 		tot_iter += move_lim;
 		success_rat = ((float) stats.success_sum) / move_lim;
 		if (stats.success_sum == 0) {
 			stats.av_cost = cost;
 			stats.av_bb_cost = bb_cost;
 			stats.av_timing_cost = timing_cost;
 			stats.av_delay_cost = delay_cost;
 		} else {
 			stats.av_cost /= stats.success_sum;
 			stats.av_bb_cost /= stats.success_sum;
 			stats.av_timing_cost /= stats.success_sum;
 			stats.av_delay_cost /= stats.success_sum;
 		}
 		std_dev = get_std_dev(stats.success_sum, stats.sum_of_squares, stats.av_cost);

 		oldt = t; /* for finding and printing alpha. */
 		update_t(&t, std_dev, rlim, success_rat, annealing_sched);

 #ifndef SPEC
 		critical_path_delay = get_critical_path_delay();
 		if(oldt >= 100.0 - EPSILON) {
 			vpr_printf_info("%7.3f %7.5f %10.4f %-10.5g %-10.5g %-10.5g %7.4f %7.4f %7.4f %7.4f %6.3f %9d %6.3f\n",
 					oldt, stats.av_cost, stats.av_bb_cost, stats.av_timing_cost,
 					stats.av_delay_cost, place_delay_value, critical_path_delay,
 					success_rat, std_dev, rlim, crit_exponent, tot_iter, t / oldt);
 		} else if(oldt >= 10.0 - EPSILON) {
 			vpr_printf_info("%7.4f %7.5f %10.4f %-10.5g %-10.5g %-10.5g %7.4f %7.4f %7.4f %7.4f %6.3f %9d %6.3f\n",
 					oldt, stats.av_cost, stats.av_bb_cost, stats.av_timing_cost,
 					stats.av_delay_cost, place_delay_value, critical_path_delay,
 					success_rat, std_dev, rlim, crit_exponent, tot_iter, t / oldt);
 		} else {
 			vpr_printf_info("%7.5f %7.5f %10.4f %-10.5g %-10.5g %-10.5g %7.4f %7.4f %7.4f %7.4f %6.3f %9d %6.3f\n",
 					oldt, stats.av_cost, stats.av_bb_cost, stats.av_timing_cost,
 					stats.av_delay_cost, place_delay_value, critical_path_delay,
 					success_rat, std_dev, rlim, crit_exponent, tot_iter, t / oldt);
 		}
 #endif

 		sprintf(msg, "Cost: %g  BB Cost %g  TD Cost %g  Temperature: %g",
 				cost, bb_cost, timing_cost, t);
 		update_screen(MINOR, msg, PLACEMENT, false, timing_inf);
 		update_rlim(&rlim, success_rat);

 		if (placer_opts.place_algorithm == NET_TIMING_DRIVEN_PLACE
 				|| placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
 			crit_exponent = (1 - (rlim - final_rlim) * inverse_delta_rlim)
 					* (placer_opts.td_place_exp_last - placer_opts.td_place_exp_first)
 					+ placer_opts.td_place_exp_first;
 		}
 #ifdef VERBOSE
 		if (getEchoEnabled()) {
 			print_clb_placement("first_iteration_clb_placement.echo");
 		}
 #endif
 	}
 	/* Outer loop of the simmulated annealing ends */


 	outer_loop_recompute_criticalities(placer_opts, num_connections, slacks,
 			crit_exponent, bb_cost, &place_delay_value, &timing_cost, &delay_cost,
 			&outer_crit_iter_count, &inverse_prev_timing_cost, &inverse_prev_bb_cost, net_delay, timing_inf);

 	t = 0; /* freeze out */

 	/* Run inner loop again with temperature = 0 so as to accept only swaps
 	 * which reduce the cost of the placement */
 	placement_inner_loop(t, rlim, placer_opts, inverse_prev_bb_cost, inverse_prev_timing_cost,
 			move_lim, num_connections, slacks, crit_exponent, inner_recompute_limit, &stats,
 			&cost, &bb_cost, &timing_cost, old_region_occ_x, old_region_occ_y, &delay_cost,
 			&place_delay_value, net_delay, timing_inf);

 	tot_iter += move_lim;
 	success_rat = ((float) stats.success_sum) / move_lim;
 	if (stats.success_sum == 0) {
 		stats.av_cost = cost;
 		stats.av_bb_cost = bb_cost;
 		stats.av_delay_cost = delay_cost;
 		stats.av_timing_cost = timing_cost;
 	} else {
 		stats.av_cost /= stats.success_sum;
 		stats.av_bb_cost /= stats.success_sum;
 		stats.av_delay_cost /= stats.success_sum;
 		stats.av_timing_cost /= stats.success_sum;
 	}

 	std_dev = get_std_dev(stats.success_sum, stats.sum_of_squares, stats.av_cost);

 #ifndef SPEC
 	vpr_printf_info("%7.5f %7.5f %10.4f %-10.5g %-10.5g %-10.5g %7s %7.4f %7.4f %7.4f %6.3f %9d\n",
 			t, stats.av_cost, stats.av_bb_cost, stats.av_timing_cost, stats.av_delay_cost, place_delay_value,
 			" ", success_rat, std_dev, rlim, crit_exponent, tot_iter);
 #endif

 	// TODO:
 	// 1. print a message about number of aborted moves.
 	// 2. add some subroutine hierarchy!  Too big!

 #ifdef VERBOSE
 	if (getEchoEnabled() && isEchoFileEnabled(E_ECHO_END_CLB_PLACEMENT)) {
 		print_clb_placement(getEchoFileName(E_ECHO_END_CLB_PLACEMENT));
 	}
 #endif

 	check_place(bb_cost, timing_cost,
 			placer_opts.place_algorithm, delay_cost);

 	if (placer_opts.enable_timing_computations
 			&& placer_opts.place_algorithm == BOUNDING_BOX_PLACE) {
 		/*need this done since the timing data has not been kept up to date*
 		 *in bounding_box mode */
 		for (inet = 0; inet < g_clbs_nlist.net.size(); inet++)
 			for (ipin = 1; ipin < g_clbs_nlist.net[inet].pins.size(); ipin++)
 				timing_place_crit[inet][ipin] = 0; /*dummy crit values */
 		comp_td_costs(&timing_cost, &delay_cost); /*computes point_to_point_delay_cost */
 	}

 	if (placer_opts.place_algorithm == NET_TIMING_DRIVEN_PLACE
 			|| placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE
 			|| placer_opts.enable_timing_computations) {
 		net_delay = point_to_point_delay_cost; /*this makes net_delay up to date with    *
 		 *the same values that the placer is using*/
 		load_timing_graph_net_delays(net_delay);

 		do_timing_analysis(slacks, timing_inf, false, false);

 		if (getEchoEnabled()) {
 			if(isEchoFileEnabled(E_ECHO_PLACEMENT_SINK_DELAYS))
 				print_sink_delays(getEchoFileName(E_ECHO_PLACEMENT_SINK_DELAYS));
 			if(isEchoFileEnabled(E_ECHO_FINAL_PLACEMENT_SLACK))
 				print_slack(slacks->slack, false, getEchoFileName(E_ECHO_FINAL_PLACEMENT_SLACK));
 			if(isEchoFileEnabled(E_ECHO_FINAL_PLACEMENT_CRITICALITY))
 				print_criticality(slacks, getEchoFileName(E_ECHO_FINAL_PLACEMENT_CRITICALITY));
 			if(isEchoFileEnabled(E_ECHO_FINAL_PLACEMENT_TIMING_GRAPH))
 				print_timing_graph(getEchoFileName(E_ECHO_FINAL_PLACEMENT_TIMING_GRAPH));
 			if(isEchoFileEnabled(E_ECHO_PLACEMENT_CRIT_PATH))
 				print_critical_path(getEchoFileName(E_ECHO_PLACEMENT_CRIT_PATH), timing_inf);
 		}

 		/* Print critical path delay. */
 		critical_path_delay = get_critical_path_delay();
 		vpr_printf_info("\n");
 		vpr_printf_info("Placement estimated critical path delay: %g ns\n", critical_path_delay);
 	}

 	sprintf(msg, "Placement. Cost: %g  bb_cost: %g td_cost: %g Channel Factor: %d",
 			cost, bb_cost, timing_cost, width_fac);
 	vpr_printf_info("Placement cost: %g, bb_cost: %g, td_cost: %g, delay_cost: %g\n",
 			cost, bb_cost, timing_cost, delay_cost);
 	update_screen(MAJOR, msg, PLACEMENT, false, timing_inf);

 	// Print out swap statistics
 	total_swap_attempts = num_swap_rejected + num_swap_accepted + num_swap_aborted;
 	reject_rate = num_swap_rejected / total_swap_attempts;
 	accept_rate = num_swap_accepted / total_swap_attempts;
 	abort_rate = num_swap_aborted / total_swap_attempts;
 	vpr_printf_info("Placement total # of swap attempts: %d\n", total_swap_attempts);
 	vpr_printf_info("\tSwap reject rate: %g\n", reject_rate);
 	vpr_printf_info("\tSwap accept rate: %g\n", accept_rate);
 	vpr_printf_info("\tSwap abort rate: %g\n",	abort_rate);


 #ifdef SPEC
 	vpr_printf_info("Total moves attempted: %d.0\n", tot_iter);
 #endif

 	free_placement_structs(
 				old_region_occ_x, old_region_occ_y,
 				placer_opts);
 	if (placer_opts.place_algorithm == NET_TIMING_DRIVEN_PLACE
 			|| placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE
 			|| placer_opts.enable_timing_computations) {

 		net_delay = remember_net_delay_original_ptr;
 		free_lookups_and_criticalities(&net_delay, slacks);
 	}

 	free_try_swap_arrays();
 }

 /* Function to recompute the criticalities before the inner loop of the annealing */
 static void outer_loop_recompute_criticalities(struct s_placer_opts placer_opts,
 	int num_connections, t_slack * slacks, float crit_exponent, float bb_cost,
 	float * place_delay_value, float * timing_cost, float * delay_cost,
 	int * outer_crit_iter_count, float * inverse_prev_timing_cost,
 	float * inverse_prev_bb_cost, float ** net_delay, const t_timing_inf &timing_inf) {

 	if (placer_opts.place_algorithm != NET_TIMING_DRIVEN_PLACE
 			&& placer_opts.place_algorithm != PATH_TIMING_DRIVEN_PLACE)
 		return;

 	/*at each temperature change we update these values to be used     */
 	/*for normalizing the tradeoff between timing and wirelength (bb)  */
 	if (*outer_crit_iter_count >= placer_opts.recompute_crit_iter
 			|| placer_opts.inner_loop_recompute_divider != 0) {
 #ifdef VERBOSE
 		vpr_printf_info("Outer loop recompute criticalities\n");
 #endif
 		*place_delay_value = (*delay_cost) / num_connections;

 		if (placer_opts.place_algorithm == NET_TIMING_DRIVEN_PLACE)
 			load_constant_net_delay(net_delay, *place_delay_value,
 				g_clbs_nlist.net, g_clbs_nlist.net.size());
 		/*note, for path_based, the net delay is not updated since it is current,
 		 *because it accesses point_to_point_delay array */

 		load_timing_graph_net_delays(net_delay);
 		do_timing_analysis(slacks, timing_inf, false, false);
 		load_criticalities(slacks, crit_exponent);
 		/*recompute costs from scratch, based on new criticalities */
 		comp_td_costs(timing_cost, delay_cost);
 		*outer_crit_iter_count = 0;
 	}
 	(*outer_crit_iter_count)++;

 	/*at each temperature change we update these values to be used     */
 	/*for normalizing the tradeoff between timing and wirelength (bb)  */
 	*inverse_prev_bb_cost = 1 / bb_cost;
 	/*Prevent inverse timing cost from going to infinity */
 	*inverse_prev_timing_cost = min(1 / (*timing_cost), (float)MAX_INV_TIMING_COST);
 }

 /* Function which contains the inner loop of the simulated annealing */
 static void placement_inner_loop(float t, float rlim, struct s_placer_opts placer_opts,
 	float inverse_prev_bb_cost, float inverse_prev_timing_cost, int move_lim,
 	int num_connections, t_slack * slacks, float crit_exponent, int inner_recompute_limit,
 	t_placer_statistics *stats, float * cost, float * bb_cost, float * timing_cost,
 	float ** old_region_occ_x, float ** old_region_occ_y, float * delay_cost,
 	float * place_delay_value, float ** net_delay, const t_timing_inf &timing_inf) {

 	int inner_crit_iter_count, inner_iter;
 	int swap_result;

 	stats->av_cost = 0.;
 	stats->av_bb_cost = 0.;
 	stats->av_delay_cost = 0.;
 	stats->av_timing_cost = 0.;
 	stats->sum_of_squares = 0.;
 	stats->success_sum = 0;

 	inner_crit_iter_count = 1;

 	/* Inner loop begins */
 	for (inner_iter = 0; inner_iter < move_lim; inner_iter++) {
 		swap_result = try_swap(t, cost, bb_cost, timing_cost, rlim,
 				old_region_occ_x, old_region_occ_y,
 				placer_opts.place_algorithm, placer_opts.timing_tradeoff,
 				inverse_prev_bb_cost, inverse_prev_timing_cost, delay_cost);

 		if (swap_result == ACCEPTED) {
 			/* Move was accepted.  Update statistics that are useful for the annealing schedule. */
 			stats->success_sum++;
 			stats->av_cost += *cost;
 			stats->av_bb_cost += *bb_cost;
 			stats->av_timing_cost += *timing_cost;
 			stats->av_delay_cost += *delay_cost;
 			stats->sum_of_squares += (*cost) * (*cost);
 			num_swap_accepted++;
 		} else if (swap_result == ABORTED) {
 			num_swap_aborted++;
 		} else { // swap_result == REJECTED
 			num_swap_rejected++;
 		}


 		if (placer_opts.place_algorithm == NET_TIMING_DRIVEN_PLACE
 				|| placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE) {

 			/* Do we want to re-timing analyze the circuit to get updated slack and criticality values?
 			 * We do this only once in a while, since it is expensive.
 			 */
 			if (inner_crit_iter_count >= inner_recompute_limit
 					&& inner_iter != move_lim - 1) { /*on last iteration don't recompute */

 				inner_crit_iter_count = 0;
 #ifdef VERBOSE
 				vpr_printf_trace("Inner loop recompute criticalities\n");
 #endif
 				if (placer_opts.place_algorithm == NET_TIMING_DRIVEN_PLACE) {
 					/* Use a constant delay per connection as the delay estimate, rather than
 					 * estimating based on the current placement.  Not a great idea, but not the
 					 * default.
 					 */
 					(*place_delay_value) = (*delay_cost) / num_connections;
 					load_constant_net_delay(net_delay, *place_delay_value,
 							g_clbs_nlist.net, g_clbs_nlist.net.size());
 				}

 				/* Using the delays in net_delay, do a timing analysis to update slacks and
 				 * criticalities; then update the timing cost since it will change.
 				 */
 				load_timing_graph_net_delays(net_delay);
 				do_timing_analysis(slacks, timing_inf, false, false);
 				load_criticalities(slacks, crit_exponent);
 				comp_td_costs(timing_cost, delay_cost);
 			}
 			inner_crit_iter_count++;
 		}
 #ifdef VERBOSE
 		vpr_printf_trace("t = %g  cost = %g   bb_cost = %g timing_cost = %g move = %d dmax = %g\n",
 				t, *cost, *bb_cost, *timing_cost, inner_iter, *delay_cost);
 		if (fabs((*bb_cost) - comp_bb_cost(CHECK)) > (*bb_cost) * ERROR_TOL)
 			vpr_throw(VPR_ERROR_PLACE, __FILE__, __LINE__,
 				"fabs((*bb_cost) - comp_bb_cost(CHECK)) > (*bb_cost) * ERROR_TOL");
 #endif
 	}
 	/* Inner loop ends */
 }

 static int count_connections() {
 	/*only count non-global connections */

 	int count;
 	unsigned int inet;

 	count = 0;

 	for (inet = 0; inet < g_clbs_nlist.net.size(); inet++) {

 		if (g_clbs_nlist.net[inet].is_global)
 			continue;

 		count += g_clbs_nlist.net[inet].num_sinks();
 	}
 	return (count);
 }

 static double get_std_dev(int n, double sum_x_squared, double av_x) {

 	/* Returns the standard deviation of data set x.  There are n sample points, *
 	 * sum_x_squared is the summation over n of x^2 and av_x is the average x.   *
 	 * All operations are done in double precision, since round off error can be *
 	 * a problem in the initial temp. std_dev calculation for big circuits.      */

 	double std_dev;

 	if (n <= 1)
 		std_dev = 0.;
 	else
 		std_dev = (sum_x_squared - n * av_x * av_x) / (double) (n - 1);

 	if (std_dev > 0.) /* Very small variances sometimes round negative */
 		std_dev = sqrt(std_dev);
 	else
 		std_dev = 0.;

 	return (std_dev);
 }

 static void update_rlim(float *rlim, float success_rat) {

 	/* Update the range limited to keep acceptance prob. near 0.44.  Use *
 	 * a floating point rlim to allow gradual transitions at low temps.  */

 	float upper_lim;

 	*rlim = (*rlim) * (1. - 0.44 + success_rat);
 	upper_lim = max(nx + 1, ny + 1);
 	*rlim = min(*rlim, upper_lim);
 	*rlim = max(*rlim, (float)1.);
 }

 /* Update the temperature according to the annealing schedule selected. */
 static void update_t(float *t, float std_dev, float rlim, float success_rat,
 		struct s_annealing_sched annealing_sched) {

 	/*  float fac; */

 	if (annealing_sched.type == USER_SCHED) {
 		*t = annealing_sched.alpha_t * (*t);
 	}

 	/* Old standard deviation based stuff is below.  This bogs down horribly
 	 * for big circuits (alu4 and especially bigkey_mod). */
 	/* #define LAMBDA .7  */
 	/* ------------------------------------ */
 #if 0
 	else if (std_dev == 0.)
 	{
 		*t = 0.;
 	}
 	else
 	{
 		fac = exp(-LAMBDA * (*t) / std_dev);
 		fac = max(0.5, fac);
 		*t = (*t) * fac;
 	}
 #endif
 	/* ------------------------------------- */

 	else { /* AUTO_SCHED */
 		if (success_rat > 0.96) {
 			*t = (*t) * 0.5;
 		} else if (success_rat > 0.8) {
 			*t = (*t) * 0.9;
 		} else if (success_rat > 0.15 || rlim > 1.) {
 			*t = (*t) * 0.95;
 		} else {
 			*t = (*t) * 0.8;
 		}
 	}
 }

 static int exit_crit(float t, float cost,
 		struct s_annealing_sched annealing_sched) {

 	/* Return 1 when the exit criterion is met.                        */

 	if (annealing_sched.type == USER_SCHED) {
 		if (t < annealing_sched.exit_t) {
 			return (1);
 		} else {
 			return (0);
 		}
 	}

 	/* Automatic annealing schedule */

 	if (t < 0.005 * cost / g_clbs_nlist.net.size()) {
 		return (1);
 	} else {
 		return (0);
 	}
 }

 static float starting_t(float *cost_ptr, float *bb_cost_ptr,
 		float *timing_cost_ptr, float **old_region_occ_x,
 		float **old_region_occ_y,
 		struct s_annealing_sched annealing_sched, int max_moves, float rlim,
 		enum e_place_algorithm place_algorithm, float timing_tradeoff,
 		float inverse_prev_bb_cost, float inverse_prev_timing_cost,
 		float *delay_cost_ptr) {

 	/* Finds the starting temperature (hot condition).              */

 	int i, num_accepted, move_lim, swap_result;
 	double std_dev, av, sum_of_squares; /* Double important to avoid round off */

 	if (annealing_sched.type == USER_SCHED)
 		return (annealing_sched.init_t);

 	move_lim = min(max_moves, num_blocks);

 	num_accepted = 0;
 	av = 0.;
 	sum_of_squares = 0.;

 	/* Try one move per block.  Set t high so essentially all accepted. */

 	for (i = 0; i < move_lim; i++) {
 		swap_result = try_swap(HUGE_POSITIVE_FLOAT, cost_ptr, bb_cost_ptr, timing_cost_ptr, rlim,
 				old_region_occ_x, old_region_occ_y,
 				place_algorithm, timing_tradeoff,
 				inverse_prev_bb_cost, inverse_prev_timing_cost, delay_cost_ptr);

 		if (swap_result == ACCEPTED) {
 			num_accepted++;
 			av += *cost_ptr;
 			sum_of_squares += *cost_ptr * (*cost_ptr);
 			num_swap_accepted++;
 		} else if (swap_result == ABORTED) {
 			num_swap_aborted++;
 		} else {
 			num_swap_rejected++;
 		}
 	}

 	if (num_accepted != 0)
 		av /= num_accepted;
 	else
 		av = 0.;

 	std_dev = get_std_dev(num_accepted, sum_of_squares, av);

 #ifdef DEBUG
 	if (num_accepted != move_lim) {
 		vpr_printf_warning(__FILE__, __LINE__,
 				"Starting t: %d of %d configurations accepted.\n", num_accepted, move_lim);
 	}
 #endif

 #ifdef VERBOSE
 	vpr_printf_info("std_dev: %g, average cost: %g, starting temp: %g\n", std_dev, av, 20. * std_dev);
 #endif

 	/* Set the initial temperature to 20 times the standard of deviation */
 	/* so that the initial temperature adjusts according to the circuit */
 	return (20. * std_dev);
 }


 static int setup_blocks_affected(int b_from, int x_to, int y_to, int z_to) {

 	/* Find all the blocks affected when b_from is swapped with b_to.
 	 * Returns abort_swap.                  */

 	int imoved_blk, imacro;
 	int x_from, y_from, z_from, b_to;
 	int abort_swap = false;

 	x_from = block[b_from].x;
 	y_from = block[b_from].y;
 	z_from = block[b_from].z;

 	b_to = grid[x_to][y_to].blocks[z_to];

 	// Check whether the to_location is empty
 	if (b_to == EMPTY) {

 		// Swap the block, dont swap the nets yet
 		block[b_from].x = x_to;
 		block[b_from].y = y_to;
 		block[b_from].z = z_to;

 		// Sets up the blocks moved
 		imoved_blk = blocks_affected.num_moved_blocks;
 		blocks_affected.moved_blocks[imoved_blk].block_num = b_from;
 		blocks_affected.moved_blocks[imoved_blk].xold = x_from;
 		blocks_affected.moved_blocks[imoved_blk].xnew = x_to;
 		blocks_affected.moved_blocks[imoved_blk].yold = y_from;
 		blocks_affected.moved_blocks[imoved_blk].ynew = y_to;
 		blocks_affected.moved_blocks[imoved_blk].zold = z_from;
 		blocks_affected.moved_blocks[imoved_blk].znew = z_to;
 		blocks_affected.moved_blocks[imoved_blk].swapped_to_was_empty = true;
 		blocks_affected.moved_blocks[imoved_blk].swapped_from_is_empty = true;
 		blocks_affected.num_moved_blocks ++;

 	} else if (b_to != INVALID ) {

 		// Does not allow a swap with a macro yet
 		get_imacro_from_iblk(&imacro, b_to, pl_macros, num_pl_macros);
 		if (imacro != -1) {
 			abort_swap = true;
 			return (abort_swap);
 		}

 		// Swap the block, dont swap the nets yet
 		block[b_to].x = x_from;
 		block[b_to].y = y_from;
 		block[b_to].z = z_from;

 		block[b_from].x = x_to;
 		block[b_from].y = y_to;
 		block[b_from].z = z_to;

 		// Sets up the blocks moved
 		imoved_blk = blocks_affected.num_moved_blocks;
 		blocks_affected.moved_blocks[imoved_blk].block_num = b_from;
 		blocks_affected.moved_blocks[imoved_blk].xold = x_from;
 		blocks_affected.moved_blocks[imoved_blk].xnew = x_to;
 		blocks_affected.moved_blocks[imoved_blk].yold = y_from;
 		blocks_affected.moved_blocks[imoved_blk].ynew = y_to;
 		blocks_affected.moved_blocks[imoved_blk].zold = z_from;
 		blocks_affected.moved_blocks[imoved_blk].znew = z_to;
 		blocks_affected.moved_blocks[imoved_blk].swapped_to_was_empty = false;
 		blocks_affected.moved_blocks[imoved_blk].swapped_from_is_empty = false;
 		blocks_affected.num_moved_blocks ++;

 		imoved_blk = blocks_affected.num_moved_blocks;
 		blocks_affected.moved_blocks[imoved_blk].block_num = b_to;
 		blocks_affected.moved_blocks[imoved_blk].xold = x_to;
 		blocks_affected.moved_blocks[imoved_blk].xnew = x_from;
 		blocks_affected.moved_blocks[imoved_blk].yold = y_to;
 		blocks_affected.moved_blocks[imoved_blk].ynew = y_from;
 		blocks_affected.moved_blocks[imoved_blk].zold = z_to;
 		blocks_affected.moved_blocks[imoved_blk].znew = z_from;
 		blocks_affected.moved_blocks[imoved_blk].swapped_to_was_empty = false;
 		blocks_affected.moved_blocks[imoved_blk].swapped_from_is_empty = false;
 		blocks_affected.num_moved_blocks ++;

 	} // Finish swapping the blocks and setting up blocks_affected

 	return (abort_swap);

 }

 static int find_affected_blocks(int b_from, int x_to, int y_to, int z_to) {

 	/* Finds and set ups the affected_blocks array.
 	 * Returns abort_swap. */

 	int imacro, imember;
 	int x_swap_offset, y_swap_offset, z_swap_offset, x_from, y_from, z_from;
 	int curr_b_from, curr_x_from, curr_y_from, curr_z_from, curr_x_to, curr_y_to, curr_z_to;
 	int abort_swap = false;

 	x_from = block[b_from].x;
 	y_from = block[b_from].y;
 	z_from = block[b_from].z;

 	get_imacro_from_iblk(&imacro, b_from, pl_macros, num_pl_macros);
 	if ( imacro != -1) {
 		// b_from is part of a macro, I need to swap the whole macro

 		// Record down the relative position of the swap
 		x_swap_offset = x_to - x_from;
 		y_swap_offset = y_to - y_from;
 		z_swap_offset = z_to - z_from;

 		for (imember = 0; imember < pl_macros[imacro].num_blocks && abort_swap == false; imember++) {

 			// Gets the new from and to info for every block in the macro
 			// cannot use the old from and to info
 			curr_b_from = pl_macros[imacro].members[imember].blk_index;

 			curr_x_from = block[curr_b_from].x;
 			curr_y_from = block[curr_b_from].y;
 			curr_z_from = block[curr_b_from].z;

 			curr_x_to = curr_x_from + x_swap_offset;
 			curr_y_to = curr_y_from + y_swap_offset;
 			curr_z_to = curr_z_from + z_swap_offset;

 			// Make sure that the swap_to location is still on the chip
 			if (curr_x_to < 1 || curr_x_to > nx || curr_y_to < 1 || curr_y_to > ny || curr_z_to < 0) {
 				abort_swap = true;
 			} else {
 				abort_swap = setup_blocks_affected(curr_b_from, curr_x_to, curr_y_to, curr_z_to);
 			}
 		} // Finish going through all the blocks in the macro

 	} else {
 		// This is not a macro - I could use the from and to info from before
 		abort_swap = setup_blocks_affected(b_from, x_to, y_to, z_to);

 	} // Finish handling cases for blocks in macro and otherwise

 	return (abort_swap);

 }

 static enum swap_result try_swap(float t, float *cost, float *bb_cost, float *timing_cost,
 		float rlim, float **old_region_occ_x,
 		float **old_region_occ_y,
 		enum e_place_algorithm place_algorithm, float timing_tradeoff,
 		float inverse_prev_bb_cost, float inverse_prev_timing_cost,
 		float *delay_cost) {

 	/* Picks some block and moves it to another spot.  If this spot is   *
 	 * occupied, switch the blocks.  Assess the change in cost function  *
 	 * and accept or reject the move.  If rejected, return 0.  If        *
 	 * accepted return 1.  Pass back the new value of the cost function. *
 	 * rlim is the range limiter.                                        */

 	enum swap_result keep_switch;
 	int b_from, x_from, y_from, z_from, x_to, y_to, z_to;
 	int num_nets_affected;
 	float delta_c, bb_delta_c, timing_delta_c, delay_delta_c;
 	int inet, iblk, bnum, iblk_pin, inet_affected;
 	int abort_swap = false;

 	num_ts_called ++;

 	/* I'm using negative values of temp_net_cost as a flag, so DO NOT   *
 	 * use cost functions that can go negative.                          */

 	delta_c = 0; /* Change in cost due to this swap. */
 	bb_delta_c = 0;
 	timing_delta_c = 0;
 	delay_delta_c = 0.0;

 	/* Pick a random block to be swapped with another random block    */
 	b_from = my_irand(num_blocks - 1);

 	/* If the pins are fixed we never move them from their initial    *
 	 * random locations.  The code below could be made more efficient *
 	 * by using the fact that pins appear first in the block list,    *
 	 * but this shouldn't cause any significant slowdown and won't be *
 	 * broken if I ever change the parser so that the pins aren't     *
 	 * necessarily at the start of the block list.                    */
 	while (block[b_from].is_fixed == true) {
 		b_from = my_irand(num_blocks - 1);
 	}

 	x_from = block[b_from].x;
 	y_from = block[b_from].y;
 	z_from = block[b_from].z;

 	if (!find_to(block[b_from].type, rlim, b_from, x_from, y_from, &x_to, &y_to, &z_to))
 		return REJECTED;

 #if 0
 	int b_to = grid[x_to][y_to].blocks[z_to];
 	vpr_printf_info( "swap [%d][%d][%d] %s \"%s\" <=> [%d][%d][%d] %s \"%s\"\n",
 		x_from, y_from, z_from, grid[x_from][y_from].type->name, b_from != -1 ? block[b_from].name : "",
 		x_to, y_to, z_to, grid[x_to][y_to].type->name, b_to != -1 ? block[b_to].name : "");
 #endif

 	/* Make the switch in order to make computing the new bounding *
 	 * box simpler.  If the cost increase is too high, switch them *
 	 * back.  (block data structures switched, clbs not switched   *
 	 * until success of move is determined.)                       *
 	 * Also check that whether those are the only 2 blocks         *
 	 * to be moved - check for carry chains and other placement    *
 	 * macros.                                                     */

 	/* Check whether the from_block is part of a macro first.      *
 	 * If it is, the whole macro has to be moved. Calculate the    *
 	 * x, y, z offsets of the swap to maintain relative placements *
 	 * of the blocks. Abort the swap if the to_block is part of a  *
 	 * macro (not supported yet).                                  */

 	abort_swap = find_affected_blocks(b_from, x_to, y_to, z_to);

 	if (abort_swap == false) {

 		// Find all the nets affected by this swap
 		num_nets_affected = find_affected_nets(ts_nets_to_update);

 		/* Go through all the pins in all the blocks moved and update the bounding boxes.  *
 		 * Do not update the net cost here since it should only be updated once per net,   *
 		 * not once per pin                                                                */
 		for (iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++)
 		{
 			bnum = blocks_affected.moved_blocks[iblk].block_num;

 			/* Go through all the pins in the moved block */
 			for (iblk_pin = 0; iblk_pin < block[bnum].type->num_pins; iblk_pin++)
 			{
 				inet = block[bnum].nets[iblk_pin];
 				if (inet == OPEN)
 					continue;
 				if (g_clbs_nlist.net[inet].is_global)
 					continue;

 				if (g_clbs_nlist.net[inet].num_sinks() < SMALL_NET) {
 					if(bb_updated_before[inet] == NOT_UPDATED_YET)
 						/* Brute force bounding box recomputation, once only for speed. */
 						get_non_updateable_bb(inet, &ts_bb_coord_new[inet]);
 				} else {
 					update_bb(inet, &ts_bb_coord_new[inet],
 							&ts_bb_edge_new[inet],
 							blocks_affected.moved_blocks[iblk].xold + block[bnum].type->pin_width[iblk_pin],
 							blocks_affected.moved_blocks[iblk].yold + block[bnum].type->pin_height[iblk_pin],
 							blocks_affected.moved_blocks[iblk].xnew + block[bnum].type->pin_width[iblk_pin],
 							blocks_affected.moved_blocks[iblk].ynew + block[bnum].type->pin_height[iblk_pin]);
 				}
 			}
 		}

 		/* Now update the cost function. The cost is only updated once for every net  *
 		 * May have to do major optimizations here later.                             */
 		for (inet_affected = 0; inet_affected < num_nets_affected; inet_affected++) {
 			inet = ts_nets_to_update[inet_affected];

 			temp_net_cost[inet] = get_net_cost(inet, &ts_bb_coord_new[inet]);
 			bb_delta_c += temp_net_cost[inet] - net_cost[inet];
 		}

 		if (place_algorithm == NET_TIMING_DRIVEN_PLACE
 				|| place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
 			/*in this case we redefine delta_c as a combination of timing and bb.  *
 			 *additionally, we normalize all values, therefore delta_c is in       *
 			 *relation to 1*/

 			comp_delta_td_cost(&timing_delta_c, &delay_delta_c);

 			delta_c = (1 - timing_tradeoff) * bb_delta_c * inverse_prev_bb_cost
 					+ timing_tradeoff * timing_delta_c * inverse_prev_timing_cost;
 		} else {
 			delta_c = bb_delta_c;
 		}

 		/* 1 -> move accepted, 0 -> rejected. */
 		keep_switch = assess_swap(delta_c, t);

 		if (keep_switch == ACCEPTED) {
 			*cost = *cost + delta_c;
 			*bb_cost = *bb_cost + bb_delta_c;

 			if (place_algorithm == NET_TIMING_DRIVEN_PLACE
 					|| place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
 				/*update the point_to_point_timing_cost and point_to_point_delay_cost
 				 * values from the temporary values */
 				*timing_cost = *timing_cost + timing_delta_c;
 				*delay_cost = *delay_cost + delay_delta_c;

 				update_td_cost();
 			}

 			/* update net cost functions and reset flags. */
 			for (inet_affected = 0; inet_affected < num_nets_affected; inet_affected++) {
 				inet = ts_nets_to_update[inet_affected];

 				bb_coords[inet] = ts_bb_coord_new[inet];
 				if (g_clbs_nlist.net[inet].num_sinks() >= SMALL_NET)
 					bb_num_on_edges[inet] = ts_bb_edge_new[inet];

 				net_cost[inet] = temp_net_cost[inet];

 				/* negative temp_net_cost value is acting as a flag. */
 				temp_net_cost[inet] = -1;
 				bb_updated_before[inet] = NOT_UPDATED_YET;
 			}

 			/* Update clb data structures since we kept the move. */
 			/* Swap physical location */
 			for (iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++) {

 				x_to = blocks_affected.moved_blocks[iblk].xnew;
 				y_to = blocks_affected.moved_blocks[iblk].ynew;
 				z_to = blocks_affected.moved_blocks[iblk].znew;

 				x_from = blocks_affected.moved_blocks[iblk].xold;
 				y_from = blocks_affected.moved_blocks[iblk].yold;
 				z_from = blocks_affected.moved_blocks[iblk].zold;

 				b_from = blocks_affected.moved_blocks[iblk].block_num;

 				grid[x_to][y_to].blocks[z_to] = b_from;

 				if (blocks_affected.moved_blocks[iblk].swapped_to_was_empty) {
 					grid[x_to][y_to].usage++;
 				}
 				if (blocks_affected.moved_blocks[iblk].swapped_from_is_empty) {
 					grid[x_from][y_from].usage--;
 					grid[x_from][y_from].blocks[z_from] = EMPTY;
 				}

 			} // Finish updating clb for all blocks

 		} else { /* Move was rejected.  */

 			/* Reset the net cost function flags first. */
 			for (inet_affected = 0; inet_affected < num_nets_affected; inet_affected++) {
 				inet = ts_nets_to_update[inet_affected];
 				temp_net_cost[inet] = -1;
 				bb_updated_before[inet] = NOT_UPDATED_YET;
 			}

 			/* Restore the block data structures to their state before the move. */
 			for (iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++) {
 				b_from = blocks_affected.moved_blocks[iblk].block_num;

 				block[b_from].x = blocks_affected.moved_blocks[iblk].xold;
 				block[b_from].y = blocks_affected.moved_blocks[iblk].yold;
 				block[b_from].z = blocks_affected.moved_blocks[iblk].zold;
 			}
 		}

 		/* Resets the num_moved_blocks, but do not free blocks_moved array. Defensive Coding */
 		blocks_affected.num_moved_blocks = 0;

 		//check_place(*bb_cost, *timing_cost, place_algorithm, *delay_cost);

 		return (keep_switch);
 	} else {

 		/* Restore the block data structures to their state before the move. */
 		for (iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++) {
 			b_from = blocks_affected.moved_blocks[iblk].block_num;

 			block[b_from].x = blocks_affected.moved_blocks[iblk].xold;
 			block[b_from].y = blocks_affected.moved_blocks[iblk].yold;
 			block[b_from].z = blocks_affected.moved_blocks[iblk].zold;
 		}

 		/* Resets the num_moved_blocks, but do not free blocks_moved array. Defensive Coding */
 		blocks_affected.num_moved_blocks = 0;

 		return ABORTED;
 	}
 }

 static int find_affected_nets(int *nets_to_update) {

 	/* Puts a list of all the nets that are changed by the swap into          *
 	 * nets_to_update.  Returns the number of affected nets.                  */

 	int iblk, iblk_pin, inet, bnum, num_affected_nets;

 	num_affected_nets = 0;
 	/* Go through all the blocks moved */
 	for (iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++)
 	{
 		bnum = blocks_affected.moved_blocks[iblk].block_num;

 		/* Go through all the pins in the moved block */
 		for (iblk_pin = 0; iblk_pin < block[bnum].type->num_pins; iblk_pin++)
 		{
 			/* Updates the pins_to_nets array, set to -1 if   *
 			 * that pin is not connected to any net or it is a  *
 			 * global pin that does not need to be updated      */
 			inet = block[bnum].nets[iblk_pin];
 			if (inet == OPEN)
 				continue;
 			if (g_clbs_nlist.net[inet].is_global)
 				continue;

 			if (temp_net_cost[inet] < 0.) {
 				/* Net not marked yet. */
 				nets_to_update[num_affected_nets] = inet;
 				num_affected_nets++;

 				/* Flag to say we've marked this net. */
 				temp_net_cost[inet] = 1.;
 			}
 		}
 	}
 	return num_affected_nets;
 }

 static bool find_to(t_type_ptr type, float rlim,
 		int iblk_from, int x_from, int y_from,
 		int *px_to, int *py_to, int *pz_to) {

 	/* Returns the point to which I want to swap, properly range limited.
 	 * rlim must always be between 1 and nx (inclusive) for this routine
 	 * to work.  Assumes that a column only contains blocks of the same type.
 	 */

 	int rlx, rly, min_x, max_x, min_y, max_y;
 	int num_tries;
 	int active_area;
 	bool is_legal;
 	int itype;

 	assert(type == grid[x_from][y_from].type);

 	rlx = (int)min((float)nx + 1, rlim);
 	rly = (int)min((float)ny + 1, rlim); /* Added rly for aspect_ratio != 1 case. */
 	active_area = 4 * rlx * rly;

 	min_x = max(0, x_from - rlx);
 	max_x = min(nx + 1, x_from + rlx);
 	min_y = max(0, y_from - rly);
 	max_y = min(ny + 1, y_from + rly);

 #ifdef DEBUG
 	if (rlx < 1 || rlx > nx + 1) {
 		vpr_throw(VPR_ERROR_PLACE, __FILE__, __LINE__,"in find_to: rlx = %d\n", rlx);
 	}
 #endif

 	num_tries = 0;
 	itype = type->index;

 	do { /* Until legal */
 		is_legal = true;

 		/* Limit the number of tries when searching for an alternative position */
 		if(num_tries >= 2 * min(active_area / (type->width * type->height), num_legal_pos[itype]) + 10) {
 			/* Tried randomly searching for a suitable position */
 			return false;
 		} else {
 			num_tries++;
 		}

 		find_to_location(type, rlim, iblk_from, x_from, y_from,
 				px_to, py_to, pz_to);

 		if((x_from == *px_to) && (y_from == *py_to)) {
 			is_legal = false;
 		} else if(*px_to > max_x || *px_to < min_x || *py_to > max_y || *py_to < min_y) {
 			is_legal = false;
 		} else if(grid[*px_to][*py_to].type != grid[x_from][y_from].type) {
 			is_legal = false;
 		} else {
 			/* Find z_to and test to validate that the "to" block is *not* fixed */
 			*pz_to = 0;
 			if (grid[*px_to][*py_to].type->capacity > 1) {
 				*pz_to = my_irand(grid[*px_to][*py_to].type->capacity - 1);
 			}
 			int b_to = grid[*px_to][*py_to].blocks[*pz_to];
 			if ((b_to != EMPTY) && (block[b_to].is_fixed == true)) {
 				is_legal = false;
 			}
 		}

 		assert(*px_to >= 0 && *px_to <= nx + 1);
 		assert(*py_to >= 0 && *py_to <= ny + 1);
 	} while (is_legal == false);

 #ifdef DEBUG
 	if (*px_to < 0 || *px_to > nx + 1 || *py_to < 0 || *py_to > ny + 1) {
 		vpr_throw(VPR_ERROR_PLACE, __FILE__, __LINE__,"in routine find_to: (x_to,y_to) = (%d,%d)\n", *px_to, *py_to);
 	}
 #endif
 	assert(type == grid[*px_to][*py_to].type);
 	return true;
 }

 static void find_to_location(t_type_ptr type, float rlim,
 		int iblk_from, int x_from, int y_from,
 		int *px_to, int *py_to, int *pz_to) {

 	int itype = type->index;


 	int rlx = (int)min((float)nx + 1, rlim);
 	int rly = (int)min((float)ny + 1, rlim); /* Added rly for aspect_ratio != 1 case. */
 	int active_area = 4 * rlx * rly;

 	int min_x = max(0, x_from - rlx);
 	int max_x = min(nx + 1, x_from + rlx);
 	int min_y = max(0, y_from - rly);
 	int max_y = min(ny + 1, y_from + rly);

 	*pz_to = 0;
 	if (nx / 4 < rlx || ny / 4 < rly || num_legal_pos[itype] < active_area) {
 		int ipos = my_irand(num_legal_pos[itype] - 1);
 		*px_to = legal_pos[itype][ipos].x;
 		*py_to = legal_pos[itype][ipos].y;
 		*pz_to = legal_pos[itype][ipos].z;
 	} else {
 		int x_rel = my_irand(max(0, max_x - min_x));
 		int y_rel = my_irand(max(0, max_y - min_y));
 		*px_to = min_x + x_rel;
 		*py_to = min_y + y_rel;
 		*px_to = (*px_to) - grid[*px_to][*py_to].width_offset; /* align it */
 		*py_to = (*py_to) - grid[*px_to][*py_to].height_offset; /* align it */
 	}
 }

 static enum swap_result assess_swap(float delta_c, float t) {

 	/* Returns: 1 -> move accepted, 0 -> rejected. */

 	enum swap_result accept;
 	float prob_fac, fnum;

 	if (delta_c <= 0) {

 #ifdef SPEC			/* Reduce variation in final solution due to round off */
 		fnum = my_frand();
 #endif

 		accept = ACCEPTED;
 		return (accept);
 	}

 	if (t == 0.)
 		return (REJECTED);

 	fnum = my_frand();
 	prob_fac = exp(-delta_c / t);
 	if (prob_fac > fnum) {
 		accept = ACCEPTED;
 	} else {
 		accept = REJECTED;
 	}
 	return (accept);
 }

 static float recompute_bb_cost(void) {

 	/* Recomputes the cost to eliminate roundoff that may have accrued.  *
 	 * This routine does as little work as possible to compute this new  *
 	 * cost.                                                             */

 	unsigned int inet;
 	float cost;

 	cost = 0;

 	for (inet = 0; inet < g_clbs_nlist.net.size(); inet++) { /* for each net ... */
 		if (g_clbs_nlist.net[inet].is_global == false) { /* Do only if not global. */

 			/* Bounding boxes don't have to be recomputed; they're correct. */
 			cost += net_cost[inet];
 		}
 	}

 	return (cost);
 }

 static float comp_td_point_to_point_delay(int inet, int ipin) {

 	/*returns the delay of one point to point connection */

 	int source_block, sink_block;
 	int delta_x, delta_y;
 	t_type_ptr source_type, sink_type;
 	float delay_source_to_sink;

 	delay_source_to_sink = 0.;

 	source_block = g_clbs_nlist.net[inet].pins[0].block;
 	source_type = block[source_block].type;

 	sink_block = g_clbs_nlist.net[inet].pins[ipin].block;
 	sink_type = block[sink_block].type;

 	assert(source_type != NULL);
 	assert(sink_type != NULL);

 	delta_x = abs(block[sink_block].x - block[source_block].x);
 	delta_y = abs(block[sink_block].y - block[source_block].y);

 	/* TODO low priority: Could be merged into one look-up table */
 	/* Note: This heuristic is terrible on Quality of Results.
 	 * A much better heuristic is to create a more comprehensive lookup table but
 	 * it's too late in the release cycle to do this.  Pushing until the next release */
 	if (source_type == IO_TYPE) {
 		if (sink_type == IO_TYPE)
 			delay_source_to_sink = delta_io_to_io[delta_x][delta_y];
 		else
 			delay_source_to_sink = delta_io_to_clb[delta_x][delta_y];
 	} else {
 		if (sink_type == IO_TYPE)
 			delay_source_to_sink = delta_clb_to_io[delta_x][delta_y];
 		else
 			delay_source_to_sink = delta_clb_to_clb[delta_x][delta_y];
 	}
 	if (delay_source_to_sink < 0) {
 		vpr_throw(VPR_ERROR_PLACE, __FILE__, __LINE__,
 				"in comp_td_point_to_point_delay: Bad delay_source_to_sink value delta(%d, %d) delay of %g\n"
 				"in comp_td_point_to_point_delay: Delay is less than 0\n",
 				delta_x, delta_y, delay_source_to_sink);
 	}

 #ifdef INTERPOSER_BASED_ARCHITECTURE
 	/* Check the number of times this connection crosses the interposer cuts
 	 * and account for the added delay
 	 */
 	if(num_cuts > 0 && delay_increase > 0)
 	{
 		float f_delay_increase = (float)delay_increase * 1e-12;
 		int y1 = std::min(block[source_block].y, block[sink_block].y);
 		int y2 = std::max(block[source_block].y, block[sink_block].y);
 		int times_crossed = 0;
 		int counter = 0;
 		int y = 0;
 		for(counter = 0; counter < num_cuts; ++counter)
 		{
 			y = arch_cut_locations[counter];
 			if(y1 <= y && y2 > y)
 			{
 				times_crossed++;
 			}
 		}

 		delay_source_to_sink += (float)times_crossed * f_delay_increase;
 	}
 #endif

 	return (delay_source_to_sink);
 }

 static void update_td_cost(void) {
 	/* Update the point_to_point_timing_cost values from the temporary *
 	 * values for all connections that have changed.                   */

 	int iblk_pin, net_pin, inet;
 	unsigned int ipin;
 	int iblk, iblk2, bnum, driven_by_moved_block;

 	/* Go through all the blocks moved. */
 	for (iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++)
 	{
 		bnum = blocks_affected.moved_blocks[iblk].block_num;
 		for (iblk_pin = 0; iblk_pin < block[bnum].type->num_pins; iblk_pin++) {

 			inet = block[bnum].nets[iblk_pin];

 			if (inet == OPEN)
 				continue;

 			if (g_clbs_nlist.net[inet].is_global)
 				continue;

 			net_pin = net_pin_index[bnum][iblk_pin];

 			if (net_pin != 0) {

 				driven_by_moved_block = false;
 				for (iblk2 = 0; iblk2 < blocks_affected.num_moved_blocks; iblk2++)
 				{	if (g_clbs_nlist.net[inet].pins[0].block == blocks_affected.moved_blocks[iblk2].block_num)
 						driven_by_moved_block = true;
 				}

 				/* The following "if" prevents the value from being updated twice. */
 				if (driven_by_moved_block == false) {
 					point_to_point_delay_cost[inet][net_pin] =
 							temp_point_to_point_delay_cost[inet][net_pin];
 					temp_point_to_point_delay_cost[inet][net_pin] = -1;
 					point_to_point_timing_cost[inet][net_pin] =
 							temp_point_to_point_timing_cost[inet][net_pin];
 					temp_point_to_point_timing_cost[inet][net_pin] = -1;
 				}
 			} else { /* This net is being driven by a moved block, recompute */
 				/* All point to point connections on this net. */
 				for (ipin = 1; ipin < g_clbs_nlist.net[inet].pins.size(); ipin++) {
 					point_to_point_delay_cost[inet][ipin] =
 							temp_point_to_point_delay_cost[inet][ipin];
 					temp_point_to_point_delay_cost[inet][ipin] = -1;
 					point_to_point_timing_cost[inet][ipin] =
 							temp_point_to_point_timing_cost[inet][ipin];
 					temp_point_to_point_timing_cost[inet][ipin] = -1;
 				} /* Finished updating the pin */
 			}
 		} /* Finished going through all the pins in the moved block */
 	} /* Finished going through all the blocks moved */
 }

 static void comp_delta_td_cost(float *delta_timing, float *delta_delay) {

 	/*a net that is being driven by a moved block must have all of its  */
 	/*sink timing costs recomputed. A net that is driving a moved block */
 	/*must only have the timing cost on the connection driving the input */
 	/*pin computed */

 	int inet, net_pin;
 	unsigned int ipin;
 	float delta_timing_cost, delta_delay_cost, temp_delay;
 	int iblk, iblk2, bnum, iblk_pin, driven_by_moved_block;

 	delta_timing_cost = 0.;
 	delta_delay_cost = 0.;

 	/* Go through all the blocks moved */
 	for (iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++)
 	{
 		bnum = blocks_affected.moved_blocks[iblk].block_num;
 		/* Go through all the pins in the moved block */
 		for (iblk_pin = 0; iblk_pin < block[bnum].type->num_pins; iblk_pin++) {
 			inet = block[bnum].nets[iblk_pin];

 			if (inet == OPEN)
 				continue;

 			if (g_clbs_nlist.net[inet].is_global)
 				continue;

 			net_pin = net_pin_index[bnum][iblk_pin];

 			if (net_pin != 0) {
 				/* If this net is being driven by a block that has moved, we do not    *
 				 * need to compute the change in the timing cost (here) since it will  *
 				 * be computed in the fanout of the net on  the driving block, also    *
 				 * computing it here would double count the change, and mess up the    *
 				 * delta_timing_cost value.                                            */
 				driven_by_moved_block = false;
 				for (iblk2 = 0; iblk2 < blocks_affected.num_moved_blocks; iblk2++)
 				{	if (g_clbs_nlist.net[inet].pins[0].block == blocks_affected.moved_blocks[iblk2].block_num)
 						driven_by_moved_block = true;
 				}

 				if (driven_by_moved_block == false) {
 					temp_delay = comp_td_point_to_point_delay(inet, net_pin);
 					temp_point_to_point_delay_cost[inet][net_pin] = temp_delay;

 					temp_point_to_point_timing_cost[inet][net_pin] =
 						timing_place_crit[inet][net_pin] * temp_delay;
 					delta_timing_cost += temp_point_to_point_timing_cost[inet][net_pin]
 						- point_to_point_timing_cost[inet][net_pin];
 					delta_delay_cost += temp_point_to_point_delay_cost[inet][net_pin]
 							- point_to_point_delay_cost[inet][net_pin];
 				}
 			} else { /* This net is being driven by a moved block, recompute */
 				/* All point to point connections on this net. */
 				for (ipin = 1; ipin < g_clbs_nlist.net[inet].pins.size(); ipin++) {
 					temp_delay = comp_td_point_to_point_delay(inet, ipin);
 					temp_point_to_point_delay_cost[inet][ipin] = temp_delay;

 					temp_point_to_point_timing_cost[inet][ipin] =
 						timing_place_crit[inet][ipin] * temp_delay;
 					delta_timing_cost += temp_point_to_point_timing_cost[inet][ipin]
 						- point_to_point_timing_cost[inet][ipin];
 					delta_delay_cost += temp_point_to_point_delay_cost[inet][ipin]
 							- point_to_point_delay_cost[inet][ipin];

 				} /* Finished updating the pin */
 			}
 		} /* Finished going through all the pins in the moved block */
 	} /* Finished going through all the blocks moved */

 	*delta_timing = delta_timing_cost;
 	*delta_delay = delta_delay_cost;
 }

 static void comp_td_costs(float *timing_cost, float *connection_delay_sum) {
 	/* Computes the cost (from scratch) due to the delays and criticalities  *
 	 * on all point to point connections, we define the timing cost of       *
 	 * each connection as criticality*delay.                                 */

 	unsigned inet, ipin;
 	float loc_timing_cost, loc_connection_delay_sum, temp_delay_cost,
 			temp_timing_cost;

 	loc_timing_cost = 0.;
 	loc_connection_delay_sum = 0.;

 	for (inet = 0; inet < g_clbs_nlist.net.size(); inet++) { /* For each net ... */
 		if (g_clbs_nlist.net[inet].is_global == false) { /* Do only if not global. */

 			for (ipin = 1; ipin < g_clbs_nlist.net[inet].pins.size(); ipin++) {

 				temp_delay_cost = comp_td_point_to_point_delay(inet, ipin);
 				temp_timing_cost = temp_delay_cost * timing_place_crit[inet][ipin];

 				loc_connection_delay_sum += temp_delay_cost;
 				point_to_point_delay_cost[inet][ipin] = temp_delay_cost;
 				temp_point_to_point_delay_cost[inet][ipin] = -1; /* Undefined */

 				point_to_point_timing_cost[inet][ipin] = temp_timing_cost;
 				temp_point_to_point_timing_cost[inet][ipin] = -1; /* Undefined */
 				loc_timing_cost += temp_timing_cost;
 			}
 		}
 	}

 	/* Make sure timing cost does not go above MIN_TIMING_COST. */
 	*timing_cost = loc_timing_cost;

 	*connection_delay_sum = loc_connection_delay_sum;
 }

 static float comp_bb_cost(enum cost_methods method) {

 	/* Finds the cost from scratch.  Done only when the placement   *
 	 * has been radically changed (i.e. after initial placement).   *
 	 * Otherwise find the cost change incrementally.  If method     *
 	 * check is NORMAL, we find bounding boxes that are updateable  *
 	 * for the larger nets.  If method is CHECK, all bounding boxes *
 	 * are found via the non_updateable_bb routine, to provide a    *
 	 * cost which can be used to check the correctness of the       *
 	 * other routine.                                               */

 	unsigned int inet;
 	float cost;
 	double expected_wirelength;

 	cost = 0;
 	expected_wirelength = 0.0;

 	for (inet = 0; inet < g_clbs_nlist.net.size(); inet++) { /* for each net ... */

 		if (g_clbs_nlist.net[inet].is_global == false) { /* Do only if not global. */

 			/* Small nets don't use incremental updating on their bounding boxes, *
 			 * so they can use a fast bounding box calculator.                    */

 			if (g_clbs_nlist.net[inet].num_sinks() >= SMALL_NET && method == NORMAL) {
 				get_bb_from_scratch(inet, &bb_coords[inet],
 						&bb_num_on_edges[inet]);
 			} else {
 				get_non_updateable_bb(inet, &bb_coords[inet]);
 			}

 			net_cost[inet] = get_net_cost(inet, &bb_coords[inet]);
 			cost += net_cost[inet];
 			if (method == CHECK)
 				expected_wirelength += get_net_wirelength_estimate(inet,
 						&bb_coords[inet]);
 		}
 	}

 	if (method == CHECK) {
 		vpr_printf_info("\n");
 		vpr_printf_info("BB estimate of min-dist (placement) wire length: %.0f\n", expected_wirelength);
 	}
 	return (cost);
 }

 static void free_placement_structs(
 		float **old_region_occ_x, float **old_region_occ_y,
 		struct s_placer_opts placer_opts) {

 	/* Frees the major structures needed by the placer (and not needed       *
 	 * elsewhere).   */

 	unsigned int inet;
 	int imacro;

 	free_legal_placements();
 	free_fast_cost_update();

 	if (placer_opts.place_algorithm == NET_TIMING_DRIVEN_PLACE
 			|| placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE
 			|| placer_opts.enable_timing_computations) {
 		for (inet = 0; inet < g_clbs_nlist.net.size(); inet++) {
 			/*add one to the address since it is indexed from 1 not 0 */

 			point_to_point_delay_cost[inet]++;
 			free(point_to_point_delay_cost[inet]);

 			point_to_point_timing_cost[inet]++;
 			free(point_to_point_timing_cost[inet]);

 			temp_point_to_point_delay_cost[inet]++;
 			free(temp_point_to_point_delay_cost[inet]);

 			temp_point_to_point_timing_cost[inet]++;
 			free(temp_point_to_point_timing_cost[inet]);
 		}
 		free(point_to_point_delay_cost);
 		free(temp_point_to_point_delay_cost);

 		free(point_to_point_timing_cost);
 		free(temp_point_to_point_timing_cost);

 		free_matrix(net_pin_index, 0, num_blocks - 1, 0, sizeof(int));
 	}

 	free(net_cost);
 	free(temp_net_cost);
 	free(bb_num_on_edges);
 	free(bb_coords);

 	free_placement_macros_structs();

 	for (imacro = 0; imacro < num_pl_macros; imacro ++)
 		free(pl_macros[imacro].members);
 	free(pl_macros);

 	net_cost = NULL; /* Defensive coding. */
 	temp_net_cost = NULL;
 	bb_num_on_edges = NULL;
 	bb_coords = NULL;
 	pl_macros = NULL;

 	/* Frees up all the data structure used in vpr_utils. */
 	free_port_pin_from_blk_pin();
 	free_blk_pin_from_port_pin();

 }

 static void alloc_and_load_placement_structs(
 		float place_cost_exp, float ***old_region_occ_x,
 		float ***old_region_occ_y, struct s_placer_opts placer_opts,
 		t_direct_inf *directs, int num_directs, int num_segments) {

 	/* Allocates the major structures needed only by the placer, primarily for *
 	 * computing costs quickly and such.                                       */

 	int max_pins_per_clb, i;
 	unsigned int inet, ipin;

 	alloc_legal_placements();
 	load_legal_placements();

 	max_pins_per_clb = 0;
 	for (i = 0; i < num_types; i++) {
 		max_pins_per_clb = max(max_pins_per_clb, type_descriptors[i].num_pins);
 	}

 	if (placer_opts.place_algorithm == NET_TIMING_DRIVEN_PLACE
 			|| placer_opts.place_algorithm == PATH_TIMING_DRIVEN_PLACE
 			|| placer_opts.enable_timing_computations) {
 		/* Allocate structures associated with timing driven placement */
 		/* [0..g_clbs_nlist.net.size()-1][1..num_pins-1]  */
 		point_to_point_delay_cost = (float **) my_malloc(
 				g_clbs_nlist.net.size() * sizeof(float *));
 		temp_point_to_point_delay_cost = (float **) my_malloc(
 				g_clbs_nlist.net.size() * sizeof(float *));

 		point_to_point_timing_cost = (float **) my_malloc(
 				g_clbs_nlist.net.size() * sizeof(float *));
 		temp_point_to_point_timing_cost = (float **) my_malloc(
 				g_clbs_nlist.net.size() * sizeof(float *));

 		for (inet = 0; inet < g_clbs_nlist.net.size(); inet++) {

 			/* In the following, subract one so index starts at *
 			 * 1 instead of 0 */
 			point_to_point_delay_cost[inet] = (float *) my_malloc(
 					g_clbs_nlist.net[inet].num_sinks() * sizeof(float));
 			point_to_point_delay_cost[inet]--;

 			temp_point_to_point_delay_cost[inet] = (float *) my_malloc(
 					g_clbs_nlist.net[inet].num_sinks() * sizeof(float));
 			temp_point_to_point_delay_cost[inet]--;

 			point_to_point_timing_cost[inet] = (float *) my_malloc(
 					g_clbs_nlist.net[inet].num_sinks() * sizeof(float));
 			point_to_point_timing_cost[inet]--;

 			temp_point_to_point_timing_cost[inet] = (float *) my_malloc(
 					g_clbs_nlist.net[inet].num_sinks() * sizeof(float));
 			temp_point_to_point_timing_cost[inet]--;
 		}
 		for (inet = 0; inet < g_clbs_nlist.net.size(); inet++) {
 			for (ipin = 1; ipin < g_clbs_nlist.net[inet].pins.size(); ipin++) {
 				point_to_point_delay_cost[inet][ipin] = 0;
 				temp_point_to_point_delay_cost[inet][ipin] = 0;
 			}
 		}
 	}

 	net_cost = (float *) my_malloc(g_clbs_nlist.net.size() * sizeof(float));
 	temp_net_cost = (float *) my_malloc(g_clbs_nlist.net.size() * sizeof(float));
 	bb_updated_before = (char*)my_calloc(g_clbs_nlist.net.size(), sizeof(char));

 	/* Used to store costs for moves not yet made and to indicate when a net's   *
 	 * cost has been recomputed. temp_net_cost[inet] < 0 means net's cost hasn't *
 	 * been recomputed.                                                          */

 	for (inet = 0; inet < g_clbs_nlist.net.size(); inet++){
 		bb_updated_before[inet] = NOT_UPDATED_YET;
 		temp_net_cost[inet] = -1.;
 	}

 	bb_coords = (struct s_bb *) my_malloc(g_clbs_nlist.net.size() * sizeof(struct s_bb));
 	bb_num_on_edges = (struct s_bb *) my_malloc(g_clbs_nlist.net.size() * sizeof(struct s_bb));

 	/* Shouldn't use them; crash hard if I do!   */
 	*old_region_occ_x = NULL;
 	*old_region_occ_y = NULL;

 	alloc_and_load_for_fast_cost_update(place_cost_exp);

 	net_pin_index = alloc_and_load_net_pin_index();

 	alloc_and_load_try_swap_structs();

 	num_pl_macros = alloc_and_load_placement_macros(directs, num_directs, num_segments, &pl_macros);
 }

 static void alloc_and_load_try_swap_structs() {
 	/* Allocate the local bb_coordinate storage, etc. only once. */
 	/* Allocate with size g_clbs_nlist.net.size() for any number of nets affected. */
 	ts_bb_coord_new = (struct s_bb *) my_calloc(
 			g_clbs_nlist.net.size(), sizeof(struct s_bb));
 	ts_bb_edge_new = (struct s_bb *) my_calloc(
 			g_clbs_nlist.net.size(), sizeof(struct s_bb));
 	ts_nets_to_update = (int *) my_calloc(g_clbs_nlist.net.size(), sizeof(int));

 	/* Allocate with size num_blocks for any number of moved block. */
 	blocks_affected.moved_blocks = (t_pl_moved_block*)my_calloc(
 			num_blocks, sizeof(t_pl_moved_block) );
 	blocks_affected.num_moved_blocks = 0;

 }

 static void get_bb_from_scratch(int inet, struct s_bb *coords,
 		struct s_bb *num_on_edges) {

 	/* This routine finds the bounding box of each net from scratch (i.e.    *
 	 * from only the block location information).  It updates both the       *
 	 * coordinate and number of pins on each edge information.  It         *
 	 * should only be called when the bounding box information is not valid. */

 	int bnum, pnum, x, y, xmin, xmax, ymin, ymax;
 	int xmin_edge, xmax_edge, ymin_edge, ymax_edge;
 	unsigned int ipin, n_pins;

 	n_pins = g_clbs_nlist.net[inet].pins.size();

 	bnum = g_clbs_nlist.net[inet].pins[0].block;
 	pnum = g_clbs_nlist.net[inet].pins[0].block_pin;

 	x = block[bnum].x + block[bnum].type->pin_width[pnum];
 	y = block[bnum].y + block[bnum].type->pin_height[pnum];

 	x = max(min(x, nx), 1);
 	y = max(min(y, ny), 1);

 	xmin = x;
 	ymin = y;
 	xmax = x;
 	ymax = y;
 	xmin_edge = 1;
 	ymin_edge = 1;
 	xmax_edge = 1;
 	ymax_edge = 1;

 	for (ipin = 1; ipin < n_pins; ipin++) {
 		bnum = g_clbs_nlist.net[inet].pins[ipin].block;
 		pnum = g_clbs_nlist.net[inet].pins[ipin].block_pin;
 		x = block[bnum].x + block[bnum].type->pin_width[pnum];
 		y = block[bnum].y + block[bnum].type->pin_height[pnum];

 		/* Code below counts IO blocks as being within the 1..nx, 1..ny clb array. *
 		 * This is because channels do not go out of the 0..nx, 0..ny range, and   *
 		 * I always take all channels impinging on the bounding box to be within   *
 		 * that bounding box.  Hence, this "movement" of IO blocks does not affect *
 		 * the which channels are included within the bounding box, and it         *
 		 * simplifies the code a lot.                                              */

 		x = max(min(x, nx), 1);
 		y = max(min(y, ny), 1);

 		if (x == xmin) {
 			xmin_edge++;
 		}
 		if (x == xmax) { /* Recall that xmin could equal xmax -- don't use else */
 			xmax_edge++;
 		} else if (x < xmin) {
 			xmin = x;
 			xmin_edge = 1;
 		} else if (x > xmax) {
 			xmax = x;
 			xmax_edge = 1;
 		}

 		if (y == ymin) {
 			ymin_edge++;
 		}
 		if (y == ymax) {
 			ymax_edge++;
 		} else if (y < ymin) {
 			ymin = y;
 			ymin_edge = 1;
 		} else if (y > ymax) {
 			ymax = y;
 			ymax_edge = 1;
 		}
 	}

 	/* Copy the coordinates and number on edges information into the proper   *
 	 * structures.                                                            */
 	coords->xmin = xmin;
 	coords->xmax = xmax;
 	coords->ymin = ymin;
 	coords->ymax = ymax;

 	num_on_edges->xmin = xmin_edge;
 	num_on_edges->xmax = xmax_edge;
 	num_on_edges->ymin = ymin_edge;
 	num_on_edges->ymax = ymax_edge;
 }

 static double get_net_wirelength_estimate(int inet, struct s_bb *bbptr) {

 	/* WMF: Finds the estimate of wirelength due to one net by looking at   *
 	 * its coordinate bounding box.                                         */

 	double ncost, crossing;

 	/* Get the expected "crossing count" of a net, based on its number *
 	 * of pins.  Extrapolate for very large nets.                      */

 	if (((g_clbs_nlist.net[inet].pins.size()) > 50)
 			&& ((g_clbs_nlist.net[inet].pins.size()) < 85)) {
 		crossing = 2.7933 + 0.02616 * ((g_clbs_nlist.net[inet].pins.size()) - 50);
 	} else if ((g_clbs_nlist.net[inet].pins.size()) >= 85) {
 		crossing = 2.7933 + 0.011 * (g_clbs_nlist.net[inet].pins.size())
 				- 0.0000018 * (g_clbs_nlist.net[inet].pins.size())
 					* (g_clbs_nlist.net[inet].pins.size());
 	} else {
 		crossing = cross_count[g_clbs_nlist.net[inet].pins.size() - 1];
 	}

 	/* Could insert a check for xmin == xmax.  In that case, assume  *
 	 * connection will be made with no bends and hence no x-cost.    *
 	 * Same thing for y-cost.                                        */

 	/* Cost = wire length along channel * cross_count / average      *
 	 * channel capacity.   Do this for x, then y direction and add.  */

 	ncost = (bbptr->xmax - bbptr->xmin + 1) * crossing;

 	ncost += (bbptr->ymax - bbptr->ymin + 1) * crossing;

 	return (ncost);
 }

 static float get_net_cost(int inet, struct s_bb *bbptr) {

 	/* Finds the cost due to one net by looking at its coordinate bounding  *
 	 * box.                                                                 */

 	float ncost, crossing;

 	/* Get the expected "crossing count" of a net, based on its number *
 	 * of pins.  Extrapolate for very large nets.                      */

 	if ((g_clbs_nlist.net[inet].pins.size()) > 50) {
 		crossing = 2.7933 + 0.02616 * ((g_clbs_nlist.net[inet].pins.size()) - 50);
 		/*    crossing = 3.0;    Old value  */
 	} else {
 		crossing = cross_count[(g_clbs_nlist.net[inet].pins.size()) - 1];
 	}

 	/* Could insert a check for xmin == xmax.  In that case, assume  *
 	 * connection will be made with no bends and hence no x-cost.    *
 	 * Same thing for y-cost.                                        */

 	/* Cost = wire length along channel * cross_count / average      *
 	 * channel capacity.   Do this for x, then y direction and add.  */

 	ncost = (bbptr->xmax - bbptr->xmin + 1) * crossing
 			* chanx_place_cost_fac[bbptr->ymax][bbptr->ymin - 1];

 	ncost += (bbptr->ymax - bbptr->ymin + 1) * crossing
 			* chany_place_cost_fac[bbptr->xmax][bbptr->xmin - 1];

 #ifdef INTERPOSER_BASED_ARCHITECTURE
 	/* Adding extra cost factor here if the net crosses a cutline */
 	if(num_cuts > 0)
 	{
 		float C1, C2;
 		int times_crossed, counter;
 		int const_type = constant_type;

 		/* Ideas of different costs:
 		 * 0 penalty = C0 * times_crossed * width
 		 * 1 penalty = C1 * height * times_crossed
 		 * 2 penalty = C2 * height * times_crossed + C2*#crosses
 		 * 3 penalty = C3 * min(y1, y2) + C2*#crosses
 		 * 4 penalty = increase cost of ychannel
 		 * 5 penalty = C * height
 		 */
 		int closest = 11000;
 		times_crossed = 0;
 		C1 = placer_cost_constant;
 		C2 = (float)(percent_wires_cut / 100.0);

 		for(counter = 0; counter < num_cuts; ++counter)
 		{
 			int j = arch_cut_locations[counter];
 			if(bbptr->ymin <= j && bbptr->ymax > j)
 			{
 				times_crossed++;
 				closest = std::min(closest, std::min(j-bbptr->ymin+1, bbptr->ymax-j));
 			}
 		}

 		if(const_type == 0)
 		{
 			ncost += C1 * chany_place_cost_fac[nx][1] * times_crossed * (bbptr->xmax - bbptr->xmin + 1) * C2;
 		}
 		if(const_type == 1)
 		{
 			ncost += C1 * chany_place_cost_fac[nx][1] * times_crossed * (bbptr->ymax - bbptr->ymin + 1) * C2;
 		}
 		if(const_type == 2)
 		{
 			ncost += C1 * chany_place_cost_fac[nx][1] * times_crossed * (bbptr->ymax - bbptr->ymin + 1) * C2 + C1 * chany_place_cost_fac[nx][1] * times_crossed * C2;
 		}
 		if(const_type == 3)
 		{
 			if(times_crossed > 0)
 			{
 				ncost += C1 * chany_place_cost_fac[nx][1] * times_crossed * closest * C2 + C1 * chany_place_cost_fac[nx][1] * times_crossed * C2;
 			}
 		}
 		if(const_type == 4)
 		{
 			ncost += C1 * (bbptr->ymax - bbptr->ymin + 1) * crossing * chany_place_cost_fac[bbptr->xmax][bbptr->xmin - 1] * C2;
 		}
 		if(const_type == 5)
 		{
 			ncost += C1 * chany_place_cost_fac[nx][1] * (bbptr->ymax - bbptr->ymin + 1) * C2;
 		}
 	}
 #endif

 	return (ncost);
 }

 static void get_non_updateable_bb(int inet, struct s_bb *bb_coord_new) {

 	/* Finds the bounding box of a net and stores its coordinates in the  *
 	 * bb_coord_new data structure.  This routine should only be called   *
 	 * for small nets, since it does not determine enough information for *
 	 * the bounding box to be updated incrementally later.                *
 	 * Currently assumes channels on both sides of the CLBs forming the   *
 	 * edges of the bounding box can be used.  Essentially, I am assuming *
 	 * the pins always lie on the outside of the bounding box.            */

 	int xmax, ymax, xmin, ymin, x, y;
 	int bnum, pnum;
 	unsigned int k;

 	bnum = g_clbs_nlist.net[inet].pins[0].block;
 	pnum = g_clbs_nlist.net[inet].pins[0].block_pin;
 	x = block[bnum].x + block[bnum].type->pin_width[pnum];
 	y = block[bnum].y + block[bnum].type->pin_height[pnum];

 	xmin = x;
 	ymin = y;
 	xmax = x;
 	ymax = y;

 	for (k = 1; k < g_clbs_nlist.net[inet].pins.size(); k++) {
 		bnum = g_clbs_nlist.net[inet].pins[k].block;
 		pnum = g_clbs_nlist.net[inet].pins[k].block_pin;
 		x = block[bnum].x + block[bnum].type->pin_width[pnum];
 		y = block[bnum].y + block[bnum].type->pin_height[pnum];

 		if (x < xmin) {
 			xmin = x;
 		} else if (x > xmax) {
 			xmax = x;
 		}

 		if (y < ymin) {
 			ymin = y;
 		} else if (y > ymax) {
 			ymax = y;
 		}
 	}

 	/* Now I've found the coordinates of the bounding box.  There are no *
 	 * channels beyond nx and ny, so I want to clip to that.  As well,   *
 	 * since I'll always include the channel immediately below and the   *
 	 * channel immediately to the left of the bounding box, I want to    *
 	 * clip to 1 in both directions as well (since minimum channel index *
 	 * is 0).  See route.c for a channel diagram.                        */

 	bb_coord_new->xmin = max(min(xmin, nx), 1);
 	bb_coord_new->ymin = max(min(ymin, ny), 1);
 	bb_coord_new->xmax = max(min(xmax, nx), 1);
 	bb_coord_new->ymax = max(min(ymax, ny), 1);
 }

 static void update_bb(int inet, struct s_bb *bb_coord_new,
 		struct s_bb *bb_edge_new, int xold, int yold, int xnew, int ynew) {

 	/* Updates the bounding box of a net by storing its coordinates in    *
 	 * the bb_coord_new data structure and the number of blocks on each   *
 	 * edge in the bb_edge_new data structure.  This routine should only  *
 	 * be called for large nets, since it has some overhead relative to   *
 	 * just doing a brute force bounding box calculation.  The bounding   *
 	 * box coordinate and edge information for inet must be valid before  *
 	 * this routine is called.                                            *
 	 * Currently assumes channels on both sides of the CLBs forming the   *
 	 * edges of the bounding box can be used.  Essentially, I am assuming *
 	 * the pins always lie on the outside of the bounding box.            *
 	 * The x and y coordinates are the pin's x and y coordinates.         */
 	/* IO blocks are considered to be one cell in for simplicity.         */

 	struct s_bb *curr_bb_edge, *curr_bb_coord;

 	xnew = max(min(xnew, nx), 1);
 	ynew = max(min(ynew, ny), 1);
 	xold = max(min(xold, nx), 1);
 	yold = max(min(yold, ny), 1);

 	/* Check if the net had been updated before. */
 	if (bb_updated_before[inet] == GOT_FROM_SCRATCH)
 	{	/* The net had been updated from scratch, DO NOT update again! */
 		return;
 	}
 	else if (bb_updated_before[inet] == NOT_UPDATED_YET)
 	{	/* The net had NOT been updated before, could use the old values */
 		curr_bb_coord = &bb_coords[inet];
 		curr_bb_edge = &bb_num_on_edges[inet];
 		bb_updated_before[inet] = UPDATED_ONCE;
 	}
 	else
 	{	/* The net had been updated before, must use the new values */
 		curr_bb_coord = bb_coord_new;
 		curr_bb_edge = bb_edge_new;
 	}

 	/* Check if I can update the bounding box incrementally. */

 	if (xnew < xold) { /* Move to left. */

 		/* Update the xmax fields for coordinates and number of edges first. */

 		if (xold == curr_bb_coord->xmax) { /* Old position at xmax. */
 			if (curr_bb_edge->xmax == 1) {
 				get_bb_from_scratch(inet, bb_coord_new, bb_edge_new);
 				bb_updated_before[inet] = GOT_FROM_SCRATCH;
 				return;
 			} else {
 				bb_edge_new->xmax = curr_bb_edge->xmax - 1;
 				bb_coord_new->xmax = curr_bb_coord->xmax;
 			}
 		}

 		else { /* Move to left, old postion was not at xmax. */
 			bb_coord_new->xmax = curr_bb_coord->xmax;
 			bb_edge_new->xmax = curr_bb_edge->xmax;
 		}

 		/* Now do the xmin fields for coordinates and number of edges. */

 		if (xnew < curr_bb_coord->xmin) { /* Moved past xmin */
 			bb_coord_new->xmin = xnew;
 			bb_edge_new->xmin = 1;
 		}

 		else if (xnew == curr_bb_coord->xmin) { /* Moved to xmin */
 			bb_coord_new->xmin = xnew;
 			bb_edge_new->xmin = curr_bb_edge->xmin + 1;
 		}

 		else { /* Xmin unchanged. */
 			bb_coord_new->xmin = curr_bb_coord->xmin;
 			bb_edge_new->xmin = curr_bb_edge->xmin;
 		}
 	}

 	/* End of move to left case. */
 	else if (xnew > xold) { /* Move to right. */

 		/* Update the xmin fields for coordinates and number of edges first. */

 		if (xold == curr_bb_coord->xmin) { /* Old position at xmin. */
 			if (curr_bb_edge->xmin == 1) {
 				get_bb_from_scratch(inet, bb_coord_new, bb_edge_new);
 				bb_updated_before[inet] = GOT_FROM_SCRATCH;
 				return;
 			} else {
 				bb_edge_new->xmin = curr_bb_edge->xmin - 1;
 				bb_coord_new->xmin = curr_bb_coord->xmin;
 			}
 		}

 		else { /* Move to right, old position was not at xmin. */
 			bb_coord_new->xmin = curr_bb_coord->xmin;
 			bb_edge_new->xmin = curr_bb_edge->xmin;
 		}

 		/* Now do the xmax fields for coordinates and number of edges. */

 		if (xnew > curr_bb_coord->xmax) { /* Moved past xmax. */
 			bb_coord_new->xmax = xnew;
 			bb_edge_new->xmax = 1;
 		}

 		else if (xnew == curr_bb_coord->xmax) { /* Moved to xmax */
 			bb_coord_new->xmax = xnew;
 			bb_edge_new->xmax = curr_bb_edge->xmax + 1;
 		}

 		else { /* Xmax unchanged. */
 			bb_coord_new->xmax = curr_bb_coord->xmax;
 			bb_edge_new->xmax = curr_bb_edge->xmax;
 		}
 	}
 	/* End of move to right case. */
 	else { /* xnew == xold -- no x motion. */
 		bb_coord_new->xmin = curr_bb_coord->xmin;
 		bb_coord_new->xmax = curr_bb_coord->xmax;
 		bb_edge_new->xmin = curr_bb_edge->xmin;
 		bb_edge_new->xmax = curr_bb_edge->xmax;
 	}

 	/* Now account for the y-direction motion. */

 	if (ynew < yold) { /* Move down. */

 		/* Update the ymax fields for coordinates and number of edges first. */

 		if (yold == curr_bb_coord->ymax) { /* Old position at ymax. */
 			if (curr_bb_edge->ymax == 1) {
 				get_bb_from_scratch(inet, bb_coord_new, bb_edge_new);
 				bb_updated_before[inet] = GOT_FROM_SCRATCH;
 				return;
 			} else {
 				bb_edge_new->ymax = curr_bb_edge->ymax - 1;
 				bb_coord_new->ymax = curr_bb_coord->ymax;
 			}
 		}

 		else { /* Move down, old postion was not at ymax. */
 			bb_coord_new->ymax = curr_bb_coord->ymax;
 			bb_edge_new->ymax = curr_bb_edge->ymax;
 		}

 		/* Now do the ymin fields for coordinates and number of edges. */

 		if (ynew < curr_bb_coord->ymin) { /* Moved past ymin */
 			bb_coord_new->ymin = ynew;
 			bb_edge_new->ymin = 1;
 		}

 		else if (ynew == curr_bb_coord->ymin) { /* Moved to ymin */
 			bb_coord_new->ymin = ynew;
 			bb_edge_new->ymin = curr_bb_edge->ymin + 1;
 		}

 		else { /* ymin unchanged. */
 			bb_coord_new->ymin = curr_bb_coord->ymin;
 			bb_edge_new->ymin = curr_bb_edge->ymin;
 		}
 	}
 	/* End of move down case. */
 	else if (ynew > yold) { /* Moved up. */

 		/* Update the ymin fields for coordinates and number of edges first. */

 		if (yold == curr_bb_coord->ymin) { /* Old position at ymin. */
 			if (curr_bb_edge->ymin == 1) {
 				get_bb_from_scratch(inet, bb_coord_new, bb_edge_new);
 				bb_updated_before[inet] = GOT_FROM_SCRATCH;
 				return;
 			} else {
 				bb_edge_new->ymin = curr_bb_edge->ymin - 1;
 				bb_coord_new->ymin = curr_bb_coord->ymin;
 			}
 		}

 		else { /* Moved up, old position was not at ymin. */
 			bb_coord_new->ymin = curr_bb_coord->ymin;
 			bb_edge_new->ymin = curr_bb_edge->ymin;
 		}

 		/* Now do the ymax fields for coordinates and number of edges. */

 		if (ynew > curr_bb_coord->ymax) { /* Moved past ymax. */
 			bb_coord_new->ymax = ynew;
 			bb_edge_new->ymax = 1;
 		}

 		else if (ynew == curr_bb_coord->ymax) { /* Moved to ymax */
 			bb_coord_new->ymax = ynew;
 			bb_edge_new->ymax = curr_bb_edge->ymax + 1;
 		}

 		else { /* ymax unchanged. */
 			bb_coord_new->ymax = curr_bb_coord->ymax;
 			bb_edge_new->ymax = curr_bb_edge->ymax;
 		}
 	}
 	/* End of move up case. */
 	else { /* ynew == yold -- no y motion. */
 		bb_coord_new->ymin = curr_bb_coord->ymin;
 		bb_coord_new->ymax = curr_bb_coord->ymax;
 		bb_edge_new->ymin = curr_bb_edge->ymin;
 		bb_edge_new->ymax = curr_bb_edge->ymax;
 	}

 	if (bb_updated_before[inet] == NOT_UPDATED_YET)
 		bb_updated_before[inet] = UPDATED_ONCE;
 }

 static void alloc_legal_placements() {
 	int i, j, k;

 	legal_pos = (t_legal_pos **) my_malloc(num_types * sizeof(t_legal_pos *));
 	num_legal_pos = (int *) my_calloc(num_types, sizeof(int));

 	/* Initialize all occupancy to zero. */

 	for (i = 0; i <= nx + 1; i++) {
 		for (j = 0; j <= ny + 1; j++) {
 			grid[i][j].usage = 0;
 			for (k = 0; k < grid[i][j].type->capacity; k++) {
 				if (grid[i][j].blocks[k] != INVALID) {
 					grid[i][j].blocks[k] = EMPTY;
 					if (grid[i][j].width_offset == 0 && grid[i][j].height_offset == 0) {
 						num_legal_pos[grid[i][j].type->index]++;
 					}
 				}
 			}
 		}
 	}

 	for (i = 0; i < num_types; i++) {
 		legal_pos[i] = (t_legal_pos *) my_malloc(num_legal_pos[i] * sizeof(t_legal_pos));
 	}
 }

 static void load_legal_placements() {
 	int i, j, k, itype;
 	int *index;

 	index = (int *) my_calloc(num_types, sizeof(int));

 	for (i = 0; i <= nx + 1; i++) {
 		for (j = 0; j <= ny + 1; j++) {
 			for (k = 0; k < grid[i][j].type->capacity; k++) {
 				if (grid[i][j].blocks[k] == INVALID) {
 					continue;
 				}
 				if (grid[i][j].width_offset == 0 && grid[i][j].height_offset == 0) {
 					itype = grid[i][j].type->index;
 					legal_pos[itype][index[itype]].x = i;
 					legal_pos[itype][index[itype]].y = j;
 					legal_pos[itype][index[itype]].z = k;
 					index[itype]++;
 				}
 			}
 		}
 	}
 	free(index);
 }

 static void free_legal_placements() {
 	int i;
 	for (i = 0; i < num_types; i++) {
 		free(legal_pos[i]);
 	}
 	free(legal_pos); /* Free the mapping list */
 	free(num_legal_pos);
 }


 static int check_macro_can_be_placed(int imacro, int itype, int x, int y, int z) {

 	int imember;
 	int member_x, member_y, member_z;

 	// Every macro can be placed until proven otherwise
 	int macro_can_be_placed = true;

 	// Check whether all the members can be placed
 	for (imember = 0; imember < pl_macros[imacro].num_blocks; imember++) {
 		member_x = x + pl_macros[imacro].members[imember].x_offset;
 		member_y = y + pl_macros[imacro].members[imember].y_offset;
 		member_z = z + pl_macros[imacro].members[imember].z_offset;

 		// Check whether the location could accept block of this type
 		// Then check whether the location could still accomodate more blocks
 		// Also check whether the member position is valid, that is the member's location
 		// still within the chip's dimemsion and the member_z is allowed at that location on the grid
 		if (member_x <= nx+1 && member_y <= ny+1
 				&& grid[member_x][member_y].type->index == itype
 				&& grid[member_x][member_y].blocks[member_z] == EMPTY) {
 			// Can still accomodate blocks here, check the next position
 			continue;
 		} else {
 			// Cant be placed here - skip to the next try
 			macro_can_be_placed = false;
 			break;
 		}
 	}

 	return (macro_can_be_placed);
 }


 static int try_place_macro(int itype, int ipos, int imacro, int * free_locations){

 	int x, y, z, member_x, member_y, member_z, imember;

 	int macro_placed = false;

 	// Choose a random position for the head
 	x = legal_pos[itype][ipos].x;
 	y = legal_pos[itype][ipos].y;
 	z = legal_pos[itype][ipos].z;

 	// If that location is occupied, do nothing.
 	if (grid[x][y].blocks[z] != EMPTY) {
 		return (macro_placed);
 	}

 	int macro_can_be_placed = check_macro_can_be_placed(imacro, itype, x, y, z);

 	if (macro_can_be_placed) {

 		// Place down the macro
 		macro_placed = true;
 		for (imember = 0; imember < pl_macros[imacro].num_blocks; imember++) {

 			member_x = x + pl_macros[imacro].members[imember].x_offset;
 			member_y = y + pl_macros[imacro].members[imember].y_offset;
 			member_z = z + pl_macros[imacro].members[imember].z_offset;

 			block[pl_macros[imacro].members[imember].blk_index].x = member_x;
 			block[pl_macros[imacro].members[imember].blk_index].y = member_y;
 			block[pl_macros[imacro].members[imember].blk_index].z = member_z;

 			grid[member_x][member_y].blocks[member_z] = pl_macros[imacro].members[imember].blk_index;
 			grid[member_x][member_y].usage++;

 			// Could not ensure that the randomiser would not pick this location again
 			// So, would have to do a lazy removal - whenever I come across a block that could not be placed,
 			// go ahead and remove it from the legal_pos[][] array

 		} // Finish placing all the members in the macro

 	} // End of this choice of legal_pos

 	return (macro_placed);

 }


 static void initial_placement_pl_macros(int macros_max_num_tries, int * free_locations) {

 	int macro_placed;
 	int imacro, iblk, itype, itry, ipos;

 	/* Macros are harder to place.  Do them first */
 	for (imacro = 0; imacro < num_pl_macros; imacro++) {

 		// Every macro are not placed in the beginnning
 		macro_placed = false;

 		// Assume that all the blocks in the macro are of the same type
 		iblk = pl_macros[imacro].members[0].blk_index;
 		itype = block[iblk].type->index;
 		if (free_locations[itype] < pl_macros[imacro].num_blocks) {
 			vpr_throw(VPR_ERROR_PLACE, __FILE__, __LINE__,
 					"Initial placement failed.\n"
 					"Could not place macro length %d with head block %s (#%d); not enough free locations of type %s (#%d).\n"
 					"VPR cannot auto-size for your circuit, please resize the FPGA manually.\n",
 					pl_macros[imacro].num_blocks, block[iblk].name, iblk, type_descriptors[itype].name, itype);
 		}

 		// Try to place the macro first, if can be placed - place them, otherwise try again
 		for (itry = 0; itry < macros_max_num_tries && macro_placed == false; itry++) {

 			// Choose a random position for the head
 			ipos = my_irand(free_locations[itype] - 1);

 			// Try to place the macro
 			macro_placed = try_place_macro(itype, ipos, imacro, free_locations);

 		} // Finished all tries


 		if (macro_placed == false){
 			// if a macro still could not be placed after macros_max_num_tries times,
 			// go through the chip exhaustively to find a legal placement for the macro
 			// place the macro on the first location that is legal
 			// then set macro_placed = true;
 			// if there are no legal positions, error out

 			// Exhaustive placement of carry macros
 			for (ipos = 0; ipos < free_locations[itype] && macro_placed == false; ipos++) {

 				// Try to place the macro
 				macro_placed = try_place_macro(itype, ipos, imacro, free_locations);

 			} // Exhausted all the legal placement position for this macro

 			// If macro could not be placed after exhaustive placement, error out
 			if (macro_placed == false) {
 				// Error out
 				vpr_throw(VPR_ERROR_PLACE, __FILE__, __LINE__,
 						"Initial placement failed.\n"
 						"Could not place macro length %d with head block %s (#%d); not enough free locations of type %s (#%d).\n"
 						"Please manually size the FPGA because VPR can't do this yet.\n",
 						pl_macros[imacro].num_blocks, block[iblk].name, iblk, type_descriptors[itype].name, itype);
 			}

 		} else {
 			// This macro has been placed successfully, proceed to place the next macro
 			continue;
 		}
 	} // Finish placing all the pl_macros successfully
 }

 static void initial_placement_blocks(int * free_locations, enum e_pad_loc_type pad_loc_type,
 		bool apply_placement_regions) {

 	/* Place blocks that are NOT a part of any macro.
 	 * We'll randomly place each block in the clustered netlist, one by one.
 	 */

 	int iblk, itype;
 	int ipos, x, y, z;

 	for (iblk = 0; iblk < num_blocks; iblk++) {

 		if (block[iblk].x != EMPTY) {
 			// block placed.
 			continue;
 		}

 		/* Don't do IOs if the user specifies IOs; we'll read those locations later. */
 		if (!(block[iblk].type == IO_TYPE && pad_loc_type == USER)) {

 		    /* Randomly select a free location of the appropriate type
 			 * for iblk.  We have a linearized list of all the free locations
 			 * that can accomodate a block of that type in free_locations[itype].
 			 * Choose one randomly and put iblk there.  Then we don't want to pick that
 			 * location again, so remove it from the free_locations array.
 			 */
 			itype = block[iblk].type->index;
 			if (free_locations[itype] <= 0) {
 				vpr_throw(VPR_ERROR_PLACE, __FILE__, __LINE__,
 						"Initial placement failed.\n"
 						"Could not place block %s (#%d); no free locations of type %s (#%d).\n",
 						block[iblk].name, iblk, type_descriptors[itype].name, itype);
 			}

 			initial_placement_location(free_locations, iblk, &ipos, &x, &y, &z);

 			// Make sure that the position is EMPTY before placing the block down
 			assert (grid[x][y].blocks[z] == EMPTY);

 			grid[x][y].blocks[z] = iblk;
 			grid[x][y].usage++;

 			block[iblk].x = x;
 			block[iblk].y = y;
 			block[iblk].z = z;

 			/* Ensure randomizer doesn't pick this location again, since it's occupied. Could shift all the
 				* legal positions in legal_pos to remove the entry (choice) we just used, but faster to
 				* just move the last entry in legal_pos to the spot we just used and decrement the
 				* count of free_locations.
 				*/
 			legal_pos[itype][ipos] = legal_pos[itype][free_locations[itype] - 1]; /* overwrite used block position */
 			free_locations[itype]--;

 		}
 	}
 }

 static void initial_placement_location(int * free_locations, int iblk,
 		int *pipos, int *px_to, int *py_to, int *pz_to) {

 	int itype = block[iblk].type->index;

 	*pipos = my_irand(free_locations[itype] - 1);
 	*px_to = legal_pos[itype][*pipos].x;
 	*py_to = legal_pos[itype][*pipos].y;
 	*pz_to = legal_pos[itype][*pipos].z;
 }

 static void initial_placement(enum e_pad_loc_type pad_loc_type,
 		char *pad_loc_file) {

 	/* Randomly places the blocks to create an initial placement. We rely on
 	 * the legal_pos array already being loaded.  That legal_pos[itype] is an
 	 * array that gives every legal value of (x,y,z) that can accomodate a block.
 	 * The number of such locations is given by num_legal_pos[itype].
 	 */
 	int i, j, k, iblk, itype, x, y, z, ipos;
 	int *free_locations; /* [0..num_types-1].
 						  * Stores how many locations there are for this type that *might* still be free.
 						  * That is, this stores the number of entries in legal_pos[itype] that are worth considering
 						  * as you look for a free location.
 						  */

 	free_locations = (int *) my_malloc(num_types * sizeof(int));
 	for (itype = 0; itype < num_types; itype++) {
 		free_locations[itype] = num_legal_pos[itype];
 	}

 	/* We'll use the grid to record where everything goes. Initialize to the grid has no
 	 * blocks placed anywhere.
 	 */
 	for (i = 0; i <= nx + 1; i++) {
 		for (j = 0; j <= ny + 1; j++) {
 			grid[i][j].usage = 0;
 			itype = grid[i][j].type->index;
 			for (k = 0; k < type_descriptors[itype].capacity; k++) {
 				if (grid[i][j].blocks[k] != INVALID) {
 					grid[i][j].blocks[k] = EMPTY;
 				}
 			}
 		}
 	}

 	/* Similarly, mark all blocks as not being placed yet. */
 	for (iblk = 0; iblk < num_blocks; iblk++) {
 		block[iblk].x = -1;
 		block[iblk].y = -1;
 		block[iblk].z = -1;
 	}

 	initial_placement_pl_macros(MAX_NUM_TRIES_TO_PLACE_MACROS_RANDOMLY, free_locations);

 	// All the macros are placed, update the legal_pos[][] array
 	for (itype = 0; itype < num_types; itype++) {
 		assert (free_locations[itype] >= 0);
 		for (ipos = 0; ipos < free_locations[itype]; ipos++) {
 			x = legal_pos[itype][ipos].x;
 			y = legal_pos[itype][ipos].y;
 			z = legal_pos[itype][ipos].z;

 			// Check if that location is occupied.  If it is, remove from legal_pos
 			if (grid[x][y].blocks[z] != EMPTY && grid[x][y].blocks[z] != INVALID) {
 				legal_pos[itype][ipos] = legal_pos[itype][free_locations[itype] - 1];
 				free_locations[itype]--;

 				// After the move, I need to check this particular entry again
 				ipos--;
 				continue;
 			}
 		}
 	} // Finish updating the legal_pos[][] and free_locations[] array

 	bool apply_placement_regions = true;
 	initial_placement_blocks(free_locations, pad_loc_type, apply_placement_regions );
 	initial_placement_blocks(free_locations, pad_loc_type);

 	if (pad_loc_type == USER) {
 		read_user_pad_loc(pad_loc_file);
 	}

 	/* Restore legal_pos */
 	load_legal_placements();

 #ifdef VERBOSE
 	vpr_printf_info("At end of initial_placement.\n");
 	if (getEchoEnabled() && isEchoFileEnabled(E_ECHO_INITIAL_CLB_PLACEMENT)) {
 		print_clb_placement(getEchoFileName(E_ECHO_INITIAL_CLB_PLACEMENT));
 	}
 #endif
 	free(free_locations);
 }

 static void free_fast_cost_update(void) {
 	int i;

 	for (i = 0; i <= ny; i++)
 		free(chanx_place_cost_fac[i]);
 	free(chanx_place_cost_fac);
 	chanx_place_cost_fac = NULL;

 	for (i = 0; i <= nx; i++)
 		free(chany_place_cost_fac[i]);
 	free(chany_place_cost_fac);
 	chany_place_cost_fac = NULL;
 }

 static void alloc_and_load_for_fast_cost_update(float place_cost_exp) {

 	/* Allocates and loads the chanx_place_cost_fac and chany_place_cost_fac *
 	 * arrays with the inverse of the average number of tracks per channel   *
 	 * between [subhigh] and [sublow].  This is only useful for the cost     *
 	 * function that takes the length of the net bounding box in each        *
 	 * dimension divided by the average number of tracks in that direction.  *
 	 * For other cost functions, you don't have to bother calling this       *
 	 * routine; when using the cost function described above, however, you   *
 	 * must always call this routine after you call init_chan and before     *
 	 * you do any placement cost determination.  The place_cost_exp factor   *
 	 * specifies to what power the width of the channel should be taken --   *
 	 * larger numbers make narrower channels more expensive.                 */

 	int low, high, i;

 	/* Access arrays below as chan?_place_cost_fac[subhigh][sublow].  Since   *
 	 * subhigh must be greater than or equal to sublow, we only need to       *
 	 * allocate storage for the lower half of a matrix.                       */

 	chanx_place_cost_fac = (float **) my_malloc((ny + 1) * sizeof(float *));
 	for (i = 0; i <= ny; i++)
 		chanx_place_cost_fac[i] = (float *) my_malloc((i + 1) * sizeof(float));

 	chany_place_cost_fac = (float **) my_malloc((nx + 1) * sizeof(float *));
 	for (i = 0; i <= nx; i++)
 		chany_place_cost_fac[i] = (float *) my_malloc((i + 1) * sizeof(float));

 	/* First compute the number of tracks between channel high and channel *
 	 * low, inclusive, in an efficient manner.                             */

 	chanx_place_cost_fac[0][0] = chan_width.x_list[0];

 	for (high = 1; high <= ny; high++) {
 		chanx_place_cost_fac[high][high] = chan_width.x_list[high];
 		for (low = 0; low < high; low++) {
 			chanx_place_cost_fac[high][low] =
 					chanx_place_cost_fac[high - 1][low] + chan_width.x_list[high];
 		}
 	}

 	/* Now compute the inverse of the average number of tracks per channel *
 	 * between high and low.  The cost function divides by the average     *
 	 * number of tracks per channel, so by storing the inverse I convert   *
 	 * this to a faster multiplication.  Take this final number to the     *
 	 * place_cost_exp power -- numbers other than one mean this is no      *
 	 * longer a simple "average number of tracks"; it is some power of     *
 	 * that, allowing greater penalization of narrow channels.             */

 	for (high = 0; high <= ny; high++)
 		for (low = 0; low <= high; low++) {
 			chanx_place_cost_fac[high][low] = (high - low + 1.)
 					/ chanx_place_cost_fac[high][low];
 			chanx_place_cost_fac[high][low] = pow(
 					(double) chanx_place_cost_fac[high][low],
 					(double) place_cost_exp);
 		}

 	/* Now do the same thing for the y-directed channels.  First get the  *
 	 * number of tracks between channel high and channel low, inclusive.  */

 	chany_place_cost_fac[0][0] = chan_width.y_list[0];

 	for (high = 1; high <= nx; high++) {
 		chany_place_cost_fac[high][high] = chan_width.y_list[high];
 		for (low = 0; low < high; low++) {
 			chany_place_cost_fac[high][low] =
 					chany_place_cost_fac[high - 1][low] + chan_width.y_list[high];
 		}
 	}

 	/* Now compute the inverse of the average number of tracks per channel *
 	 * between high and low.  Take to specified power.                     */

 	for (high = 0; high <= nx; high++)
 		for (low = 0; low <= high; low++) {
 			chany_place_cost_fac[high][low] = (high - low + 1.)
 					/ chany_place_cost_fac[high][low];
 			chany_place_cost_fac[high][low] = pow(
 					(double) chany_place_cost_fac[high][low],
 					(double) place_cost_exp);
 		}
 }

 static void check_place(float bb_cost, float timing_cost,
 		enum e_place_algorithm place_algorithm,
 		float delay_cost) {

 	/* Checks that the placement has not confused our data structures. *
 	 * i.e. the clb and block structures agree about the locations of  *
 	 * every block, blocks are in legal spots, etc.  Also recomputes   *
 	 * the final placement cost from scratch and makes sure it is      *
 	 * within roundoff of what we think the cost is.                   */

 	static int *bdone;
 	int i, j, k, error = 0, bnum;
 	float bb_cost_check;
 	int usage_check;
 	float timing_cost_check, delay_cost_check;
 	int imacro, imember, head_iblk, member_iblk, member_x, member_y, member_z;

 	bb_cost_check = comp_bb_cost(CHECK);
 	vpr_printf_info("bb_cost recomputed from scratch: %g\n", bb_cost_check);
 	if (fabs(bb_cost_check - bb_cost) > bb_cost * ERROR_TOL) {
 		vpr_printf_error(__FILE__, __LINE__,
 				"bb_cost_check: %g and bb_cost: %g differ in check_place.\n",
 				bb_cost_check, bb_cost);
 		error++;
 	}

 	if (place_algorithm == NET_TIMING_DRIVEN_PLACE
 			|| place_algorithm == PATH_TIMING_DRIVEN_PLACE) {
 		comp_td_costs(&timing_cost_check, &delay_cost_check);
 		vpr_printf_info("timing_cost recomputed from scratch: %g\n", timing_cost_check);
 		if (fabs(timing_cost_check - timing_cost) > timing_cost * ERROR_TOL) {
 			vpr_printf_error(__FILE__, __LINE__,
 					"timing_cost_check: %g and timing_cost: %g differ in check_place.\n",
 					timing_cost_check, timing_cost);
 			error++;
 		}
 		vpr_printf_info("delay_cost recomputed from scratch: %g\n", delay_cost_check);
 		if (fabs(delay_cost_check - delay_cost) > delay_cost * ERROR_TOL) {
 			vpr_printf_error(__FILE__, __LINE__,
 					"delay_cost_check: %g and delay_cost: %g differ in check_place.\n",
 					delay_cost_check, delay_cost);
 			error++;
 		}
 	}

 	bdone = (int *) my_malloc(num_blocks * sizeof(int));
 	for (i = 0; i < num_blocks; i++)
 		bdone[i] = 0;

 	/* Step through grid array. Check it against block array. */
 	for (i = 0; i <= (nx + 1); i++)
 		for (j = 0; j <= (ny + 1); j++) {
 			if (grid[i][j].usage > grid[i][j].type->capacity) {
 				vpr_printf_error(__FILE__, __LINE__,
 						"Block at grid location (%d,%d) overused. Usage is %d.\n",
 						i, j, grid[i][j].usage);
 				error++;
 			}
 			usage_check = 0;
 			for (k = 0; k < grid[i][j].type->capacity; k++) {
 				bnum = grid[i][j].blocks[k];
 				if (EMPTY == bnum || INVALID == bnum)
 					continue;

 				if (block[bnum].type != grid[i][j].type) {
 					vpr_printf_error(__FILE__, __LINE__,
 							"Block %d type does not match grid location (%d,%d) type.\n",
 							bnum, i, j);
 					error++;
 				}
 				if ((block[bnum].x != i) || (block[bnum].y != j)) {
 					vpr_printf_error(__FILE__, __LINE__,
 							"Block %d location conflicts with grid(%d,%d) data.\n",
 							bnum, i, j);
 					error++;
 				}
 				++usage_check;
 				bdone[bnum]++;
 			}
 			if (usage_check != grid[i][j].usage) {
 				vpr_printf_error(__FILE__, __LINE__,
 						"Location (%d,%d) usage is %d, but has actual usage %d.\n",
 						i, j, grid[i][j].usage, usage_check);
 				error++;
 			}
 		}

 	/* Check that every block exists in the grid and block arrays somewhere. */
 	for (i = 0; i < num_blocks; i++)
 		if (bdone[i] != 1) {
 			vpr_printf_error(__FILE__, __LINE__,
 					"Block %d listed %d times in data structures.\n",
 					i, bdone[i]);
 			error++;
 		}
 	free(bdone);

 	/* Check the pl_macro placement are legal - blocks are in the proper relative position. */
 	for (imacro = 0; imacro < num_pl_macros; imacro++) {

 		head_iblk = pl_macros[imacro].members[0].blk_index;

 		for (imember = 0; imember < pl_macros[imacro].num_blocks; imember++) {

 			member_iblk = pl_macros[imacro].members[imember].blk_index;

 			// Compute the suppossed member's x,y,z location
 			member_x = block[head_iblk].x + pl_macros[imacro].members[imember].x_offset;
 			member_y = block[head_iblk].y + pl_macros[imacro].members[imember].y_offset;
 			member_z = block[head_iblk].z + pl_macros[imacro].members[imember].z_offset;

 			// Check the block data structure first
 			if (block[member_iblk].x != member_x
 					|| block[member_iblk].y != member_y
 					|| block[member_iblk].z != member_z) {
 				vpr_printf_error(__FILE__, __LINE__,
 						"Block %d in pl_macro #%d is not placed in the proper orientation.\n",
 						member_iblk, imacro);
 				error++;
 			}

 			// Then check the grid data structure
 			if (grid[member_x][member_y].blocks[member_z] != member_iblk) {
 				vpr_printf_error(__FILE__, __LINE__,
 						"Block %d in pl_macro #%d is not placed in the proper orientation.\n",
 						member_iblk, imacro);
 				error++;
 			}
 		} // Finish going through all the members
 	} // Finish going through all the macros

 	if (error == 0) {
 		vpr_printf_info("\n");
 		vpr_printf_info("Completed placement consistency check successfully.\n");
 		vpr_printf_info("\n");
 		vpr_printf_info("Swaps called: %d\n", num_ts_called);

 #ifdef PRINT_REL_POS_DISTR
 		print_relative_pos_distr(void);
 #endif
 	} else {
 		vpr_throw(VPR_ERROR_PLACE, __FILE__, __LINE__,
 				"\nCompleted placement consistency check, %d errors found.\n"
 				"Aborting program.\n", error);
 	}

 }

 #ifdef VERBOSE
 static void print_clb_placement(const char *fname) {

 	/* Prints out the clb placements to a file.  */

 	FILE *fp;
 	int i;

 	fp = my_fopen(fname, "w", 0);
 	fprintf(fp, "Complex block placements:\n\n");

 	fprintf(fp, "Block #\tName\t(X, Y, Z).\n");
 	for(i = 0; i < num_blocks; i++) {
 		fprintf(fp, "#%d\t%s\t(%d, %d, %d).\n", i, block[i].name, block[i].x, block[i].y, block[i].z);
 	}

 	fclose(fp);
 }
 #endif

 static void free_try_swap_arrays(void) {
 	if(ts_bb_coord_new != NULL) {
 		free(ts_bb_coord_new);
 		free(ts_bb_edge_new);
 		free(ts_nets_to_update);
 		free(blocks_affected.moved_blocks);
 		free(bb_updated_before);

 		ts_bb_coord_new = NULL;
 		ts_bb_edge_new = NULL;
 		ts_nets_to_update = NULL;
 		blocks_affected.moved_blocks = NULL;
 		blocks_affected.num_moved_blocks = 0;
 		bb_updated_before = NULL;
 	}
 }