| #include <cstring> |
| #include <climits> |
| #include <cmath> |
| #include <set> |
| #include <time.h> |
| using namespace std; |
| |
| #include "vtr_assert.h" |
| #include "vtr_log.h" |
| |
| #include "vpr_types.h" |
| #include "vpr_error.h" |
| |
| #include "globals.h" |
| #include "atom_netlist.h" |
| #include "path_delay.h" |
| #include "path_delay2.h" |
| #include "net_delay.h" |
| #include "vpr_utils.h" |
| #include "read_xml_arch_file.h" |
| #include "echo_files.h" |
| #include "read_sdc.h" |
| #include "stats.h" |
| |
| /**************************** Top-level summary ****************************** |
| |
| Timing analysis by Vaughn Betz, Jason Luu, and Michael Wainberg. |
| |
| Timing analysis is a three-step process: |
| 1. Interpret the constraints specified by the user in an SDC (Synopsys Design |
| Constraints) file or, if none is specified, use default constraints. |
| 2. Convert the pre- or post-packed netlist (depending on the stage of the |
| flow) into a timing graph, where nodes represent pins and edges represent |
| dependencies and delays between pins (see "Timing graph structure", below). |
| 3. Traverse the timing graph to obtain information about which connections |
| to optimize, as well as statistics for the user. |
| |
| Steps 1 and 2 are performed through one of two helper functions: |
| alloc_and_load_timing_graph and alloc_and_load_pre_packing_timing_graph. |
| The first step is to create the timing graph, which is stored in the array |
| timing_ctx.tnodes ("timing node"). This is done through alloc_and_load_tnodes (post- |
| packed) or alloc_and_load_tnodes_from_prepacked_netlist. Then, the timing |
| graph is topologically sorted ("levelized") in |
| alloc_and_load_timing_graph_levels, to allow for faster traversals later. |
| |
| read_sdc reads the SDC file, interprets its contents and stores them in |
| the data structure timing_ctx.sdc. (This data structure does not need to remain |
| global but it is probably easier, since its information is used in both |
| netlists and only needs to be read in once.) |
| |
| load_clock_domain_and_clock_and_io_delay then gives each flip-flop and I/O |
| the index of a constrained clock from the SDC file in timing_ctx.sdc->constrained_ |
| clocks, or -1 if an I/O is unconstrained. |
| |
| process_constraints does a pre-traversal through the timing graph and prunes |
| all constraints between domains that never intersect so they are not analysed. |
| |
| Step 3 is performed through do_timing_analysis. For each constraint between |
| a pair of clock domains we do a forward traversal from the "source" domain |
| to the "sink" domain to compute each tnode's arrival time, the time when the |
| latest signal would arrive at the node. We also do a backward traversal to |
| compute required time, the time when the earliest signal has to leave the |
| node to meet the constraint. The "slack" of each sink pin on each net is |
| basically the difference between the two times. |
| |
| If path counting is on, we calculate a forward and backward path weight in |
| do_path_counting. These represent the importance of paths fanning, |
| respectively, into and out of this pin, in such a way that paths with a |
| higher slack are discounted exponentially in importance. |
| |
| If path counting is off and we are using the pre-packed netlist, we also |
| calculate normalized costs for the clusterer (normalized arrival time, slack |
| and number of critical paths) in normalized_costs. The clusterer uses these |
| to calculate a criticality for each block. |
| |
| Finally, in update_slacks, we calculate the slack for each sink pin on each net |
| for printing, as well as a derived metric, timing criticality, which the |
| optimizers actually use. If path counting is on, we calculate a path criticality |
| from the forward and backward weights on each tnode. */ |
| |
| /************************* Timing graph structure **************************** |
| |
| Author: V. Betz |
| |
| We can build timing graphs that match either the primitive (atom_ctx.nlist) |
| netlist (of basic elements before clustering, like FFs and LUTs) or that |
| match the clustered netlist (block). You pass in the is_pre_packed flag to |
| say which kind of netlist and timing graph you are working with. |
| |
| Every used (not OPEN) block pin becomes a timing node, both on primitive |
| blocks and (if you are building the timing graph that matches a clustered |
| netlist) on clustered blocks. For the clustered (not pre_packed) timing |
| graph, every used pin within a clustered (pb_type) block also becomes a timing |
| node. So a CLB that contains ALMs that contains LUTs will have nodes created |
| for the CLB pins, ALM pins and LUT pins, with edges that connect them as |
| specified in the clustered netlist. Unused (OPEN) pins don't create any |
| timing nodes. |
| |
| The primitive blocks have edges from the tnodes that represent input pins and |
| output pins that represent the timing dependencies within these lowest level |
| blocks (e.g. LUTs and FFs and IOs). The exact edges created come from reading |
| the architecture file. A LUT for example will have delays specified from all |
| its inputs to its output, and hence we will create a tedge from each tnode |
| corresponding to an input pin of the LUT to the tnode that represents the LUT |
| output pin. The delay marked on each edge is read from the architecture file. |
| |
| A FF has nodes representing its pins, but also two extra nodes, representing |
| the TN_FF_SINK and TN_FF_SOURCE. Those two nodes represent the internal storage |
| node of the FF. The TN_FF_SINK has edges going to it from the regular FF inputs |
| (but not the clock input), with appropriate delays. It has no outgoing edges, |
| since it terminates timing paths. The TN_FF_SOURCE has an edge going from it to |
| the FF output pin, with the appropriate delay. TN_FF_SOURCES have no edges; they |
| start timing paths. FF clock pins have no outgoing edges, but the delay to |
| them can be looked up, and is used in the timing traversals to compute clock |
| delay for slack computations. |
| |
| In the timing graph I create, input pads and constant generators have no |
| inputs (incoming edges), just like TN_FF_SOURCES. Every input pad and output |
| pad is represented by two tnodes -- an input pin/source and an output pin/ |
| sink. For an input pad the input source comes from off chip and has no fanin, |
| while the output pin drives signals within the chip. For output pads, the |
| input pin is driven by signal (net) within the chip, and the output sink node |
| goes off chip and has no fanout (out-edges). I need two nodes to respresent |
| things like pads because I mark all delay on tedges, not on tnodes. |
| |
| One other subblock that needs special attention is a constant generator. |
| This has no used inputs, but its output is used. I create an extra tnode, |
| a dummy input, in addition to the output pin tnode. The dummy tnode has |
| no fanin. Since constant generators really generate their outputs at T = |
| -infinity, I set the delay from the input tnode to the output to a large- |
| magnitude negative number. This guarantees every block that needs the |
| output of a constant generator sees it available very early. |
| |
| The main structure of the timing graph is given by the nodes and the edges |
| that connect them. We also store some extra information on tnodes that |
| (1) lets us figure out how to map from a tnode back to the netlist pin (or |
| other item) it represents and (2) figure out what clock domain it is on, if |
| it is a timing path start point (no incoming edges) or end point (no outgoing |
| edges) and (3) lets us figure out the delay of the clock to that node, if it |
| is a timing path start or end point. |
| |
| To map efficiently from tedges back to the netlist pins, we create the tedge |
| array driven by a tnode the represents a netlist output pin *in the same order |
| as the netlist net pins*. That means the edge index for the tedge array from |
| such a tnode guarantees iedge = net_pin_index 1. The code to map slacks |
| from the timing graph back to the netlist relies on this. */ |
| |
| /******************** Variables local to this module ************************/ |
| |
| #define NUM_BUCKETS 5 /* Used when printing slack and criticality. */ |
| |
| /* Variables for "chunking" the tedge memory. If the head pointer in tedge_ch is NULL, * |
| * no timing graph exists now. */ |
| |
| static vtr::t_chunk tedge_ch; |
| |
| static vector<size_t> f_num_timing_net_pins; |
| |
| static t_timing_stats * f_timing_stats = nullptr; /* Critical path delay and worst-case slack per constraint. */ |
| |
| static int * f_net_to_driver_tnode; |
| /* [0..net.size() - 1]. Gives the index of the tnode that drives each net. |
| Used for both pre- and post-packed netlists. If you just want the number |
| of edges on the driver tnode, use: |
| num_edges = timing_nets[inet].num_sinks; |
| instead of the equivalent but more convoluted: |
| driver_tnode = f_net_to_driver_tnode[inet]; |
| num_edges = timing_ctx.tnodes[driver_tnode].num_edges; |
| Only use this array if you want the actual edges themselves or the index |
| of the driver tnode. */ |
| |
| /***************** Subroutines local to this module *************************/ |
| |
| static t_slack * alloc_slacks(); |
| |
| static void update_slacks(t_slack * slacks, float criticality_denom, |
| bool update_slack, bool is_final_analysis, float smallest_slack_in_design, const t_timing_inf &timing_inf); |
| |
| static void alloc_and_load_tnodes(const t_timing_inf &timing_inf); |
| |
| static void alloc_and_load_tnodes_from_prepacked_netlist(float inter_cluster_net_delay, |
| const std::unordered_map<AtomBlockId,t_pb_graph_node*>& expected_lowest_cost_pb_gnode); |
| |
| static void mark_max_block_delay(const std::unordered_map<AtomBlockId,t_pb_graph_node*>& expected_lowest_cost_pb_gnode); |
| |
| static void alloc_timing_stats(); |
| |
| static float do_timing_analysis_for_constraint(int source_clock_domain, int sink_clock_domain, |
| bool is_prepacked, bool is_final_analysis, long * max_critical_input_paths_ptr, |
| long * max_critical_output_paths_ptr, vtr::vector_map<ClusterBlockId, t_pb **> &pin_id_to_pb_mapping, const t_timing_inf &timing_inf); |
| |
| #ifdef PATH_COUNTING |
| static void do_path_counting(float criticality_denom); |
| #endif |
| |
| static float find_least_slack(bool is_prepacked, vtr::vector_map<ClusterBlockId, t_pb **> &pin_id_to_pb_mapping); |
| |
| static void load_tnode(t_pb_graph_pin *pb_graph_pin, const ClusterBlockId iblock, |
| int *inode); |
| |
| #ifndef PATH_COUNTING |
| static void update_normalized_costs(float T_arr_max_this_domain, long max_critical_input_paths, |
| long max_critical_output_paths, const t_timing_inf &timing_inf); |
| #endif |
| |
| //static void print_primitive_as_blif(FILE *fpout, int iblk, int **lookup_tnode_from_pin_id); |
| |
| static void load_clock_domain_and_clock_and_io_delay(bool is_prepacked, vtr::vector_map<ClusterBlockId, std::vector<int>> &lookup_tnode_from_pin_id, vtr::vector_map<ClusterBlockId, t_pb **> &pin_id_to_pb_mapping); |
| |
| static const char * find_tnode_net_name(int inode, bool is_prepacked, vtr::vector_map<ClusterBlockId, t_pb **> &pin_id_to_pb_mapping); |
| |
| static t_tnode * find_ff_clock_tnode(int inode, bool is_prepacked, vtr::vector_map<ClusterBlockId, std::vector<int>> &lookup_tnode_from_pin_id); |
| |
| static inline bool has_valid_T_arr(int inode); |
| |
| static inline bool has_valid_T_req(int inode); |
| |
| static int find_clock(const char * net_name); |
| |
| static int find_input(const char * net_name); |
| |
| static int find_output(const char * net_name); |
| |
| static int find_cf_constraint(const char * source_clock_name, const char * sink_ff_name); |
| |
| static void propagate_clock_domain_and_skew(int inode); |
| |
| static void process_constraints(); |
| |
| static void print_global_criticality_stats(FILE * fp, float ** criticality, const char * singular_name, const char * capitalized_plural_name); |
| |
| static void print_timing_constraint_info(const char *fname); |
| |
| static void print_spaces(FILE * fp, int num_spaces); |
| |
| //The following functions handle counting of net pins for both |
| //the atom and clb netlist. |
| //TODO: remove - these are temporarily in use until re-unify the timing graph to be directly driven by the atom netlist only |
| static std::vector<size_t> init_timing_net_pins(); |
| static std::vector<size_t> init_atom_timing_net_pins(); |
| size_t num_timing_nets(); |
| size_t num_timing_net_pins(int inet); |
| size_t num_timing_net_sinks(int inet); |
| |
| /********************* Subroutine definitions *******************************/ |
| |
| t_slack * alloc_and_load_timing_graph(t_timing_inf timing_inf) { |
| |
| /* This routine builds the graph used for timing analysis. Every cb pin is a |
| * timing node (tnode). The connectivity between pins is * |
| * represented by timing edges (tedges). All delay is marked on edges, not * |
| * on nodes. Returns two arrays that will store slack values: * |
| * slack and criticality ([0..net.size()-1][1..num_pins]). */ |
| |
| /* For pads, only the first two pin locations are used (input to pad is first, |
| * output of pad is second). For CLBs, all OPEN pins on the cb have their |
| * mapping set to OPEN so I won't use it by mistake. */ |
| |
| t_slack * slacks = nullptr; |
| bool do_process_constraints = false; |
| vtr::vector_map<ClusterBlockId, std::vector<int>> lookup_tnode_from_pin_id; |
| vtr::vector_map<ClusterBlockId, t_pb **> pin_id_to_pb_mapping = alloc_and_load_pin_id_to_pb_mapping(); |
| |
| if (tedge_ch.chunk_ptr_head != nullptr) { |
| vpr_throw(VPR_ERROR_TIMING, __FILE__, __LINE__, |
| "in alloc_and_load_timing_graph: An old timing graph still exists.\n"); |
| } |
| |
| f_num_timing_net_pins = init_timing_net_pins(); |
| |
| alloc_and_load_tnodes(timing_inf); |
| |
| detect_and_fix_timing_graph_combinational_loops(); |
| |
| alloc_and_load_timing_graph_levels(); |
| |
| check_timing_graph(); |
| |
| slacks = alloc_slacks(); |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| if (timing_ctx.sdc == nullptr) { |
| /* the SDC timing constraints only need to be read in once; * |
| * if they haven't been already, do it now */ |
| read_sdc(timing_inf); |
| do_process_constraints = true; |
| } |
| |
| lookup_tnode_from_pin_id = alloc_and_load_tnode_lookup_from_pin_id(); |
| |
| load_clock_domain_and_clock_and_io_delay(false, lookup_tnode_from_pin_id, pin_id_to_pb_mapping); |
| |
| if (do_process_constraints) |
| process_constraints(); |
| |
| if (f_timing_stats == nullptr) |
| alloc_timing_stats(); |
| |
| free_tnode_lookup_from_pin_id(lookup_tnode_from_pin_id); |
| free_pin_id_to_pb_mapping(pin_id_to_pb_mapping); |
| |
| return slacks; |
| } |
| |
| t_slack * alloc_and_load_pre_packing_timing_graph(float inter_cluster_net_delay, t_timing_inf timing_inf, |
| const std::unordered_map<AtomBlockId,t_pb_graph_node*>& expected_lowest_cost_pb_gnode) { |
| |
| /* This routine builds the graph used for timing analysis. Every technology- |
| * mapped netlist pin is a timing node (tnode). The connectivity between pins is * |
| * represented by timing edges (tedges). All delay is marked on edges, not * |
| * on nodes. Returns two arrays that will store slack values: * |
| * slack and criticality ([0..net.size()-1][1..num_pins]). */ |
| |
| /* For pads, only the first two pin locations are used (input to pad is first, |
| * output of pad is second). For CLBs, all OPEN pins on the cb have their |
| * mapping set to OPEN so I won't use it by mistake. */ |
| |
| t_slack * slacks = nullptr; |
| bool do_process_constraints = false; |
| vtr::vector_map<ClusterBlockId, std::vector<int>> lookup_tnode_from_pin_id; |
| vtr::vector_map<ClusterBlockId, t_pb **> pin_id_to_pb_mapping = alloc_and_load_pin_id_to_pb_mapping(); |
| |
| if (tedge_ch.chunk_ptr_head != nullptr) { |
| vpr_throw(VPR_ERROR_TIMING,__FILE__, __LINE__, |
| "in alloc_and_load_timing_graph: An old timing graph still exists.\n"); |
| } |
| |
| lookup_tnode_from_pin_id = alloc_and_load_tnode_lookup_from_pin_id(); |
| |
| f_num_timing_net_pins = init_atom_timing_net_pins(); |
| |
| alloc_and_load_tnodes_from_prepacked_netlist(inter_cluster_net_delay, expected_lowest_cost_pb_gnode); |
| |
| mark_max_block_delay(expected_lowest_cost_pb_gnode); |
| |
| detect_and_fix_timing_graph_combinational_loops(); |
| |
| alloc_and_load_timing_graph_levels(); |
| |
| slacks = alloc_slacks(); |
| |
| check_timing_graph(); |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| if (timing_ctx.sdc == nullptr) { |
| /* the SDC timing constraints only need to be read in once; * |
| * if they haven't been already, do it now */ |
| read_sdc(timing_inf); |
| do_process_constraints = true; |
| } |
| |
| load_clock_domain_and_clock_and_io_delay(true, lookup_tnode_from_pin_id, pin_id_to_pb_mapping); |
| |
| if (do_process_constraints) |
| process_constraints(); |
| |
| if (f_timing_stats == nullptr) |
| alloc_timing_stats(); |
| |
| free_pin_id_to_pb_mapping(pin_id_to_pb_mapping); |
| free_tnode_lookup_from_pin_id(lookup_tnode_from_pin_id); |
| |
| return slacks; |
| } |
| |
| static t_slack * alloc_slacks() { |
| |
| /* Allocates the slack, criticality and path_criticality structures |
| ([0..net.size()-1][1..num_pins-1]). Chunk allocated to save space. */ |
| |
| t_slack * slacks = (t_slack *) vtr::malloc(sizeof(t_slack)); |
| |
| slacks->slack = (float **) vtr::malloc(num_timing_nets() * sizeof(float *)); |
| slacks->timing_criticality = (float **) vtr::malloc(num_timing_nets() * sizeof(float *)); |
| #ifdef PATH_COUNTING |
| slacks->path_criticality = (float **) vtr::malloc(num_timing_nets() * sizeof(float *)); |
| #endif |
| for (size_t inet = 0; inet < num_timing_nets(); inet++) { |
| slacks->slack[inet] = (float *) vtr::chunk_malloc(num_timing_net_pins(inet) * sizeof(float), &tedge_ch); |
| slacks->timing_criticality[inet] = (float *) vtr::chunk_malloc(num_timing_net_pins(inet) * sizeof(float), &tedge_ch); |
| #ifdef PATH_COUNTING |
| slacks->path_criticality[inet] = (float *) vtr::chunk_malloc(num_timing_net_pins(inet) * sizeof(float), &tedge_ch); |
| #endif |
| } |
| |
| return slacks; |
| } |
| |
| void load_timing_graph_net_delays(vtr::vector_map<ClusterNetId, float *> &net_delay) { |
| |
| /* Sets the delays of the inter-CLB nets to the values specified by * |
| * net_delay[0..net.size()-1][1..num_pins-1]. These net delays should have * |
| * been allocated and loaded with the net_delay routines. This routine * |
| * marks the corresponding edges in the timing graph with the proper delay. */ |
| |
| clock_t begin = clock(); |
| |
| VTR_ASSERT(net_delay.size()); |
| |
| int inode; |
| unsigned ipin; |
| t_tedge *tedge; |
| |
| auto& timing_ctx = g_vpr_ctx.mutable_timing(); |
| |
| for (size_t inet = 0; inet < num_timing_nets(); inet++) { |
| inode = f_net_to_driver_tnode[inet]; |
| tedge = timing_ctx.tnodes[inode].out_edges; |
| |
| /* Note that the edges of a tnode corresponding to a CLB or INPAD opin must * |
| * be in the same order as the pins of the net driven by the tnode. */ |
| |
| for (ipin = 1; ipin < num_timing_net_pins(inet); ipin++) |
| tedge[ipin - 1].Tdel = net_delay[(ClusterNetId)inet][ipin]; |
| } |
| |
| clock_t end = clock(); |
| |
| timing_ctx.stats.old_delay_annotation_wallclock_time += double(end - begin) / CLOCKS_PER_SEC; |
| } |
| |
| void free_timing_graph(t_slack * slacks) { |
| |
| int inode; |
| |
| auto& timing_ctx = g_vpr_ctx.mutable_timing(); |
| |
| if (tedge_ch.chunk_ptr_head == nullptr) { |
| vtr::printf_warning(__FILE__, __LINE__, "in free_timing_graph: No timing graph to free.\n"); |
| } |
| |
| free_chunk_memory(&tedge_ch); |
| |
| if (timing_ctx.tnodes != nullptr && timing_ctx.tnodes[0].prepacked_data) { |
| /* If we allocated prepacked_data for the first node, |
| it must be allocated for all other nodes too. */ |
| for (inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| free(timing_ctx.tnodes[inode].prepacked_data); |
| } |
| } |
| free(timing_ctx.tnodes); |
| free(f_net_to_driver_tnode); |
| vtr::free_ivec_vector(timing_ctx.tnodes_at_level, 0, timing_ctx.num_tnode_levels - 1); |
| |
| free(slacks->slack); |
| free(slacks->timing_criticality); |
| #ifdef PATH_COUNTING |
| free(slacks->path_criticality); |
| #endif |
| free(slacks); |
| |
| timing_ctx.tnodes = nullptr; |
| timing_ctx.num_tnodes = 0; |
| f_net_to_driver_tnode = nullptr; |
| //timing_ctx.tnodes_at_level = NULL; |
| timing_ctx.num_tnode_levels = 0; |
| slacks = nullptr; |
| } |
| |
| void free_timing_stats() { |
| int i; |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| if(f_timing_stats != nullptr) { |
| for (i = 0; i < timing_ctx.sdc->num_constrained_clocks; i++) { |
| free(f_timing_stats->cpd[i]); |
| free(f_timing_stats->least_slack[i]); |
| } |
| free(f_timing_stats->cpd); |
| free(f_timing_stats->least_slack); |
| free(f_timing_stats); |
| } |
| f_timing_stats = nullptr; |
| } |
| |
| void print_slack(float ** slack, bool slack_is_normalized, const char *fname) { |
| |
| /* Prints slacks into a file. */ |
| |
| int ibucket, driver_tnode, num_unused_slacks = 0; |
| t_tedge * tedge; |
| FILE *fp; |
| float max_slack = HUGE_NEGATIVE_FLOAT, min_slack = HUGE_POSITIVE_FLOAT, |
| total_slack = 0, total_negative_slack = 0, bucket_size, slk; |
| int slacks_in_bucket[NUM_BUCKETS]; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| fp = vtr::fopen(fname, "w"); |
| |
| if (slack_is_normalized) { |
| fprintf(fp, "The following slacks have been normalized to be non-negative by " |
| "relaxing the required times to the maximum arrival time.\n\n"); |
| } |
| |
| /* Go through slack once to get the largest and smallest slack, both for reporting and |
| so that we can delimit the buckets. Also calculate the total negative slack in the design. */ |
| for (size_t inet = 0; inet < num_timing_nets(); inet++) { |
| size_t num_edges = num_timing_net_sinks(inet); |
| for (size_t iedge = 0; iedge < num_edges; iedge++) { |
| slk = slack[inet][iedge + 1]; |
| if (slk < HUGE_POSITIVE_FLOAT - 1) { /* if slack was analysed */ |
| max_slack = max(max_slack, slk); |
| min_slack = min(min_slack, slk); |
| total_slack += slk; |
| if (slk < NEGATIVE_EPSILON) { |
| total_negative_slack -= slk; /* By convention, we'll have total_negative_slack be a positive number. */ |
| } |
| } else { /* slack was never analysed */ |
| num_unused_slacks++; |
| } |
| } |
| } |
| |
| if (max_slack > HUGE_NEGATIVE_FLOAT + 1) { |
| fprintf(fp, "Largest slack in design: %g\n", max_slack); |
| } else { |
| fprintf(fp, "Largest slack in design: --\n"); |
| } |
| |
| if (min_slack < HUGE_POSITIVE_FLOAT - 1) { |
| fprintf(fp, "Smallest slack in design: %g\n", min_slack); |
| } else { |
| fprintf(fp, "Smallest slack in design: --\n"); |
| } |
| |
| fprintf(fp, "Total slack in design: %g\n", total_slack); |
| fprintf(fp, "Total negative slack: %g\n", total_negative_slack); |
| |
| if (max_slack - min_slack > EPSILON) { /* Only sort the slacks into buckets if not all slacks are the same (if they are identical, no need to sort). */ |
| /* Initialize slacks_in_bucket, an array counting how many slacks are within certain linearly-spaced ranges (buckets). */ |
| for (ibucket = 0; ibucket < NUM_BUCKETS; ibucket++) { |
| slacks_in_bucket[ibucket] = 0; |
| } |
| |
| /* The size of each bucket is the range of slacks, divided by the number of buckets. */ |
| bucket_size = (max_slack - min_slack)/NUM_BUCKETS; |
| |
| /* Now, pass through again, incrementing the number of slacks in the nth bucket |
| for each slack between (min_slack + n*bucket_size) and (min_slack + (n+1)*bucket_size). */ |
| |
| for (size_t inet = 0; inet < num_timing_nets(); inet++) { |
| size_t num_edges = num_timing_net_sinks(inet); |
| for (size_t iedge = 0; iedge < num_edges; iedge++) { |
| slk = slack[inet][iedge + 1]; |
| if (slk < HUGE_POSITIVE_FLOAT - 1) { |
| /* We have to watch out for the special case where slack = max_slack, in which case ibucket = NUM_BUCKETS and we go out of bounds of the array. */ |
| ibucket = min(NUM_BUCKETS - 1, (int) ((slk - min_slack)/bucket_size)); |
| VTR_ASSERT(ibucket >= 0 && ibucket < NUM_BUCKETS); |
| slacks_in_bucket[ibucket]++; |
| } |
| } |
| } |
| |
| /* Now print how many slacks are in each bucket. */ |
| fprintf(fp, "\n\nRange\t\t"); |
| for (ibucket = 0; ibucket < NUM_BUCKETS; ibucket++) { |
| fprintf(fp, "%.1e to ", min_slack); |
| min_slack += bucket_size; |
| fprintf(fp, "%.1e\t", min_slack); |
| } |
| fprintf(fp, "Not analysed"); |
| fprintf(fp, "\nSlacks in range\t\t"); |
| for (ibucket = 0; ibucket < NUM_BUCKETS; ibucket++) { |
| fprintf(fp, "%d\t\t\t", slacks_in_bucket[ibucket]); |
| } |
| fprintf(fp, "%d", num_unused_slacks); |
| } |
| |
| /* Finally, print all the slacks, organized by net. */ |
| fprintf(fp, "\n\nNet #\tDriver_tnode\tto_node\tSlack\n\n"); |
| |
| for (size_t inet = 0; inet < num_timing_nets(); inet++) { |
| driver_tnode = f_net_to_driver_tnode[inet]; |
| size_t num_edges = timing_ctx.tnodes[driver_tnode].num_edges; |
| tedge = timing_ctx.tnodes[driver_tnode].out_edges; |
| slk = slack[inet][1]; |
| if (slk < HUGE_POSITIVE_FLOAT - 1) { |
| fprintf(fp, "%5zu\t%5d\t\t%5d\t%g\n", inet, driver_tnode, tedge[0].to_node, slk); |
| } else { /* Slack is meaningless, so replace with --. */ |
| fprintf(fp, "%5zu\t%5d\t\t%5d\t--\n", inet, driver_tnode, tedge[0].to_node); |
| } |
| for (size_t iedge = 1; iedge < num_edges; iedge++) { /* newline and indent subsequent edges after the first */ |
| slk = slack[inet][iedge+1]; |
| if (slk < HUGE_POSITIVE_FLOAT - 1) { |
| fprintf(fp, "\t\t\t%5d\t%g\n", tedge[iedge].to_node, slk); |
| } else { /* Slack is meaningless, so replace with --. */ |
| fprintf(fp, "\t\t\t%5d\t--\n", tedge[iedge].to_node); |
| } |
| } |
| } |
| |
| fclose(fp); |
| } |
| |
| void print_criticality(t_slack * slacks, const char *fname) { |
| |
| /* Prints timing criticalities (and path criticalities if enabled) into a file. */ |
| |
| int iedge, driver_tnode, num_edges; |
| t_tedge * tedge; |
| FILE *fp; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| fp = vtr::fopen(fname, "w"); |
| |
| print_global_criticality_stats(fp, slacks->timing_criticality, "timing criticality", "Timing criticalities"); |
| #ifdef PATH_COUNTING |
| print_global_criticality_stats(fp, slacks->path_criticality, "path criticality", "Path criticalities"); |
| #endif |
| |
| /* Finally, print all the criticalities, organized by net. */ |
| fprintf(fp, "\n\nNet #\tDriver_tnode\t to_node\tTiming criticality" |
| #ifdef PATH_COUNTING |
| "\tPath criticality" |
| #endif |
| "\n"); |
| |
| for (size_t inet = 0; inet < num_timing_nets(); inet++) { |
| driver_tnode = f_net_to_driver_tnode[inet]; |
| num_edges = timing_ctx.tnodes[driver_tnode].num_edges; |
| tedge = timing_ctx.tnodes[driver_tnode].out_edges; |
| |
| fprintf(fp, "\n%5zu\t%5d\t\t%5d\t\t%.6f", inet, driver_tnode, tedge[0].to_node, slacks->timing_criticality[inet][1]); |
| #ifdef PATH_COUNTING |
| fprintf(fp, "\t\t%g", slacks->path_criticality[inet][1]); |
| #endif |
| for (iedge = 1; iedge < num_edges; iedge++) { /* newline and indent subsequent edges after the first */ |
| fprintf(fp, "\n\t\t\t%5d\t\t%.6f", tedge[iedge].to_node, slacks->timing_criticality[inet][iedge+1]); |
| #ifdef PATH_COUNTING |
| fprintf(fp, "\t\t%g", slacks->path_criticality[inet][iedge+1]); |
| #endif |
| } |
| } |
| |
| fclose(fp); |
| } |
| |
| static void print_global_criticality_stats(FILE * fp, float ** criticality, const char * singular_name, const char * capitalized_plural_name) { |
| |
| /* Prints global stats for timing or path criticality to the file pointed to by fp, |
| including maximum criticality, minimum criticality, total criticality in the design, |
| and the number of criticalities within various ranges, or buckets. */ |
| |
| int iedge, num_edges, ibucket, criticalities_in_bucket[NUM_BUCKETS]; |
| float crit, max_criticality = HUGE_NEGATIVE_FLOAT, min_criticality = HUGE_POSITIVE_FLOAT, |
| total_criticality = 0, bucket_size; |
| |
| /* Go through criticality once to get the largest and smallest timing criticality, |
| both for reporting and so that we can delimit the buckets. */ |
| for (size_t inet = 0; inet < num_timing_nets(); inet++) { |
| num_edges = num_timing_net_sinks(inet); |
| for (iedge = 0; iedge < num_edges; iedge++) { |
| crit = criticality[inet][iedge + 1]; |
| max_criticality = max(max_criticality, crit); |
| min_criticality = min(min_criticality, crit); |
| total_criticality += crit; |
| } |
| } |
| |
| fprintf(fp, "Largest %s in design: %g\n", singular_name, max_criticality); |
| fprintf(fp, "Smallest %s in design: %g\n", singular_name, min_criticality); |
| fprintf(fp, "Total %s in design: %g\n", singular_name, total_criticality); |
| |
| if (max_criticality - min_criticality > EPSILON) { /* Only sort the criticalities into buckets if not all criticalities are the same (if they are identical, no need to sort). */ |
| /* Initialize criticalities_in_bucket, an array counting how many criticalities are within certain linearly-spaced ranges (buckets). */ |
| for (ibucket = 0; ibucket < NUM_BUCKETS; ibucket++) { |
| criticalities_in_bucket[ibucket] = 0; |
| } |
| |
| /* The size of each bucket is the range of criticalities, divided by the number of buckets. */ |
| bucket_size = (max_criticality - min_criticality)/NUM_BUCKETS; |
| |
| /* Now, pass through again, incrementing the number of criticalities in the nth bucket |
| for each criticality between (min_criticality + n*bucket_size) and (min_criticality + (n+1)*bucket_size). */ |
| |
| for (size_t inet = 0; inet < num_timing_nets(); inet++) { |
| num_edges = num_timing_net_sinks(inet); |
| for (iedge = 0; iedge < num_edges; iedge++) { |
| crit = criticality[inet][iedge + 1]; |
| /* We have to watch out for the special case where criticality = max_criticality, in which case ibucket = NUM_BUCKETS and we go out of bounds of the array. */ |
| ibucket = min(NUM_BUCKETS - 1, (int) ((crit - min_criticality)/bucket_size)); |
| VTR_ASSERT(ibucket >= 0 && ibucket < NUM_BUCKETS); |
| criticalities_in_bucket[ibucket]++; |
| } |
| } |
| |
| /* Now print how many criticalities are in each bucket. */ |
| fprintf(fp, "\nRange\t\t"); |
| for (ibucket = 0; ibucket < NUM_BUCKETS; ibucket++) { |
| fprintf(fp, "%.1e to ", min_criticality); |
| min_criticality += bucket_size; |
| fprintf(fp, "%.1e\t", min_criticality); |
| } |
| fprintf(fp, "\n%s in range\t\t", capitalized_plural_name); |
| for (ibucket = 0; ibucket < NUM_BUCKETS; ibucket++) { |
| fprintf(fp, "%d\t\t\t", criticalities_in_bucket[ibucket]); |
| } |
| } |
| fprintf(fp, "\n\n"); |
| } |
| |
| void print_net_delay(vtr::vector_map<ClusterNetId, float *> &net_delay, const char *fname) { |
| |
| /* Prints the net delays into a file. */ |
| |
| int iedge, driver_tnode, num_edges; |
| t_tedge * tedge; |
| FILE *fp; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| fp = vtr::fopen(fname, "w"); |
| |
| fprintf(fp, "Net #\tDriver_tnode\tto_node\tDelay\n\n"); |
| |
| for (size_t inet = 0; inet < num_timing_nets(); inet++) { |
| driver_tnode = f_net_to_driver_tnode[inet]; |
| num_edges = timing_ctx.tnodes[driver_tnode].num_edges; |
| tedge = timing_ctx.tnodes[driver_tnode].out_edges; |
| fprintf(fp, "%5zu\t%5d\t\t%5d\t%g\n", inet, driver_tnode, tedge[0].to_node, net_delay[(ClusterNetId)inet][1]); |
| for (iedge = 1; iedge < num_edges; iedge++) { /* newline and indent subsequent edges after the first */ |
| fprintf(fp, "\t\t\t%5d\t%g\n", tedge[iedge].to_node, net_delay[(ClusterNetId)inet][iedge+1]); |
| } |
| } |
| |
| fclose(fp); |
| } |
| |
| #ifndef PATH_COUNTING |
| void print_clustering_timing_info(const char *fname) { |
| /* Print information from tnodes which is used by the clusterer. */ |
| int inode; |
| FILE *fp; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| fp = vtr::fopen(fname, "w"); |
| |
| fprintf(fp, "inode "); |
| if (timing_ctx.sdc->num_constrained_clocks <= 1) { |
| /* These values are from the last constraint analysed, |
| so they're not meaningful unless there was only one constraint. */ |
| fprintf(fp, "Critical input paths Critical output paths "); |
| } |
| fprintf(fp, "Normalized slack Normalized Tarr Normalized total crit paths\n"); |
| for (inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| fprintf(fp, "%d\t", inode); |
| /* Only print normalized values for tnodes which have valid normalized values. (If normalized_T_arr is valid, the others will be too.) */ |
| if (has_valid_normalized_T_arr(inode)) { |
| if (timing_ctx.sdc->num_constrained_clocks <= 1) { |
| fprintf(fp, "%ld\t\t\t%ld\t\t\t", timing_ctx.tnodes[inode].prepacked_data->num_critical_input_paths, timing_ctx.tnodes[inode].prepacked_data->num_critical_output_paths); |
| } |
| fprintf(fp, "%f\t%f\t%f\n", timing_ctx.tnodes[inode].prepacked_data->normalized_slack, timing_ctx.tnodes[inode].prepacked_data->normalized_T_arr, timing_ctx.tnodes[inode].prepacked_data->normalized_total_critical_paths); |
| } else { |
| if (timing_ctx.sdc->num_constrained_clocks <= 1) { |
| fprintf(fp, "--\t\t\t--\t\t\t"); |
| } |
| fprintf(fp, "--\t\t--\t\t--\n"); |
| } |
| } |
| |
| fclose(fp); |
| } |
| #endif |
| /* Count # of tnodes, allocates space, and loads the tnodes and its associated edges */ |
| static void alloc_and_load_tnodes(const t_timing_inf &timing_inf) { |
| int j, k; |
| int inode, num_nodes_in_block; |
| int count, i_pin_id; |
| int dport, dpin, dnode; |
| ClusterBlockId iblock, dblock; |
| ClusterNetId net_id; |
| int normalized_pin, normalization; |
| t_pb_graph_pin *ipb_graph_pin; |
| t_pb_route *intra_lb_route, *d_intra_lb_route; |
| int num_dangling_pins; |
| t_pb_graph_pin*** intra_lb_pb_pin_lookup; |
| vtr::vector_map<ClusterBlockId, std::vector<int>> lookup_tnode_from_pin_id; |
| |
| auto& device_ctx = g_vpr_ctx.mutable_device(); |
| auto& cluster_ctx = g_vpr_ctx.clustering(); |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| auto& timing_ctx = g_vpr_ctx.mutable_timing(); |
| |
| f_net_to_driver_tnode = (int*)vtr::malloc(num_timing_nets() * sizeof(int)); |
| |
| intra_lb_pb_pin_lookup = new t_pb_graph_pin**[device_ctx.num_block_types]; |
| for (int i = 0; i < device_ctx.num_block_types; i++) { |
| intra_lb_pb_pin_lookup[i] = alloc_and_load_pb_graph_pin_lookup_from_index(&device_ctx.block_types[i]); |
| } |
| |
| for (size_t i = 0; i < num_timing_nets(); i++) { |
| f_net_to_driver_tnode[i] = OPEN; |
| } |
| |
| /* allocate space for tnodes */ |
| timing_ctx.num_tnodes = 0; |
| for (auto blk_id : cluster_ctx.clb_nlist.blocks()) { |
| num_nodes_in_block = 0; |
| int itype = cluster_ctx.clb_nlist.block_type(blk_id)->index; |
| for (j = 0; j < cluster_ctx.clb_nlist.block_pb(blk_id)->pb_graph_node->total_pb_pins; j++) { |
| if (cluster_ctx.clb_nlist.block_pb(blk_id)->pb_route[j].atom_net_id) { |
| if (intra_lb_pb_pin_lookup[itype][j]->type == PB_PIN_INPAD |
| || intra_lb_pb_pin_lookup[itype][j]->type == PB_PIN_OUTPAD |
| || intra_lb_pb_pin_lookup[itype][j]->type == PB_PIN_SEQUENTIAL) { |
| num_nodes_in_block += 2; |
| } else { |
| num_nodes_in_block++; |
| } |
| } |
| } |
| timing_ctx.num_tnodes += num_nodes_in_block; |
| } |
| timing_ctx.tnodes = (t_tnode*)vtr::calloc(timing_ctx.num_tnodes, sizeof(t_tnode)); |
| |
| /* load tnodes with all info except edge info */ |
| /* populate tnode lookups for edge info */ |
| inode = 0; |
| for (auto blk_id : cluster_ctx.clb_nlist.blocks()) { |
| int itype = cluster_ctx.clb_nlist.block_type(blk_id)->index; |
| for (j = 0; j < cluster_ctx.clb_nlist.block_pb(blk_id)->pb_graph_node->total_pb_pins; j++) { |
| if (cluster_ctx.clb_nlist.block_pb(blk_id)->pb_route[j].atom_net_id) { |
| VTR_ASSERT(timing_ctx.tnodes[inode].pb_graph_pin == nullptr); |
| load_tnode(intra_lb_pb_pin_lookup[itype][j], blk_id, &inode); |
| } |
| } |
| } |
| VTR_ASSERT(inode == timing_ctx.num_tnodes); |
| num_dangling_pins = 0; |
| |
| lookup_tnode_from_pin_id = alloc_and_load_tnode_lookup_from_pin_id(); |
| |
| |
| /* load edge delays and initialize clock domains to OPEN |
| and prepacked_data (which is not used post-packing) to NULL. */ |
| std::set<int> const_gen_tnodes; |
| for (int i = 0; i < timing_ctx.num_tnodes; i++) { |
| timing_ctx.tnodes[i].clock_domain = OPEN; |
| timing_ctx.tnodes[i].prepacked_data = nullptr; |
| |
| /* 3 primary scenarios for edge delays |
| 1. Point-to-point delays inside block |
| 2. |
| */ |
| count = 0; |
| iblock = timing_ctx.tnodes[i].block; |
| int itype = cluster_ctx.clb_nlist.block_type(iblock)->index; |
| switch (timing_ctx.tnodes[i].type) { |
| case TN_INPAD_OPIN: |
| case TN_INTERMEDIATE_NODE: |
| case TN_PRIMITIVE_OPIN: |
| case TN_FF_OPIN: |
| case TN_CLOCK_OPIN: |
| case TN_CB_IPIN: |
| /* fanout is determined by intra-cluster connections */ |
| /* Allocate space for edges */ |
| i_pin_id = timing_ctx.tnodes[i].pb_graph_pin->pin_count_in_cluster; |
| intra_lb_route = cluster_ctx.clb_nlist.block_pb(iblock)->pb_route; |
| ipb_graph_pin = intra_lb_pb_pin_lookup[itype][i_pin_id]; |
| |
| if (ipb_graph_pin->parent_node->pb_type->max_internal_delay |
| != UNDEFINED) { |
| if (device_ctx.pb_max_internal_delay == UNDEFINED) { |
| device_ctx.pb_max_internal_delay = ipb_graph_pin->parent_node->pb_type->max_internal_delay; |
| device_ctx.pbtype_max_internal_delay = ipb_graph_pin->parent_node->pb_type; |
| } else if (device_ctx.pb_max_internal_delay < ipb_graph_pin->parent_node->pb_type->max_internal_delay) { |
| device_ctx.pb_max_internal_delay = ipb_graph_pin->parent_node->pb_type->max_internal_delay; |
| device_ctx.pbtype_max_internal_delay = ipb_graph_pin->parent_node->pb_type; |
| } |
| } |
| |
| for (j = 0; j < ipb_graph_pin->num_output_edges; j++) { |
| VTR_ASSERT(ipb_graph_pin->output_edges[j]->num_output_pins == 1); |
| dnode = ipb_graph_pin->output_edges[j]->output_pins[0]->pin_count_in_cluster; |
| if (intra_lb_route[dnode].driver_pb_pin_id == i_pin_id) { |
| count++; |
| } |
| } |
| VTR_ASSERT(count >= 0); |
| timing_ctx.tnodes[i].num_edges = count; |
| timing_ctx.tnodes[i].out_edges = (t_tedge *) vtr::chunk_malloc( |
| count * sizeof(t_tedge), &tedge_ch); |
| |
| /* Load edges */ |
| count = 0; |
| for (j = 0; j < ipb_graph_pin->num_output_edges; j++) { |
| VTR_ASSERT(ipb_graph_pin->output_edges[j]->num_output_pins == 1); |
| dnode = ipb_graph_pin->output_edges[j]->output_pins[0]->pin_count_in_cluster; |
| if (intra_lb_route[dnode].driver_pb_pin_id == i_pin_id) { |
| VTR_ASSERT(intra_lb_route[dnode].atom_net_id == intra_lb_route[i_pin_id].atom_net_id); |
| |
| timing_ctx.tnodes[i].out_edges[count].Tdel = ipb_graph_pin->output_edges[j]->delay_max; |
| timing_ctx.tnodes[i].out_edges[count].to_node = lookup_tnode_from_pin_id[iblock][dnode]; |
| VTR_ASSERT(timing_ctx.tnodes[i].out_edges[count].to_node != OPEN); |
| |
| AtomNetId atom_net_id = intra_lb_route[i_pin_id].atom_net_id; |
| if (atom_ctx.nlist.net_is_constant(atom_net_id) && timing_ctx.tnodes[i].type == TN_PRIMITIVE_OPIN) { |
| timing_ctx.tnodes[i].out_edges[count].Tdel = HUGE_NEGATIVE_FLOAT; |
| timing_ctx.tnodes[i].type = TN_CONSTANT_GEN_SOURCE; |
| const_gen_tnodes.insert(i); |
| } |
| |
| count++; |
| } |
| } |
| VTR_ASSERT(count >= 0); |
| |
| break; |
| case TN_PRIMITIVE_IPIN: |
| /* Pin info comes from pb_graph block delays |
| */ |
| /*there would be no slack information if timing analysis is off*/ |
| if (timing_inf.timing_analysis_enabled) |
| { |
| i_pin_id = timing_ctx.tnodes[i].pb_graph_pin->pin_count_in_cluster; |
| intra_lb_route = cluster_ctx.clb_nlist.block_pb(iblock)->pb_route; |
| ipb_graph_pin = intra_lb_pb_pin_lookup[itype][i_pin_id]; |
| timing_ctx.tnodes[i].num_edges = ipb_graph_pin->num_pin_timing; |
| timing_ctx.tnodes[i].out_edges = (t_tedge *) vtr::chunk_malloc( |
| ipb_graph_pin->num_pin_timing * sizeof(t_tedge), |
| &tedge_ch); |
| k = 0; |
| |
| for (j = 0; j < timing_ctx.tnodes[i].num_edges; j++) { |
| /* Some outpins aren't used, ignore these. Only consider output pins that are used */ |
| int cluster_pin_idx = ipb_graph_pin->pin_timing[j]->pin_count_in_cluster; |
| if (intra_lb_route[cluster_pin_idx].atom_net_id) { |
| timing_ctx.tnodes[i].out_edges[k].Tdel = ipb_graph_pin->pin_timing_del_max[j]; |
| timing_ctx.tnodes[i].out_edges[k].to_node = lookup_tnode_from_pin_id[timing_ctx.tnodes[i].block][ipb_graph_pin->pin_timing[j]->pin_count_in_cluster]; |
| VTR_ASSERT(timing_ctx.tnodes[i].out_edges[k].to_node != OPEN); |
| k++; |
| } |
| } |
| timing_ctx.tnodes[i].num_edges -= (j - k); /* remove unused edges */ |
| if (timing_ctx.tnodes[i].num_edges == 0) { |
| /* Dangling pin */ |
| num_dangling_pins++; |
| } |
| } |
| break; |
| case TN_CB_OPIN: |
| /* load up net info */ |
| i_pin_id = timing_ctx.tnodes[i].pb_graph_pin->pin_count_in_cluster; |
| intra_lb_route = cluster_ctx.clb_nlist.block_pb(iblock)->pb_route; |
| ipb_graph_pin = intra_lb_pb_pin_lookup[itype][i_pin_id]; |
| |
| VTR_ASSERT(intra_lb_route[i_pin_id].atom_net_id); |
| net_id = atom_ctx.lookup.clb_net(intra_lb_route[i_pin_id].atom_net_id); |
| VTR_ASSERT(net_id != ClusterNetId::INVALID()); |
| f_net_to_driver_tnode[(size_t)net_id] = i; |
| timing_ctx.tnodes[i].num_edges = cluster_ctx.clb_nlist.net_sinks(net_id).size(); |
| timing_ctx.tnodes[i].out_edges = (t_tedge *) vtr::chunk_malloc( |
| timing_ctx.tnodes[i].num_edges * sizeof(t_tedge), |
| &tedge_ch); |
| for (j = 1; j < (int)cluster_ctx.clb_nlist.net_pins(net_id).size(); j++) { |
| dblock = cluster_ctx.clb_nlist.net_pin_block(net_id, j); |
| normalization = cluster_ctx.clb_nlist.block_type(dblock)->num_pins |
| / cluster_ctx.clb_nlist.block_type(dblock)->capacity; |
| normalized_pin = cluster_ctx.clb_nlist.net_pin_physical_index(net_id, j) % normalization; |
| d_intra_lb_route = cluster_ctx.clb_nlist.block_pb(dblock)->pb_route; |
| dpin = OPEN; |
| dport = OPEN; |
| count = 0; |
| |
| for (k = 0; k < cluster_ctx.clb_nlist.block_pb(dblock)->pb_graph_node->num_input_ports && dpin == OPEN; k++) { |
| if (normalized_pin >= count |
| && (count + cluster_ctx.clb_nlist.block_pb(dblock)->pb_graph_node->num_input_pins[k] > normalized_pin)) { |
| dpin = normalized_pin - count; |
| dport = k; |
| break; |
| } |
| count += cluster_ctx.clb_nlist.block_pb(dblock)->pb_graph_node->num_input_pins[k]; |
| } |
| if (dpin == OPEN) { |
| for (k = 0; k < cluster_ctx.clb_nlist.block_pb(dblock)->pb_graph_node->num_output_ports && dpin == OPEN; k++) { |
| count += cluster_ctx.clb_nlist.block_pb(dblock)->pb_graph_node->num_output_pins[k]; |
| } |
| for (k = 0; k < cluster_ctx.clb_nlist.block_pb(dblock)->pb_graph_node->num_clock_ports && dpin == OPEN; k++) { |
| if (normalized_pin >= count |
| && (count + cluster_ctx.clb_nlist.block_pb(dblock)->pb_graph_node->num_clock_pins[k] > normalized_pin)) { |
| dpin = normalized_pin - count; |
| dport = k; |
| } |
| count += cluster_ctx.clb_nlist.block_pb(dblock)->pb_graph_node->num_clock_pins[k]; |
| } |
| VTR_ASSERT(dpin != OPEN); |
| |
| t_pb_graph_node* pb_gnode = cluster_ctx.clb_nlist.block_pb(dblock)->pb_graph_node; |
| int pin_count_in_cluster = pb_gnode->clock_pins[dport][dpin].pin_count_in_cluster; |
| ClusterNetId net_id_check = atom_ctx.lookup.clb_net(d_intra_lb_route[pin_count_in_cluster].atom_net_id); |
| VTR_ASSERT(net_id == net_id_check); |
| |
| timing_ctx.tnodes[i].out_edges[j - 1].to_node = lookup_tnode_from_pin_id[dblock][pin_count_in_cluster]; |
| VTR_ASSERT(timing_ctx.tnodes[i].out_edges[j - 1].to_node != OPEN); |
| } else { |
| VTR_ASSERT(dpin != OPEN); |
| |
| t_pb_graph_node* pb_gnode = cluster_ctx.clb_nlist.block_pb(dblock)->pb_graph_node; |
| int pin_count_in_cluster = pb_gnode->input_pins[dport][dpin].pin_count_in_cluster; |
| ClusterNetId net_id_check = atom_ctx.lookup.clb_net(d_intra_lb_route[pin_count_in_cluster].atom_net_id); |
| VTR_ASSERT(net_id == net_id_check); |
| |
| /* delays are assigned post routing */ |
| timing_ctx.tnodes[i].out_edges[j - 1].to_node = lookup_tnode_from_pin_id[dblock][pin_count_in_cluster]; |
| VTR_ASSERT(timing_ctx.tnodes[i].out_edges[j - 1].to_node != OPEN); |
| } |
| timing_ctx.tnodes[i].out_edges[j - 1].Tdel = 0; |
| VTR_ASSERT(net_id != ClusterNetId::INVALID()); |
| } |
| break; |
| case TN_OUTPAD_IPIN: |
| case TN_INPAD_SOURCE: |
| case TN_OUTPAD_SINK: |
| case TN_FF_SINK: |
| case TN_FF_SOURCE: |
| case TN_FF_IPIN: |
| case TN_FF_CLOCK: |
| case TN_CLOCK_SOURCE: |
| break; |
| default: |
| vtr::printf_error(__FILE__, __LINE__, |
| "Consistency check failed: Unknown tnode type %d.\n", timing_ctx.tnodes[i].type); |
| VTR_ASSERT(0); |
| break; |
| } |
| } |
| if(num_dangling_pins > 0) { |
| vtr::printf_warning(__FILE__, __LINE__, |
| "Unconnected logic in design, number of dangling tnodes = %d\n", num_dangling_pins); |
| } |
| |
| //In some instances, we may end-up with constant generators driving constant generators |
| //(which is illegal in the timing graph since constant generators shouldn't have any input |
| //edges). So we remove those edges now. |
| int const_gen_edge_break_count = 0; |
| for(inode = 0; inode < timing_ctx.num_tnodes; ++inode) { |
| for(int iedge = 0; iedge <timing_ctx.tnodes[inode].num_edges; ++iedge) { |
| int& to_node = timing_ctx.tnodes[inode].out_edges[iedge].to_node; |
| if(const_gen_tnodes.count(to_node)) { |
| //This edge points to a constant generator, mark it invalid |
| VTR_ASSERT(timing_ctx.tnodes[to_node].type == TN_CONSTANT_GEN_SOURCE); |
| |
| to_node = DO_NOT_ANALYSE; |
| |
| ++const_gen_edge_break_count; |
| } |
| } |
| } |
| vtr::printf_info("Disconnected %d redundant timing edges to constant generators\n", const_gen_edge_break_count); |
| |
| |
| for (int i = 0; i < device_ctx.num_block_types; i++) { |
| free_pb_graph_pin_lookup_from_index(intra_lb_pb_pin_lookup[i]); |
| } |
| free_tnode_lookup_from_pin_id(lookup_tnode_from_pin_id); |
| delete[] intra_lb_pb_pin_lookup; |
| } |
| |
| /* Allocate timing graph for pre packed netlist |
| Count number of tnodes first |
| Then connect up tnodes with edges |
| */ |
| static void alloc_and_load_tnodes_from_prepacked_netlist(float inter_cluster_net_delay, |
| const std::unordered_map<AtomBlockId,t_pb_graph_node*>& expected_lowest_cost_pb_gnode) { |
| t_pb_graph_pin *from_pb_graph_pin, *to_pb_graph_pin; |
| |
| auto& atom_ctx = g_vpr_ctx.mutable_atom(); |
| auto& timing_ctx = g_vpr_ctx.mutable_timing(); |
| |
| /* Determine the number of tnode's */ |
| timing_ctx.num_tnodes = 0; |
| for(AtomBlockId blk_id : atom_ctx.nlist.blocks()) { |
| |
| const t_model* model = atom_ctx.nlist.block_model(blk_id); |
| if (atom_ctx.nlist.block_type(blk_id) == AtomBlockType::INPAD) { |
| auto pins = atom_ctx.nlist.block_output_pins(blk_id); |
| if(pins.size() == 1) { |
| timing_ctx.num_tnodes += 2; //SOURCE and OPIN |
| } else { |
| VTR_ASSERT(pins.size() == 0); //Swept |
| } |
| } else if (atom_ctx.nlist.block_type(blk_id) == AtomBlockType::OUTPAD) { |
| auto pins = atom_ctx.nlist.block_input_pins(blk_id); |
| if(pins.size() == 1) { |
| timing_ctx.num_tnodes += 2; //SINK and OPIN |
| } else { |
| VTR_ASSERT(pins.size() == 0); //Swept |
| } |
| } else { |
| int incr; |
| if (atom_ctx.nlist.block_is_combinational(blk_id)) { |
| incr = 1; //Non-sequential so only an IPIN or OPIN node |
| } else { |
| VTR_ASSERT(atom_ctx.nlist.block_clock_pins(blk_id).size() > 0); |
| incr = 2; //Sequential so both and IPIN/OPIN and a SOURCE/SINK nodes |
| } |
| int j = 0; |
| const t_model_ports* model_port = model->inputs; |
| while (model_port) { |
| AtomPortId port_id = atom_ctx.nlist.find_atom_port(blk_id, model_port); |
| if(port_id) { |
| if (model_port->is_clock == false) { |
| //Non-clock port, so add tnodes for each used input pin |
| for (int k = 0; k < model_port->size; k++) { |
| if(atom_ctx.nlist.port_pin(port_id, k)) { |
| timing_ctx.num_tnodes += incr; |
| } |
| } |
| j++; |
| } else { |
| //A clock port so only an FF_CLOCK node |
| VTR_ASSERT(model_port->is_clock); |
| timing_ctx.num_tnodes++; //TODO: consider multi-bit clock ports |
| } |
| } |
| model_port = model_port->next; |
| } |
| |
| j = 0; |
| model_port = model->outputs; |
| while (model_port) { |
| AtomPortId port_id = atom_ctx.nlist.find_atom_port(blk_id, model_port); |
| if (port_id) { |
| //Add tnodes for each output pin |
| for (int k = 0; k < model_port->size; k++) { |
| if(atom_ctx.nlist.port_pin(port_id, k)) { |
| timing_ctx.num_tnodes += incr; |
| } |
| } |
| j++; |
| } |
| model_port = model_port->next; |
| } |
| } |
| } |
| |
| /* Allocate the tnodes */ |
| timing_ctx.tnodes = (t_tnode *)vtr::calloc(timing_ctx.num_tnodes, sizeof(t_tnode)); |
| |
| /* Allocate space for prepacked_data, which is only used pre-packing. */ |
| //TODO: get rid of prepacked_data |
| for (int inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| timing_ctx.tnodes[inode].prepacked_data = (t_prepacked_tnode_data *) vtr::malloc(sizeof(t_prepacked_tnode_data)); |
| } |
| |
| /* load tnodes, alloc edges for tnodes, load all known tnodes */ |
| int inode = 0; |
| for(AtomBlockId blk_id : atom_ctx.nlist.blocks()) { |
| |
| const t_model* model = atom_ctx.nlist.block_model(blk_id); |
| if (atom_ctx.nlist.block_type(blk_id) == AtomBlockType::INPAD) { |
| //A primary input |
| |
| //Single output pin |
| VTR_ASSERT(atom_ctx.nlist.block_input_pins(blk_id).empty()); |
| VTR_ASSERT(atom_ctx.nlist.block_clock_pins(blk_id).empty()); |
| auto pins = atom_ctx.nlist.block_output_pins(blk_id); |
| if(pins.size() == 1) { |
| auto pin_id = *pins.begin(); |
| |
| //Look-ups |
| atom_ctx.lookup.set_atom_pin_classic_tnode(pin_id, inode); |
| |
| //The OPIN |
| timing_ctx.tnodes[inode].prepacked_data->model_pin = 0; |
| timing_ctx.tnodes[inode].prepacked_data->model_port = 0; |
| timing_ctx.tnodes[inode].prepacked_data->model_port_ptr = model->outputs; |
| timing_ctx.tnodes[inode].block = ClusterBlockId::INVALID(); |
| timing_ctx.tnodes[inode].type = TN_INPAD_OPIN; |
| |
| auto net_id = atom_ctx.nlist.pin_net(pin_id); |
| timing_ctx.tnodes[inode].num_edges = atom_ctx.nlist.net_sinks(net_id).size(); |
| timing_ctx.tnodes[inode].out_edges = (t_tedge *) vtr::chunk_malloc( |
| timing_ctx.tnodes[inode].num_edges * sizeof(t_tedge), |
| &tedge_ch); |
| |
| //The source |
| timing_ctx.tnodes[inode + 1].type = TN_INPAD_SOURCE; |
| timing_ctx.tnodes[inode + 1].block = ClusterBlockId::INVALID(); |
| timing_ctx.tnodes[inode + 1].num_edges = 1; |
| timing_ctx.tnodes[inode + 1].out_edges = (t_tedge *) vtr::chunk_malloc( 1 * sizeof(t_tedge), &tedge_ch); |
| timing_ctx.tnodes[inode + 1].out_edges->Tdel = 0; |
| timing_ctx.tnodes[inode + 1].out_edges->to_node = inode; |
| |
| inode += 2; |
| } else { |
| VTR_ASSERT(pins.size() == 0); //Driven net was swept |
| } |
| } else if (atom_ctx.nlist.block_type(blk_id) == AtomBlockType::OUTPAD) { |
| //A primary input |
| |
| //Single input pin |
| VTR_ASSERT(atom_ctx.nlist.block_output_pins(blk_id).empty()); |
| VTR_ASSERT(atom_ctx.nlist.block_clock_pins(blk_id).empty()); |
| |
| //Single pin |
| auto pins = atom_ctx.nlist.block_input_pins(blk_id); |
| if(pins.size() == 1) { |
| auto pin_id = *pins.begin(); |
| |
| //Look-ups |
| atom_ctx.lookup.set_atom_pin_classic_tnode(pin_id, inode); |
| |
| //The IPIN |
| timing_ctx.tnodes[inode].prepacked_data->model_pin = 0; |
| timing_ctx.tnodes[inode].prepacked_data->model_port = 0; |
| timing_ctx.tnodes[inode].prepacked_data->model_port_ptr = model->inputs; |
| timing_ctx.tnodes[inode].block = ClusterBlockId::INVALID(); |
| timing_ctx.tnodes[inode].type = TN_OUTPAD_IPIN; |
| timing_ctx.tnodes[inode].num_edges = 1; |
| timing_ctx.tnodes[inode].out_edges = (t_tedge *) vtr::chunk_malloc(1 * sizeof(t_tedge), &tedge_ch); |
| timing_ctx.tnodes[inode].out_edges->Tdel = 0; |
| timing_ctx.tnodes[inode].out_edges->to_node = inode + 1; |
| |
| //The sink |
| timing_ctx.tnodes[inode + 1].type = TN_OUTPAD_SINK; |
| timing_ctx.tnodes[inode + 1].block = ClusterBlockId::INVALID(); |
| timing_ctx.tnodes[inode + 1].num_edges = 0; |
| timing_ctx.tnodes[inode + 1].out_edges = nullptr; |
| |
| inode += 2; |
| } else { |
| VTR_ASSERT(pins.size() == 0); //Input net was swept |
| } |
| } else { |
| //A regular block |
| |
| //Process the block's outputs |
| int j = 0; |
| t_model_ports* model_port = model->outputs; |
| while (model_port) { |
| |
| AtomPortId port_id = atom_ctx.nlist.find_atom_port(blk_id, model_port); |
| |
| if(port_id) { |
| |
| if (model_port->is_clock == false) { |
| //A non-clock output |
| for (int k = 0; k < model_port->size; k++) { |
| auto pin_id = atom_ctx.nlist.port_pin(port_id, k); |
| if (pin_id) { |
| //Look-ups |
| atom_ctx.lookup.set_atom_pin_classic_tnode(pin_id, inode); |
| |
| //The pin's associated net |
| auto net_id = atom_ctx.nlist.pin_net(pin_id); |
| |
| //The first tnode |
| timing_ctx.tnodes[inode].prepacked_data->model_pin = k; |
| timing_ctx.tnodes[inode].prepacked_data->model_port = j; |
| timing_ctx.tnodes[inode].prepacked_data->model_port_ptr = model_port; |
| timing_ctx.tnodes[inode].block = ClusterBlockId::INVALID(); |
| timing_ctx.tnodes[inode].num_edges = atom_ctx.nlist.net_sinks(net_id).size(); |
| timing_ctx.tnodes[inode].out_edges = (t_tedge *) vtr::chunk_malloc( timing_ctx.tnodes[inode].num_edges * sizeof(t_tedge), &tedge_ch); |
| |
| if (atom_ctx.nlist.block_is_combinational(blk_id)) { |
| //Non-sequentail block so only a single OPIN tnode |
| timing_ctx.tnodes[inode].type = TN_PRIMITIVE_OPIN; |
| |
| inode++; |
| } else { |
| VTR_ASSERT(atom_ctx.nlist.block_clock_pins(blk_id).size() > 0); |
| //A sequential block, so the first tnode was an FF_OPIN |
| timing_ctx.tnodes[inode].type = TN_FF_OPIN; |
| |
| //The second tnode is the FF_SOURCE |
| timing_ctx.tnodes[inode + 1].type = TN_FF_SOURCE; |
| timing_ctx.tnodes[inode + 1].block = ClusterBlockId::INVALID(); |
| |
| //Initialize the edge between SOURCE and OPIN with the clk-to-q delay |
| auto iter = expected_lowest_cost_pb_gnode.find(blk_id); |
| VTR_ASSERT(iter != expected_lowest_cost_pb_gnode.end()); |
| |
| from_pb_graph_pin = get_pb_graph_node_pin_from_model_port_pin(model_port, k, |
| iter->second); |
| timing_ctx.tnodes[inode + 1].num_edges = 1; |
| timing_ctx.tnodes[inode + 1].out_edges = (t_tedge *) vtr::chunk_malloc( 1 * sizeof(t_tedge), &tedge_ch); |
| timing_ctx.tnodes[inode + 1].out_edges->to_node = inode; |
| timing_ctx.tnodes[inode + 1].out_edges->Tdel = from_pb_graph_pin->tco_max; //Set the clk-to-Q delay |
| |
| inode += 2; |
| } |
| } |
| } |
| } else { |
| //An output clock port, meaning a clock source (e.g. PLL output) |
| |
| //For every non-empty pin on the clock port create a clock pin and clock source tnode |
| for (int k = 0; k < model_port->size; k++) { |
| auto pin_id = atom_ctx.nlist.port_pin(port_id, k); |
| if (pin_id) { |
| //Look-ups |
| atom_ctx.lookup.set_atom_pin_classic_tnode(pin_id, inode); |
| |
| //Create the OPIN |
| timing_ctx.tnodes[inode].type = TN_CLOCK_OPIN; |
| |
| timing_ctx.tnodes[inode].prepacked_data->model_pin = k; |
| timing_ctx.tnodes[inode].prepacked_data->model_port = j; |
| timing_ctx.tnodes[inode].prepacked_data->model_port_ptr = model_port; |
| timing_ctx.tnodes[inode].block = ClusterBlockId::INVALID(); |
| |
| //Allocate space for the output edges |
| auto net_id = atom_ctx.nlist.pin_net(pin_id); |
| timing_ctx.tnodes[inode].num_edges = atom_ctx.nlist.net_sinks(net_id).size(); |
| timing_ctx.tnodes[inode].out_edges = (t_tedge *) vtr::chunk_malloc( timing_ctx.tnodes[inode].num_edges * sizeof(t_tedge), &tedge_ch); |
| |
| |
| //Create the clock SOURCE |
| auto iter = expected_lowest_cost_pb_gnode.find(blk_id); |
| VTR_ASSERT(iter != expected_lowest_cost_pb_gnode.end()); |
| |
| timing_ctx.tnodes[inode + 1].type = TN_CLOCK_SOURCE; |
| timing_ctx.tnodes[inode + 1].block = ClusterBlockId::INVALID(); |
| timing_ctx.tnodes[inode + 1].num_edges = 1; |
| timing_ctx.tnodes[inode + 1].prepacked_data->model_pin = k; |
| timing_ctx.tnodes[inode + 1].prepacked_data->model_port = j; |
| timing_ctx.tnodes[inode + 1].prepacked_data->model_port_ptr = model_port; |
| |
| //Initialize the edge between them |
| timing_ctx.tnodes[inode + 1].out_edges = (t_tedge *) vtr::chunk_malloc( 1 * sizeof(t_tedge), &tedge_ch); |
| timing_ctx.tnodes[inode + 1].out_edges->to_node = inode; |
| timing_ctx.tnodes[inode + 1].out_edges->Tdel = 0.; //PLL output delay? Not clear what this reallly means... perhaps clock insertion delay from PLL? |
| |
| inode += 2; |
| } |
| } |
| } |
| } |
| j++; |
| model_port = model_port->next; |
| } |
| |
| //Process the block's inputs |
| j = 0; |
| model_port = model->inputs; |
| while (model_port) { |
| |
| AtomPortId port_id = atom_ctx.nlist.find_atom_port(blk_id, model_port); |
| if(port_id) { |
| |
| if (model_port->is_clock == false) { |
| //A non-clock (i.e. data) input |
| |
| //For every non-empty pin create the associated tnode(s) |
| for (int k = 0; k < model_port->size; k++) { |
| auto pin_id = atom_ctx.nlist.port_pin(port_id, k); |
| if (pin_id) { |
| |
| //Look-ups |
| atom_ctx.lookup.set_atom_pin_classic_tnode(pin_id, inode); |
| |
| //Initialize the common part of the first tnode |
| timing_ctx.tnodes[inode].prepacked_data->model_pin = k; |
| timing_ctx.tnodes[inode].prepacked_data->model_port = j; |
| timing_ctx.tnodes[inode].prepacked_data->model_port_ptr = model_port; |
| timing_ctx.tnodes[inode].block = ClusterBlockId::INVALID(); |
| |
| auto iter = expected_lowest_cost_pb_gnode.find(blk_id); |
| VTR_ASSERT(iter != expected_lowest_cost_pb_gnode.end()); |
| |
| from_pb_graph_pin = get_pb_graph_node_pin_from_model_port_pin(model_port, k, |
| iter->second); |
| |
| if (atom_ctx.nlist.block_is_combinational(blk_id)) { |
| //A non-sequential/combinational block |
| |
| timing_ctx.tnodes[inode].type = TN_PRIMITIVE_IPIN; |
| |
| //Allocate space for all possible the edges |
| timing_ctx.tnodes[inode].out_edges = (t_tedge *) vtr::chunk_malloc( from_pb_graph_pin->num_pin_timing * sizeof(t_tedge), &tedge_ch); |
| |
| //Initialize the delay and sink tnodes (i.e. PRIMITIVE_OPINs) |
| int count = 0; |
| for(int m = 0; m < from_pb_graph_pin->num_pin_timing; m++) { |
| to_pb_graph_pin = from_pb_graph_pin->pin_timing[m]; |
| |
| //Find the tnode associated with the sink port & pin |
| auto sink_blk_id = blk_id; //Within a single atom, so the source and sink blocks are the same |
| auto sink_port_id = atom_ctx.nlist.find_atom_port(sink_blk_id, to_pb_graph_pin->port->model_port); |
| if(sink_port_id) { |
| auto sink_pin_id = atom_ctx.nlist.port_pin(sink_port_id, to_pb_graph_pin->pin_number); |
| if(sink_pin_id) { |
| //Pin is in use |
| |
| int to_node = atom_ctx.lookup.atom_pin_classic_tnode(sink_pin_id); |
| VTR_ASSERT(to_node != OPEN); |
| |
| //Skip pins with no connections |
| if(!sink_pin_id) { |
| continue; |
| } |
| |
| //Mark the delay |
| timing_ctx.tnodes[inode].out_edges[count].Tdel = from_pb_graph_pin->pin_timing_del_max[m]; |
| |
| VTR_ASSERT(timing_ctx.tnodes[to_node].type == TN_PRIMITIVE_OPIN); |
| |
| timing_ctx.tnodes[inode].out_edges[count].to_node = to_node; |
| |
| count++; |
| } |
| } |
| } |
| //Mark the number of used edges (note that this may be less than the number allocated, since |
| //some allocated edges may correspond to pins which were not connected) |
| timing_ctx.tnodes[inode].num_edges = count; |
| inode++; |
| } else { |
| VTR_ASSERT(atom_ctx.nlist.block_clock_pins(blk_id).size() > 0); |
| |
| //A sequential input |
| timing_ctx.tnodes[inode].type = TN_FF_IPIN; |
| |
| //Allocate the edge to the sink, and mark the setup-time on it |
| timing_ctx.tnodes[inode].num_edges = 1; |
| timing_ctx.tnodes[inode].out_edges = (t_tedge *) vtr::chunk_malloc( 1 * sizeof(t_tedge), &tedge_ch); |
| timing_ctx.tnodes[inode].out_edges->to_node = inode + 1; |
| timing_ctx.tnodes[inode].out_edges->Tdel = from_pb_graph_pin->tsu; |
| |
| //Initialize the FF_SINK node |
| timing_ctx.tnodes[inode + 1].type = TN_FF_SINK; |
| timing_ctx.tnodes[inode + 1].num_edges = 0; |
| timing_ctx.tnodes[inode + 1].out_edges = nullptr; |
| timing_ctx.tnodes[inode + 1].block = ClusterBlockId::INVALID(); |
| |
| inode += 2; |
| } |
| } |
| } |
| j++; |
| } else { |
| //A clock input |
| //TODO: consider more than one clock per primitive |
| auto pins = atom_ctx.nlist.port_pins(port_id); |
| VTR_ASSERT(pins.size() == 1); |
| auto pin_id = *pins.begin(); |
| |
| if (pin_id) { |
| //Look-up |
| atom_ctx.lookup.set_atom_pin_classic_tnode(pin_id, inode); |
| |
| //Initialize the clock tnode |
| timing_ctx.tnodes[inode].type = TN_FF_CLOCK; |
| timing_ctx.tnodes[inode].block = ClusterBlockId::INVALID(); |
| timing_ctx.tnodes[inode].prepacked_data->model_pin = 0; |
| timing_ctx.tnodes[inode].prepacked_data->model_port = 0; |
| timing_ctx.tnodes[inode].prepacked_data->model_port_ptr = model_port; |
| timing_ctx.tnodes[inode].num_edges = 0; |
| timing_ctx.tnodes[inode].out_edges = nullptr; |
| |
| inode++; |
| } |
| } |
| } |
| model_port = model_port->next; |
| } |
| } |
| } |
| VTR_ASSERT(inode == timing_ctx.num_tnodes); |
| |
| /* Load net delays and initialize clock domains to OPEN. */ |
| std::set<int> const_gen_tnodes; |
| for (int i = 0; i < timing_ctx.num_tnodes; i++) { |
| timing_ctx.tnodes[i].clock_domain = OPEN; |
| |
| //Since the setup/clock-to-q/combinational delays have already been set above, |
| //we only set the net related delays here |
| switch (timing_ctx.tnodes[i].type) { |
| case TN_INPAD_OPIN: //fallthrough |
| case TN_PRIMITIVE_OPIN: //fallthrough |
| case TN_FF_OPIN: //fallthrough |
| case TN_CLOCK_OPIN: { |
| /* An output pin tnode */ |
| |
| //Find the net driven by the OPIN |
| auto pin_id = atom_ctx.lookup.classic_tnode_atom_pin(i); |
| auto net_id = atom_ctx.nlist.pin_net(pin_id); |
| VTR_ASSERT(net_id); |
| VTR_ASSERT_MSG(pin_id == atom_ctx.nlist.net_driver(net_id), "OPIN must be net driver"); |
| |
| bool is_const_net = atom_ctx.nlist.pin_is_constant(pin_id); |
| //TODO: const generators should really be treated by sources launching at t=-inf |
| if(is_const_net) { |
| VTR_ASSERT(timing_ctx.tnodes[i].type == TN_PRIMITIVE_OPIN); |
| timing_ctx.tnodes[i].type = TN_CONSTANT_GEN_SOURCE; |
| const_gen_tnodes.insert(i); |
| } |
| |
| //Look at each sink |
| int j = 0; |
| for(auto sink_pin_id : atom_ctx.nlist.net_sinks(net_id)) { |
| if (is_const_net) { |
| //The net is a constant generator, we override the driver |
| //tnode type appropriately and set the delay to a large |
| //negative value to ensure the signal is treated as already |
| //'arrived' |
| timing_ctx.tnodes[i].out_edges[j].Tdel = HUGE_NEGATIVE_FLOAT; |
| } else { |
| //This is a real net, so use the default delay |
| timing_ctx.tnodes[i].out_edges[j].Tdel = inter_cluster_net_delay; |
| } |
| |
| //Find the sink tnode |
| int to_node = atom_ctx.lookup.atom_pin_classic_tnode(sink_pin_id); |
| VTR_ASSERT(to_node != OPEN); |
| |
| //Connect the edge to the sink |
| timing_ctx.tnodes[i].out_edges[j].to_node = to_node; |
| ++j; |
| } |
| VTR_ASSERT(timing_ctx.tnodes[i].num_edges == static_cast<int>(atom_ctx.nlist.net_sinks(net_id).size())); |
| break; |
| } |
| case TN_PRIMITIVE_IPIN: //fallthrough |
| case TN_OUTPAD_IPIN: //fallthrough |
| case TN_INPAD_SOURCE: //fallthrough |
| case TN_OUTPAD_SINK: //fallthrough |
| case TN_FF_SINK: //fallthrough |
| case TN_FF_SOURCE: //fallthrough |
| case TN_FF_IPIN: //fallthrough |
| case TN_FF_CLOCK: //fallthrough |
| case TN_CLOCK_SOURCE: //fallthrough |
| /* All other tnode's have had their edge-delays initialized */ |
| break; |
| default: |
| VPR_THROW(VPR_ERROR_TIMING, "Unknown tnode type %d.\n", timing_ctx.tnodes[i].type); |
| break; |
| } |
| } |
| |
| //In some instances, we may end-up with constant generators driving constant generators |
| //(which is illegal in the timing graph since constant generators shouldn't have any input |
| //edges). So we remove those edges now. |
| int const_gen_edge_break_count = 0; |
| for(inode = 0; inode < timing_ctx.num_tnodes; ++inode) { |
| for(int iedge = 0; iedge <timing_ctx.tnodes[inode].num_edges; ++iedge) { |
| int& to_node = timing_ctx.tnodes[inode].out_edges[iedge].to_node; |
| if(const_gen_tnodes.count(to_node)) { |
| //This edge points to a constant generator, mark it invalid |
| VTR_ASSERT(timing_ctx.tnodes[to_node].type == TN_CONSTANT_GEN_SOURCE); |
| |
| to_node = DO_NOT_ANALYSE; |
| |
| ++const_gen_edge_break_count; |
| } |
| } |
| } |
| vtr::printf_info("Disconnected %d redundant timing edges to constant generators\n", const_gen_edge_break_count); |
| |
| |
| //Build the net to driver look-up |
| // TODO: convert to use net_id directly when timing graph re-unified |
| f_net_to_driver_tnode = (int*)vtr::malloc(num_timing_nets() * sizeof(int)); |
| auto nets = atom_ctx.nlist.nets(); |
| for (size_t i = 0; i < num_timing_nets(); i++) { |
| auto net_id = *(nets.begin() + i); //XXX: Ugly hack relying on sequentially increasing net id's |
| VTR_ASSERT(net_id); |
| |
| auto driver_pin_id = atom_ctx.nlist.net_driver(net_id); |
| VTR_ASSERT(driver_pin_id); |
| |
| f_net_to_driver_tnode[i] = atom_ctx.lookup.atom_pin_classic_tnode(driver_pin_id); |
| } |
| |
| //Sanity check, every net should have a valid driver tnode |
| for (size_t i = 0; i < num_timing_nets(); i++) { |
| VTR_ASSERT(f_net_to_driver_tnode[i] != OPEN); |
| } |
| } |
| |
| static void load_tnode(t_pb_graph_pin *pb_graph_pin, const ClusterBlockId iblock, |
| int *inode) { |
| auto& atom_ctx = g_vpr_ctx.mutable_atom(); |
| auto& timing_ctx = g_vpr_ctx.mutable_timing(); |
| |
| int i = *inode; |
| timing_ctx.tnodes[i].pb_graph_pin = pb_graph_pin; |
| timing_ctx.tnodes[i].block = iblock; |
| |
| |
| if (timing_ctx.tnodes[i].pb_graph_pin->parent_node->pb_type->blif_model == nullptr) { |
| VTR_ASSERT(timing_ctx.tnodes[i].pb_graph_pin->type == PB_PIN_NORMAL); |
| if (timing_ctx.tnodes[i].pb_graph_pin->parent_node->parent_pb_graph_node == nullptr) { |
| if (timing_ctx.tnodes[i].pb_graph_pin->port->type == IN_PORT) { |
| timing_ctx.tnodes[i].type = TN_CB_IPIN; |
| } else { |
| VTR_ASSERT(timing_ctx.tnodes[i].pb_graph_pin->port->type == OUT_PORT); |
| timing_ctx.tnodes[i].type = TN_CB_OPIN; |
| } |
| } else { |
| timing_ctx.tnodes[i].type = TN_INTERMEDIATE_NODE; |
| } |
| } else { |
| AtomPinId atom_pin = find_atom_pin(iblock, pb_graph_pin); |
| |
| if (timing_ctx.tnodes[i].pb_graph_pin->type == PB_PIN_INPAD) { |
| VTR_ASSERT(timing_ctx.tnodes[i].pb_graph_pin->port->type == OUT_PORT); |
| timing_ctx.tnodes[i].type = TN_INPAD_OPIN; |
| timing_ctx.tnodes[i + 1].num_edges = 1; |
| timing_ctx.tnodes[i + 1].out_edges = (t_tedge *) vtr::chunk_malloc( |
| 1 * sizeof(t_tedge), &tedge_ch); |
| timing_ctx.tnodes[i + 1].out_edges->Tdel = 0; |
| timing_ctx.tnodes[i + 1].out_edges->to_node = i; |
| timing_ctx.tnodes[i + 1].pb_graph_pin = pb_graph_pin; /* Necessary for propagate_clock_domain_and_skew(). */ |
| timing_ctx.tnodes[i + 1].type = TN_INPAD_SOURCE; |
| timing_ctx.tnodes[i + 1].block = iblock; |
| |
| |
| atom_ctx.lookup.set_atom_pin_classic_tnode(atom_pin, i + 1); |
| |
| (*inode)++; |
| } else if (timing_ctx.tnodes[i].pb_graph_pin->type == PB_PIN_OUTPAD) { |
| VTR_ASSERT(timing_ctx.tnodes[i].pb_graph_pin->port->type == IN_PORT); |
| timing_ctx.tnodes[i].type = TN_OUTPAD_IPIN; |
| timing_ctx.tnodes[i].num_edges = 1; |
| timing_ctx.tnodes[i].out_edges = (t_tedge *) vtr::chunk_malloc( |
| 1 * sizeof(t_tedge), &tedge_ch); |
| timing_ctx.tnodes[i].out_edges->Tdel = 0; |
| timing_ctx.tnodes[i].out_edges->to_node = i + 1; |
| timing_ctx.tnodes[i + 1].pb_graph_pin = pb_graph_pin; /* Necessary for find_tnode_net_name(). */ |
| timing_ctx.tnodes[i + 1].type = TN_OUTPAD_SINK; |
| timing_ctx.tnodes[i + 1].block = iblock; |
| timing_ctx.tnodes[i + 1].num_edges = 0; |
| timing_ctx.tnodes[i + 1].out_edges = nullptr; |
| |
| atom_ctx.lookup.set_atom_pin_classic_tnode(atom_pin, i + 1); |
| (*inode)++; |
| } else if (timing_ctx.tnodes[i].pb_graph_pin->type == PB_PIN_SEQUENTIAL) { |
| if (timing_ctx.tnodes[i].pb_graph_pin->port->type == IN_PORT) { |
| timing_ctx.tnodes[i].type = TN_FF_IPIN; |
| timing_ctx.tnodes[i].num_edges = 1; |
| timing_ctx.tnodes[i].out_edges = (t_tedge *) vtr::chunk_malloc( |
| 1 * sizeof(t_tedge), &tedge_ch); |
| timing_ctx.tnodes[i].out_edges->Tdel = pb_graph_pin->tsu; |
| timing_ctx.tnodes[i].out_edges->to_node = i + 1; |
| timing_ctx.tnodes[i + 1].pb_graph_pin = pb_graph_pin; |
| timing_ctx.tnodes[i + 1].type = TN_FF_SINK; |
| timing_ctx.tnodes[i + 1].block = iblock; |
| timing_ctx.tnodes[i + 1].num_edges = 0; |
| timing_ctx.tnodes[i + 1].out_edges = nullptr; |
| |
| atom_ctx.lookup.set_atom_pin_classic_tnode(atom_pin, i + 1); |
| } else { |
| VTR_ASSERT(timing_ctx.tnodes[i].pb_graph_pin->port->type == OUT_PORT); |
| //Determine whether we are a standard clocked output pin (TN_FF_OPIN with TN_FF_SOURCE) |
| //or a clock source (TN_CLOCK_OPIN with TN_CLOCK_SOURCE) |
| t_model_ports* model_port = timing_ctx.tnodes[i].pb_graph_pin->port->model_port; |
| if(!model_port->is_clock) { |
| //Standard sequential output |
| timing_ctx.tnodes[i].type = TN_FF_OPIN; |
| timing_ctx.tnodes[i + 1].num_edges = 1; |
| timing_ctx.tnodes[i + 1].out_edges = (t_tedge *) vtr::chunk_malloc( |
| 1 * sizeof(t_tedge), &tedge_ch); |
| timing_ctx.tnodes[i + 1].out_edges->Tdel = pb_graph_pin->tco_max; |
| timing_ctx.tnodes[i + 1].out_edges->to_node = i; |
| timing_ctx.tnodes[i + 1].pb_graph_pin = pb_graph_pin; |
| timing_ctx.tnodes[i + 1].type = TN_FF_SOURCE; |
| timing_ctx.tnodes[i + 1].block = iblock; |
| |
| atom_ctx.lookup.set_atom_pin_classic_tnode(atom_pin, i + 1); |
| } else { |
| //Clock source |
| timing_ctx.tnodes[i].type = TN_CLOCK_OPIN; |
| timing_ctx.tnodes[i + 1].num_edges = 1; |
| timing_ctx.tnodes[i + 1].out_edges = (t_tedge *) vtr::chunk_malloc( |
| 1 * sizeof(t_tedge), &tedge_ch); |
| timing_ctx.tnodes[i + 1].out_edges->Tdel = 0.; //Not clear what this delay physically represents |
| timing_ctx.tnodes[i + 1].out_edges->to_node = i; |
| timing_ctx.tnodes[i + 1].pb_graph_pin = pb_graph_pin; |
| timing_ctx.tnodes[i + 1].type = TN_CLOCK_SOURCE; |
| timing_ctx.tnodes[i + 1].block = iblock; |
| |
| atom_ctx.lookup.set_atom_pin_classic_tnode(atom_pin, i + 1); |
| } |
| } |
| (*inode)++; |
| } else if (timing_ctx.tnodes[i].pb_graph_pin->type == PB_PIN_CLOCK) { |
| timing_ctx.tnodes[i].type = TN_FF_CLOCK; |
| timing_ctx.tnodes[i].num_edges = 0; |
| timing_ctx.tnodes[i].out_edges = nullptr; |
| |
| atom_ctx.lookup.set_atom_pin_classic_tnode(atom_pin, i); |
| } else { |
| if (timing_ctx.tnodes[i].pb_graph_pin->port->type == IN_PORT) { |
| VTR_ASSERT(timing_ctx.tnodes[i].pb_graph_pin->type == PB_PIN_TERMINAL); |
| timing_ctx.tnodes[i].type = TN_PRIMITIVE_IPIN; |
| |
| atom_ctx.lookup.set_atom_pin_classic_tnode(atom_pin, i); |
| } else { |
| VTR_ASSERT(timing_ctx.tnodes[i].pb_graph_pin->port->type == OUT_PORT); |
| VTR_ASSERT(timing_ctx.tnodes[i].pb_graph_pin->type == PB_PIN_TERMINAL); |
| timing_ctx.tnodes[i].type = TN_PRIMITIVE_OPIN; |
| |
| atom_ctx.lookup.set_atom_pin_classic_tnode(atom_pin, i); |
| } |
| } |
| } |
| (*inode)++; |
| } |
| |
| void print_timing_graph(const char *fname) { |
| |
| /* Prints the timing graph into a file. */ |
| |
| FILE *fp; |
| int inode, iedge, ilevel, i; |
| t_tedge *tedge; |
| e_tnode_type itype; |
| const char *tnode_type_names[] = { "TN_INPAD_SOURCE", "TN_INPAD_OPIN", "TN_OUTPAD_IPIN", |
| "TN_OUTPAD_SINK", "TN_CB_IPIN", "TN_CB_OPIN", "TN_INTERMEDIATE_NODE", |
| "TN_PRIMITIVE_IPIN", "TN_PRIMITIVE_OPIN", "TN_FF_IPIN", "TN_FF_OPIN", "TN_FF_SINK", |
| "TN_FF_SOURCE", "TN_FF_CLOCK", "TN_CLOCK_SOURCE", "TN_CLOCK_OPIN", "TN_CONSTANT_GEN_SOURCE" }; |
| |
| auto& cluster_ctx = g_vpr_ctx.clustering(); |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| fp = vtr::fopen(fname, "w"); |
| |
| fprintf(fp, "timing_ctx.num_tnodes: %d\n", timing_ctx.num_tnodes); |
| fprintf(fp, "Node #\tType\t\tipin\tiblk\tDomain\tSkew\tI/O Delay\t# edges\t" |
| "to_node Tdel\n\n"); |
| |
| for (inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| fprintf(fp, "%d\t", inode); |
| |
| itype = timing_ctx.tnodes[inode].type; |
| fprintf(fp, "%-15.15s\t", tnode_type_names[itype]); |
| |
| if (timing_ctx.tnodes[inode].pb_graph_pin != nullptr) { |
| fprintf(fp, "%d\t%zu\t", |
| timing_ctx.tnodes[inode].pb_graph_pin->pin_count_in_cluster, |
| size_t(timing_ctx.tnodes[inode].block)); |
| } else { |
| fprintf(fp, "\t%zu\t", size_t(timing_ctx.tnodes[inode].block)); |
| } |
| |
| if (itype == TN_FF_CLOCK || itype == TN_FF_SOURCE || itype == TN_FF_SINK) { |
| fprintf(fp, "%d\t%.3e\t\t", timing_ctx.tnodes[inode].clock_domain, timing_ctx.tnodes[inode].clock_delay); |
| } else if (itype == TN_INPAD_SOURCE || itype ==TN_CLOCK_SOURCE) { |
| fprintf(fp, "%d\t\t%.3e\t", timing_ctx.tnodes[inode].clock_domain, timing_ctx.tnodes[inode].out_edges ? timing_ctx.tnodes[inode].out_edges[0].Tdel : -1); |
| } else if (itype == TN_OUTPAD_SINK) { |
| VTR_ASSERT(timing_ctx.tnodes[inode-1].type == TN_OUTPAD_IPIN); /* Outpad ipins should be one prior in the tnode array */ |
| fprintf(fp, "%d\t\t%.3e\t", timing_ctx.tnodes[inode].clock_domain, timing_ctx.tnodes[inode-1].out_edges[0].Tdel); |
| } else { |
| fprintf(fp, "\t\t\t\t"); |
| } |
| |
| fprintf(fp, "%d", timing_ctx.tnodes[inode].num_edges); |
| |
| /* Print all edges after edge 0 on separate lines */ |
| tedge = timing_ctx.tnodes[inode].out_edges; |
| if (timing_ctx.tnodes[inode].num_edges > 0) { |
| fprintf(fp, "\t%4d\t%7.3g", tedge[0].to_node, tedge[0].Tdel); |
| for (iedge = 1; iedge < timing_ctx.tnodes[inode].num_edges; iedge++) { |
| fprintf(fp, "\n\t\t\t\t\t\t\t\t\t\t%4d\t%7.3g", tedge[iedge].to_node, tedge[iedge].Tdel); |
| } |
| } |
| fprintf(fp, "\n"); |
| } |
| |
| fprintf(fp, "\n\ntiming_ctx.num_tnode_levels: %d\n", timing_ctx.num_tnode_levels); |
| |
| for (ilevel = 0; ilevel < timing_ctx.num_tnode_levels; ilevel++) { |
| fprintf(fp, "\n\nLevel: %d Num_nodes: %zu\nNodes:", ilevel, |
| timing_ctx.tnodes_at_level[ilevel].size()); |
| for (unsigned j = 0; j < timing_ctx.tnodes_at_level[ilevel].size(); j++) |
| fprintf(fp, "\t%d", timing_ctx.tnodes_at_level[ilevel][j]); |
| } |
| |
| fprintf(fp, "\n"); |
| fprintf(fp, "\n\nNet #\tNet_to_driver_tnode\n"); |
| |
| for (i = 0; i < (int) cluster_ctx.clb_nlist.nets().size(); i++) |
| fprintf(fp, "%4d\t%6d\n", i, f_net_to_driver_tnode[i]); |
| |
| if (timing_ctx.sdc && timing_ctx.sdc->num_constrained_clocks == 1) { |
| |
| /* Arrival and required times, and forward and backward weights, will be meaningless for multiclock |
| designs, since the values currently on the graph will only correspond to the most recent traversal. */ |
| fprintf(fp, "\n\nNode #\t\tT_arr\t\tT_req" |
| #ifdef PATH_COUNTING |
| "\tForward weight\tBackward weight" |
| #endif |
| "\n\n"); |
| |
| for (inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| if (timing_ctx.tnodes[inode].T_arr > HUGE_NEGATIVE_FLOAT + 1) { |
| fprintf(fp, "%d\t%12g", inode, timing_ctx.tnodes[inode].T_arr); |
| } else { |
| fprintf(fp, "%d\t\t -", inode); |
| } |
| if (timing_ctx.tnodes[inode].T_req < HUGE_POSITIVE_FLOAT - 1) { |
| fprintf(fp, "\t%12g", timing_ctx.tnodes[inode].T_req); |
| } else { |
| fprintf(fp, "\t\t -"); |
| } |
| #ifdef PATH_COUNTING |
| fprintf(fp, "\t%12g\t%12g", timing_ctx.tnodes[inode].forward_weight, timing_ctx.tnodes[inode].backward_weight); |
| #endif |
| fprintf(fp, "\n"); |
| } |
| } |
| |
| fclose(fp); |
| } |
| static void process_constraints() { |
| /* Removes all constraints between domains which never intersect. We need to do this |
| so that criticality_denom in do_timing_analysis is not affected by unused constraints. |
| BFS through the levelized graph once for each source domain. Whenever we get to a sink, |
| mark off that we've used that sink clock domain. After each traversal, set all unused |
| constraints to DO_NOT_ANALYSE. |
| |
| Also, print timing_ctx.sdc->domain_constraints, constrained I/Os and override constraints, |
| and convert timing_ctx.sdc->domain_constraints and flip-flop-level override constraints |
| to be in seconds rather than nanoseconds. We don't need to normalize timing_ctx.sdc->cc_constraints |
| because they're already on the timing_ctx.sdc->domain_constraints matrix, and we don't need |
| to normalize constrained_ios because we already did the normalization when |
| we put the delays onto the timing graph in load_clock_domain_and_clock_and_io_delay. */ |
| |
| int source_clock_domain, sink_clock_domain, inode, ilevel, num_at_level, i, |
| num_edges, iedge, to_node, icf, ifc, iff; |
| t_tedge * tedge; |
| float constraint; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| bool * constraint_used = (bool *) vtr::malloc(timing_ctx.sdc->num_constrained_clocks * sizeof(bool)); |
| |
| for (source_clock_domain = 0; source_clock_domain < timing_ctx.sdc->num_constrained_clocks; source_clock_domain++) { |
| /* We're going to use arrival time to flag which nodes we've reached, |
| even though the values we put in will not correspond to actual arrival times. |
| Nodes which are reached on this traversal will get an arrival time of 0. |
| Reset arrival times now to an invalid number. */ |
| for (inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| timing_ctx.tnodes[inode].T_arr = HUGE_NEGATIVE_FLOAT; |
| } |
| |
| /* Reset all constraint_used entries. */ |
| for (sink_clock_domain = 0; sink_clock_domain < timing_ctx.sdc->num_constrained_clocks; sink_clock_domain++) { |
| constraint_used[sink_clock_domain] = false; |
| } |
| |
| /* Set arrival times for each top-level tnode on this clock domain. */ |
| if(!timing_ctx.tnodes_at_level.empty()) { |
| num_at_level = timing_ctx.tnodes_at_level[0].size(); |
| } else { |
| num_at_level = 0; |
| } |
| for (i = 0; i < num_at_level; i++) { |
| inode = timing_ctx.tnodes_at_level[0][i]; |
| if (timing_ctx.tnodes[inode].clock_domain == source_clock_domain) { |
| timing_ctx.tnodes[inode].T_arr = 0.; |
| } |
| } |
| |
| for (ilevel = 0; ilevel < timing_ctx.num_tnode_levels; ilevel++) { /* Go down one level at a time. */ |
| num_at_level = timing_ctx.tnodes_at_level[ilevel].size(); |
| |
| for (i = 0; i < num_at_level; i++) { |
| inode = timing_ctx.tnodes_at_level[ilevel][i]; /* Go through each of the tnodes at the level we're on. */ |
| if (has_valid_T_arr(inode)) { /* If this tnode has been used */ |
| num_edges = timing_ctx.tnodes[inode].num_edges; |
| if (num_edges == 0) { /* sink */ |
| /* We've reached the sink domain of this tnode, so set constraint_used |
| to true for this tnode's clock domain (if it has a valid one). */ |
| sink_clock_domain = timing_ctx.tnodes[inode].clock_domain; |
| if (sink_clock_domain != -1) { |
| constraint_used[sink_clock_domain] = true; |
| } |
| } else { |
| /* Set arrival time to a valid value (0.) for each tnode in this tnode's fanout. */ |
| tedge = timing_ctx.tnodes[inode].out_edges; |
| for (iedge = 0; iedge < num_edges; iedge++) { |
| to_node = tedge[iedge].to_node; |
| if(to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| |
| timing_ctx.tnodes[to_node].T_arr = 0.; |
| } |
| } |
| } |
| } |
| } |
| |
| /* At the end of the source domain traversal, see which sink domains haven't been hit, |
| and set the constraint for the pair of source and sink domains to DO_NOT_ANALYSE */ |
| |
| for (sink_clock_domain = 0; sink_clock_domain < timing_ctx.sdc->num_constrained_clocks; sink_clock_domain++) { |
| if (!constraint_used[sink_clock_domain]) { |
| if(timing_ctx.sdc->domain_constraint[source_clock_domain][sink_clock_domain] != DO_NOT_ANALYSE) { |
| vtr::printf_warning(__FILE__, __LINE__, "Timing constraint from clock %d to %d of value %f will be disabled" |
| " since it is not activated by any path in the timing graph.\n", |
| source_clock_domain, sink_clock_domain, |
| timing_ctx.sdc->domain_constraint[source_clock_domain][sink_clock_domain]); |
| } |
| timing_ctx.sdc->domain_constraint[source_clock_domain][sink_clock_domain] = DO_NOT_ANALYSE; |
| } |
| } |
| } |
| |
| free(constraint_used); |
| |
| /* Print constraints */ |
| if (getEchoEnabled() && isEchoFileEnabled(E_ECHO_TIMING_CONSTRAINTS)) { |
| print_timing_constraint_info(getEchoFileName(E_ECHO_TIMING_CONSTRAINTS)); |
| } |
| |
| /* Convert timing_ctx.sdc->domain_constraint and ff-level override constraints to be in seconds, not nanoseconds. */ |
| for (source_clock_domain = 0; source_clock_domain < timing_ctx.sdc->num_constrained_clocks; source_clock_domain++) { |
| for (sink_clock_domain = 0; sink_clock_domain < timing_ctx.sdc->num_constrained_clocks; sink_clock_domain++) { |
| constraint = timing_ctx.sdc->domain_constraint[source_clock_domain][sink_clock_domain]; |
| if (constraint > NEGATIVE_EPSILON) { /* if constraint does not equal DO_NOT_ANALYSE */ |
| timing_ctx.sdc->domain_constraint[source_clock_domain][sink_clock_domain] = constraint * 1e-9; |
| } |
| } |
| } |
| for (icf = 0; icf < timing_ctx.sdc->num_cf_constraints; icf++) { |
| timing_ctx.sdc->cf_constraints[icf].constraint *= 1e-9; |
| } |
| for (ifc = 0; ifc < timing_ctx.sdc->num_fc_constraints; ifc++) { |
| timing_ctx.sdc->fc_constraints[ifc].constraint *= 1e-9; |
| } |
| for (iff = 0; iff < timing_ctx.sdc->num_ff_constraints; iff++) { |
| timing_ctx.sdc->ff_constraints[iff].constraint *= 1e-9; |
| } |
| |
| /* Finally, free timing_ctx.sdc->cc_constraints since all of its info is contained in timing_ctx.sdc->domain_constraint. */ |
| free_override_constraint(timing_ctx.sdc->cc_constraints, timing_ctx.sdc->num_cc_constraints); |
| } |
| |
| static void alloc_timing_stats() { |
| |
| /* Allocate f_timing_stats data structure. */ |
| |
| int i; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| f_timing_stats = (t_timing_stats *) vtr::malloc(sizeof(t_timing_stats)); |
| f_timing_stats->cpd = (float **) vtr::malloc(timing_ctx.sdc->num_constrained_clocks * sizeof(float *)); |
| f_timing_stats->least_slack = (float **) vtr::malloc(timing_ctx.sdc->num_constrained_clocks * sizeof(float *)); |
| for (i = 0; i < timing_ctx.sdc->num_constrained_clocks; i++) { |
| f_timing_stats->cpd[i] = (float *) vtr::malloc(timing_ctx.sdc->num_constrained_clocks * sizeof(float)); |
| f_timing_stats->least_slack[i] = (float *) vtr::malloc(timing_ctx.sdc->num_constrained_clocks * sizeof(float)); |
| } |
| } |
| |
| void do_timing_analysis(t_slack * slacks, const t_timing_inf &timing_inf, bool is_prepacked, bool is_final_analysis) { |
| |
| /* Performs timing analysis on the circuit. Before this routine is called, t_slack * slacks |
| must have been allocated, and the circuit must have been converted into a timing graph. |
| The nodes of the timing graph represent pins and the edges between them represent delays |
| and m dependencies from one pin to another. Most elements are modeled as a pair of nodes so |
| that the delay through the element can be marked on the edge between them (e.g. |
| TN_INPAD_SOURCE->TN_INPAD_OPIN, TN_OUTPAD_IPIN->TN_OUTPAD_SINK, TN_PRIMITIVE_OPIN-> |
| TN_PRIMITIVE_OPIN, etc.). |
| |
| The timing graph nodes are stored as an array, tnode [0..timing_ctx.num_tnodes - 1]. Each tnode |
| includes an array of all edges, tedge, which fan out from it. Each tedge includes the |
| index of the node on its far end (in the tnode array), and the delay to that node. |
| |
| The timing graph has sources at each TN_FF_SOURCE (Q output), TN_INPAD_SOURCE (input I/O pad) |
| and TN_CONSTANT_GEN_SOURCE (constant 1 or 0 generator) node and sinks at TN_FF_SINK (D input) |
| and TN_OUTPAD_SINK (output I/O pad) nodes. Two traversals, one forward (sources to sinks) |
| and one backward, are performed for each valid constraint (one which is not DO_NOT_ANALYSE) |
| between a source and a sink clock domain in the matrix timing_ctx.sdc->domain_constraint |
| [0..timing_ctx.sdc->num_constrained_clocks - 1][0..timing_ctx.sdc->num_constrained_clocks - 1]. This matrix has been |
| pruned so that all domain pairs with no paths between them have been set to DO_NOT_ANALYSE. |
| |
| During the traversal pair for each constraint, all nodes in the fanout of sources on the |
| source clock domain are assigned a T_arr, the arrival time of the last input signal to the node. |
| All nodes in the fanin of sinks on the sink clock domain are assigned a T_req, the required |
| arrival time of the last input signal to the node if the critical path for this constraint is |
| not to be lengthened. Nodes which receive both a valid T_arr and T_req are flagged with |
| used_on_this_traversal, and nodes which are used on at least one traversal pair are flagged |
| with has_valid_slack so that later functions know the slack is valid. |
| |
| After each traversal pair, a slack is calculated for each sink pin on each net (or equivalently, |
| each connection or tedge fanning out from that net's driver tnode). Slack is calculated as: |
| T_req (dest node) - T_arr (source node) - Tdel (edge) |
| and represents the amount of delay which could be added to this connection before the critical |
| path delay for this constraint would worsen. Edges on the critical path have a slack of 0. |
| Slacks which are never used are set to HUGE_POSITIVE_FLOAT. |
| |
| The optimizers actually use a metric called timing_criticality. Timing criticality is defined |
| as 1 - slack / criticality_denom, where the normalization factor criticality_denom is the max |
| of all arrival times in the constraint and the constraint itself (T_req-relaxed slacks) or |
| all arrival times and constraints in the design (shifted slacks). See below for a further |
| discussion of these two regimes. Timing criticality is always between 0 (not critical) and 1 |
| (very critical). Unlike slack, which is set to HUGE_POSITIVE_FLOAT for unanalysed connections, |
| timing criticality is 0 for these, meaning no special check has to be made for which connections |
| have been analysed. |
| |
| If path counting is on (PATH_COUNTING is defined in vpr_types.h), the optimizers use a weighted |
| sum of timing_criticality and path_criticality, the latter of which is an estimate of the |
| importance of the number of paths using a particular connection. As a path's timing_criticality |
| decreases, it will become exponentially less important to the path_criticality of any connection |
| which this path uses. Path criticality also goes from 0 (not critical or unanalysed) to 1. |
| |
| Slack and criticalities are only calculated if both the driver of the net and the sink pin were |
| used_on_this_traversal, and are only overwritten if lower than previously-obtained values. |
| The optimizers actually use criticality rather than slack, but slack is useful for human |
| designers and so we calculate it only if we need to print it. |
| |
| This routine outputs slack and criticality to t_slack * slacks. It also stores least slack and |
| critical path delay per constraint [0..timing_ctx.sdc->num_constrained_clocks - 1][0..timing_ctx.sdc->num_constrained_clocks - 1] |
| in the file-scope variable f_timing_stats. |
| |
| Is_prepacked flags whether to calculate normalized costs for the clusterer (normalized_slack, |
| normalized_Tarr, normalized_total_critical_paths). Setting this to false saves time in post- |
| packed timing analyses. |
| |
| Is_final_analysis flags whether this is the final, analysis pass. If it is, the analyser will |
| compute actual slacks instead of relaxed ones. We "relax" slacks by setting the required time to |
| the maximum arrival time for tight constraints so that no slacks are negative (which confuses |
| the optimizers). This is called "T_req-relaxed" slack. However, human designers want to see |
| actual slack values, so we report those in the final analysis. The alternative ways of making |
| slacks positive are shifting them upwards by the value of the largest negative slack, after |
| all traversals are complete ("shifted slacks"), which can be enabled by changing slack_definition |
| from 'R' to 'I'; or using either 'R' or 'I' with a criticality denominator which |
| is the same for every traversal (respectively called 'G' and 'S"). |
| |
| To do: flip-flop to flip-flop and flip-flop to clock domain constraints (set_false_path, set_max_delay, |
| and especially set_multicycle_path). All the info for these constraints is contained in timing_ctx.sdc->fc_constraints and |
| timing_ctx.sdc->ff_constraints, but graph traversals are not included yet. Probably, an entire traversal will be needed for |
| each constraint. Clock domain to flip-flop constraints are coded but not tested, and are done within |
| existing traversals. */ |
| |
| /* Description of slack_definition: |
| slack_definition determines how to normalize negative slacks for the optimizers (not in the final timing analysis |
| for output statistics): |
| 'R' (T_req-relaxed): For each constraint, set the required time at sink nodes to the max of the true required time |
| (constraint + timing_ctx.tnodes[inode].clock_skew) and the max arrival time. This means that the required time is "relaxed" |
| to the max arrival time for tight constraints which would otherwise give negative slack. |
| 'I' (Improved Shifted): After all slacks are computed, increase the value of all slacks by the magnitude of the |
| largest negative slack, if it exists. More computationally demanding. Equivalent to 'R' for a single clock. |
| 'S' (Shifted): Same as improved shifted, but criticalities are only computed after all traversals. Equivalent to 'R' |
| for a single clock. |
| 'G' (Global T_req-relaxed): Same as T_req-relaxed, but criticalities are only computed after all traversals. |
| Equivalent to 'R' for a single clock. Note: G is a global version of R, just like S is a global version of I. |
| 'C' (Clipped): All negative slacks are clipped to 0. |
| 'N' (None): Negative slacks are not normalized. |
| |
| This definition also affects how criticality is computed. For all methods except 'S', the criticality denominator |
| is the maximum required time for each constraint, and criticality is updated after each constraint. 'S' only updates |
| criticalities once after all traversals, and the criticality denominator is the maximum required time over all |
| traversals. |
| |
| Because the criticality denominator must be >= the largest slack it is used on, and slack normalization affects the |
| size of the largest slack, the denominator must also be normalized. We choose the following normalization methods: |
| |
| 'R': Denominator is already taken care of because the maximum required time now depends on the constraint. No further |
| normalization is necessary. |
| 'I': Denominator is increased by the magnitude of the largest negative slack. |
| 'S': Denominator is the maximum of the 'I' denominator over all constraints. |
| 'G': Denominator is the maximum of the 'R' denominator over all constraints. |
| 'C': Denominator is unchanged. However, if Treq_max is 0, there is division by 0. To avoid this, note that in this |
| case, all of the slacks will be clipped to zero anyways, so we can just set the criticality to 1. |
| 'N': Denominator is set to max(max_Treq, max_Tarr), so that the magnitude of criticality will at least be bounded to |
| 2. This is the same denominator as 'R', though calculated differently. |
| */ |
| |
| clock_t begin = clock(); |
| |
| #ifdef PATH_COUNTING |
| float max_path_criticality = HUGE_NEGATIVE_FLOAT /* used to normalize path_criticalities */; |
| #endif |
| |
| bool update_slack = (is_final_analysis || getEchoEnabled()); |
| /* Only update slack values if we need to print it, i.e. |
| for the final output file (is_final_analysis) or echo files. */ |
| |
| float criticality_denom; /* (slack_definition == 'R' and 'I' only) For a particular constraint, the maximum of |
| the constraint and all arrival times for the constraint's traversal. Used to |
| normalize the clusterer's normalized_slack and, more importantly, criticality. */ |
| |
| long max_critical_output_paths, max_critical_input_paths; |
| |
| float smallest_slack_in_design = HUGE_POSITIVE_FLOAT; |
| /* For slack_definition == 'S', shift all slacks upwards by this number if it is negative. */ |
| |
| float criticality_denom_global = HUGE_NEGATIVE_FLOAT; |
| /* Denominator of criticality for slack_definition == 'S' || slack_definition == 'G' - |
| max of all arrival times and all constraints. */ |
| |
| update_slack = bool(timing_inf.slack_definition == std::string("S") || timing_inf.slack_definition == std::string("G")); |
| /* Update slack values for certain slack definitions where they are needed to compute timing criticalities. */ |
| |
| auto& timing_ctx = g_vpr_ctx.mutable_timing(); |
| |
| /* Reset all values which need to be reset once per |
| timing analysis, rather than once per traversal pair. */ |
| |
| /* Reset slack and criticality */ |
| for (size_t inet = 0; inet < num_timing_nets(); inet++) { |
| for (int ipin = 1; ipin < (int) num_timing_net_pins(inet); ipin++) { |
| slacks->slack[inet][ipin] = HUGE_POSITIVE_FLOAT; |
| slacks->timing_criticality[inet][ipin] = 0.; |
| #ifdef PATH_COUNTING |
| slacks->path_criticality[inet][ipin] = 0.; |
| #endif |
| } |
| } |
| |
| /* Reset f_timing_stats. */ |
| for (int i = 0; i < timing_ctx.sdc->num_constrained_clocks; i++) { |
| for (int j = 0; j < timing_ctx.sdc->num_constrained_clocks; j++) { |
| f_timing_stats->cpd[i][j] = HUGE_NEGATIVE_FLOAT; |
| f_timing_stats->least_slack[i][j] = HUGE_POSITIVE_FLOAT; |
| } |
| } |
| |
| #ifndef PATH_COUNTING |
| /* Reset normalized values for clusterer. */ |
| if (is_prepacked) { |
| for (int inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| timing_ctx.tnodes[inode].prepacked_data->normalized_slack = HUGE_POSITIVE_FLOAT; |
| timing_ctx.tnodes[inode].prepacked_data->normalized_T_arr = HUGE_NEGATIVE_FLOAT; |
| timing_ctx.tnodes[inode].prepacked_data->normalized_total_critical_paths = HUGE_NEGATIVE_FLOAT; |
| } |
| } |
| #endif |
| |
| vtr::vector_map<ClusterBlockId, t_pb **> pin_id_to_pb_mapping = alloc_and_load_pin_id_to_pb_mapping(); |
| |
| if (timing_inf.slack_definition == std::string("I")) { |
| /* Find the smallest slack in the design, if negative. */ |
| smallest_slack_in_design = find_least_slack(is_prepacked, pin_id_to_pb_mapping); |
| if (smallest_slack_in_design > 0) smallest_slack_in_design = 0; |
| } |
| |
| /* For each valid constraint (pair of source and sink clock domains), we do one |
| forward and one backward topological traversal to find arrival and required times, |
| in do_timing_analysis_for_constraint. If path counting is on, we then do another, |
| simpler traversal to find forward and backward weights, relying on the largest |
| required time we found from the first traversal. After each constraint's traversals, |
| we update the slacks, timing criticalities and (if necessary) path criticalities or |
| normalized costs used by the clusterer. */ |
| |
| for (int source_clock_domain = 0; source_clock_domain < timing_ctx.sdc->num_constrained_clocks; source_clock_domain++) { |
| for (int sink_clock_domain = 0; sink_clock_domain < timing_ctx.sdc->num_constrained_clocks; sink_clock_domain++) { |
| if (timing_ctx.sdc->domain_constraint[source_clock_domain][sink_clock_domain] > NEGATIVE_EPSILON) { /* i.e. != DO_NOT_ANALYSE */ |
| |
| /* Perform the forward and backward traversal for this constraint. */ |
| criticality_denom = do_timing_analysis_for_constraint(source_clock_domain, sink_clock_domain, |
| is_prepacked, is_final_analysis, &max_critical_input_paths, &max_critical_output_paths, |
| pin_id_to_pb_mapping, timing_inf); |
| #ifdef PATH_COUNTING |
| /* Weight the importance of each net, used in slack calculation. */ |
| do_path_counting(criticality_denom); |
| #endif |
| |
| if (timing_inf.slack_definition == std::string("I")) { |
| criticality_denom -= smallest_slack_in_design; |
| /* Remember, smallest_slack_in_design is negative, so we're INCREASING criticality_denom. */ |
| } |
| |
| /* Update the slack and criticality for each edge of each net which was |
| analysed on the most recent traversal and has a lower (slack) or |
| higher (criticality) value than before. */ |
| update_slacks(slacks, criticality_denom, |
| update_slack, is_final_analysis, smallest_slack_in_design, timing_inf); |
| |
| #ifndef PATH_COUNTING |
| /* Update the normalized costs used by the clusterer. */ |
| if (is_prepacked) { |
| update_normalized_costs(criticality_denom, max_critical_input_paths, max_critical_output_paths, timing_inf); |
| } |
| #endif |
| |
| if (timing_inf.slack_definition == std::string("S") || timing_inf.slack_definition == std::string("G")) { |
| /* Set criticality_denom_global to the max of criticality_denom over all traversals. */ |
| criticality_denom_global = max(criticality_denom_global, criticality_denom); |
| } |
| } |
| } |
| } |
| |
| #ifdef PATH_COUNTING |
| /* Normalize path criticalities by the largest value in the |
| circuit. Otherwise, path criticalities would be unbounded. */ |
| |
| for (inet = 0; inet < num_timing_nets(); inet++) { |
| num_edges = num_timing_net_sinks(inet); |
| for (iedge = 0; iedge < num_edges; iedge++) { |
| max_path_criticality = max(max_path_criticality, slacks->path_criticality[inet][iedge + 1]); |
| } |
| } |
| |
| for (inet = 0; inet < num_timing_nets(); inet++) { |
| num_edges = num_timing_net_sinks(inet); |
| for (iedge = 0; iedge < num_edges; iedge++) { |
| slacks->path_criticality[inet][iedge + 1] /= max_path_criticality; |
| } |
| } |
| |
| #endif |
| |
| if (timing_inf.slack_definition == std::string("S") || timing_inf.slack_definition == std::string("G")) { |
| if (!is_final_analysis) { |
| if (timing_inf.slack_definition == std::string("S")) { |
| /* Find the smallest slack in the design. */ |
| for (int i = 0; i < timing_ctx.sdc->num_constrained_clocks; i++) { |
| for (int j = 0; j < timing_ctx.sdc->num_constrained_clocks; j++) { |
| smallest_slack_in_design = min(smallest_slack_in_design, f_timing_stats->least_slack[i][j]); |
| } |
| } |
| |
| /* Increase all slacks by the value of the smallest slack in the design, if it's negative. |
| Or clip all negative slacks to 0, based on how we choose to normalize slacks*/ |
| if (smallest_slack_in_design < 0) { |
| for (size_t inet = 0; inet < num_timing_nets(); inet++) { |
| int num_edges = num_timing_net_sinks(inet); |
| for (int iedge = 0; iedge < num_edges; iedge++) { |
| slacks->slack[inet][iedge + 1] -= smallest_slack_in_design; |
| /* Remember, smallest_slack_in_design is negative, so we're INCREASING all the slacks. */ |
| /* Note that if slack was equal to HUGE_POSITIVE_FLOAT, it will still be equal to more than this, |
| so it will still be ignored when we calculate criticalities. */ |
| } |
| } |
| criticality_denom_global -= smallest_slack_in_design; |
| /* Also need to increase the criticality denominator. In some cases, slack > criticality_denom_global, causing |
| timing_criticality < 0 when calculated later. Do this shift to keep timing_criticality non-negative. |
| */ |
| } |
| } |
| |
| /* We can now calculate criticalities (for 'S', this must be done after we normalize slacks). */ |
| for (size_t inet = 0; inet < num_timing_nets(); inet++) { |
| int num_edges = num_timing_net_sinks(inet); |
| for (int iedge = 0; iedge < num_edges; iedge++) { |
| if (slacks->slack[inet][iedge + 1] < HUGE_POSITIVE_FLOAT - 1) { /* if the slack is valid */ |
| slacks->timing_criticality[inet][iedge + 1] = 1 - slacks->slack[inet][iedge + 1]/criticality_denom_global; |
| VTR_ASSERT(slacks->timing_criticality[inet][iedge + 1] > -0.01); |
| VTR_ASSERT(slacks->timing_criticality[inet][iedge + 1] < 1.01); |
| } |
| /* otherwise, criticality remains 0, as it was initialized */ |
| } |
| } |
| } |
| } |
| free_pin_id_to_pb_mapping(pin_id_to_pb_mapping); |
| |
| clock_t end = clock(); |
| timing_ctx.stats.old_sta_wallclock_time += double(end - begin) / CLOCKS_PER_SEC; |
| timing_ctx.stats.num_old_sta_full_updates += 1; |
| } |
| |
| static float find_least_slack(bool is_prepacked, vtr::vector_map<ClusterBlockId, t_pb **> &pin_id_to_pb_mapping) { |
| /* Perform a simplified version of do_timing_analysis_for_constraint |
| to compute only the smallest slack in the design. |
| USED ONLY WHEN slack_definition == 'I'! */ |
| |
| float smallest_slack_in_design = HUGE_POSITIVE_FLOAT; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| for (int source_clock_domain = 0; source_clock_domain < timing_ctx.sdc->num_constrained_clocks; source_clock_domain++) { |
| for (int sink_clock_domain = 0; sink_clock_domain < timing_ctx.sdc->num_constrained_clocks; sink_clock_domain++) { |
| if (timing_ctx.sdc->domain_constraint[source_clock_domain][sink_clock_domain] > NEGATIVE_EPSILON) { /* i.e. != DO_NOT_ANALYSE */ |
| |
| /* Reset all arrival and required times. */ |
| for (int inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| timing_ctx.tnodes[inode].T_arr = HUGE_NEGATIVE_FLOAT; |
| timing_ctx.tnodes[inode].T_req = HUGE_POSITIVE_FLOAT; |
| } |
| |
| /* Set arrival times for each top-level tnode on this source domain. */ |
| int num_at_level = timing_ctx.tnodes_at_level[0].size(); |
| for (int i = 0; i < num_at_level; i++) { |
| int inode = timing_ctx.tnodes_at_level[0][i]; |
| if (timing_ctx.tnodes[inode].clock_domain == source_clock_domain) { |
| if (timing_ctx.tnodes[inode].type == TN_FF_SOURCE) { |
| timing_ctx.tnodes[inode].T_arr = timing_ctx.tnodes[inode].clock_delay; |
| } else if (timing_ctx.tnodes[inode].type == TN_INPAD_SOURCE) { |
| timing_ctx.tnodes[inode].T_arr = 0.; |
| } |
| } |
| } |
| |
| /* Forward traversal, to compute arrival times. */ |
| for (int ilevel = 0; ilevel < timing_ctx.num_tnode_levels; ilevel++) { |
| num_at_level = timing_ctx.tnodes_at_level[ilevel].size(); |
| |
| for (int i = 0; i < num_at_level; i++) { |
| int inode = timing_ctx.tnodes_at_level[ilevel][i]; |
| if (timing_ctx.tnodes[inode].T_arr < NEGATIVE_EPSILON) { |
| continue; |
| } |
| |
| int num_edges = timing_ctx.tnodes[inode].num_edges; |
| t_tedge *tedge = timing_ctx.tnodes[inode].out_edges; |
| |
| for (int iedge = 0; iedge < num_edges; iedge++) { |
| int to_node = tedge[iedge].to_node; |
| if(to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| |
| timing_ctx.tnodes[to_node].T_arr = max(timing_ctx.tnodes[to_node].T_arr, timing_ctx.tnodes[inode].T_arr + tedge[iedge].Tdel); |
| } |
| } |
| } |
| |
| /* Backward traversal, to compute required times and slacks. We can skip nodes which are more than |
| 1 away from a sink, because the path with the least slack has to use a connection between a sink |
| and one node away from a sink. */ |
| for (int ilevel = timing_ctx.num_tnode_levels - 1; ilevel >= 0; ilevel--) { |
| num_at_level = timing_ctx.tnodes_at_level[ilevel].size(); |
| |
| for (int i = 0; i < num_at_level; i++) { |
| int inode = timing_ctx.tnodes_at_level[ilevel][i]; |
| int num_edges = timing_ctx.tnodes[inode].num_edges; |
| if (num_edges == 0) { /* sink */ |
| if (timing_ctx.tnodes[inode].type == TN_FF_CLOCK || timing_ctx.tnodes[inode].T_arr < HUGE_NEGATIVE_FLOAT + 1 |
| || timing_ctx.tnodes[inode].clock_domain != sink_clock_domain) { |
| continue; |
| } |
| |
| float constraint; |
| if (timing_ctx.sdc->num_cf_constraints > 0) { |
| char *source_domain_name = timing_ctx.sdc->constrained_clocks[source_clock_domain].name; |
| const char *sink_register_name = find_tnode_net_name(inode, is_prepacked, pin_id_to_pb_mapping); |
| int icf = find_cf_constraint(source_domain_name, sink_register_name); |
| if (icf != DO_NOT_ANALYSE) { |
| constraint = timing_ctx.sdc->cf_constraints[icf].constraint; |
| if (constraint < NEGATIVE_EPSILON) { |
| continue; |
| } |
| } else { |
| constraint = timing_ctx.sdc->domain_constraint[source_clock_domain][sink_clock_domain]; |
| } |
| } else { |
| constraint = timing_ctx.sdc->domain_constraint[source_clock_domain][sink_clock_domain]; |
| } |
| |
| timing_ctx.tnodes[inode].T_req = constraint + timing_ctx.tnodes[inode].clock_delay; |
| } else { |
| |
| if (timing_ctx.tnodes[inode].T_arr < HUGE_NEGATIVE_FLOAT + 1) { |
| continue; |
| } |
| |
| bool found = false; |
| t_tedge *tedge = timing_ctx.tnodes[inode].out_edges; |
| for (int iedge = 0; iedge < num_edges && !found; iedge++) { |
| int to_node = tedge[iedge].to_node; |
| if(to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| |
| if(timing_ctx.tnodes[to_node].T_req < HUGE_POSITIVE_FLOAT) { |
| found = true; |
| } |
| } |
| if (!found) { |
| continue; |
| } |
| |
| for (int iedge = 0; iedge < num_edges; iedge++) { |
| int to_node = tedge[iedge].to_node; |
| if(to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| |
| if (timing_ctx.tnodes[to_node].num_edges == 0 && |
| timing_ctx.tnodes[to_node].clock_domain == sink_clock_domain) { // one away from a register on this sink domain |
| float Tdel = tedge[iedge].Tdel; |
| float T_req = timing_ctx.tnodes[to_node].T_req; |
| smallest_slack_in_design = min(smallest_slack_in_design, T_req - Tdel - timing_ctx.tnodes[inode].T_arr); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| return smallest_slack_in_design; |
| } |
| |
| static float do_timing_analysis_for_constraint(int source_clock_domain, int sink_clock_domain, |
| bool is_prepacked, bool is_final_analysis, long * max_critical_input_paths_ptr, |
| long * max_critical_output_paths_ptr, vtr::vector_map<ClusterBlockId, t_pb **> &pin_id_to_pb_mapping, const t_timing_inf &timing_inf) { |
| |
| /* Performs a single forward and backward traversal for the domain pair |
| source_clock_domain and sink_clock_domain. Returns the denominator that |
| will be later used to normalize criticality - the maximum of all arrival |
| times from this traversal and the constraint for this pair of domains. |
| Also returns the maximum number of critical input and output paths of any |
| node analysed for this constraint, passed by reference from do_timing_analysis. */ |
| |
| int inode, num_at_level, i, total, ilevel, num_edges, iedge, to_node; |
| float constraint, Tdel, T_req; |
| |
| float max_Tarr = HUGE_NEGATIVE_FLOAT; /* Max of all arrival times for this constraint - |
| used to relax required times for slack_definition 'R' and 'G' |
| and to normalize slacks for slack_definition 'N'. */ |
| |
| float max_Treq = HUGE_NEGATIVE_FLOAT; /* Max of all required time for this constraint - |
| used in criticality denominator. */ |
| |
| auto& timing_ctx = g_vpr_ctx.mutable_timing(); |
| |
| t_tedge * tedge; |
| int num_dangling_nodes; |
| bool found; |
| long max_critical_input_paths = 0, max_critical_output_paths = 0; |
| |
| /* If not passed in, alloc and load pin_id_to_pb_mapping (and make sure to free later). */ |
| bool must_free_mapping = false; |
| if (pin_id_to_pb_mapping.size() == 0) { |
| pin_id_to_pb_mapping = alloc_and_load_pin_id_to_pb_mapping(); |
| must_free_mapping = true; |
| } |
| |
| /* Reset all values which need to be reset once per |
| traversal pair, rather than once per timing analysis. */ |
| |
| /* Reset all arrival and required times. */ |
| for (inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| timing_ctx.tnodes[inode].T_arr = HUGE_NEGATIVE_FLOAT; |
| timing_ctx.tnodes[inode].T_req = HUGE_POSITIVE_FLOAT; |
| } |
| |
| #ifndef PATH_COUNTING |
| /* Reset num_critical_output_paths. */ |
| if (is_prepacked) { |
| for (inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| timing_ctx.tnodes[inode].prepacked_data->num_critical_output_paths = 0; |
| } |
| } |
| #endif |
| |
| /* Set arrival times for each top-level tnode on this source domain. */ |
| num_at_level = timing_ctx.tnodes_at_level[0].size(); |
| for (i = 0; i < num_at_level; i++) { |
| inode = timing_ctx.tnodes_at_level[0][i]; |
| |
| if (timing_ctx.tnodes[inode].clock_domain == source_clock_domain) { |
| |
| if (timing_ctx.tnodes[inode].type == TN_FF_SOURCE) { |
| /* Set the arrival time of this flip-flop tnode to its clock skew. */ |
| timing_ctx.tnodes[inode].T_arr = timing_ctx.tnodes[inode].clock_delay; |
| |
| } else if (timing_ctx.tnodes[inode].type == TN_INPAD_SOURCE) { |
| /* There's no such thing as clock skew for external clocks, and |
| input delay is already marked on the edge coming out from this node. |
| As a result, the signal can be said to arrive at t = 0. */ |
| timing_ctx.tnodes[inode].T_arr = 0.; |
| } |
| |
| } |
| } |
| |
| /* Compute arrival times with a forward topological traversal from sources |
| (TN_FF_SOURCE, TN_INPAD_SOURCE, TN_CONSTANT_GEN_SOURCE) to sinks (TN_FF_SINK, TN_OUTPAD_SINK). */ |
| |
| total = 0; /* We count up all tnodes to error-check at the end. */ |
| for (ilevel = 0; ilevel < timing_ctx.num_tnode_levels; ilevel++) { /* For each level of our levelized timing graph... */ |
| num_at_level = timing_ctx.tnodes_at_level[ilevel].size(); /* ...there are num_at_level tnodes at that level. */ |
| total += num_at_level; |
| |
| for (i = 0; i < num_at_level; i++) { |
| inode = timing_ctx.tnodes_at_level[ilevel][i]; /* Go through each of the tnodes at the level we're on. */ |
| if (timing_ctx.tnodes[inode].T_arr < NEGATIVE_EPSILON) { /* If the arrival time is less than 0 (i.e. HUGE_NEGATIVE_FLOAT)... */ |
| continue; /* End this iteration of the num_at_level for loop since |
| this node is not part of the clock domain we're analyzing. |
| (If it were, it would have received an arrival time already.) */ |
| } |
| |
| num_edges = timing_ctx.tnodes[inode].num_edges; /* Get the number of edges fanning out from the node we're visiting */ |
| tedge = timing_ctx.tnodes[inode].out_edges; /* Get the list of edges from the node we're visiting */ |
| #ifndef PATH_COUNTING |
| if (is_prepacked && ilevel == 0) { |
| timing_ctx.tnodes[inode].prepacked_data->num_critical_input_paths = 1; /* Top-level tnodes have one locally-critical input path. */ |
| } |
| |
| /* Using a somewhat convoluted procedure inherited from T-VPack, |
| count how many locally-critical input paths fan into each tnode, |
| and also find the maximum number over all tnodes. */ |
| if (is_prepacked) { |
| for (iedge = 0; iedge < num_edges; iedge++) { |
| to_node = tedge[iedge].to_node; |
| if(to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| |
| if (fabs(timing_ctx.tnodes[to_node].T_arr - (timing_ctx.tnodes[inode].T_arr + tedge[iedge].Tdel)) < EPSILON) { |
| /* If the "local forward slack" (T_arr(to_node) - T_arr(inode) - T_del) for this edge |
| is 0 (i.e. the path from inode to to_node is locally as critical as any other path to |
| to_node), add to_node's num critical input paths to inode's number. */ |
| timing_ctx.tnodes[to_node].prepacked_data->num_critical_input_paths += timing_ctx.tnodes[inode].prepacked_data->num_critical_input_paths; |
| } else if (timing_ctx.tnodes[to_node].T_arr < (timing_ctx.tnodes[inode].T_arr + tedge[iedge].Tdel)) { |
| /* If the "local forward slack" for this edge is negative, |
| reset to_node's num critical input paths to inode's number. */ |
| timing_ctx.tnodes[to_node].prepacked_data->num_critical_input_paths = timing_ctx.tnodes[inode].prepacked_data->num_critical_input_paths; |
| } |
| /* Set max_critical_input_paths to the maximum number of critical |
| input paths for all tnodes analysed on this traversal. */ |
| if (timing_ctx.tnodes[to_node].prepacked_data->num_critical_input_paths > max_critical_input_paths) { |
| max_critical_input_paths = timing_ctx.tnodes[to_node].prepacked_data->num_critical_input_paths; |
| } |
| } |
| } |
| #endif |
| for (iedge = 0; iedge < num_edges; iedge++) { /* Now go through each edge coming out from this tnode */ |
| to_node = tedge[iedge].to_node; /* Get the index of the destination tnode of this edge. */ |
| if (to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| |
| /* The arrival time T_arr at the destination node is set to the maximum of all |
| the possible arrival times from all edges fanning in to the node. The arrival |
| time represents the latest time that all inputs must arrive at a node. */ |
| timing_ctx.tnodes[to_node].T_arr = max(timing_ctx.tnodes[to_node].T_arr, timing_ctx.tnodes[inode].T_arr + tedge[iedge].Tdel); |
| |
| if (timing_inf.slack_definition == std::string("R") || timing_inf.slack_definition == std::string("G")) { |
| /* Since we updated the destination node (to_node), change the max arrival |
| time for the forward traversal if to_node's arrival time is greater than |
| the existing maximum, and it is on the sink clock domain. */ |
| if (timing_ctx.tnodes[to_node].num_edges == 0 && timing_ctx.tnodes[to_node].clock_domain == sink_clock_domain) { |
| max_Tarr = max(max_Tarr, timing_ctx.tnodes[to_node].T_arr); |
| } |
| } |
| } |
| } |
| } |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| VTR_ASSERT(total == timing_ctx.num_tnodes); |
| num_dangling_nodes = 0; |
| /* Compute required times with a backward topological traversal from sinks to sources. */ |
| for (ilevel = timing_ctx.num_tnode_levels - 1; ilevel >= 0; ilevel--) { |
| num_at_level = timing_ctx.tnodes_at_level[ilevel].size(); |
| |
| for (i = 0; i < num_at_level; i++) { |
| inode = timing_ctx.tnodes_at_level[ilevel][i]; |
| num_edges = timing_ctx.tnodes[inode].num_edges; |
| |
| if (ilevel == 0) { |
| if (!(timing_ctx.tnodes[inode].type == TN_INPAD_SOURCE || timing_ctx.tnodes[inode].type == TN_FF_SOURCE || timing_ctx.tnodes[inode].type == TN_CONSTANT_GEN_SOURCE || |
| timing_ctx.tnodes[inode].type == TN_CLOCK_SOURCE) && !timing_ctx.tnodes[inode].is_comb_loop_breakpoint) { |
| //We suppress node type errors if they have the is_comb_loop_breakpoint flag set. |
| //The flag denotes that an input edge to this node was disconnected to break a combinational |
| //loop, and hence we don't consider this an error. |
| AtomPinId pin_id = atom_ctx.lookup.classic_tnode_atom_pin(inode); |
| if(pin_id) { |
| AtomPortId port_id = atom_ctx.nlist.pin_port(pin_id); |
| AtomBlockId blk_id = atom_ctx.nlist.pin_block(pin_id); |
| vpr_throw(VPR_ERROR_TIMING,__FILE__, __LINE__, |
| "Timing graph discovered unexpected edge to node %d atom block '%s' port '%s' port_bit '%zu'.", |
| inode, |
| atom_ctx.nlist.block_name(blk_id).c_str(), |
| atom_ctx.nlist.port_name(port_id).c_str(), |
| atom_ctx.nlist.pin_port_bit(pin_id)); |
| } else if(timing_ctx.tnodes[inode].pb_graph_pin) { |
| vpr_throw(VPR_ERROR_TIMING, __FILE__, __LINE__, |
| "Timing graph started on unexpected node %d %s.%s[%d].", |
| inode, |
| timing_ctx.tnodes[inode].pb_graph_pin->parent_node->pb_type->name, |
| timing_ctx.tnodes[inode].pb_graph_pin->port->name, |
| timing_ctx.tnodes[inode].pb_graph_pin->pin_number); |
| } else { |
| vpr_throw(VPR_ERROR_TIMING, __FILE__, __LINE__, |
| "Timing graph started on unexpected node %d.", |
| inode); |
| } |
| } |
| } else { |
| if ((timing_ctx.tnodes[inode].type == TN_INPAD_SOURCE || timing_ctx.tnodes[inode].type == TN_FF_SOURCE || timing_ctx.tnodes[inode].type == TN_CONSTANT_GEN_SOURCE)) { |
| AtomPinId pin_id = atom_ctx.lookup.classic_tnode_atom_pin(inode); |
| if(pin_id) { |
| AtomPortId port_id = atom_ctx.nlist.pin_port(pin_id); |
| AtomBlockId blk_id = atom_ctx.nlist.pin_block(pin_id); |
| vpr_throw(VPR_ERROR_TIMING,__FILE__, __LINE__, |
| "Timing graph discovered unexpected edge to node %d atom block '%s' port '%s' port_bit '%zu'.", |
| inode, |
| atom_ctx.nlist.block_name(blk_id).c_str(), |
| atom_ctx.nlist.port_name(port_id).c_str(), |
| atom_ctx.nlist.pin_port_bit(pin_id)); |
| } else if(timing_ctx.tnodes[inode].pb_graph_pin) { |
| vpr_throw(VPR_ERROR_TIMING,__FILE__, __LINE__, |
| "Timing graph discovered unexpected edge to node %d %s.%s[%d].", |
| inode, |
| timing_ctx.tnodes[inode].pb_graph_pin->parent_node->pb_type->name, |
| timing_ctx.tnodes[inode].pb_graph_pin->port->name, |
| timing_ctx.tnodes[inode].pb_graph_pin->pin_number); |
| } else { |
| vpr_throw(VPR_ERROR_TIMING,__FILE__, __LINE__, |
| "Timing graph discovered unexpected edge to node %d.", |
| inode); |
| } |
| } |
| } |
| |
| /* Unlike the forward traversal, the sinks are all on different levels, so we always have to |
| check whether a node is a sink. We give every sink on the sink clock domain we're considering |
| a valid required time. Every non-sink node in the fanin of one of these sinks and the fanout of |
| some source from the forward traversal also gets a valid required time. */ |
| |
| if (num_edges == 0) { /* sink */ |
| |
| if (timing_ctx.tnodes[inode].type == TN_FF_CLOCK || timing_ctx.tnodes[inode].T_arr < HUGE_NEGATIVE_FLOAT + 1) { |
| continue; /* Skip nodes on the clock net itself, and nodes with unset arrival times. */ |
| } |
| |
| if (!(timing_ctx.tnodes[inode].type == TN_OUTPAD_SINK || timing_ctx.tnodes[inode].type == TN_FF_SINK)) { |
| if(is_prepacked) { |
| AtomPinId pin_id = atom_ctx.lookup.classic_tnode_atom_pin(inode); |
| VTR_ASSERT(pin_id); |
| AtomBlockId blk_id = atom_ctx.nlist.pin_block(pin_id); |
| vtr::printf_warning(__FILE__, __LINE__, |
| "Pin on block %s.%s[%d] not used\n", |
| atom_ctx.nlist.block_name(blk_id).c_str(), |
| timing_ctx.tnodes[inode].prepacked_data->model_port_ptr->name, |
| timing_ctx.tnodes[inode].prepacked_data->model_pin); |
| } |
| num_dangling_nodes++; |
| /* Note: Still need to do standard traversals with dangling pins so that algorithm runs properly, but T_arr and T_Req to values such that it dangling nodes do not affect actual timing values */ |
| } |
| |
| /* Skip nodes not on the sink clock domain of the |
| constraint we're currently considering */ |
| if (timing_ctx.tnodes[inode].clock_domain != sink_clock_domain) { |
| continue; |
| } |
| |
| /* See if there's an override constraint between the source clock domain (name is |
| timing_ctx.sdc->constrained_clocks[source_clock_domain].name) and the flip-flop or outpad we're at |
| now (name is find_tnode_net_name(inode, is_prepacked)). We test if |
| timing_ctx.sdc->num_cf_constraints > 0 first so that we can save time looking up names in the vast |
| majority of cases where there are no such constraints. */ |
| |
| if (timing_ctx.sdc->num_cf_constraints > 0) { |
| char *source_domain_name = timing_ctx.sdc->constrained_clocks[source_clock_domain].name; |
| const char *sink_register_name = find_tnode_net_name(inode, is_prepacked, pin_id_to_pb_mapping); |
| int icf = find_cf_constraint(source_domain_name, sink_register_name); |
| if (icf != DO_NOT_ANALYSE) { |
| constraint = timing_ctx.sdc->cf_constraints[icf].constraint; |
| if (constraint < NEGATIVE_EPSILON) { |
| /* Constraint is DO_NOT_ANALYSE (-1) for this particular sink. */ |
| continue; |
| } |
| } else { |
| /* Use the default constraint from timing_ctx.sdc->domain_constraint. */ |
| constraint = timing_ctx.sdc->domain_constraint[source_clock_domain][sink_clock_domain]; |
| /* Constraint is guaranteed to be valid since we checked for it at the very beginning. */ |
| } |
| } else { |
| constraint = timing_ctx.sdc->domain_constraint[source_clock_domain][sink_clock_domain]; |
| } |
| |
| /* Now we know we should analyse this tnode. */ |
| |
| if (timing_inf.slack_definition == std::string("R") || timing_inf.slack_definition == std::string("G")) { |
| /* Assign the required time T_req for this leaf node, taking into account clock skew. T_req is the |
| time all inputs to a tnode must arrive by before it would degrade this constraint's critical path delay. |
| |
| Relax the required time at the sink node to be non-negative by taking the max of the "real" required |
| time (constraint + timing_ctx.tnodes[inode].clock_delay) and the max arrival time in this domain (max_Tarr), except |
| for the final analysis where we report actual slack. We do this to prevent any slacks from being |
| negative, since negative slacks are not used effectively by the optimizers. |
| |
| E.g. if we have a 10 ns constraint and it takes 14 ns to get here, we'll have a slack of at most -4 ns |
| for any edge along the path that got us here. If we say the required time is 14 ns (no less than the |
| arrival time), we don't have a negative slack anymore. However, in the final timing analysis, the real |
| slacks are computed (that's what human designers care about), not the relaxed ones. */ |
| |
| if (is_final_analysis) { |
| timing_ctx.tnodes[inode].T_req = constraint + timing_ctx.tnodes[inode].clock_delay; |
| } else { |
| timing_ctx.tnodes[inode].T_req = max(constraint + timing_ctx.tnodes[inode].clock_delay, max_Tarr); |
| } |
| } else { |
| /* Don't do the relaxation and always set T_req equal to the "real" required time. */ |
| timing_ctx.tnodes[inode].T_req = constraint + timing_ctx.tnodes[inode].clock_delay; |
| } |
| |
| max_Treq = max(max_Treq, timing_ctx.tnodes[inode].T_req); |
| |
| /* Store the largest critical path delay for this constraint (source domain AND sink domain) |
| in the matrix critical_path_delay. C.P.D. = T_arr at destination - clock skew at destination |
| = (datapath delay + clock delay to source) - clock delay to destination. |
| |
| Critical path delay is really telling us how fast we can run the source clock before we can no |
| longer meet this constraint. e.g. If the datapath delay is 10 ns, the clock delay at source is |
| 2 ns and the clock delay at destination is 5 ns, then C.P.D. is 7 ns by the above formula. We |
| can run the source clock at 7 ns because the clock skew gives us 3 ns extra to meet the 10 ns |
| datapath delay. */ |
| |
| f_timing_stats->cpd[source_clock_domain][sink_clock_domain] = |
| max(f_timing_stats->cpd[source_clock_domain][sink_clock_domain], |
| (timing_ctx.tnodes[inode].T_arr - timing_ctx.tnodes[inode].clock_delay)); |
| |
| #ifndef PATH_COUNTING |
| if (is_prepacked) { |
| timing_ctx.tnodes[inode].prepacked_data->num_critical_output_paths = 1; /* Bottom-level tnodes have one locally-critical input path. */ |
| } |
| #endif |
| } else { /* not a sink */ |
| |
| VTR_ASSERT(!(timing_ctx.tnodes[inode].type == TN_OUTPAD_SINK || timing_ctx.tnodes[inode].type == TN_FF_SINK || timing_ctx.tnodes[inode].type == TN_FF_CLOCK)); |
| |
| /* We need to skip this node unless it is on a path from source_clock_domain to |
| sink_clock_domain. We need to skip all nodes which: |
| 1. Fan out to the sink domain but do not fan in from the source domain. |
| 2. Fan in from the source domain but do not fan out to the sink domain. |
| 3. Do not fan in or out to either domain. |
| If a node does not fan in from the source domain, it will not have a valid arrival time. |
| So cases 1 and 3 can be skipped by continuing if T_arr = HUGE_NEGATIVE_FLOAT. We cannot |
| treat case 2 as simply since the required time for this node has not yet been assigned, |
| so we have to look at the required time for every node in its immediate fanout instead. */ |
| |
| /* Cases 1 and 3 */ |
| if (timing_ctx.tnodes[inode].T_arr < HUGE_NEGATIVE_FLOAT + 1) { |
| continue; /* Skip nodes with unset arrival times. */ |
| } |
| |
| /* Case 2 */ |
| found = false; |
| tedge = timing_ctx.tnodes[inode].out_edges; |
| for (iedge = 0; iedge < num_edges && !found; iedge++) { |
| to_node = tedge[iedge].to_node; |
| if(to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| |
| if (timing_ctx.tnodes[to_node].T_req < HUGE_POSITIVE_FLOAT) { |
| found = true; |
| } |
| } |
| if (!found) { |
| continue; |
| } |
| |
| /* Now we know this node is on a path from source_clock_domain to |
| sink_clock_domain, and needs to be analyzed. */ |
| |
| /* Opposite to T_arr, set T_req to the MINIMUM of the |
| required times of all edges fanning OUT from this node. */ |
| for (iedge = 0; iedge < num_edges; iedge++) { |
| to_node = tedge[iedge].to_node; |
| if(to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| |
| Tdel = tedge[iedge].Tdel; |
| T_req = timing_ctx.tnodes[to_node].T_req; |
| timing_ctx.tnodes[inode].T_req = min(timing_ctx.tnodes[inode].T_req, T_req - Tdel); |
| |
| /* Update least slack per constraint. This is NOT the same as the minimum slack we will |
| calculate on this traversal for post-packed netlists, which only count inter-cluster |
| slacks. We only look at edges adjacent to sink nodes on the sink clock domain since |
| all paths go through one of these edges. */ |
| if (timing_ctx.tnodes[to_node].num_edges == 0 && timing_ctx.tnodes[to_node].clock_domain == sink_clock_domain) { |
| f_timing_stats->least_slack[source_clock_domain][sink_clock_domain] = |
| min(f_timing_stats->least_slack[source_clock_domain][sink_clock_domain], |
| (T_req - Tdel - timing_ctx.tnodes[inode].T_arr)); |
| } |
| } |
| #ifndef PATH_COUNTING |
| |
| /* Similar to before, we count how many locally-critical output paths fan out from each tnode, |
| and also find the maximum number over all tnodes. Unlike for input paths, where we haven't set |
| the arrival time at to_node before analysing it, here the required time is set at both nodes, |
| so the "local backward slack" (T_req(to_node) - T_req(inode) - T_del) will never be negative. |
| Hence, we only have to test if the "local backward slack" is 0. */ |
| if (is_prepacked) { |
| for (iedge = 0; iedge < num_edges; iedge++) { |
| to_node = tedge[iedge].to_node; |
| if(to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| |
| /* If the "local backward slack" (T_arr(to_node) - T_arr(inode) - T_del) for this edge |
| is 0 (i.e. the path from inode to to_node is locally as critical as any other path to |
| to_node), add to_node's num critical output paths to inode's number. */ |
| if (fabs(timing_ctx.tnodes[to_node].T_req - (timing_ctx.tnodes[inode].T_req + tedge[iedge].Tdel)) < EPSILON) { |
| timing_ctx.tnodes[inode].prepacked_data->num_critical_output_paths += timing_ctx.tnodes[to_node].prepacked_data->num_critical_output_paths; |
| } |
| /* Set max_critical_output_paths to the maximum number of critical |
| output paths for all tnodes analysed on this traversal. */ |
| if (timing_ctx.tnodes[to_node].prepacked_data->num_critical_output_paths > max_critical_output_paths) { |
| max_critical_output_paths = timing_ctx.tnodes[to_node].prepacked_data->num_critical_output_paths; |
| } |
| } |
| } |
| #endif |
| } |
| } |
| } |
| |
| if (must_free_mapping) { |
| free_pin_id_to_pb_mapping(pin_id_to_pb_mapping); |
| } |
| |
| /* Return max critical input/output paths for this constraint through |
| the pointers we passed in. */ |
| if (max_critical_input_paths_ptr && max_critical_output_paths_ptr) { |
| *max_critical_input_paths_ptr = max_critical_input_paths; |
| *max_critical_output_paths_ptr = max_critical_output_paths; |
| } |
| |
| if(num_dangling_nodes > 0 && (is_final_analysis || is_prepacked)) { |
| vtr::printf_warning(__FILE__, __LINE__, |
| "%d unused pins \n", num_dangling_nodes); |
| } |
| |
| /* The criticality denominator is the maximum required time, except for slack_definition == 'N' |
| where it is the max of all arrival and required times. |
| |
| For definitions except 'R' and 'N', this works out to the maximum of constraint + |
| timing_ctx.tnodes[inode].clock_delay over all nodes. For 'R' and 'N', this works out to the maximum of |
| that value and max_Tarr. The max_Tarr is implicitly incorporated into the denominator through |
| its inclusion in the required time, but we have to explicitly include it for 'N'. */ |
| if (timing_inf.slack_definition == std::string("N")) { |
| return max_Treq + max_Tarr; |
| } else { |
| return max_Treq; |
| } |
| } |
| #ifdef PATH_COUNTING |
| static void do_path_counting(float criticality_denom) { |
| /* Count the importance of the number of paths going through each net |
| by giving each net a forward and backward path weight. This is the first step of |
| "A Novel Net Weighting Algorithm for Timing-Driven Placement" (Kong, 2002). |
| We only visit nodes with set arrival and required times, so this function |
| must be called after do_timing_analysis_for_constraints, which sets T_arr and T_req. */ |
| |
| int inode, num_at_level, i, ilevel, num_edges, iedge, to_node; |
| t_tedge * tedge; |
| float forward_local_slack, backward_local_slack, discount; |
| |
| /* Reset forward and backward weights for all tnodes. */ |
| for (inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| timing_ctx.tnodes[inode].forward_weight = 0; |
| timing_ctx.tnodes[inode].backward_weight = 0; |
| } |
| |
| /* Set foward weights for each top-level tnode. */ |
| num_at_level = timing_ctx.tnodes_at_level[0].size(); |
| for (i = 0; i < num_at_level; i++) { |
| inode = timing_ctx.tnodes_at_level[0][i]; |
| timing_ctx.tnodes[inode].forward_weight = 1.; |
| } |
| |
| /* Do a forward topological traversal to populate forward weights. */ |
| for (ilevel = 0; ilevel < timing_ctx.num_tnode_levels; ilevel++) { |
| num_at_level = timing_ctx.tnodes_at_level[ilevel].size(); |
| |
| for (i = 0; i < num_at_level; i++) { |
| inode = timing_ctx.tnodes_at_level[ilevel][i]; |
| if (!(has_valid_T_arr(inode) && has_valid_T_req(inode))) { |
| continue; |
| } |
| tedge = timing_ctx.tnodes[inode].out_edges; |
| num_edges = timing_ctx.tnodes[inode].num_edges; |
| for (iedge = 0; iedge < num_edges; iedge++) { |
| to_node = tedge[iedge].to_node; |
| if(to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| |
| if (!(has_valid_T_arr(to_node) && has_valid_T_req(to_node))) { |
| continue; |
| } |
| forward_local_slack = timing_ctx.tnodes[to_node].T_arr - timing_ctx.tnodes[inode].T_arr - tedge[iedge].Tdel; |
| discount = pow((float) DISCOUNT_FUNCTION_BASE, -1 * forward_local_slack / criticality_denom); |
| timing_ctx.tnodes[to_node].forward_weight += discount * timing_ctx.tnodes[inode].forward_weight; |
| } |
| } |
| } |
| |
| /* Do a backward topological traversal to populate backward weights. |
| Since the sinks are all on different levels, we have to check for them as we go. */ |
| for (ilevel = timing_ctx.num_tnode_levels - 1; ilevel >= 0; ilevel--) { |
| num_at_level = timing_ctx.tnodes_at_level[ilevel].size(); |
| |
| for (i = 0; i < num_at_level; i++) { |
| inode = timing_ctx.tnodes_at_level[ilevel][i]; |
| if (!(has_valid_T_arr(inode) && has_valid_T_req(inode))) { |
| continue; |
| } |
| num_edges = timing_ctx.tnodes[inode].num_edges; |
| if (num_edges == 0) { /* sink */ |
| timing_ctx.tnodes[inode].backward_weight = 1.; |
| } else { |
| tedge = timing_ctx.tnodes[inode].out_edges; |
| for (iedge = 0; iedge < num_edges; iedge++) { |
| to_node = tedge[iedge].to_node; |
| if(to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| if (!(has_valid_T_arr(to_node) && has_valid_T_req(to_node))) { |
| continue; |
| } |
| backward_local_slack = timing_ctx.tnodes[to_node].T_req - timing_ctx.tnodes[inode].T_req - tedge[iedge].Tdel; |
| discount = pow((float) DISCOUNT_FUNCTION_BASE, -1 * backward_local_slack / criticality_denom); |
| timing_ctx.tnodes[inode].backward_weight += discount * timing_ctx.tnodes[to_node].backward_weight; |
| } |
| } |
| } |
| } |
| } |
| #endif |
| |
| static void update_slacks(t_slack * slacks, float criticality_denom, |
| bool update_slack, bool is_final_analysis, float smallest_slack_in_design, const t_timing_inf &timing_inf) { |
| |
| /* Updates the slack and criticality of each sink pin, or equivalently |
| each edge, of each net. Go through the list of nets. If the net's |
| driver tnode has been used, go through each tedge of that tnode and |
| take the minimum of the slack/criticality for this traversal and the |
| existing values. Since the criticality_denom can vary greatly between |
| traversals, we have to update slack and criticality separately. |
| Only update slack if we need to print it later (update_slack == true). |
| |
| Note: there is a correspondence in indexing between out_edges and the |
| net data structure: out_edges[iedge] = net[inet].node_block[iedge + 1] |
| There is an offset of 1 because net[inet].node_block includes the driver |
| node at index 0, while out_edges is part of the driver node and does |
| not bother to refer to itself. */ |
| |
| int iedge, inode, to_node, num_edges; |
| t_tedge *tedge; |
| float T_arr, Tdel, T_req, slack; |
| float timing_criticality; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| for (size_t inet = 0; inet < num_timing_nets(); inet++) { |
| inode = f_net_to_driver_tnode[inet]; |
| T_arr = timing_ctx.tnodes[inode].T_arr; |
| |
| if (!(has_valid_T_arr(inode) && has_valid_T_req(inode))) { |
| continue; /* Only update this net on this traversal if its |
| driver node has been updated on this traversal. */ |
| } |
| |
| num_edges = timing_ctx.tnodes[inode].num_edges; |
| tedge = timing_ctx.tnodes[inode].out_edges; |
| |
| VTR_ASSERT_MSG(static_cast<size_t>(num_edges) == num_timing_net_sinks(inet), "Number of tnode edges and net sinks do not match"); |
| |
| for (iedge = 0; iedge < num_edges; iedge++) { |
| to_node = tedge[iedge].to_node; |
| if(to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| |
| if (!(has_valid_T_arr(to_node) && has_valid_T_req(to_node))) { |
| continue; /* Only update this edge on this traversal if this |
| particular sink node has been updated on this traversal. */ |
| } |
| Tdel = tedge[iedge].Tdel; |
| T_req = timing_ctx.tnodes[to_node].T_req; |
| |
| slack = T_req - T_arr - Tdel; |
| |
| if (!is_final_analysis) { |
| if (timing_inf.slack_definition == std::string("I")) { |
| /* Shift slack UPWARDS by subtracting the smallest slack in the |
| design (which is negative or zero). */ |
| slack -= smallest_slack_in_design; |
| } else if (timing_inf.slack_definition == std::string("C")) { |
| /* Clip all negative slacks to 0. */ |
| if (slack < 0) slack = 0; |
| } |
| } |
| |
| if (update_slack) { |
| /* Update the slack for this edge if slk is the new minimum value */ |
| slacks->slack[inet][iedge + 1] = min(slack, slacks->slack[inet][iedge + 1]); |
| } |
| |
| if (timing_inf.slack_definition != std::string("S") && timing_inf.slack_definition != std::string("G")) { |
| if (!is_final_analysis) { // Criticality is not meaningful using non-normalized slacks. |
| |
| /* Since criticality_denom is not the same on each traversal, |
| we have to update criticality separately. */ |
| |
| if (timing_inf.slack_definition == std::string("C")) { |
| /* For clipped, criticality_denom is the raw maximum required time, and this can |
| be 0 if the constraint was 0, leading to division by 0. In this case, all of the |
| slacks will be clipped to zero anyways, so we can just set the criticality to 1. */ |
| timing_criticality = criticality_denom ? 1 - slack / criticality_denom : 1; |
| } else { |
| // VTR_ASSERT(criticality_denom); |
| timing_criticality = 1 - slack / criticality_denom; |
| } |
| |
| // Criticality must be normalized to between 0 and 1 |
| // Note: floating-point error may be ~1e-5 since we are dividing by a small |
| // criticality_denom (~1e-9). |
| VTR_ASSERT(timing_criticality > -0.01); |
| VTR_ASSERT(timing_criticality < 1.01); |
| |
| slacks->timing_criticality[inet][iedge + 1] = |
| max(timing_criticality, slacks->timing_criticality[inet][iedge + 1]); |
| |
| #ifdef PATH_COUNTING |
| /* Also update path criticality separately. Kong uses slack / T_crit for the exponent, |
| which is equivalent to criticality - 1. Depending on how PATH_COUNTING is defined, |
| different functional forms are used. */ |
| #if PATH_COUNTING == 'S' /* Use sum of forward and backward weights. */ |
| slacks->path_criticality[inet][iedge + 1] = max(slacks->path_criticality[inet][iedge + 1], |
| (timing_ctx.tnodes[inode].forward_weight + timing_ctx.tnodes[to_node].backward_weight) * |
| pow((float) FINAL_DISCOUNT_FUNCTION_BASE, timing_criticality - 1)); |
| #elif PATH_COUNTING == 'P' /* Use product of forward and backward weights. */ |
| slacks->path_criticality[inet][iedge + 1] = max(slacks->path_criticality[inet][iedge + 1], |
| timing_ctx.tnodes[inode].forward_weight * timing_ctx.tnodes[to_node].backward_weight * |
| pow((float) FINAL_DISCOUNT_FUNCTION_BASE, timing_criticality - 1)); |
| #elif PATH_COUNTING == 'L' /* Use natural log of product of forward and backward weights. */ |
| slacks->path_criticality[inet][iedge + 1] = max(slacks->path_criticality[inet][iedge + 1], |
| log(timing_ctx.tnodes[inode].forward_weight * timing_ctx.tnodes[to_node].backward_weight) * |
| pow((float) FINAL_DISCOUNT_FUNCTION_BASE, timing_criticality - 1)); |
| #elif PATH_COUNTING == 'R' /* Use product of natural logs of forward and backward weights. */ |
| slacks->path_criticality[inet][iedge + 1] = max(slacks->path_criticality[inet][iedge + 1], |
| log(timing_ctx.tnodes[inode].forward_weight) * log(timing_ctx.tnodes[to_node].backward_weight) * |
| pow((float) FINAL_DISCOUNT_FUNCTION_BASE, timing_criticality - 1)); |
| #endif |
| #endif |
| } |
| } |
| } |
| } |
| } |
| |
| |
| void print_critical_path(const char *fname, const t_timing_inf &timing_inf) { |
| |
| /* Prints the critical path to a file. */ |
| |
| vtr::t_linked_int *critical_path_head, *critical_path_node; |
| FILE *fp; |
| int nets_on_crit_path, tnodes_on_crit_path, inode; |
| e_tnode_type type; |
| float total_net_delay, total_logic_delay, Tdel; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| vtr::vector_map<ClusterBlockId, t_pb **> pin_id_to_pb_mapping = alloc_and_load_pin_id_to_pb_mapping(); |
| |
| critical_path_head = allocate_and_load_critical_path(timing_inf); |
| critical_path_node = critical_path_head; |
| |
| fp = vtr::fopen(fname, "w"); |
| |
| nets_on_crit_path = 0; |
| tnodes_on_crit_path = 0; |
| total_net_delay = 0.; |
| total_logic_delay = 0.; |
| |
| while (critical_path_node != nullptr) { |
| Tdel = print_critical_path_node(fp, critical_path_node, pin_id_to_pb_mapping); |
| inode = critical_path_node->data; |
| type = timing_ctx.tnodes[inode].type; |
| tnodes_on_crit_path++; |
| |
| if (type == TN_CB_OPIN) { |
| nets_on_crit_path++; |
| |
| total_net_delay += Tdel; |
| } else { |
| total_logic_delay += Tdel; |
| } |
| |
| critical_path_node = critical_path_node->next; |
| } |
| |
| fprintf(fp, "\nTnodes on critical path: %d Nets on critical path: %d." |
| "\n", tnodes_on_crit_path, nets_on_crit_path); |
| fprintf(fp, "Total logic delay: %g (s) Total net delay: %g (s)\n", |
| total_logic_delay, total_net_delay); |
| |
| vtr::printf_info("Nets on critical path: %d normal.\n", |
| nets_on_crit_path); |
| |
| vtr::printf_info("Total logic delay: %g (s), total net delay: %g (s)\n", |
| total_logic_delay, total_net_delay); |
| |
| /* Make sure total_logic_delay and total_net_delay add |
| up to critical path delay,within 5 decimal places. */ |
| VTR_ASSERT(std::isnan(get_critical_path_delay()) || total_logic_delay + total_net_delay - get_critical_path_delay()/1e9 < 1e-5); |
| |
| fclose(fp); |
| free_pin_id_to_pb_mapping(pin_id_to_pb_mapping); |
| free_int_list(&critical_path_head); |
| } |
| |
| vtr::t_linked_int * allocate_and_load_critical_path(const t_timing_inf &timing_inf) { |
| |
| /* Finds the critical path and returns a list of the tnodes on the critical * |
| * path in a linked list, from the path SOURCE to the path SINK. */ |
| |
| vtr::t_linked_int *critical_path_head, *curr_crit_node, *prev_crit_node; |
| int inode, iedge, to_node, num_at_level_zero, i, j, crit_node = OPEN, num_edges; |
| int source_clock_domain = UNDEFINED, sink_clock_domain = UNDEFINED; |
| float min_slack = HUGE_POSITIVE_FLOAT, slack; |
| t_tedge *tedge; |
| vtr::vector_map<ClusterBlockId, t_pb **> empty_pin_id_to_pb_mapping; //Empty vector_map for do_timing_analysis_for_constraint |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| /* If there's only one clock, we can use the arrival and required times |
| currently on the timing graph to find the critical path. If there are multiple |
| clocks, however, the values currently on the timing graph correspond to the |
| last constraint (pair of clock domains) analysed, which may not be the constraint |
| with the critical path. In this case, we have to find the constraint with the |
| least slack and redo the timing analysis for this constraint so we get the right |
| values onto the timing graph. */ |
| |
| if (timing_ctx.sdc->num_constrained_clocks > 1) { |
| /* The critical path belongs to the source and sink clock domains |
| with the least slack. Find these clock domains now. */ |
| |
| for (i = 0; i < timing_ctx.sdc->num_constrained_clocks; i++) { |
| for (j = 0; j < timing_ctx.sdc->num_constrained_clocks; j++) { |
| // Use the true, unrelaxed, least slack (constraint - critical path delay). |
| slack = timing_ctx.sdc->domain_constraint[i][j] - f_timing_stats->cpd[i][j]; |
| if (slack < min_slack) { |
| min_slack = slack; |
| source_clock_domain = i; |
| sink_clock_domain = j; |
| } |
| } |
| } |
| |
| /* Do a timing analysis for this clock domain pair only. |
| Set is_prepacked to false since we don't care about the clusterer's normalized values. |
| Set is_final_analysis to false to get actual, rather than relaxed, slacks. |
| Set max critical input/output paths to NULL since they aren't used unless is_prepacked is true. */ |
| do_timing_analysis_for_constraint(source_clock_domain, sink_clock_domain, false, false, nullptr, nullptr, empty_pin_id_to_pb_mapping, timing_inf); |
| } |
| |
| /* Start at the source (level-0) tnode with the least slack (T_req-T_arr). |
| This will form the head of our linked list of tnodes on the critical path. */ |
| min_slack = HUGE_POSITIVE_FLOAT; |
| num_at_level_zero = timing_ctx.tnodes_at_level[0].size(); |
| for (i = 0; i < num_at_level_zero; i++) { |
| inode = timing_ctx.tnodes_at_level[0][i]; |
| if (has_valid_T_arr(inode) && has_valid_T_req(inode)) { /* Valid arrival and required times */ |
| slack = timing_ctx.tnodes[inode].T_req - timing_ctx.tnodes[inode].T_arr; |
| if (slack < min_slack) { |
| crit_node = inode; |
| min_slack = slack; |
| } |
| } |
| } |
| critical_path_head = (vtr::t_linked_int *) vtr::malloc(sizeof(vtr::t_linked_int)); |
| critical_path_head->data = crit_node; |
| VTR_ASSERT(crit_node != OPEN); |
| prev_crit_node = critical_path_head; |
| num_edges = timing_ctx.tnodes[crit_node].num_edges; |
| |
| /* Keep adding the tnode in this tnode's fanout which has the least slack |
| to our critical path linked list, then jump to that tnode and repeat, until |
| we hit a tnode with no edges, which is the sink of the critical path. */ |
| while (num_edges != 0) { |
| curr_crit_node = (vtr::t_linked_int *) vtr::malloc(sizeof(vtr::t_linked_int)); |
| prev_crit_node->next = curr_crit_node; |
| tedge = timing_ctx.tnodes[crit_node].out_edges; |
| min_slack = HUGE_POSITIVE_FLOAT; |
| |
| for (iedge = 0; iedge < num_edges; iedge++) { |
| to_node = tedge[iedge].to_node; |
| if(to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| |
| if (has_valid_T_arr(to_node) && has_valid_T_req(to_node)) { /* Valid arrival and required times */ |
| slack = timing_ctx.tnodes[to_node].T_req - timing_ctx.tnodes[to_node].T_arr; |
| if (slack < min_slack) { |
| crit_node = to_node; |
| min_slack = slack; |
| } |
| } |
| } |
| |
| curr_crit_node->data = crit_node; |
| prev_crit_node = curr_crit_node; |
| num_edges = timing_ctx.tnodes[crit_node].num_edges; |
| } |
| |
| prev_crit_node->next = nullptr; |
| return (critical_path_head); |
| } |
| |
| #ifndef PATH_COUNTING |
| static void update_normalized_costs(float criticality_denom, long max_critical_input_paths, |
| long max_critical_output_paths, const t_timing_inf &timing_inf) { |
| int inode; |
| |
| /* Update the normalized costs for the clusterer. On each traversal, each cost is |
| updated for tnodes analysed on this traversal if it would give this tnode a higher |
| criticality when calculating block criticality for the clusterer. */ |
| |
| if (timing_inf.slack_definition == std::string("R") || timing_inf.slack_definition == std::string("I")) { |
| /*VTR_ASSERT(criticality_denom != 0); */ |
| /* Possible if timing analysis is being run pre-packing |
| with all delays set to 0. This is not currently done, |
| but if you're going to do it, you need to decide how |
| best to normalize these values to suit your purposes. */ |
| } |
| |
| auto& timing_ctx = g_vpr_ctx.mutable_timing(); |
| |
| for (inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| /* Only calculate for tnodes which have valid arrival and required times. */ |
| if (has_valid_T_arr(inode) && has_valid_T_req(inode)) { |
| timing_ctx.tnodes[inode].prepacked_data->normalized_slack = min(timing_ctx.tnodes[inode].prepacked_data->normalized_slack, |
| (timing_ctx.tnodes[inode].T_req - timing_ctx.tnodes[inode].T_arr)/criticality_denom); |
| timing_ctx.tnodes[inode].prepacked_data->normalized_T_arr = max(timing_ctx.tnodes[inode].prepacked_data->normalized_T_arr, |
| timing_ctx.tnodes[inode].T_arr/criticality_denom); |
| timing_ctx.tnodes[inode].prepacked_data->normalized_total_critical_paths = max(timing_ctx.tnodes[inode].prepacked_data->normalized_total_critical_paths, |
| ((float) timing_ctx.tnodes[inode].prepacked_data->num_critical_input_paths + timing_ctx.tnodes[inode].prepacked_data->num_critical_output_paths) / |
| ((float) max_critical_input_paths + max_critical_output_paths)); |
| } |
| } |
| } |
| #endif |
| |
| |
| static void load_clock_domain_and_clock_and_io_delay(bool is_prepacked, vtr::vector_map<ClusterBlockId, std::vector<int>> &lookup_tnode_from_pin_id, vtr::vector_map<ClusterBlockId, t_pb **> &pin_id_to_pb_mapping) { |
| /* Loads clock domain and clock delay onto TN_FF_SOURCE and TN_FF_SINK tnodes. |
| The clock domain of each clock is its index in timing_ctx.sdc->constrained_clocks. |
| We do this by matching each clock input pad (TN_INPAD_SOURCE), or internal clock |
| source (TN_CLOCK_SOURCE) to a constrained clock name, then propagating forward its |
| domain index to all flip-flops fed by it (TN_FF_CLOCK tnodes), then later looking |
| up the TN_FF_CLOCK tnode corresponding to every TN_FF_SOURCE and TN_FF_SINK tnode. |
| We also add up the delays along the clock net to each TN_FF_CLOCK tnode to give it |
| (and the SOURCE/SINK nodes) a clock delay. |
| |
| Also loads input delay/output delay (from set_input_delay or set_output_delay SDC |
| constraints) onto the tedges between TN_INPAD_SOURCE/OPIN and TN_OUTPAD_IPIN/SINK |
| tnodes. Finds fanout of each clock domain, including virtual (external) clocks. |
| Marks unconstrained I/Os with a dummy clock domain (-1). */ |
| |
| int i, iclock, inode, num_at_level, clock_index, input_index, output_index; |
| const char * net_name; |
| t_tnode * clock_node; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| /* Wipe fanout of each clock domain in timing_ctx.sdc->constrained_clocks. */ |
| for (iclock = 0; iclock < timing_ctx.sdc->num_constrained_clocks; iclock++) { |
| timing_ctx.sdc->constrained_clocks[iclock].fanout = 0; |
| } |
| |
| /* First, visit all TN_INPAD_SOURCE and TN_CLOCK_SOURCE tnodes |
| * We only need to check the first level of the timing graph, since all SOURCEs should appear there*/ |
| if(!timing_ctx.tnodes_at_level.empty()) { |
| num_at_level = timing_ctx.tnodes_at_level[0].size(); /* There are num_at_level top-level tnodes. */ |
| } else { |
| num_at_level = 0; |
| } |
| for(i = 0; i < num_at_level; i++) { |
| inode = timing_ctx.tnodes_at_level[0][i]; /* Iterate through each tnode. inode is the index of the tnode in the array tnode. */ |
| if (timing_ctx.tnodes[inode].type == TN_INPAD_SOURCE || timing_ctx.tnodes[inode].type == TN_CLOCK_SOURCE) { /* See if this node is the start of an I/O pad, or a clock source. */ |
| net_name = find_tnode_net_name(inode, is_prepacked, pin_id_to_pb_mapping); |
| if ((clock_index = find_clock(net_name)) != -1) { /* We have a clock inpad or clock source. */ |
| /* Set clock skew to 0 at the source and propagate skew |
| recursively to all connected nodes, adding delay as we go. |
| Set the clock domain to the index of the clock in the |
| timing_ctx.sdc->constrained_clocks array and propagate unchanged. */ |
| timing_ctx.tnodes[inode].clock_delay = 0.; |
| timing_ctx.tnodes[inode].clock_domain = clock_index; |
| propagate_clock_domain_and_skew(inode); |
| |
| /* Set the clock domain of this clock inpad to -1, so that we do not analyse it. |
| If we did not do this, the clock net would be analysed on the same iteration of |
| the timing analyzer as the flip-flops it drives! */ |
| timing_ctx.tnodes[inode].clock_domain = DO_NOT_ANALYSE; |
| |
| } else if (timing_ctx.tnodes[inode].type == TN_INPAD_SOURCE && (input_index = find_input(net_name)) != -1) { |
| /* We have a constrained non-clock inpad - find the clock it's constrained on. */ |
| clock_index = find_clock(timing_ctx.sdc->constrained_inputs[input_index].clock_name); |
| VTR_ASSERT(clock_index != -1); |
| /* The clock domain for this input is that of its virtual clock */ |
| timing_ctx.tnodes[inode].clock_domain = clock_index; |
| /* Increment the fanout of this virtual clock domain. */ |
| timing_ctx.sdc->constrained_clocks[clock_index].fanout++; |
| /* Mark input delay specified in SDC file on the timing graph edge leading out from the TN_INPAD_SOURCE node. */ |
| timing_ctx.tnodes[inode].out_edges[0].Tdel = timing_ctx.sdc->constrained_inputs[input_index].delay * 1e-9; /* normalize to be in seconds not ns */ |
| } else { /* We have an unconstrained input - mark with dummy clock domain and do not analyze. */ |
| timing_ctx.tnodes[inode].clock_domain = DO_NOT_ANALYSE; |
| } |
| } |
| } |
| |
| /* Second, visit all TN_OUTPAD_SINK tnodes. Unlike for TN_INPAD_SOURCE tnodes, |
| we have to search the entire tnode array since these are all on different levels. */ |
| for (inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| if (timing_ctx.tnodes[inode].type == TN_OUTPAD_SINK) { |
| /* Since the pb_graph_pin of TN_OUTPAD_SINK tnodes points to NULL, we have to find the net |
| from the pb_graph_pin of the corresponding TN_OUTPAD_IPIN node. |
| Exploit the fact that the TN_OUTPAD_IPIN node will always be one prior in the tnode array. */ |
| VTR_ASSERT(timing_ctx.tnodes[inode - 1].type == TN_OUTPAD_IPIN); |
| net_name = find_tnode_net_name(inode, is_prepacked, pin_id_to_pb_mapping); |
| output_index = find_output(net_name + 4); /* the + 4 removes the prefix "out:" automatically prepended to outputs */ |
| if (output_index != -1) { |
| /* We have a constrained outpad, find the clock it's constrained on. */ |
| clock_index = find_clock(timing_ctx.sdc->constrained_outputs[output_index].clock_name); |
| VTR_ASSERT(clock_index != -1); |
| /* The clock doain for this output is that of its virtual clock */ |
| timing_ctx.tnodes[inode].clock_domain = clock_index; |
| /* Increment the fanout of this virtual clock domain. */ |
| timing_ctx.sdc->constrained_clocks[clock_index].fanout++; |
| /* Mark output delay specified in SDC file on the timing graph edge leading into the TN_OUTPAD_SINK node. |
| However, this edge is part of the corresponding TN_OUTPAD_IPIN node. |
| Exploit the fact that the TN_OUTPAD_IPIN node will always be one prior in the tnode array. */ |
| timing_ctx.tnodes[inode - 1].out_edges[0].Tdel = timing_ctx.sdc->constrained_outputs[output_index].delay * 1e-9; /* normalize to be in seconds not ns */ |
| |
| } else { /* We have an unconstrained input - mark with dummy clock domain and do not analyze. */ |
| timing_ctx.tnodes[inode].clock_domain = -1; |
| } |
| } |
| } |
| |
| /* Third, visit all TN_FF_SOURCE and TN_FF_SINK tnodes, and transfer the clock domain and skew from their corresponding TN_FF_CLOCK tnodes*/ |
| for (inode = 0; inode < timing_ctx.num_tnodes; inode++) { |
| if (timing_ctx.tnodes[inode].type == TN_FF_SOURCE || timing_ctx.tnodes[inode].type == TN_FF_SINK) { |
| clock_node = find_ff_clock_tnode(inode, is_prepacked, lookup_tnode_from_pin_id); |
| timing_ctx.tnodes[inode].clock_domain = clock_node->clock_domain; |
| timing_ctx.tnodes[inode].clock_delay = clock_node->clock_delay; |
| } |
| } |
| } |
| |
| static void propagate_clock_domain_and_skew(int inode) { |
| /* Given a tnode indexed by inode (which is part of a clock net), |
| * propagate forward the clock domain (unchanged) and skew (adding the delay of edges) to all nodes in its fanout. |
| * We then call recursively on all children in a depth-first search. If num_edges is 0, we should be at an TN_FF_CLOCK tnode; |
| * We implicitly rely on a tnode not being part of two separate clock nets, since undefined behaviour would result if one DFS |
| * overwrote the results of another. This may be problematic in cases of multiplexed or locally-generated clocks. */ |
| |
| int iedge, to_node; |
| t_tedge * tedge; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| tedge = timing_ctx.tnodes[inode].out_edges; /* Get the list of edges from the node we're visiting. */ |
| |
| if (!tedge) { /* Leaf/sink node; base case of the recursion. */ |
| if(timing_ctx.tnodes[inode].type != TN_FF_CLOCK && timing_ctx.tnodes[inode].type != TN_OUTPAD_SINK) { |
| vtr::printf_warning(__FILE__, __LINE__, "tnode %d appears to take clock as a data input\n", inode); |
| return; |
| } |
| |
| VTR_ASSERT(timing_ctx.tnodes[inode].clock_domain != -1); /* Make sure clock domain is valid. */ |
| |
| timing_ctx.sdc->constrained_clocks[timing_ctx.tnodes[inode].clock_domain].fanout++; |
| return; |
| } |
| |
| for (iedge = 0; iedge < timing_ctx.tnodes[inode].num_edges; iedge++) { /* Go through each edge coming out from this tnode */ |
| to_node = tedge[iedge].to_node; |
| if(to_node == DO_NOT_ANALYSE) continue; //Skip marked invalid nodes |
| |
| /* Propagate clock skew forward along this clock net, adding the delay of the wires (edges) of the clock network. */ |
| timing_ctx.tnodes[to_node].clock_delay = timing_ctx.tnodes[inode].clock_delay + tedge[iedge].Tdel; |
| /* Propagate clock domain forward unchanged */ |
| timing_ctx.tnodes[to_node].clock_domain = timing_ctx.tnodes[inode].clock_domain; |
| /* Finally, call recursively on the destination tnode. */ |
| propagate_clock_domain_and_skew(to_node); |
| } |
| } |
| |
| static const char * find_tnode_net_name(int inode, bool is_prepacked, vtr::vector_map<ClusterBlockId, t_pb **> &pin_id_to_pb_mapping) { |
| /* Finds the name of the net which a tnode (inode) is on (different for pre-/post-packed netlists). */ |
| |
| t_pb_graph_pin * pb_graph_pin; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| if (is_prepacked) { |
| //We only store pin mappings for tnode's which correspond to netlist pins, |
| //so we need to convert sources/sinks to their relevant pin tnode's before |
| //looking up the netlist pin/net |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| int inode_pin = OPEN; |
| if(timing_ctx.tnodes[inode].type == TN_INPAD_SOURCE || timing_ctx.tnodes[inode].type == TN_FF_SOURCE || timing_ctx.tnodes[inode].type == TN_CLOCK_SOURCE) { |
| VTR_ASSERT(timing_ctx.tnodes[inode].num_edges == 1); |
| inode_pin = timing_ctx.tnodes[inode].out_edges[0].to_node; |
| } else if (timing_ctx.tnodes[inode].type == TN_OUTPAD_SINK || timing_ctx.tnodes[inode].type == TN_FF_SINK) { |
| inode_pin = inode - 1; |
| VTR_ASSERT(timing_ctx.tnodes[inode_pin].out_edges[0].to_node == inode); |
| } else { |
| inode_pin = inode; |
| } |
| VTR_ASSERT(inode_pin != OPEN); |
| |
| AtomPinId pin_id = atom_ctx.lookup.classic_tnode_atom_pin(inode_pin); |
| VTR_ASSERT(pin_id); |
| |
| if(timing_ctx.tnodes[inode].type == TN_INPAD_SOURCE || timing_ctx.tnodes[inode].type == TN_INPAD_OPIN || |
| timing_ctx.tnodes[inode].type == TN_OUTPAD_SINK || timing_ctx.tnodes[inode].type == TN_OUTPAD_IPIN) { |
| AtomBlockId blk_id = atom_ctx.nlist.pin_block(pin_id); |
| |
| //For input/input pads the net name is the same as the block name |
| return atom_ctx.nlist.block_name(blk_id).c_str(); |
| } else { |
| AtomNetId net_id = atom_ctx.nlist.pin_net(pin_id); |
| |
| return atom_ctx.nlist.net_name(net_id).c_str(); |
| } |
| } else { |
| auto& cluster_ctx = g_vpr_ctx.clustering(); |
| |
| ClusterBlockId iblock = timing_ctx.tnodes[inode].block; |
| if(timing_ctx.tnodes[inode].type == TN_INPAD_SOURCE || timing_ctx.tnodes[inode].type == TN_INPAD_OPIN || |
| timing_ctx.tnodes[inode].type == TN_OUTPAD_SINK || timing_ctx.tnodes[inode].type == TN_OUTPAD_IPIN) { |
| //For input/input pads the net name is the same as the block name |
| pb_graph_pin = timing_ctx.tnodes[inode].pb_graph_pin; |
| return pin_id_to_pb_mapping[iblock][pb_graph_pin->pin_count_in_cluster]->name; |
| } else { |
| //We need to find the TN_CB_OPIN/TN_CB_IPIN that this node drives, so that we can look up |
| //the net name in the global clb netlist |
| int inode_search = inode; |
| while(timing_ctx.tnodes[inode_search].type != TN_CB_IPIN && timing_ctx.tnodes[inode_search].type != TN_CB_OPIN ) { |
| //Assume we don't have any internal fanout inside the block |
| // Should be true for most source-types |
| VTR_ASSERT(timing_ctx.tnodes[inode_search].num_edges == 1); |
| inode_search = timing_ctx.tnodes[inode_search].out_edges[0].to_node; |
| } |
| VTR_ASSERT(timing_ctx.tnodes[inode_search].type == TN_CB_IPIN || timing_ctx.tnodes[inode_search].type == TN_CB_OPIN); |
| |
| //Now that we have a complex block pin, we can look-up its connected net in the CLB netlist |
| pb_graph_pin = timing_ctx.tnodes[inode_search].pb_graph_pin; |
| int ipin = pb_graph_pin->pin_count_in_cluster; //Pin into the CLB |
| |
| ClusterNetId net_id = cluster_ctx.clb_nlist.block_net(timing_ctx.tnodes[inode_search].block, ipin); //Net index into the clb netlist |
| return cluster_ctx.clb_nlist.net_name(net_id).c_str(); |
| } |
| } |
| } |
| |
| static t_tnode * find_ff_clock_tnode(int inode, bool is_prepacked, vtr::vector_map<ClusterBlockId, std::vector<int>> &lookup_tnode_from_pin_id) { |
| /* Finds the TN_FF_CLOCK tnode on the same flipflop as an TN_FF_SOURCE or TN_FF_SINK tnode. */ |
| |
| t_tnode * ff_clock_tnode; |
| int ff_tnode; |
| t_pb_graph_node * parent_pb_graph_node; |
| t_pb_graph_pin * ff_source_or_sink_pb_graph_pin, * clock_pb_graph_pin; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| if (is_prepacked) { |
| //TODO: handle multiple clocks |
| |
| int i_pin_tnode = OPEN; |
| if(timing_ctx.tnodes[inode].type == TN_FF_SOURCE) { |
| VTR_ASSERT(timing_ctx.tnodes[inode].num_edges == 1); |
| |
| i_pin_tnode = timing_ctx.tnodes[inode].out_edges[0].to_node; |
| VTR_ASSERT(timing_ctx.tnodes[i_pin_tnode].type == TN_FF_OPIN); |
| } else if (timing_ctx.tnodes[inode].type == TN_FF_SINK) { |
| |
| i_pin_tnode = inode - 1; |
| |
| VTR_ASSERT(timing_ctx.tnodes[i_pin_tnode].type == TN_FF_IPIN); |
| VTR_ASSERT(timing_ctx.tnodes[i_pin_tnode].num_edges == 1); |
| VTR_ASSERT(timing_ctx.tnodes[i_pin_tnode].out_edges[0].to_node == inode); |
| } else { |
| VTR_ASSERT(false); |
| } |
| |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| auto pin_id = atom_ctx.lookup.classic_tnode_atom_pin(i_pin_tnode); |
| VTR_ASSERT(pin_id); |
| |
| auto blk_id = atom_ctx.nlist.pin_block(pin_id); |
| VTR_ASSERT(blk_id); |
| |
| //Get the single clock pin |
| auto clock_pins = atom_ctx.nlist.block_clock_pins(blk_id); |
| VTR_ASSERT(clock_pins.size() == 1); |
| auto clock_pin_id = *clock_pins.begin(); |
| |
| //Get the associated tnode index |
| int i_ff_clock = atom_ctx.lookup.atom_pin_classic_tnode(clock_pin_id); |
| VTR_ASSERT(i_ff_clock != OPEN); |
| |
| ff_clock_tnode = &timing_ctx.tnodes[i_ff_clock]; |
| |
| } else { |
| ClusterBlockId iblock = timing_ctx.tnodes[inode].block; |
| ff_source_or_sink_pb_graph_pin = timing_ctx.tnodes[inode].pb_graph_pin; |
| parent_pb_graph_node = ff_source_or_sink_pb_graph_pin->parent_node; |
| /* Make sure there's only one clock port and only one clock pin in that port */ |
| VTR_ASSERT(parent_pb_graph_node->num_clock_ports == 1); |
| VTR_ASSERT(parent_pb_graph_node->num_clock_pins[0] == 1); |
| clock_pb_graph_pin = &parent_pb_graph_node->clock_pins[0][0]; |
| ff_tnode = lookup_tnode_from_pin_id[iblock][clock_pb_graph_pin->pin_count_in_cluster]; |
| VTR_ASSERT(ff_tnode != OPEN); |
| ff_clock_tnode = &timing_ctx.tnodes[ff_tnode]; |
| } |
| VTR_ASSERT(ff_clock_tnode != nullptr); |
| VTR_ASSERT(ff_clock_tnode->type == TN_FF_CLOCK); |
| return ff_clock_tnode; |
| } |
| |
| static int find_clock(const char * net_name) { |
| /* Given a string net_name, find whether it's the name of a clock in the array timing_ctx.sdc->constrained_clocks. * |
| * if it is, return the clock's index in timing_ctx.sdc->constrained_clocks; if it's not, return -1. */ |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| for (int index = 0; index < timing_ctx.sdc->num_constrained_clocks; index++) { |
| if (strcmp(net_name, timing_ctx.sdc->constrained_clocks[index].name) == 0) { |
| return index; |
| } |
| } |
| return -1; |
| } |
| |
| static int find_input(const char * net_name) { |
| /* Given a string net_name, find whether it's the name of a constrained input in the array timing_ctx.sdc->constrained_inputs. * |
| * if it is, return its index in timing_ctx.sdc->constrained_inputs; if it's not, return -1. */ |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| for (int index = 0; index < timing_ctx.sdc->num_constrained_inputs; index++) { |
| if (strcmp(net_name, timing_ctx.sdc->constrained_inputs[index].name) == 0) { |
| return index; |
| } |
| } |
| return -1; |
| } |
| |
| static int find_output(const char * net_name) { |
| /* Given a string net_name, find whether it's the name of a constrained output in the array timing_ctx.sdc->constrained_outputs. * |
| * if it is, return its index in timing_ctx.sdc->constrained_outputs; if it's not, return -1. */ |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| for (int index = 0; index < timing_ctx.sdc->num_constrained_outputs; index++) { |
| if (strcmp(net_name, timing_ctx.sdc->constrained_outputs[index].name) == 0) { |
| return index; |
| } |
| } |
| return -1; |
| } |
| |
| static int find_cf_constraint(const char * source_clock_name, const char * sink_ff_name) { |
| /* Given a source clock domain and a sink flip-flop, find out if there's an override constraint between them. |
| If there is, return the index in timing_ctx.sdc->cf_constraints; if there is not, return -1. */ |
| int icf, isource, isink; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| for (icf = 0; icf < timing_ctx.sdc->num_cf_constraints; icf++) { |
| for (isource = 0; isource < timing_ctx.sdc->cf_constraints[icf].num_source; isource++) { |
| if (strcmp(timing_ctx.sdc->cf_constraints[icf].source_list[isource], source_clock_name) == 0) { |
| for (isink = 0; isink < timing_ctx.sdc->cf_constraints[icf].num_sink; isink++) { |
| if (strcmp(timing_ctx.sdc->cf_constraints[icf].sink_list[isink], sink_ff_name) == 0) { |
| return icf; |
| } |
| } |
| } |
| } |
| } |
| return -1; |
| } |
| |
| static inline bool has_valid_T_arr(int inode) { |
| /* Has this tnode's arrival time been changed from its original value of HUGE_NEGATIVE_FLOAT? */ |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| return (bool) (timing_ctx.tnodes[inode].T_arr > HUGE_NEGATIVE_FLOAT + 1); |
| } |
| |
| static inline bool has_valid_T_req(int inode) { |
| /* Has this tnode's required time been changed from its original value of HUGE_POSITIVE_FLOAT? */ |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| return (bool) (timing_ctx.tnodes[inode].T_req < HUGE_POSITIVE_FLOAT - 1); |
| } |
| |
| #ifndef PATH_COUNTING |
| bool has_valid_normalized_T_arr(int inode) { |
| /* Has this tnode's normalized_T_arr been changed from its original value of HUGE_NEGATIVE_FLOAT? */ |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| return (bool) (timing_ctx.tnodes[inode].prepacked_data->normalized_T_arr > HUGE_NEGATIVE_FLOAT + 1); |
| } |
| #endif |
| |
| float get_critical_path_delay() { |
| /* Finds the critical path delay, which is the minimum clock period required to meet the constraint |
| corresponding to the pair of source and sink clock domains with the least slack in the design. */ |
| |
| int source_clock_domain, sink_clock_domain; |
| float least_slack_in_design = HUGE_POSITIVE_FLOAT, critical_path_delay = NAN; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| auto& device_ctx = g_vpr_ctx.device(); |
| |
| if (!timing_ctx.sdc) return UNDEFINED; /* If timing analysis is off, for instance. */ |
| |
| for (source_clock_domain = 0; source_clock_domain < timing_ctx.sdc->num_constrained_clocks; source_clock_domain++) { |
| for (sink_clock_domain = 0; sink_clock_domain < timing_ctx.sdc->num_constrained_clocks; sink_clock_domain++) { |
| if (least_slack_in_design > f_timing_stats->least_slack[source_clock_domain][sink_clock_domain]) { |
| least_slack_in_design = f_timing_stats->least_slack[source_clock_domain][sink_clock_domain]; |
| critical_path_delay = f_timing_stats->cpd[source_clock_domain][sink_clock_domain]; |
| } |
| } |
| } |
| if (device_ctx.pb_max_internal_delay != UNDEFINED && device_ctx.pb_max_internal_delay > critical_path_delay) { |
| critical_path_delay = device_ctx.pb_max_internal_delay; |
| } |
| |
| return critical_path_delay * 1e9; /* Convert to nanoseconds */ |
| } |
| |
| void print_timing_stats() { |
| |
| /* Prints critical path delay/fmax, least slack in design, and, for multiple-clock designs, |
| minimum required clock period to meet each constraint, least slack per constraint, |
| geometric average clock frequency, and fanout-weighted geometric average clock frequency. */ |
| |
| int source_clock_domain, sink_clock_domain, clock_domain, fanout, total_fanout = 0, |
| num_netlist_clocks_with_intra_domain_paths = 0; |
| float geomean_period = 1., least_slack_in_design = HUGE_POSITIVE_FLOAT, critical_path_delay = UNDEFINED; |
| double fanout_weighted_geomean_period = 0.; |
| bool found; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| for (source_clock_domain = 0; source_clock_domain < timing_ctx.sdc->num_constrained_clocks; source_clock_domain++) { |
| for (sink_clock_domain = 0; sink_clock_domain < timing_ctx.sdc->num_constrained_clocks; sink_clock_domain++) { |
| if (least_slack_in_design > f_timing_stats->least_slack[source_clock_domain][sink_clock_domain]) { |
| least_slack_in_design = f_timing_stats->least_slack[source_clock_domain][sink_clock_domain]; |
| } |
| } |
| } |
| |
| /* Find critical path delay. If the device_ctx.pb_max_internal_delay is greater than this, it becomes |
| the limiting factor on critical path delay, so print that instead, with a special message. */ |
| critical_path_delay = get_critical_path_delay(); |
| vtr::printf_info("Final critical path: %g ns", critical_path_delay); |
| |
| if (timing_ctx.sdc->num_constrained_clocks <= 1) { |
| /* Although critical path delay is always well-defined, it doesn't make sense to talk about fmax for multi-clock circuits */ |
| vtr::printf_direct(", f_max: %g MHz", 1e3 / critical_path_delay); |
| } |
| vtr::printf_direct("\n"); |
| |
| /* Also print the least slack in the design */ |
| vtr::printf_info("\n"); |
| vtr::printf_info("Least slack in design: %g ns\n", 1e9 * least_slack_in_design); |
| vtr::printf_info("\n"); |
| |
| if (timing_ctx.sdc->num_constrained_clocks > 1) { /* Multiple-clock design */ |
| |
| /* Print minimum possible clock period to meet each constraint. Convert to nanoseconds. */ |
| |
| vtr::printf_info("Minimum possible clock period to meet each constraint (including skew effects):\n"); |
| for (source_clock_domain = 0; source_clock_domain < timing_ctx.sdc->num_constrained_clocks; source_clock_domain++) { |
| /* Print the intra-domain constraint if it was analysed. */ |
| if (timing_ctx.sdc->domain_constraint[source_clock_domain][source_clock_domain] > NEGATIVE_EPSILON) { |
| vtr::printf_info("%s to %s: %g ns (%g MHz)\n", |
| timing_ctx.sdc->constrained_clocks[source_clock_domain].name, |
| timing_ctx.sdc->constrained_clocks[source_clock_domain].name, |
| 1e9 * f_timing_stats->cpd[source_clock_domain][source_clock_domain], |
| 1e-6 / f_timing_stats->cpd[source_clock_domain][source_clock_domain]); |
| } else { |
| vtr::printf_info("%s to %s: --\n", |
| timing_ctx.sdc->constrained_clocks[source_clock_domain].name, |
| timing_ctx.sdc->constrained_clocks[source_clock_domain].name); |
| } |
| /* Then, print all other constraints on separate lines, indented. We re-print |
| the source clock domain's name so there's no ambiguity when parsing. */ |
| for (sink_clock_domain = 0; sink_clock_domain < timing_ctx.sdc->num_constrained_clocks; sink_clock_domain++) { |
| if (source_clock_domain == sink_clock_domain) continue; /* already done that */ |
| if (timing_ctx.sdc->domain_constraint[source_clock_domain][sink_clock_domain] > NEGATIVE_EPSILON) { |
| /* If this domain pair was analysed */ |
| vtr::printf_info("\t%s to %s: %g ns (%g MHz)\n", |
| timing_ctx.sdc->constrained_clocks[source_clock_domain].name, |
| timing_ctx.sdc->constrained_clocks[sink_clock_domain].name, |
| 1e9 * f_timing_stats->cpd[source_clock_domain][sink_clock_domain], |
| 1e-6 / f_timing_stats->cpd[source_clock_domain][sink_clock_domain]); |
| } else { |
| vtr::printf_info("\t%s to %s: --\n", |
| timing_ctx.sdc->constrained_clocks[source_clock_domain].name, |
| timing_ctx.sdc->constrained_clocks[sink_clock_domain].name); |
| } |
| } |
| } |
| |
| /* Print least slack per constraint. */ |
| |
| vtr::printf_info("\n"); |
| vtr::printf_info("Least slack per constraint:\n"); |
| for (source_clock_domain = 0; source_clock_domain < timing_ctx.sdc->num_constrained_clocks; source_clock_domain++) { |
| /* Print the intra-domain slack if valid. */ |
| if (f_timing_stats->least_slack[source_clock_domain][source_clock_domain] < HUGE_POSITIVE_FLOAT - 1) { |
| vtr::printf_info("%s to %s: %g ns\n", |
| timing_ctx.sdc->constrained_clocks[source_clock_domain].name, |
| timing_ctx.sdc->constrained_clocks[source_clock_domain].name, |
| 1e9 * f_timing_stats->least_slack[source_clock_domain][source_clock_domain]); |
| } else { |
| vtr::printf_info("%s to %s: --\n", |
| timing_ctx.sdc->constrained_clocks[source_clock_domain].name, |
| timing_ctx.sdc->constrained_clocks[source_clock_domain].name); |
| } |
| /* Then, print all other slacks on separate lines. */ |
| for (sink_clock_domain = 0; sink_clock_domain < timing_ctx.sdc->num_constrained_clocks; sink_clock_domain++) { |
| if (source_clock_domain == sink_clock_domain) continue; /* already done that */ |
| if (f_timing_stats->least_slack[source_clock_domain][sink_clock_domain] < HUGE_POSITIVE_FLOAT - 1) { |
| /* If this domain pair was analysed and has a valid slack */ |
| vtr::printf_info("\t%s to %s: %g ns\n", |
| timing_ctx.sdc->constrained_clocks[source_clock_domain].name, |
| timing_ctx.sdc->constrained_clocks[sink_clock_domain].name, |
| 1e9 * f_timing_stats->least_slack[source_clock_domain][sink_clock_domain]); |
| } else { |
| vtr::printf_info("\t%s to %s: --\n", |
| timing_ctx.sdc->constrained_clocks[source_clock_domain].name, |
| timing_ctx.sdc->constrained_clocks[sink_clock_domain].name); |
| } |
| } |
| } |
| |
| /* Calculate geometric mean f_max (fanout-weighted and unweighted) from the diagonal (intra-domain) entries of critical_path_delay, |
| excluding domains without intra-domain paths (for which the timing constraint is DO_NOT_ANALYSE) and virtual clocks. */ |
| found = false; |
| for (clock_domain = 0; clock_domain < timing_ctx.sdc->num_constrained_clocks; clock_domain++) { |
| if (timing_ctx.sdc->domain_constraint[clock_domain][clock_domain] > NEGATIVE_EPSILON && timing_ctx.sdc->constrained_clocks[clock_domain].is_netlist_clock) { |
| geomean_period *= f_timing_stats->cpd[clock_domain][clock_domain]; |
| fanout = timing_ctx.sdc->constrained_clocks[clock_domain].fanout; |
| fanout_weighted_geomean_period += log((double) f_timing_stats->cpd[clock_domain][clock_domain])*(double)fanout; |
| total_fanout += fanout; |
| num_netlist_clocks_with_intra_domain_paths++; |
| found = true; |
| } |
| } |
| if (found) { /* Only print these if we found at least one clock domain with intra-domain paths. */ |
| geomean_period = pow(geomean_period, (float) 1/num_netlist_clocks_with_intra_domain_paths); |
| fanout_weighted_geomean_period = exp(fanout_weighted_geomean_period/total_fanout); |
| /* Convert to MHz */ |
| vtr::printf_info("\n"); |
| vtr::printf_info("Geometric mean intra-domain period: %g ns (%g MHz)\n", |
| 1e9 * geomean_period, 1e-6 / geomean_period); |
| vtr::printf_info("Fanout-weighted geomean intra-domain period: %g ns (%g MHz)\n", |
| 1e9 * fanout_weighted_geomean_period, 1e-6 / fanout_weighted_geomean_period); |
| } |
| |
| vtr::printf_info("\n"); |
| } |
| } |
| |
| static void print_timing_constraint_info(const char *fname) { |
| /* Prints the contents of timing_ctx.sdc->domain_constraint, timing_ctx.sdc->constrained_clocks, constrained_ios and timing_ctx.sdc->cc_constraints to a file. */ |
| |
| FILE * fp; |
| int source_clock_domain, sink_clock_domain, i, j; |
| int max_clock_name_length = INT_MIN; |
| char * clock_name; |
| |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| int * clock_name_length = (int *) vtr::malloc(timing_ctx.sdc->num_constrained_clocks * sizeof(int)); /* Array of clock name lengths */ |
| |
| fp = vtr::fopen(fname, "w"); |
| |
| /* Get lengths of each clock name and max length so we can space the columns properly. */ |
| for (sink_clock_domain = 0; sink_clock_domain < timing_ctx.sdc->num_constrained_clocks; sink_clock_domain++) { |
| clock_name = timing_ctx.sdc->constrained_clocks[sink_clock_domain].name; |
| clock_name_length[sink_clock_domain] = strlen(clock_name); |
| if (clock_name_length[sink_clock_domain] > max_clock_name_length) { |
| max_clock_name_length = clock_name_length[sink_clock_domain]; |
| } |
| } |
| |
| /* First, combine info from timing_ctx.sdc->domain_constraint and timing_ctx.sdc->constrained_clocks into a matrix |
| (they're indexed the same as each other). */ |
| |
| fprintf(fp, "Timing constraints in ns (source clock domains down left side, sink along top).\n" |
| "A value of -1.00 means the pair of source and sink domains will not be analysed.\n\n"); |
| |
| print_spaces(fp, max_clock_name_length + 4); |
| |
| for (sink_clock_domain = 0; sink_clock_domain < timing_ctx.sdc->num_constrained_clocks; sink_clock_domain++) { |
| fprintf(fp, "%s", timing_ctx.sdc->constrained_clocks[sink_clock_domain].name); |
| /* Minimum column width of 8 */ |
| print_spaces(fp, max(8 - clock_name_length[sink_clock_domain], 4)); |
| } |
| fprintf(fp, "\n"); |
| |
| for (source_clock_domain = 0; source_clock_domain < timing_ctx.sdc->num_constrained_clocks; source_clock_domain++) { |
| fprintf(fp, "%s", timing_ctx.sdc->constrained_clocks[source_clock_domain].name); |
| print_spaces(fp, max_clock_name_length + 4 - clock_name_length[source_clock_domain]); |
| for (sink_clock_domain = 0; sink_clock_domain < timing_ctx.sdc->num_constrained_clocks; sink_clock_domain++) { |
| fprintf(fp, "%5.2f", timing_ctx.sdc->domain_constraint[source_clock_domain][sink_clock_domain]); |
| /* Minimum column width of 8 */ |
| print_spaces(fp, max(clock_name_length[sink_clock_domain] - 1, 3)); |
| } |
| fprintf(fp, "\n"); |
| } |
| |
| free(clock_name_length); |
| |
| /* Second, print I/O constraints. */ |
| fprintf(fp, "\nList of constrained inputs (note: constraining a clock net input has no effect):\n"); |
| for (i = 0; i < timing_ctx.sdc->num_constrained_inputs; i++) { |
| fprintf(fp, "Input name %s on clock %s with input delay %.2f ns\n", |
| timing_ctx.sdc->constrained_inputs[i].name, timing_ctx.sdc->constrained_inputs[i].clock_name, timing_ctx.sdc->constrained_inputs[i].delay); |
| } |
| |
| fprintf(fp, "\nList of constrained outputs:\n"); |
| for (i = 0; i < timing_ctx.sdc->num_constrained_outputs; i++) { |
| fprintf(fp, "Output name %s on clock %s with output delay %.2f ns\n", |
| timing_ctx.sdc->constrained_outputs[i].name, timing_ctx.sdc->constrained_outputs[i].clock_name, timing_ctx.sdc->constrained_outputs[i].delay); |
| } |
| |
| /* Third, print override constraints. */ |
| fprintf(fp, "\nList of override constraints (non-default constraints created by set_false_path, set_clock_groups, \nset_max_delay, and set_multicycle_path):\n"); |
| |
| for (i = 0; i < timing_ctx.sdc->num_cc_constraints; i++) { |
| fprintf(fp, "Clock domain"); |
| for (j = 0; j < timing_ctx.sdc->cc_constraints[i].num_source; j++) { |
| fprintf(fp, " %s,", timing_ctx.sdc->cc_constraints[i].source_list[j]); |
| } |
| fprintf(fp, " to clock domain"); |
| for (j = 0; j < timing_ctx.sdc->cc_constraints[i].num_sink - 1; j++) { |
| fprintf(fp, " %s,", timing_ctx.sdc->cc_constraints[i].sink_list[j]); |
| } /* We have to print the last one separately because we don't want a comma after it. */ |
| if (timing_ctx.sdc->cc_constraints[i].num_multicycles == 0) { /* not a multicycle constraint */ |
| fprintf(fp, " %s: %.2f ns\n", timing_ctx.sdc->cc_constraints[i].sink_list[j], timing_ctx.sdc->cc_constraints[i].constraint); |
| } else { /* multicycle constraint */ |
| fprintf(fp, " %s: %d multicycles\n", timing_ctx.sdc->cc_constraints[i].sink_list[j], timing_ctx.sdc->cc_constraints[i].num_multicycles); |
| } |
| } |
| |
| for (i = 0; i < timing_ctx.sdc->num_cf_constraints; i++) { |
| fprintf(fp, "Clock domain"); |
| for (j = 0; j < timing_ctx.sdc->cf_constraints[i].num_source; j++) { |
| fprintf(fp, " %s,", timing_ctx.sdc->cf_constraints[i].source_list[j]); |
| } |
| fprintf(fp, " to flip-flop"); |
| for (j = 0; j < timing_ctx.sdc->cf_constraints[i].num_sink - 1; j++) { |
| fprintf(fp, " %s,", timing_ctx.sdc->cf_constraints[i].sink_list[j]); |
| } /* We have to print the last one separately because we don't want a comma after it. */ |
| if (timing_ctx.sdc->cf_constraints[i].num_multicycles == 0) { /* not a multicycle constraint */ |
| fprintf(fp, " %s: %.2f ns\n", timing_ctx.sdc->cf_constraints[i].sink_list[j], timing_ctx.sdc->cf_constraints[i].constraint); |
| } else { /* multicycle constraint */ |
| fprintf(fp, " %s: %d multicycles\n", timing_ctx.sdc->cf_constraints[i].sink_list[j], timing_ctx.sdc->cf_constraints[i].num_multicycles); |
| } |
| } |
| |
| for (i = 0; i < timing_ctx.sdc->num_fc_constraints; i++) { |
| fprintf(fp, "Flip-flop"); |
| for (j = 0; j < timing_ctx.sdc->fc_constraints[i].num_source; j++) { |
| fprintf(fp, " %s,", timing_ctx.sdc->fc_constraints[i].source_list[j]); |
| } |
| fprintf(fp, " to clock domain"); |
| for (j = 0; j < timing_ctx.sdc->fc_constraints[i].num_sink - 1; j++) { |
| fprintf(fp, " %s,", timing_ctx.sdc->fc_constraints[i].sink_list[j]); |
| } /* We have to print the last one separately because we don't want a comma after it. */ |
| if (timing_ctx.sdc->fc_constraints[i].num_multicycles == 0) { /* not a multicycle constraint */ |
| fprintf(fp, " %s: %.2f ns\n", timing_ctx.sdc->fc_constraints[i].sink_list[j], timing_ctx.sdc->fc_constraints[i].constraint); |
| } else { /* multicycle constraint */ |
| fprintf(fp, " %s: %d multicycles\n", timing_ctx.sdc->fc_constraints[i].sink_list[j], timing_ctx.sdc->fc_constraints[i].num_multicycles); |
| } |
| } |
| |
| for (i = 0; i < timing_ctx.sdc->num_ff_constraints; i++) { |
| fprintf(fp, "Flip-flop"); |
| for (j = 0; j < timing_ctx.sdc->ff_constraints[i].num_source; j++) { |
| fprintf(fp, " %s,", timing_ctx.sdc->ff_constraints[i].source_list[j]); |
| } |
| fprintf(fp, " to flip-flop"); |
| for (j = 0; j < timing_ctx.sdc->ff_constraints[i].num_sink - 1; j++) { |
| fprintf(fp, " %s,", timing_ctx.sdc->ff_constraints[i].sink_list[j]); |
| } /* We have to print the last one separately because we don't want a comma after it. */ |
| if (timing_ctx.sdc->ff_constraints[i].num_multicycles == 0) { /* not a multicycle constraint */ |
| fprintf(fp, " %s: %.2f ns\n", timing_ctx.sdc->ff_constraints[i].sink_list[j], timing_ctx.sdc->ff_constraints[i].constraint); |
| } else { /* multicycle constraint */ |
| fprintf(fp, " %s: %d multicycles\n", timing_ctx.sdc->ff_constraints[i].sink_list[j], timing_ctx.sdc->ff_constraints[i].num_multicycles); |
| } |
| } |
| |
| fclose(fp); |
| } |
| |
| static void print_spaces(FILE * fp, int num_spaces) { |
| /* Prints num_spaces spaces to file pointed to by fp. */ |
| for ( ; num_spaces > 0; num_spaces--) { |
| fprintf(fp, " "); |
| } |
| } |
| |
| /* |
| Create a lookup table that returns the tnode index for [iblock][pb_graph_pin_id] |
| */ |
| vtr::vector_map<ClusterBlockId, std::vector<int>> alloc_and_load_tnode_lookup_from_pin_id() { |
| auto& cluster_ctx = g_vpr_ctx.clustering(); |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| vtr::vector_map<ClusterBlockId, std::vector<int>> tnode_lookup; |
| tnode_lookup.resize(cluster_ctx.clb_nlist.blocks().size()); |
| |
| for (auto blk_id : cluster_ctx.clb_nlist.blocks()) { |
| int num_pins = cluster_ctx.clb_nlist.block_type(blk_id)->pb_graph_head->total_pb_pins; |
| tnode_lookup[blk_id].resize(num_pins); |
| for (int j = 0; j < num_pins; j++) { |
| tnode_lookup[blk_id][j] = OPEN; |
| } |
| } |
| |
| for (int i = 0; i < timing_ctx.num_tnodes; i++) { |
| if (timing_ctx.tnodes[i].pb_graph_pin != nullptr) { |
| if (timing_ctx.tnodes[i].type != TN_CLOCK_SOURCE && |
| timing_ctx.tnodes[i].type != TN_FF_SOURCE && |
| timing_ctx.tnodes[i].type != TN_INPAD_SOURCE && |
| timing_ctx.tnodes[i].type != TN_FF_SINK && |
| timing_ctx.tnodes[i].type != TN_OUTPAD_SINK |
| ) { |
| int pb_pin_id = timing_ctx.tnodes[i].pb_graph_pin->pin_count_in_cluster; |
| ClusterBlockId iblk = timing_ctx.tnodes[i].block; |
| VTR_ASSERT(tnode_lookup[iblk][pb_pin_id] == OPEN); |
| tnode_lookup[iblk][pb_pin_id] = i; |
| } |
| } |
| } |
| return tnode_lookup; |
| } |
| |
| void free_tnode_lookup_from_pin_id(vtr::vector_map<ClusterBlockId, std::vector<int>> &tnode_lookup) { |
| auto& cluster_ctx = g_vpr_ctx.clustering(); |
| |
| for (auto blk_id : cluster_ctx.clb_nlist.blocks()) { |
| tnode_lookup[blk_id].clear(); |
| } |
| tnode_lookup.clear(); |
| } |
| |
| std::vector<size_t> init_timing_net_pins() { |
| std::vector<size_t> timing_net_pin_counts; |
| auto& cluster_ctx = g_vpr_ctx.clustering(); |
| |
| for(auto net_id : cluster_ctx.clb_nlist.nets()) { |
| timing_net_pin_counts.push_back(cluster_ctx.clb_nlist.net_pins(net_id).size()); |
| } |
| |
| VTR_ASSERT(timing_net_pin_counts.size() == cluster_ctx.clb_nlist.nets().size()); |
| |
| return timing_net_pin_counts; |
| } |
| |
| std::vector<size_t> init_atom_timing_net_pins() { |
| std::vector<size_t> timing_net_pin_counts; |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| |
| for(auto net_id : atom_ctx.nlist.nets()) { |
| timing_net_pin_counts.push_back(atom_ctx.nlist.net_pins(net_id).size()); |
| } |
| |
| VTR_ASSERT(timing_net_pin_counts.size() == atom_ctx.nlist.nets().size()); |
| |
| return timing_net_pin_counts; |
| } |
| |
| size_t num_timing_nets() { |
| return f_num_timing_net_pins.size(); |
| } |
| |
| size_t num_timing_net_pins(int inet) { |
| return f_num_timing_net_pins[inet]; |
| } |
| |
| size_t num_timing_net_sinks(int inet) { |
| VTR_ASSERT(f_num_timing_net_pins[inet] > 0); |
| return f_num_timing_net_pins[inet] - 1; |
| } |
| |
| void print_classic_cpds() { |
| auto& timing_ctx = g_vpr_ctx.timing(); |
| |
| for (int source_clock_domain = 0; source_clock_domain < timing_ctx.sdc->num_constrained_clocks; source_clock_domain++) { |
| for (int sink_clock_domain = 0; sink_clock_domain < timing_ctx.sdc->num_constrained_clocks; sink_clock_domain++) { |
| float least_slack = f_timing_stats->least_slack[source_clock_domain][sink_clock_domain]; |
| float critical_path_delay = f_timing_stats->cpd[source_clock_domain][sink_clock_domain]; |
| |
| vtr::printf("Classic %d -> %d: least_slack=%g cpd=%g\n", source_clock_domain, sink_clock_domain, least_slack, critical_path_delay); |
| } |
| } |
| } |
| |
| static void mark_max_block_delay(const std::unordered_map<AtomBlockId,t_pb_graph_node*>& expected_lowest_cost_pb_gnode) { |
| auto& atom_ctx = g_vpr_ctx.atom(); |
| auto& device_ctx = g_vpr_ctx.mutable_device(); |
| |
| for(AtomBlockId blk : atom_ctx.nlist.blocks()) { |
| auto iter = expected_lowest_cost_pb_gnode.find(blk); |
| if(iter != expected_lowest_cost_pb_gnode.end()) { |
| const t_pb_graph_node* gnode = iter->second; |
| const t_pb_type* pb_type = gnode->pb_type; |
| |
| if (pb_type->max_internal_delay != UNDEFINED) { |
| if (device_ctx.pb_max_internal_delay == UNDEFINED) { |
| device_ctx.pb_max_internal_delay = pb_type->max_internal_delay; |
| device_ctx.pbtype_max_internal_delay = pb_type; |
| } else if (device_ctx.pb_max_internal_delay < pb_type->max_internal_delay) { |
| device_ctx.pb_max_internal_delay = pb_type->max_internal_delay; |
| device_ctx.pbtype_max_internal_delay = pb_type; |
| } |
| } |
| } |
| } |
| } |