Minor revisions in response to reviewer comments.

git-svn-id: https://vtr-verilog-to-routing.googlecode.com/svn/branches/javeed@4674 8e3573b8-cf2c-4f14-ef6d-137439e28b8b
diff --git a/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.aux b/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.aux
index 5aac43c..ac73eb0 100644
--- a/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.aux
+++ b/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.aux
@@ -117,7 +117,6 @@
 \newlabel{eq:balance_constraint}{{11}{8}}
 \@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {V-C}}Hypergraph to Graph Transformation}{8}}
 \newlabel{sec:hypergraph_to_graph}{{\unhbox \voidb@x \hbox {V-C}}{8}}
-\citation{feng2014rent}
 \@writefile{lof}{\contentsline {figure}{\numberline {11}{\ignorespaces (a) Star and (b) clique graph topologies.}}{9}}
 \@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {}}}{9}}
 \@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {}}}{9}}
@@ -134,6 +133,7 @@
 \newlabel{fig:graph_topology_mcw}{{15}{9}}
 \@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {V-D}}Partitioner Stage Results}{9}}
 \newlabel{sec:partitioner_stage_results}{{\unhbox \voidb@x \hbox {V-D}}{9}}
+\citation{feng2014rent}
 \citation{hutton}
 \citation{hutton}
 \citation{hutton}
@@ -154,7 +154,7 @@
 \@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {V-F}}\% wires cut}{10}}
 \@writefile{lof}{\contentsline {figure}{\numberline {18}{\ignorespaces Impact of partitioning-based CAD flow on routability for a range of architectures.\footnotemark }}{10}}
 \newlabel{fig:wires_cut}{{18}{10}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {V-G}}Bloating of the Clustered Netlist due to Partitioning Constraints}{10}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {V-G}}Bloating of the Clustered Netlist due to Partitioning Constraints}{11}}
 \@writefile{lof}{\contentsline {figure}{\numberline {19}{\ignorespaces The number of blocks in the clustered netlist with and without partitioning constraints. This is a measure of packing bloat.}}{11}}
 \newlabel{fig:packing_bloat}{{19}{11}}
 \@writefile{toc}{\contentsline {section}{\numberline {VI}Architecture Results}{11}}
@@ -171,8 +171,8 @@
 \newlabel{fig:standard_minW}{{22}{12}}
 \@writefile{lof}{\contentsline {figure}{\numberline {23}{\ignorespaces Geometric mean of required intra-die minimum channel width vs. geometric mean of the number of wires crossing the interposer for 2 dice and $1ns$ of \textit  {delay increase}.}}{12}}
 \newlabel{fig:crossingwires}{{23}{12}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {VI-C}}Circuit Speed vs. Interposer Delay}{12}}
 \citation{xilinxTSV}
+\@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {VI-C}}Circuit Speed vs. Interposer Delay}{13}}
 \@writefile{toc}{\contentsline {subsection}{\numberline {\unhbox \voidb@x \hbox {VI-D}}Impact of Number of Dice}{13}}
 \newlabel{num_dice_impact}{{\unhbox \voidb@x \hbox {VI-D}}{13}}
 \@writefile{lof}{\contentsline {figure}{\numberline {24}{\ignorespaces Critical path delay vs. \textit  {\% wires cut} for 2 dice and $0.0$, $0.5$, $1.0$ and $1.5ns$ of \textit  {delay increase}.}}{13}}
@@ -187,8 +187,6 @@
 \@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {}}}{13}}
 \@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {}}}{13}}
 \newlabel{fig:square_tall}{{27}{13}}
-\@writefile{lot}{\contentsline {table}{\numberline {II}{\ignorespaces Square interposer and square dice minimum channel widths}}{13}}
-\newlabel{table:squares}{{II}{13}}
 \bibstyle{IEEEtran}
 \bibdata{IEEEabrv,./fpga2014paper}
 \bibcite{xilinxTSVperformance}{1}
@@ -212,12 +210,14 @@
 \bibcite{icann}{19}
 \bibcite{vtr2012}{20}
 \bibcite{karypis1998multilevelmetis}{21}
-\bibcite{karypis1999multilevelhmetis}{22}
-\bibcite{karypismanual}{23}
-\bibcite{hypergraph_to_graph_survey}{24}
+\@writefile{lot}{\contentsline {table}{\numberline {II}{\ignorespaces Square interposer and square dice minimum channel widths}}{14}}
+\newlabel{table:squares}{{II}{14}}
 \@writefile{toc}{\contentsline {section}{\numberline {VII}Conclusion}{14}}
 \newlabel{conclusionSection}{{VII}{14}}
 \@writefile{toc}{\contentsline {section}{References}{14}}
+\bibcite{karypis1999multilevelhmetis}{22}
+\bibcite{karypismanual}{23}
+\bibcite{hypergraph_to_graph_survey}{24}
 \bibcite{feng2014rent}{25}
 \bibcite{hutton}{26}
 \@writefile{toc}{\contentsline {section}{Biographies}{15}}
diff --git a/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.bib b/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.bib
index d8460c9..5572c48 100644
--- a/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.bib
+++ b/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.bib
@@ -236,26 +236,6 @@
   year={2014},
 }
 
-@article{hypergraph_to_graph_survey,
- author = {Ihler, Edmund and Wagner, Dorothea and Wagner, Frank},
- title = {Modeling Hypergraphs by Graphs with the Same Mincut Properties},
- journal = {Inf. Process. Lett.},
- issue_date = {March 22, 1993},
- volume = {45},
- number = {4},
- month = mar,
- year = {1993},
- issn = {0020-0190},
- pages = {171--175},
- numpages = {5},
- url = {http://dx.doi.org/10.1016/0020-0190(93)90115-P},
- doi = {10.1016/0020-0190(93)90115-P},
- acmid = {158825},
- publisher = {Elsevier North-Holland, Inc.},
- address = {Amsterdam, The Netherlands, The Netherlands},
- keywords = {VLSI layout, algorithm design, combinatorial problems},
-} 
-
 @article{hutton,
  author = {Hutton, Michael and Adibsamii, Khosrow and Leaver, Andrew},
  title = {Adaptive Delay Estimation for Partitioning-driven PLD Placement},
@@ -275,3 +255,22 @@
  address = {Piscataway, NJ, USA},
  keywords = {field-programmable gate arrays (FPGAs), programmable logic, programmable logic devise (PLD), timing-driven placement},
 } 
+
+@article{hypergraph_to_graph_survey,
+ author = {Alpert, Charles J. and Kahng, Andrew B.},
+ title = {Recent Directions in Netlist Partitioning: A Survey},
+ journal = {Integr. VLSI J.},
+ issue_date = {Aug. 1995},
+ volume = {19},
+ number = {1-2},
+ month = aug,
+ year = {1995},
+ issn = {0167-9260},
+ pages = {1--81},
+ numpages = {81},
+ url = {http://dx.doi.org/10.1016/0167-9260(95)00008-4},
+ doi = {10.1016/0167-9260(95)00008-4},
+ acmid = {214841},
+ publisher = {Elsevier Science Publishers B. V.},
+ address = {Amsterdam, The Netherlands, The Netherlands},
+} 
diff --git a/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.log b/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.log
index 5ce11ea..3e728e7 100644
--- a/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.log
+++ b/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.log
@@ -1,4 +1,4 @@
-This is pdfTeX, Version 3.14159265-2.6-1.40.15 (TeX Live 2015/dev/Debian) (preloaded format=pdflatex 2014.12.6)  5 JUL 2015 17:13
+This is pdfTeX, Version 3.14159265-2.6-1.40.15 (TeX Live 2015/dev/Debian) (preloaded format=pdflatex 2014.12.6)  5 JUL 2015 17:47
 entering extended mode
  restricted \write18 enabled.
  %&-line parsing enabled.
@@ -638,14 +638,14 @@
 (epstopdf)                    size: 3009 bytes
 (epstopdf)             Command: <repstopdf --outfile=star-eps-converted-to.pdf 
 star.eps>
-(epstopdf)             \includegraphics on input line 57.
+(epstopdf)             \includegraphics on input line 56.
 Package epstopdf Info: Output file is already uptodate.
 
 <star-eps-converted-to.pdf, id=92, 183.68625pt x 171.64125pt>
 File: star-eps-converted-to.pdf Graphic file (type pdf)
 
 <use star-eps-converted-to.pdf>
-Package pdftex.def Info: star-eps-converted-to.pdf used on input line 57.
+Package pdftex.def Info: star-eps-converted-to.pdf used on input line 56.
 (pdftex.def)             Requested size: 88.20154pt x 82.41827pt.
 Package epstopdf Info: Source file: <clique.eps>
 (epstopdf)                    date: 2014-12-07 15:58:33
@@ -655,33 +655,33 @@
 (epstopdf)                    size: 3089 bytes
 (epstopdf)             Command: <repstopdf --outfile=clique-eps-converted-to.pd
 f clique.eps>
-(epstopdf)             \includegraphics on input line 58.
+(epstopdf)             \includegraphics on input line 57.
 Package epstopdf Info: Output file is already uptodate.
 
 <clique-eps-converted-to.pdf, id=93, 183.68625pt x 171.64125pt>
 File: clique-eps-converted-to.pdf Graphic file (type pdf)
 
 <use clique-eps-converted-to.pdf>
-Package pdftex.def Info: clique-eps-converted-to.pdf used on input line 58.
+Package pdftex.def Info: clique-eps-converted-to.pdf used on input line 57.
 (pdftex.def)             Requested size: 88.20154pt x 82.41827pt.
 
 <hg2g_star.pdf, id=94, 548.0475pt x 427.5975pt>
 File: hg2g_star.pdf Graphic file (type pdf)
  <use hg2g_star.pdf>
-Package pdftex.def Info: hg2g_star.pdf used on input line 69.
+Package pdftex.def Info: hg2g_star.pdf used on input line 68.
 (pdftex.def)             Requested size: 100.79846pt x 78.64105pt.
 
 <hg2g_clique.pdf, id=95, 550.055pt x 425.59pt>
 File: hg2g_clique.pdf Graphic file (type pdf)
  <use hg2g_clique.pdf>
-Package pdftex.def Info: hg2g_clique.pdf used on input line 70.
+Package pdftex.def Info: hg2g_clique.pdf used on input line 69.
 (pdftex.def)             Requested size: 100.79846pt x 77.9861pt.
  [8]
 <hyperedge_cutline.pdf, id=99, 652.4375pt x 258.9675pt>
 File: hyperedge_cutline.pdf Graphic file (type pdf)
 
 <use hyperedge_cutline.pdf>
-Package pdftex.def Info: hyperedge_cutline.pdf used on input line 79.
+Package pdftex.def Info: hyperedge_cutline.pdf used on input line 78.
 (pdftex.def)             Requested size: 189.0pt x 75.0196pt.
  (./graph_topology_cutsize.tex
 LaTeX Font Info:    External font `cmex10' loaded for size
@@ -708,7 +708,7 @@
 put line 116.
 (pdftex.def)             Requested size: 252.94438pt x 108.40472pt.
 )
-Overfull \hbox (0.93823pt too wide) in paragraph at lines 118--89
+Overfull \hbox (0.93823pt too wide) in paragraph at lines 118--88
  [][] 
  []
 
@@ -738,31 +738,7 @@
 line 110.
 (pdftex.def)             Requested size: 252.94438pt x 108.40472pt.
 )
-Overfull \hbox (0.93823pt too wide) in paragraph at lines 112--103
- [][] 
- []
-
-(./vpr_flows_mcw.tex
-Package epstopdf Info: Source file: <vpr_flows_mcw.eps>
-(epstopdf)                    date: 2015-01-06 20:40:09
-(epstopdf)                    size: 24238 bytes
-(epstopdf)             Output file: <vpr_flows_mcw-eps-converted-to.pdf>
-(epstopdf)                    date: 2015-01-06 20:40:12
-(epstopdf)                    size: 10413 bytes
-(epstopdf)             Command: <repstopdf --outfile=vpr_flows_mcw-eps-converte
-d-to.pdf vpr_flows_mcw.eps>
-(epstopdf)             \includegraphics on input line 130.
-Package epstopdf Info: Output file is already uptodate.
-
-<vpr_flows_mcw-eps-converted-to.pdf, id=102, 252.945pt x 144.54pt>
-File: vpr_flows_mcw-eps-converted-to.pdf Graphic file (type pdf)
-
-<use vpr_flows_mcw-eps-converted-to.pdf>
-Package pdftex.def Info: vpr_flows_mcw-eps-converted-to.pdf used on input line 
-130.
-(pdftex.def)             Requested size: 252.94438pt x 144.53963pt.
-)
-Overfull \hbox (0.93823pt too wide) in paragraph at lines 132--116
+Overfull \hbox (0.93823pt too wide) in paragraph at lines 112--102
  [][] 
  []
 
@@ -776,7 +752,31 @@
 pdfTeX warning: pdflatex (file ./hyperedge_cutline.pdf): PDF inclusion: multipl
 e pdfs with page group included in a single page
 > <./graph_topology_cutsize-eps-converted-to.pdf> <./graph_topology_mcw-eps-con
-verted-to.pdf>] (./vpr_flows_crit_path.tex
+verted-to.pdf>] (./vpr_flows_mcw.tex
+Package epstopdf Info: Source file: <vpr_flows_mcw.eps>
+(epstopdf)                    date: 2015-01-06 20:40:09
+(epstopdf)                    size: 24238 bytes
+(epstopdf)             Output file: <vpr_flows_mcw-eps-converted-to.pdf>
+(epstopdf)                    date: 2015-01-06 20:40:12
+(epstopdf)                    size: 10413 bytes
+(epstopdf)             Command: <repstopdf --outfile=vpr_flows_mcw-eps-converte
+d-to.pdf vpr_flows_mcw.eps>
+(epstopdf)             \includegraphics on input line 130.
+Package epstopdf Info: Output file is already uptodate.
+
+<vpr_flows_mcw-eps-converted-to.pdf, id=150, 252.945pt x 144.54pt>
+File: vpr_flows_mcw-eps-converted-to.pdf Graphic file (type pdf)
+
+<use vpr_flows_mcw-eps-converted-to.pdf>
+Package pdftex.def Info: vpr_flows_mcw-eps-converted-to.pdf used on input line 
+130.
+(pdftex.def)             Requested size: 252.94438pt x 144.53963pt.
+)
+Overfull \hbox (0.93823pt too wide) in paragraph at lines 132--115
+ [][] 
+ []
+
+(./vpr_flows_crit_path.tex
 Package epstopdf Info: Source file: <vpr_flows_crit_path.eps>
 (epstopdf)                    date: 2015-01-06 20:41:49
 (epstopdf)                    size: 23895 bytes
@@ -796,7 +796,7 @@
  line 126.
 (pdftex.def)             Requested size: 252.94438pt x 144.53963pt.
 )
-Overfull \hbox (0.93823pt too wide) in paragraph at lines 128--127
+Overfull \hbox (0.93823pt too wide) in paragraph at lines 128--126
  [][] 
  []
 
@@ -820,11 +820,12 @@
 
 (pdftex.def)             Requested size: 252.94438pt x 108.40472pt.
 )
-Overfull \hbox (0.93823pt too wide) in paragraph at lines 126--163
+Overfull \hbox (0.93823pt too wide) in paragraph at lines 126--162
  [][] 
  []
 
-(./packing_bloat.tex
+[10 <./vpr_flows_mcw-eps-converted-to.pdf> <./vpr_flows_crit_path-eps-converted
+-to.pdf> <./wires_cut-eps-converted-to.pdf>] (./packing_bloat.tex
 Package epstopdf Info: Source file: <packing_bloat.eps>
 (epstopdf)                    date: 2014-12-23 16:59:22
 (epstopdf)                    size: 20876 bytes
@@ -836,7 +837,7 @@
 (epstopdf)             \includegraphics on input line 122.
 Package epstopdf Info: Output file is already uptodate.
 
-<packing_bloat-eps-converted-to.pdf, id=153, 252.945pt x 144.54pt>
+<packing_bloat-eps-converted-to.pdf, id=162, 252.945pt x 144.54pt>
 File: packing_bloat-eps-converted-to.pdf Graphic file (type pdf)
 
 <use packing_bloat-eps-converted-to.pdf>
@@ -844,12 +845,11 @@
 122.
 (pdftex.def)             Requested size: 252.94438pt x 144.53963pt.
 )
-Overfull \hbox (0.93823pt too wide) in paragraph at lines 124--173
+Overfull \hbox (0.93823pt too wide) in paragraph at lines 124--172
  [][] 
  []
 
-) [10 <./vpr_flows_mcw-eps-converted-to.pdf> <./vpr_flows_crit_path-eps-convert
-ed-to.pdf> <./wires_cut-eps-converted-to.pdf>]
+)
 Package epstopdf Info: Source file: <arch_experiments4.eps>
 (epstopdf)                    date: 2014-12-14 13:57:40
 (epstopdf)                    size: 4208610 bytes
@@ -860,8 +860,7 @@
 erted-to.pdf arch_experiments4.eps>
 (epstopdf)             \includegraphics on input line 434.
 Package epstopdf Info: Output file is already uptodate.
-
-<arch_experiments4-eps-converted-to.pdf, id=163, 816.04875pt x 479.7925pt>
+ <arch_experiments4-eps-converted-to.pdf, id=163, 816.04875pt x 479.7925pt>
 File: arch_experiments4-eps-converted-to.pdf Graphic file (type pdf)
 
 <use arch_experiments4-eps-converted-to.pdf>
@@ -928,6 +927,8 @@
 Package pdftex.def Info: numberofcrossingwires_new_2-eps-converted-to.pdf used 
 on input line 478.
 (pdftex.def)             Requested size: 252.0pt x 115.29556pt.
+ [12 <./additional_fanins_2-eps-converted-to.pdf> <./MinChW_3-eps-converted-to.
+pdf> <./numberofcrossingwires_new_2-eps-converted-to.pdf>]
 Package epstopdf Info: Source file: <delays_crit_path_new.eps>
 (epstopdf)                    date: 2014-12-06 12:37:24
 (epstopdf)                    size: 1736258 bytes
@@ -940,16 +941,14 @@
 (epstopdf)             \includegraphics on input line 490.
 Package epstopdf Info: Output file is already uptodate.
 
-<delays_crit_path_new-eps-converted-to.pdf, id=175, 551.05875pt x 293.095pt>
+<delays_crit_path_new-eps-converted-to.pdf, id=188, 551.05875pt x 293.095pt>
 File: delays_crit_path_new-eps-converted-to.pdf Graphic file (type pdf)
 
 <use delays_crit_path_new-eps-converted-to.pdf>
 Package pdftex.def Info: delays_crit_path_new-eps-converted-to.pdf used on inpu
 t line 490.
 (pdftex.def)             Requested size: 252.0pt x 134.03374pt.
- [12 <./additional_fanins_2-eps-converted-to.pdf> <./MinChW_3-eps-converted-to.
-pdf> <./numberofcrossingwires_new_2-eps-converted-to.pdf>] (./wires_cut_4part.t
-ex
+ (./wires_cut_4part.tex
 Package epstopdf Info: Source file: <wires_cut_4part.eps>
 (epstopdf)                    date: 2014-12-30 18:34:14
 (epstopdf)                    size: 19021 bytes
@@ -1014,11 +1013,7 @@
 
 pdfTeX warning: pdflatex (file ./tall_fpga.pdf): PDF inclusion: multiple pdfs w
 ith page group included in a single page
->]
-Underfull \vbox (badness 6658) has occurred while \output is active []
-
-
-(./fpga2014paper.bbl
+>] (./fpga2014paper.bbl
 Underfull \hbox (badness 1769) in paragraph at lines 56--58
 []\OT1/ptm/m/n/8 W. Ar-den, M. Bril-lou[]et, P. Co-gez, M. Graef, B. Huiz-ing, 
 and
@@ -1069,11 +1064,11 @@
 Here is how much of TeX's memory you used:
  4444 strings out of 495020
  70702 string characters out of 6181324
- 158626 words of memory out of 5000000
+ 156813 words of memory out of 5000000
  7524 multiletter control sequences out of 15000+600000
  42909 words of font info for 77 fonts, out of 8000000 for 9000
  17 hyphenation exceptions out of 8191
- 42i,11n,38p,1651b,467s stack positions out of 5000i,500n,10000p,200000b,80000s
+ 42i,11n,38p,1651b,526s stack positions out of 5000i,500n,10000p,200000b,80000s
 {/usr/share/texlive/texmf-dist/fonts/
 enc/dvips/base/8r.enc}</usr/share/texlive/texmf-dist/fonts/type1/public/amsfont
 s/cm/cmex10.pfb></usr/share/texlive/texmf-dist/fonts/type1/public/amsfonts/cm/c
@@ -1089,7 +1084,7 @@
 mb8a.pfb></usr/share/texlive/texmf-dist/fonts/type1/urw/times/utmbi8a.pfb></usr
 /share/texlive/texmf-dist/fonts/type1/urw/times/utmr8a.pfb></usr/share/texlive/
 texmf-dist/fonts/type1/urw/times/utmri8a.pfb>
-Output written on fpga2014paper.pdf (15 pages, 1953622 bytes).
+Output written on fpga2014paper.pdf (15 pages, 1957790 bytes).
 PDF statistics:
  296 PDF objects out of 1000 (max. 8388607)
  189 compressed objects within 2 object streams
diff --git a/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.pdf b/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.pdf
index dc2082f..94faff4 100644
--- a/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.pdf
+++ b/doc/journal/fpgapaperfinal/FPGAPaper/fpga2014paper.pdf
Binary files differ
diff --git a/doc/journal/fpgapaperfinal/FPGAPaper/partitioning.tex b/doc/journal/fpgapaperfinal/FPGAPaper/partitioning.tex
index 4359d10..3901370 100644
--- a/doc/journal/fpgapaperfinal/FPGAPaper/partitioning.tex
+++ b/doc/journal/fpgapaperfinal/FPGAPaper/partitioning.tex
@@ -38,7 +38,7 @@
 \end{equation}
 where $P1_i$ is the set of all blocks of type $i$ in partition 1. We refer to this type of per-block set of constraints as heterogeneous balance constraints. Note that there are complex legality constraints governing which LUTs and FFs can be packed into legal logic blocks, as well as which multipliers can be packed into legal DSP blocks~\cite{luu2014vtr}. As the partitioner is not aware of these constraints, flow 3 above (which runs the partitioner after function block packing) will be able to more precisely balance resource use across partitions.
 
-The current version of hMetis does not support heterogeneous balance constraints, though Metis does, so we use Metis in our CAD flow. However, as the circuit netlist is naturally represented as a hypergraph, it needs to be transformed to a graph before Metis can process it.
+The current version of hMetis does not support heterogeneous balance constraints, though Metis does, so we use Metis in our CAD flow. However, as the circuit netlist is naturally represented as a hypergraph, it needs to be transformed to a graph before Metis can process it. \hl{Our experiments showed that hMetis performed 30\% better on average when compared to Metis on our benchmark circuits with homogeneous balance constraints.}
 
 \subsection{Hypergraph to Graph Transformation}\label{sec:hypergraph_to_graph}
 We transformed the circuit netlist hypergraph to a graph in the following way:
@@ -47,8 +47,7 @@
 \item We combined all of the per-net subgraphs, summing the edge weights for edges appearing in more than one per-net subgraph, to generate the total netlist graph.
 \end{enumerate}
 
-We explored several ways to generate the per-net subgraphs, varying the graph topology and the edge weight scheme. We considered two graph topologies, clique and star, as illustrated in Figure \ref{fig:star_clique}. \hl{Ref.\mbox{\cite{hypergraph_to_graph_survey}}
-considers the addition of ``dummy'' nodes to model a hypergraph as a graph, with the objective of obtaining equivalent edge cut weights after partitioning. In our star model, we preferred to designate the net source as the center of the star, rather than introduce a dummy node. This choice was motivated by a desire to optimize timing -- the star topology clearly differentiates sources from sinks, and we expect this to guide the partitioner to keep sources and sinks in the same partition.}
+We explored several ways to generate the per-net subgraphs, varying the graph topology and the edge weight scheme. We considered two graph topologies, clique and star, as illustrated in Figure \ref{fig:star_clique}. \hl{Ref.\mbox{\cite{hypergraph_to_graph_survey}} surveys several methods of transforming hypergraphs to graphs, and considers the addition of ``dummy'' nodes to the star model. In our star model, we preferred to designate the net source as the center of the star, rather than introduce a dummy node. This choice was motivated by a desire to optimize timing -- the star topology clearly differentiates sources from sinks, and we expect this to guide the partitioner to keep sources and sinks in the same partition.}
 
 We assigned the same edge weight to every edge of the generated subgraph. Edge weights were computed based on the number of vertices in the originating hyperedge, $n$, and we considered edge weights equal to $1$, $1/n$, and $1/n^2$.  Our intuition is that assigning lower edge weight to high-fanout nets may be beneficial, and we selected the edge weights accordingly. We refer to the combination of graph topology and edge weight scheme as the \emph{hyperedge model}.
 
@@ -90,9 +89,9 @@
 \label{fig:graph_topology_cutsize}
 \end{figure}
 
-The performance of the clique topology is significantly worse than that of the star topology, regardless of edge weight scheme. An intuitive explanation for this result is that with the clique topology, the partitioner does not have knowledge of which vertex is the source of a net and which vertices are sinks. In contrast, the star topology clearly differentiates sources from sinks and this appears to give the partitioner an “anchor point” that pulls all hyperedge fanouts toward the source. Additionally, if $n$ is the average number of vertices in a hyperedge, the clique topology generates $O(n^2)$ edges while the star topology generates only $O(n)$ edges, so the star topology saves both memory and runtime.
+The performance of the clique topology is significantly worse than that of the star topology, regardless of edge weight scheme. An intuitive explanation for this result is that with the clique topology, the partitioner does not have knowledge of which vertex is the source of a net and which vertices are sinks. In contrast, the star topology clearly differentiates sources from sinks and this appears to give the \hl{iterative refinement based} partitioner an “anchor point” that pulls all hyperedge fanouts toward the source. Additionally, if $n$ is the average number of vertices in a hyperedge, the clique topology generates $O(n^2)$ edges while the star topology generates only $O(n)$ edges, so the star topology saves both memory and runtime.
 
-For both topologies, the $1/n$ edge weight scheme gives the smallest cut size on average. This can be explained intuitively in terms of the total edge weight over all edges of the graph. The star $1$ (constant) edge weight scheme assigns a total weight of $n$ to each net, which heavily weights high-fanout nets. In contrast, the star $1/n$ model assigns the same total weight of $1$ to each net. The star $1/n^2$ model penalizes high-fanout nets relative to lower-fanout nets. Since we seek to minimize the hyperedge cut, it makes intuitive sense to weight all hyperedges equally, and the star $1/n$ model achieves this.
+\hl{For both topologies, the $1/n$ edge weight scheme gives the smallest cut size on average. This can be explained intuitively in terms of the total edge weight over all edges of the graph. The star $1$ (constant) edge weight scheme assigns a total weight of $n - 1$ to each net, which heavily weights high-fanout nets. In contrast, the star $1/n$ model assigns a total weight of $1 - \frac{1}{n}$ to each net. The star $1/n^2$ model penalizes high-fanout nets relative to lower-fanout nets. Since we seek to minimize the hyperedge cut, it makes intuitive sense to weight all hyperedges equally, and the star $1/n$ model best achieves this.}
 
 To validate the choice of hyperedge cutsize as a proxy for circuit routability, we ran the partitioning CAD flow for each hyperedge model, across several circuits. We use an unbalance of $5\%$, split the clustered netlist into two partitions (constraining only the placer), cut $80\%$ of the wires crossing the interposer, and impose a $1ns$ delay penalty for wires crossing the interposer. To accommodate unbalance in the placement engine, we increase the size of the grid of complex blocks that make up the FPGA device. Relative to the minimum device size required for placement (without partitioning constraints), we add $10\%$ to the complex block grid width and $10\%$ to the grid height. Figure \ref{fig:graph_topology_mcw} shows the geometric mean of the minimum channel width required for a successful route, for each hyperedge model.
 
@@ -104,7 +103,7 @@
 \label{fig:graph_topology_mcw}
 \end{figure}
 
-The star topology with 1/n edge weights achieves the best minimum channel width, confirming that hyperedge cutsize is a good proxy for routability. Consequently, we use this model in all future results in this paper. Unlike routability, we found that the post-routing critical path delay was not strongly impacted by the hypergraph model used.
+The star topology with $1/n$ edge weights achieves the best minimum channel width, confirming that hyperedge cutsize is a good proxy for routability. Consequently, we use this model in all future results in this paper. Unlike routability, we found that the post-routing critical path delay was not strongly impacted by the hypergraph model used.
 
 \subsection{Partitioner Stage Results}\label{sec:partitioner_stage_results}
 We compare the performance of the four CAD flow variations described in Section \ref{sec:partitioner_stage} on the 8 largest VTR benchmarks. We again use an unbalance of $5\%$ and split the clustered netlist into two partitions (constraining the packer and/or the placer, as required by the flow under test). The interposer parameters (\% wires cut = $80\%$ and delay increase = 1 ns) and bloat factor ($10\%$ in each dimension) are the same as in Section \ref{sec:hypergraph_to_graph}. Figure \ref{fig:flows_mcw} shows the minimum routable channel width for each partitioning CAD flow variation.
diff --git a/doc/journal/fpgapaperfinal/FPGAPaper/response_to_reviewers.txt b/doc/journal/fpgapaperfinal/FPGAPaper/response_to_reviewers.txt
index dd62eb6..749a243 100644
--- a/doc/journal/fpgapaperfinal/FPGAPaper/response_to_reviewers.txt
+++ b/doc/journal/fpgapaperfinal/FPGAPaper/response_to_reviewers.txt
@@ -8,21 +8,21 @@
 

 The paper presents significant space talking about how to transform a hyper graph to a graph using various models.  This has been well studied in the past.  It would be useful to present the proposed techniques in the context of this previous work.  As a start, there are numerous references within reference [22] that should be considered.  Doing so would help the reader understand the novelty of the hypergraph->graph methodology and results.

 

-[Javeed to add survey paper reference and discussion on why we use a non-dummy node star topology]

+[Javeed to add survey paper reference and discussion on why we use a non-dummy node star topology][DONE]

 

 The partitioning flow does not appear to be timing driven.  Given that, I wonder how meaningful the results in Section VI-C (Circuit speed vs. interposer delay) are.  These results show that the critical path delay is strongly affected by the delay_increase, but any reasonable timing-driven CAD flow would try to keep the critical path on one side of the partition to minimize the number of critical path segments crossing a partition.  Can you comment on this?  Can you at least provide data regarding how many cut crossings are typical on the critical paths?

 

-[Javeed add table on number of cut crossings on critical paths vs. total connections on critical paths.  Also some discussion on timing-driven partitioning -- our star model should help timing, and a timing-driven partitioner might help further but will the challenge of estimating critical paths accurately is hard.  Could reference Mike Hutton paper on APEX 20k timing-driven partitioning.  

+[Javeed add table on number of cut crossings on critical paths vs. total connections on critical paths.  Also some discussion on timing-driven partitioning -- our star model should help timing, and a timing-driven partitioner might help further but will the challenge of estimating critical paths accurately is hard.  Could reference Mike Hutton paper on APEX 20k timing-driven partitioning.

 

 Ref is:  Hutton, M., Adibsamii, K. and Leaver, A. “Adaptive Delay Estimation for Partitioning-Driven PLD Placement”, IEEE Trans. VLSI, 11:1, pp. 60-63, 2003.

 

 My summary of the paper is: 

-"A commercial recursive partitioning placement algorithm for the Altera Apex 20K family is described in [92]. Apex has a hierarchical routing architecture, making it well suited to partitioning-based placement. Recursive partitioning is conducted along the natural cut-lines formed by the various hierarchy levels of the routing architecture, as shown in Figure 13. Notice that the sequence of partitions in this algorithm is significantly different than that of ALTOR, showing the large impact an FPGA’s routing architecture has on placement algorithms. This algorithm is made timing-driven by weighting connections with low slack highly during each partitioning phase to encourage partitioning solutions in which such connections can be routed using only fast, lower-hierarchy-level routing. To improve the prediction of the critical path, the delay estimate for each connection is a function both of the known number of hierarchy boundaries the net must traverse due to partitionings at the higher levels of the routing hierarchy, and statistical estimates of how many hierarchy boundaries the connection will cross at future partitioning steps. "]

+"A commercial recursive partitioning placement algorithm for the Altera Apex 20K family is described in [92]. Apex has a hierarchical routing architecture, making it well suited to partitioning-based placement. Recursive partitioning is conducted along the natural cut-lines formed by the various hierarchy levels of the routing architecture, as shown in Figure 13. Notice that the sequence of partitions in this algorithm is significantly different than that of ALTOR, showing the large impact an FPGA’s routing architecture has on placement algorithms. This algorithm is made timing-driven by weighting connections with low slack highly during each partitioning phase to encourage partitioning solutions in which such connections can be routed using only fast, lower-hierarchy-level routing. To improve the prediction of the critical path, the delay estimate for each connection is a function both of the known number of hierarchy boundaries the net must traverse due to partitionings at the higher levels of the routing hierarchy, and statistical estimates of how many hierarchy boundaries the connection will cross at future partitioning steps. "] [DONE]

 

 

 A key result that would help the reader interpret the results would be a table that shows, for each benchmark circuit, the number of signals that cross the partition (independent of the place and route).   It is important to break this down by benchmark circuit.  If one was to build such a device, it would be important to consider the range of cut requirements rather than just the average.  This is especially important considering the results of Figure 23 which suggests that increasing the channel width is not an efficient way to make up for a lack of cut capacity.

 

-[Javeed to get table and add to paper if feasible.  Try to add in same table as critical path one.  Best flow, 2 partitions, cut net count vs. benchmark and cut net per channel vs. benchmark.]

+[Javeed to get table and add to paper if feasible.  Try to add in same table as critical path one.  Best flow, 2 partitions, cut net count vs. benchmark and cut net per channel vs. benchmark.] [DONE, but p&r dependent]

 

 In Figure 7, it was not clear to me why, for 30% of the wires cut, the placer optimizations do worse than the placer without optimizations.  Is this just experimental noise?

 

@@ -56,7 +56,7 @@
 [ We have added a discussion of why we choose a relatively tight balance constraint of 5% to Section V.B.  

 We assumed a 5% balance constraint to give the partitioner some flexibility to minimize cut size, while still forcing the partitions to be within 5% of balanced so that we can implement designs in fairly full FPGAs (e.g. 90% logic utilization).  If we allowed a more relaxed balance constraint of 10%, then a circuit could be split into 60% and 40% portions, and a 2-die interposer system would need to provide enough circuitry in one partition to support 60% of the design logic. This implies the interposer system would need 120% (2 x 60%) of the logic (or DSP, or RAM if they are the limiting resource) required by the circuit.  This would limit logic utilization to 83% of the system (1/1.2), but many FPGA designs go beyond that level of utilization.] 

 

-[ Javeed: try a balance constraint that reflects the real limits in each die. ]

+[ Javeed: try a balance constraint that reflects the real limits in each die. ] [ DONE? Need to respond.]

 

 4. In line 50 of Section V B, are you giving any special consideration to DSP or Block-RAM etc? Or are those treated the same way you treat simple LUTs and Flops?

 

@@ -103,15 +103,15 @@
 

 For the discussion about hypergraph to graph transformation in Sec V.C, there were many papers discussing how to set the edge weights in the 90's, please cite them (for example, see the survey paper "Recent directions in netlist partioning: a survey"). In particular, setting the edge weight to 1/(n-1) instead of 1/n is a popular choice since it can model a 2-pin net exactly. So, it makes sense to report the results for 1/(n-1). It looks like your star model picks one node of a net and connect it to other nodes of the net, but the more popular star model in the physical design literature would add a dummy node for each net that connects to all nodes of the net. So please explicitly define your star model in words. And from p.9, it looks like that you also want to distinguish the source of a net from the sinks of a net in the star model, if so, please state that in the definition of your star model.

 

-[Javeed: make explicit and correct text if it says we make all hyperedges the same weight with 1/n (they will vary from 1/2 to 1 in total hyperedge weight).]

+[Javeed: make explicit and correct text if it says we make all hyperedges the same weight with 1/n (they will vary from 1/2 to 1 in total hyperedge weight).] [DONE]

 

 p.9, middle of left column said "the star topology clearly differentiates sources from sinks and this appears to give the partitioner an anchor point that pulls all hyperedge fanouts toward the source", you should mention that Metis is an iterative improvement-based partitioner which is related to your reasoning.

 

-[ Javeed: add.]

+[ Javeed: add.] [DONE]

 

 p.9, left column, 2nd last paragraph, your star 1 edge weight scheme actually assigns a total weight of n-1 (not n) to each net. Similarly, your star 1/n model assigns a total weight of (n-1)/n (not 1) to each net.

 

-[Javeed: fix.]

+[Javeed: fix.] [DONE]

 

 Some writing issues:

 

@@ -136,7 +136,8 @@
 

 This is not satisfying to me at all. If the correct solution is to use hypergraph partitioning with heterogeneous balance constraints, then the authors should make appropriate modifications to hMETIS to produce the appropriate partitioning tool. It is unclear how much relevant information is lost by transforming the hypergraph to a graph, and what the impact on relative CAD metrics (W_min, f_max, etc) would be.

 

-[Javeed: mention hMetis is not open source (emailed person), give results on hmetis vs. metis cut size (with and without balance), and mention the star model should help timing, as timing is a connection / edge, phenomenon, not a net / hyperedge phenomenon.]

+[Javeed: mention hMetis is not open source (emailed person), give results on hmetis vs. metis cut size (with and without balance), and mention the star model should help timing, as timing is a connection / edge, phenomenon, not a net / hyperedge phenomenon.] [DONE]

+The hMetis source code is not freely available. We contacted the authors of hMetis, who stated that an hMetis update is in development and will contain support for heterogeneous balance constraints. Our experiments showed that hMetis performed 30% better on average when compared to Metis on our benchmark circuits, when both were used with homogeneous balance constraints.

 

 

 (2) I disagree with the case for balance constraints

@@ -154,7 +155,7 @@
 You are correct that if we targeted lower utilization rates for the interposer systems we could relax the balance constraint, and this might help routability. However, we do not feel this is the best point at which to evaluate interposer routability. Designers are accustomed to using most of an FPGA's logic, RAM, etc. capacity and will have the same expectation for interposer-based FPGAs.  Commercial monolithic FPGAs are architected to have a high probability of successful routing at utilizations of 90% and even higher. Partially this is because the routing is cheaper and the function blocks more expensive than was assumed in DeHon's 1999 paper and partly this is because an FPGA is usually selected and integrated into a board before the FPGA design is complete. While designers can estimate logic, RAM and multiplier utlization with reasonable accuracy even before a design is complete, routability estimation is very hard, so it would be difficult to choose a chip before a design was 100% complete.]

 

 

- Javeed: do the tuned balance / limit ].

+ Javeed: do the tuned balance / limit ]. [DONE?]

 

 ==

 

@@ -212,4 +213,4 @@
 

 

 Fig. 23 is somehow redundant, as it is simply a different way to display data that was reported earlier.

-[You are correct that Fig. 23 is simply a different way to visualize the data of Fig. 22. We prefer to retain the figure however, as it highlights that the interposer wiring capacity becomes the dominant factor in routability when its absolute signal count drops below a certain level. Other figures show the within-die channel width required when one cuts a certain percentage of the normal FPGA wiring across the interposer, while this figure presents the same data as within-die channel width required vs. the absolute (not relative) interposer wiring capacity. We believe this highlights the important fact that the interposer for this FPGA architecture requires about 20 wires per vertical channel at the absolute minimum.]
\ No newline at end of file
+[You are correct that Fig. 23 is simply a different way to visualize the data of Fig. 22. We prefer to retain the figure however, as it highlights that the interposer wiring capacity becomes the dominant factor in routability when its absolute signal count drops below a certain level. Other figures show the within-die channel width required when one cuts a certain percentage of the normal FPGA wiring across the interposer, while this figure presents the same data as within-die channel width required vs. the absolute (not relative) interposer wiring capacity. We believe this highlights the important fact that the interposer for this FPGA architecture requires about 20 wires per vertical channel at the absolute minimum.]