| """ |
| A tool for extracting module port definitions from PDF specifications |
| """ |
| |
| # the following cases required specific adjustments to the base algorithm: |
| # CFGLUT5, IDDR, IDELAYCTRL, IDELAYE2, ISERDESE2, KEEPER, LUT6, LUT6_2, |
| # MMCME2_BASE, ODDR, OSERDESE2, PLLE2_BASE, RAM128X1D, RAM64M |
| # any changes to the algorithm should be checked against these entries |
| |
| # we use pdfminer to parse the PDF document and interpret the elements |
| # for python3 support you need to `pip install pdfminer.six` |
| from pdfminer.pdfparser import PDFParser |
| from pdfminer.pdfdocument import PDFDocument |
| from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter |
| from pdfminer.pdfpage import PDFPage |
| from pdfminer.converter import PDFLayoutAnalyzer |
| from pdfminer.layout import LAParams, LTContainer, LTTextLineHorizontal |
| |
| from collections import OrderedDict |
| import sys |
| import re |
| |
| from datetime import datetime |
| from lxml import objectify, etree |
| |
| PAGE_MARGIN = 60 # space ignored at the top and bottom of the page, for page header/footer |
| HEADER_MARGIN = 30 # space ignored at the start of the section due to the section header |
| COL_MARGIN = 5 # acceptable variation in x-position of entries in the same column |
| |
| |
| def rev_enumerate(it, last=None): |
| for i in range(len(it))[last::-1]: |
| yield i, it[i] |
| |
| |
| class PDFTableParser(PDFLayoutAnalyzer): |
| """Custom interpreter that hooks into pdfminer to process PDF text elements""" |
| |
| def __init__( |
| self, rsrcmgr, laparams=None, stop_at=None, top=None, bottom=None |
| ): |
| PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=1, laparams=laparams) |
| self.stop_at = stop_at |
| self.reset(top, bottom) |
| |
| def process_text(self, bbox, txt, obj): |
| # there's no guarantee on the order in which text strings appear |
| # so store them all for post-processing after sorting them into order |
| if self.stop_at is not None and txt.startswith(self.stop_at): |
| self.done = True |
| self.bottom = bbox[3] + COL_MARGIN |
| return True |
| txt = txt.replace(u'\u2019', "'") # all HEX constants |
| txt = txt.replace(u'\u2022', '\n*') # bullet lists |
| txt = txt.replace( |
| u'\u201c', '"' |
| ) # e.g. FRAME_ECCE2 (FRAME_RBT_IN_FILENAME) |
| txt = txt.replace( |
| u'\u201d', '"' |
| ) # e.g. FRAME_ECCE2 (FRAME_RBT_IN_FILENAME) |
| self.items.append([int(-bbox[3]), int(bbox[0]), txt]) |
| return True |
| |
| def reset(self, top, bottom): |
| self.top = top |
| self.bottom = bottom |
| self.done = False |
| self.items = [] |
| |
| def receive_layout(self, ltpage): |
| if self.done: |
| return |
| |
| def render(item): |
| # don't process it if it's outside the FOV |
| if getattr(item, 'y1', self.bottom) <= self.bottom: |
| return True |
| if getattr(item, 'y0', self.top) >= self.top: |
| return True |
| # process individual lines of text |
| if isinstance(item, LTTextLineHorizontal): |
| if not self.process_text(item.bbox, item.get_text().strip(), |
| item): |
| return False |
| # process containers that (might) contain text (e.g. LTTextBoxHorizontal) |
| elif isinstance(item, LTContainer): |
| for child in item: |
| if not render(child): |
| return False |
| return True |
| |
| # default to processing the entire page |
| if self.top is None: |
| self.top = ltpage.mediabox[3] |
| if self.bottom is None: |
| self.bottom = ltpage.mediabox[1] |
| render(ltpage) |
| self.items[:] = sorted(self.items) |
| |
| # don't bother doing any image/line rendering |
| def render_image(self, name, stream): |
| return |
| |
| def paint_path(self, gstate, stroke, fill, evenodd, path): |
| return |
| |
| # data processing functions |
| def process_table(self): |
| # figure out the table header |
| top = -self.bottom |
| self.heads = [] |
| for y, x, t in self.items: |
| if y < top: |
| top = y |
| for i, (y, x, t) in rev_enumerate(self.items): |
| if y < top + COL_MARGIN: |
| self.heads.append((x - COL_MARGIN, t)) |
| self.items.pop(i) |
| self.heads = sorted(self.heads) |
| # figure out the rows |
| leftcol = self.heads[1][0] |
| self.rows = [] |
| self.items[:] = sorted(self.items) |
| for i, (y, x, t) in rev_enumerate(self.items): |
| if x < leftcol: |
| self.items.pop(i) |
| if t[0] == '<' and t[-1] == '>': |
| continue # IDELAYE2 |
| self.rows.append((y, t)) |
| if t.startswith('NOTE:') or not ( |
| t.isupper() |
| or t in ['/', '-', 'to']) or y > -self.bottom: |
| self.bottom = -y |
| self.done = True |
| self.rows = [(y, t) for y, t in self.rows[::-1] if y < -self.bottom] |
| # join rows together if required |
| for i, x in rev_enumerate(self.rows, -3): |
| if self.rows[i + 1][1] in ('to', '/', '-'): |
| self.rows[i] = ( |
| x[0], x[1] + self.rows[i + 1][1] + self.rows.pop(i + 2)[1] |
| ) |
| self.rows.pop(i + 1) |
| for i, x in rev_enumerate(self.rows, -2): |
| if x[1][-1] in ',_/' or x[1].endswith('to') or self.rows[ |
| i + 1][1][0] == '_' or x[0] == self.rows[i + 1][0]: |
| self.rows[i] = (x[0], x[1] + self.rows.pop(i + 1)[1]) |
| # correct y-positions |
| for i, (y, x, t) in rev_enumerate(self.items): |
| if y >= -self.bottom: |
| self.items.pop(i) |
| else: |
| self.items[i][0] = int(y) |
| # now sort items by corrected y-positions |
| self.items = sorted(self.items) |
| |
| def get_row(self, y): |
| for i, (y0, t) in enumerate(self.rows[1:]): |
| if y < y0 - COL_MARGIN: |
| return i |
| return len(self.rows) - 1 |
| |
| def arrange_items(self): |
| data = [] |
| for r in self.rows: |
| d = OrderedDict([(k[1], '') for k in self.heads]) |
| d[self.heads[0][1]] = r[1].replace(' ', '') |
| data.append(d) |
| for y, x, t in sorted(self.items): |
| for x0, head in self.heads[::-1]: |
| if x >= x0: |
| entry = data[self.get_row(y)] |
| if len(entry[head] |
| ) and entry[head][-1] != '_' and t[0] != '_': |
| t = ' ' + t |
| entry[head] += t |
| break |
| return data |
| |
| |
| def resolve_goto_action(doc, a): |
| """Resolves a "goto" action from the PDF outline into the associated page |
| object and position on that page""" |
| a = a.resolve() |
| assert a['S'].name == 'GoTo' |
| link = doc.get_dest(a['D']).resolve()['D'] |
| y = link[3] if link[1].name == 'XYZ' else 0 |
| return link[0], y |
| |
| |
| def find_pages(pgs, start, stop): |
| """Find all the pages between the resolved "start" and "stop" actions""" |
| extract = False |
| bottom = PAGE_MARGIN |
| for pg in pgs: |
| if pg.pageid == start[0].objid: |
| extract = True |
| top = start[1] - HEADER_MARGIN |
| else: |
| top = pg.mediabox[3] - PAGE_MARGIN |
| if pg.pageid == stop[0].objid: |
| bottom = stop[1] |
| if extract: |
| yield pg, top, bottom |
| if pg.pageid == stop[0].objid: |
| return |
| |
| |
| def parse_module_pages(doc, start_at): |
| """Deconstruct the PDF outline into a list of modules, with links to the |
| start and end of the associated "port descriptions" section""" |
| parts = OrderedDict() |
| module = None |
| start = None |
| process = False |
| pgs = PDFPage.create_pages(doc) |
| for (level, title, dest, a, se) in doc.get_outlines(): |
| if level == 2: # chapter titles |
| # modules are defined in chapter 4 |
| process = title.startswith('Ch. 4:') |
| elif process and level == 3: # module names are defined at level 3 of the TOC |
| module = title |
| elif level == 4 and module is not None and title.startswith(start_at): |
| # NB: possible incosistency in "title" name (e.g. LUT6) |
| start = resolve_goto_action(doc, a) |
| elif start is not None and module not in parts: # i.e. this is the FIRST following section |
| stop = resolve_goto_action(doc, a) |
| parts[module] = find_pages(pgs, start, stop) |
| start = None |
| return parts |
| |
| |
| def process_ports(tbl): |
| """Process the text elements corresponding to the "port descriptions" |
| table and return an ordered list of ports""" |
| if not len(tbl.items): |
| return [] |
| # fixes for MMCME2_BASE |
| for i, (y, x, t) in rev_enumerate(tbl.items): |
| if tbl.items[i][2] == 'Clock' and tbl.items[i + 1][2] == 'Inputs': |
| tbl.items[i][2] = 'CLKIN1' |
| tbl.items.pop(i + 1) |
| elif tbl.items[i][2] == 'Status' and tbl.items[i + 1][2] == 'Ports': |
| tbl.items[i][2] = 'LOCKED' |
| tbl.items.pop(i + 1) |
| elif t == 'Direction Width': |
| tbl.items[i][2] = 'Direction' |
| tbl.items.insert( |
| i + 1, (y, (x + tbl.items[i + 1][1]) / 2, 'Width') |
| ) |
| tbl.process_table() |
| # transform headers as necessary |
| for i, (x, name) in enumerate(tbl.heads): |
| if name.lower().startswith('direction'): |
| tbl.heads[i] = (x, 'Type') |
| # sort the items into categories |
| ports = tbl.arrange_items() |
| # process the rows one-by-one (in reverse, because we might insert new entries) |
| for i, x in rev_enumerate(ports): |
| # process the "width" entry |
| M = re.match( |
| r'([0-9]+)', x['Width'] |
| ) # remove any additional text (e.g. ODDR, KEEPER) |
| if M is None: |
| print( |
| '\tInvalid width %s on %s, skipping item' % |
| (repr(x['Width']), repr(x['Port'])) |
| ) |
| return [] |
| x['Width'] = wid = int(M.group(0)) |
| # process the "direction" entry |
| if x['Type'] == 'Input': |
| dir = 'input' |
| elif x['Type'] == 'Output': |
| dir = 'output' |
| elif x['Type'] == 'In/out': |
| dir = 'inout' |
| else: |
| assert False, 'Invalid pin type %s' % repr(x['Type']) |
| x['Type'] = dir |
| # process the port name |
| name = re.sub( |
| r'\s', '', x['Port'] |
| ) # remove any spaces from (multiline) name entries (e.g. MMCME2_BASE) |
| if '<' in name: # bus pins MIGHT be explicitly listed in name; perform sanity check |
| name, bits = name.split('<', 1) |
| assert bits == '%d:0>' % (wid - 1) |
| elif '-' in name: # entry is a range of pins (e.g. ISERDESE2) |
| n, start, stop = re.match( |
| r'([A-Z]+)([0-9]+)\s*-\s*[A-Z]+([0-9]+)', name |
| ).groups() |
| ports.pop(i) |
| for j in range(int(start), int(stop) + 1): |
| y = x.copy() |
| y['Port'] = n + '%d' % j |
| ports.insert(i + j - 1, y) |
| continue |
| # is the entry actually a LIST of pins? (e.g. CFGLUT5, OSERDESE2, ODDR) |
| M = re.split(r'[,/:]', name) |
| if len(M) > 1: |
| ports.pop(i) |
| for j, n in enumerate(M): |
| y = x.copy() |
| y['Port'] = n.strip() |
| ports.insert(i + j, y) |
| else: |
| x['Port'] = name |
| return ports |
| |
| |
| def process_attributes(tbl): |
| if not len(tbl.items): |
| return [] |
| tbl.process_table() |
| # transform headers as necessary |
| for i, (x, name) in enumerate(tbl.heads): |
| if name in ('Allowed Values', 'Allowed_Values'): |
| tbl.heads[i] = (x, 'Allowed') |
| if name == 'Descriptions': |
| tbl.heads[i] = (x, 'Description') |
| # transform text as necessary |
| for i, (y, x, t) in enumerate(tbl.items): |
| t = t.replace(u'\u2122', '(tm)') # IOBUF (DRIVE) |
| t = t.replace('""', '"') # RAM18E1 (SIM_DEVICE) |
| tbl.items[i][2] = t |
| # sort the items into categories |
| attribs = tbl.arrange_items() |
| # post-process the entries |
| for i, x in rev_enumerate(attribs): |
| if x['Type'] == 'STRING' and x[ |
| 'Default'] == 'None': # ICAPE2 (SIM_CFG_FILE_NAME) |
| x['Default'] = '""' |
| if x['Default'][0] == '"' and x['Default'][ |
| -1] != '"': # RAMB18E1 (WRITE_MODE_A) |
| s1, s2 = x['Default'].rsplit('"', 1) |
| x['Default'] = s1 + '"' |
| x['Description'] = s2 + ' ' + x['Description'] |
| if x['Default'].startswith('All'): |
| if 'one' in x['Default']: |
| val = 'F' |
| elif 'zero' in x['Default']: |
| val = '0' |
| else: |
| raise TypeError |
| assert x['Type'] == 'HEX' |
| M = re.search(r'(\d+)[-\s][Bb]it', x['Allowed']) |
| if M is None: |
| break |
| sz = int(M.group(1)) |
| pad = 1 if sz % 4 else 0 |
| x['Default'] = "%d'h%s" % (sz, val * ((sz // 4) + pad)) |
| elif x['Default'].startswith("0'h"): # ICAPE2 (DEVICE_ID) |
| x['Default'] = "32'h0" + x['Default'][3:] |
| if ',' in x['Attribute']: |
| attribs.pop(i) |
| for j, n in enumerate(x['Attribute'].split(',')): |
| n = n.strip() |
| if not len(n): |
| continue |
| y = x.copy() |
| y['Attribute'] = n |
| attribs.insert(i + j, y) |
| M = re.match( |
| r'([A-Z_]+)([0-9A-F]+)?(_[A-Z_]+)?to([A-Z_]+)([0-9A-F]+)(_[A-Z_]+)?', |
| x['Attribute'] |
| ) |
| if M is not None: |
| pre1, start, post1, pre2, stop, post2 = M.groups() |
| attribs.pop(i) |
| if start is None: |
| y = x.copy() |
| y['Attribute'] = pre1 |
| attribs.insert(i, y) |
| start = '0' |
| pre1 = pre2 |
| post1 = post2 |
| i += 1 |
| else: |
| assert pre1 == pre2 and post1 == post2 and len(start |
| ) == len(stop) |
| if post1 is None: |
| post1 = '' |
| nchar = len(stop) |
| if re.match(r'[0-9]+$', start) is not None and re.match( |
| r'[0-9]+$', stop) is not None: |
| start = int(start) |
| stop = int(stop) |
| fmt = '%s%0*d%s' |
| else: |
| start = int(start, 16) |
| stop = int(stop, 16) |
| fmt = '%s%0*X%s' |
| for j in range(start, stop + 1): |
| y = x.copy() |
| y['Attribute'] = fmt % (pre1, nchar, j, post1) |
| attribs.insert(i + j, y) |
| return attribs |
| |
| |
| def process_specs(infile, modules=None): |
| """Process the module specifications in the input PDF into an XML tree""" |
| # initialise the pdfminer interface -- |
| # we use a custom "render device" to receive the text objects in the PDF for further processing |
| resman = PDFResourceManager() |
| doc = PDFDocument(PDFParser(open(infile, 'rb'))) |
| laparams = LAParams( |
| line_margin=0.1, char_margin=0.7 |
| ) # parameters optimised to prevent incorrectly joining together words |
| |
| device = PDFTableParser(resman, laparams, stop_at='VHDL') |
| interpreter = PDFPageInterpreter(resman, device) |
| |
| # parse the PDF table of contents to figure out what modules exist and which pages to process |
| port_pages = parse_module_pages(doc, 'Port Desc') |
| if modules is None or len(modules) == 0: |
| modules = port_pages.keys() # default to processing ALL modules |
| attrib_list = parse_module_pages( |
| doc, 'Available Attrib' |
| ) # NB: not all modules have attributes |
| |
| # parse the specifications and generate an XML tree |
| E = objectify.ElementMaker(annotate=False) |
| root = E.xml(source=infile, processed=datetime.now().isoformat()) |
| |
| # run through the modules |
| for module in modules: |
| sys.stderr.write('Processing %s...\n' % module) |
| node = E.module(name=module) |
| # process the ports of this module |
| for pg, top, bottom in port_pages[module]: |
| device.reset(top, bottom) |
| interpreter.process_page(pg) |
| for P in process_ports(device): |
| node.append( |
| E.port( |
| name=P['Port'], type=P['Type'], width=str(P['Width']) |
| ) |
| ) |
| if device.done: |
| break |
| # process the attributes of this module |
| for pg, top, bottom in attrib_list.get(module, []): |
| device.reset(top, bottom) |
| interpreter.process_page(pg) |
| for A in process_attributes(device): |
| node.append( |
| E.attribute( |
| name=A['Attribute'], |
| type=A['Type'], |
| default=A['Default'].replace('"', ''), |
| values=A['Allowed'].replace('"', '') |
| ) |
| ) |
| if device.done: |
| break |
| # add it to the root object |
| root.append(node) |
| return root |
| |
| |
| if __name__ == '__main__': |
| import argparse |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| '--input', |
| '-i', |
| nargs='?', |
| default='ug953-vivado-7series-libraries.pdf' |
| ) |
| parser.add_argument( |
| '--output', |
| '-o', |
| nargs='?', |
| type=argparse.FileType('w'), |
| default=sys.stdout |
| ) |
| parser.add_argument('--modules', '-m', nargs='*') |
| args = parser.parse_args() |
| |
| xml = process_specs(args.input, args.modules) |
| args.output.write(etree.tostring(xml, pretty_print=True).decode('ascii')) |