# -*- coding: utf-8 -*-
"""
Readers for USPTO patents.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from ..scrape.clean import clean
#from ..doc.table import Table
from ..doc.table import Table
from ..doc.text import Caption, Footnote, Cell
from .markup import XmlReader
# TODO: The below has only been tested with us-patent-grant-v42
[docs]class UsptoXmlReader(XmlReader):
"""Reader for USPTO XML documents."""
cleaners = [clean] # tidy_nlm_references, space_labels
root_css = 'us-patent-grant' # TODO: Other roots
title_css = 'invention-title'
heading_css = 'heading, p[id^="h-"]'
table_css = 'table'
table_body_row_css = 'table row'
table_cell_css = 'entry'
# figure_css = 'img'
reference_css = 'claim-ref'
# citation_css = 'ref-list ref'
ignore_css = 'us-bibliographic-data-grant *:not(invention-title)'
inline_elements = {
'b', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'cite', 'code', 'dfn', 'em', 'kbd', 'strong', 'samp', 'var',
'a', 'bdo', 'br', 'img', 'map', 'object', 'q', 'script', 'span', 'sub', 'sup', 'button', 'input', 'label',
'select', 'textarea', 'blink', 'font', 'marquee', 'nobr', 's', 'strike', 'u', 'wbr',
'xref', 'underline', 'italic', 'bold', 'inline-formula', 'alternatives', 'tex-math',
'{http://www.w3.org/1998/math/mathml}math', '{http://www.w3.org/1998/math/mathml}msubsup',
'{http://www.w3.org/1998/math/mathml}mrow', '{http://www.w3.org/1998/math/mathml}mo',
'{http://www.w3.org/1998/math/mathml}mi', '{http://www.w3.org/1998/math/mathml}mn',
'claim-ref', 'figref'
}
[docs] def detect(self, fstring, fname=None):
""""""
if fname and not fname.lower().endswith('.xml'):
return False
if b'us-patent-grant' in fstring:
return True
# TODO: Other DTDs
return False
def _parse_table(self, el, refs, specials):
hdict = {}
for row, tr in enumerate(self._css(self.table_body_row_css, el)):
colnum = 0
for td in self._css(self.table_cell_css, tr):
cell = self._parse_text(td, refs=refs, specials=specials, element_cls=Cell)
colspan = int(td.get('colspan', '1'))
rowspan = int(td.get('rowspan', '1'))
for i in range(colspan):
for j in range(rowspan):
rownum = row + j
if not rownum in hdict:
hdict[rownum] = {}
while colnum in hdict[rownum]:
colnum += 1
hdict[rownum][colnum] = cell[0] if len(cell) > 0 else Cell('')
colnum += 1
potential_rows = []
most_cols = 0
for row in sorted(hdict):
potential_rows.append([])
most_cols = max(most_cols, len(hdict[row]))
for col in sorted(hdict[row]):
potential_rows[-1].append(hdict[row][col])
hrows = []
rows = []
label = None
caption = None
footnotes = []
for i, r in enumerate(potential_rows):
# Skip empty rows
if all(cell.text.strip() == '' for cell in r):
continue
# Top row label?
if len(rows) == 0 and len(r) == 1 and r[0].text.lower().startswith('table ') and not label:
label = r[0].text
continue
# Top row caption?
if len(rows) == 0 and len(r) == 1 and r[0].text.strip() and not caption:
caption = Caption(r[0].text)
continue
# Top row heading?
if len(rows) == 0:
# If any blank rows between here and 10th row of table, this is a heading
max_heading_row = min(10, int(len(potential_rows) / 2))
if i < max_heading_row:
hasblank = False
for nextrow in potential_rows[i+1:max_heading_row]:
if all(cell.text.strip() == '' for cell in nextrow):
hasblank = True
if hasblank:
hrows.append(r)
continue
# Footnotes in final rows? (all remaining rows only have 1 cell)
if all(len(frow) == 1 for frow in potential_rows[i:]):
footnotes.append(Footnote(r[0].text))
continue
rows.append(r)
for r in hrows:
r.extend([Cell('')] * (len(max(hrows, key=len)) - len(r)))
for r in rows:
r.extend([Cell('')] * (len(max(rows, key=len)) - len(r)))
rows = [r for r in rows if any(r)]
tab = Table(label=label, caption=caption or Caption(''), headings=hrows, rows=rows, footnotes=footnotes, id=el.get('id', None))
return [tab]
def _parse_table_rows(self, els, refs, specials):
hdict = {}
for row, tr in enumerate(els):
colnum = 0
for td in self._css(self.table_cell_css, tr):
cell = self._parse_text(td, refs=refs, specials=specials, element_cls=Cell)
colspan = int(td.get('colspan', '1'))
rowspan = int(td.get('rowspan', '1'))
for i in range(colspan):
for j in range(rowspan):
rownum = row + j
if not rownum in hdict:
hdict[rownum] = {}
while colnum in hdict[rownum]:
colnum += 1
hdict[rownum][colnum] = cell[0] if len(cell) > 0 else Cell('')
colnum += 1
rows = []
for row in sorted(hdict):
rows.append([])
for col in sorted(hdict[row]):
rows[-1].append(hdict[row][col])
for r in rows:
r.extend([Cell('')] * (len(max(rows, key=len)) - len(r)))
rows = [r for r in rows if any(r)]
return rows
def _parse_table_footnotes(self, fns, refs, specials):
return [self._parse_text(fn, refs=refs, specials=specials, element_cls=Footnote)[0] for fn in fns]