Source code for chemdataextractor.reader.uspto

# -*- coding: utf-8 -*-
"""
Readers for USPTO patents.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from ..scrape.clean import clean
#from ..doc.table import Table
from ..doc.table import Table
from ..doc.text import Caption, Footnote, Cell
from .markup import XmlReader


# TODO: The below has only been tested with us-patent-grant-v42


[docs]class UsptoXmlReader(XmlReader): """Reader for USPTO XML documents.""" cleaners = [clean] # tidy_nlm_references, space_labels root_css = 'us-patent-grant' # TODO: Other roots title_css = 'invention-title' heading_css = 'heading, p[id^="h-"]' table_css = 'table' table_body_row_css = 'table row' table_cell_css = 'entry' # figure_css = 'img' reference_css = 'claim-ref' # citation_css = 'ref-list ref' ignore_css = 'us-bibliographic-data-grant *:not(invention-title)' inline_elements = { 'b', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'cite', 'code', 'dfn', 'em', 'kbd', 'strong', 'samp', 'var', 'a', 'bdo', 'br', 'img', 'map', 'object', 'q', 'script', 'span', 'sub', 'sup', 'button', 'input', 'label', 'select', 'textarea', 'blink', 'font', 'marquee', 'nobr', 's', 'strike', 'u', 'wbr', 'xref', 'underline', 'italic', 'bold', 'inline-formula', 'alternatives', 'tex-math', '{http://www.w3.org/1998/math/mathml}math', '{http://www.w3.org/1998/math/mathml}msubsup', '{http://www.w3.org/1998/math/mathml}mrow', '{http://www.w3.org/1998/math/mathml}mo', '{http://www.w3.org/1998/math/mathml}mi', '{http://www.w3.org/1998/math/mathml}mn', 'claim-ref', 'figref' }
[docs] def detect(self, fstring, fname=None): """""" if fname and not fname.lower().endswith('.xml'): return False if b'us-patent-grant' in fstring: return True # TODO: Other DTDs return False
def _parse_table(self, el, refs, specials): hdict = {} for row, tr in enumerate(self._css(self.table_body_row_css, el)): colnum = 0 for td in self._css(self.table_cell_css, tr): cell = self._parse_text(td, refs=refs, specials=specials, element_cls=Cell) colspan = int(td.get('colspan', '1')) rowspan = int(td.get('rowspan', '1')) for i in range(colspan): for j in range(rowspan): rownum = row + j if not rownum in hdict: hdict[rownum] = {} while colnum in hdict[rownum]: colnum += 1 hdict[rownum][colnum] = cell[0] if len(cell) > 0 else Cell('') colnum += 1 potential_rows = [] most_cols = 0 for row in sorted(hdict): potential_rows.append([]) most_cols = max(most_cols, len(hdict[row])) for col in sorted(hdict[row]): potential_rows[-1].append(hdict[row][col]) hrows = [] rows = [] label = None caption = None footnotes = [] for i, r in enumerate(potential_rows): # Skip empty rows if all(cell.text.strip() == '' for cell in r): continue # Top row label? if len(rows) == 0 and len(r) == 1 and r[0].text.lower().startswith('table ') and not label: label = r[0].text continue # Top row caption? if len(rows) == 0 and len(r) == 1 and r[0].text.strip() and not caption: caption = Caption(r[0].text) continue # Top row heading? if len(rows) == 0: # If any blank rows between here and 10th row of table, this is a heading max_heading_row = min(10, int(len(potential_rows) / 2)) if i < max_heading_row: hasblank = False for nextrow in potential_rows[i+1:max_heading_row]: if all(cell.text.strip() == '' for cell in nextrow): hasblank = True if hasblank: hrows.append(r) continue # Footnotes in final rows? (all remaining rows only have 1 cell) if all(len(frow) == 1 for frow in potential_rows[i:]): footnotes.append(Footnote(r[0].text)) continue rows.append(r) for r in hrows: r.extend([Cell('')] * (len(max(hrows, key=len)) - len(r))) for r in rows: r.extend([Cell('')] * (len(max(rows, key=len)) - len(r))) rows = [r for r in rows if any(r)] tab = Table(label=label, caption=caption or Caption(''), headings=hrows, rows=rows, footnotes=footnotes, id=el.get('id', None)) return [tab] def _parse_table_rows(self, els, refs, specials): hdict = {} for row, tr in enumerate(els): colnum = 0 for td in self._css(self.table_cell_css, tr): cell = self._parse_text(td, refs=refs, specials=specials, element_cls=Cell) colspan = int(td.get('colspan', '1')) rowspan = int(td.get('rowspan', '1')) for i in range(colspan): for j in range(rowspan): rownum = row + j if not rownum in hdict: hdict[rownum] = {} while colnum in hdict[rownum]: colnum += 1 hdict[rownum][colnum] = cell[0] if len(cell) > 0 else Cell('') colnum += 1 rows = [] for row in sorted(hdict): rows.append([]) for col in sorted(hdict[row]): rows[-1].append(hdict[row][col]) for r in rows: r.extend([Cell('')] * (len(max(rows, key=len)) - len(r))) rows = [r for r in rows if any(r)] return rows def _parse_table_footnotes(self, fns, refs, specials): return [self._parse_text(fn, refs=refs, specials=specials, element_cls=Footnote)[0] for fn in fns]