Source code for chemdataextractor.biblio.bibtex

# -*- coding: utf-8 -*-
"""
BibTeX parser.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from collections import OrderedDict
import json
import re

from ..text.latex import latex_to_unicode


[docs]class BibtexParser(object): """A class for parsing a BibTeX string into JSON or a python data structure. Example usage: .. code-block:: python with open(example.bib, 'r') as f: bib = BibtexParser(f.read()) bib.parse() print bib.records_list print bib.json """
[docs] def __init__(self, data, **kwargs): """Initialize BibtexParser with data. Optional metadata passed as keyword arguments will be included in the JSON output. e.g. collection, label, description, id, owner, created, modified, source Example usage: .. code-block:: python bib = BibtexParser(data, created=unicode(datetime.utcnow()), owner='mcs07') """ self.data = data self.meta = kwargs self._token = None self.token_type = None self._tokens = re.compile(r'([^\s"\'#%@{}()=,]+|\s|"|\'|#|%|@|{|}|\(|\)|=|,)').finditer(self.data) self.mode = None self.definitions = {} self.records = OrderedDict() # Key name normalizations self.keynorms = { u'keyw': u'keyword', u'keywords': u'keyword', u'authors': u'author', u'editors': u'editor', u'url': u'link', u'urls': u'link', u'links': u'link', u'subjects': u'subject' }
def _next_token(self, skipws=True): """Increment _token to the next token and return it.""" self._token = next(self._tokens).group(0) return self._next_token() if skipws and self._token.isspace() else self._token
[docs] def parse(self): """Parse self.data and store the parsed BibTeX to self.records.""" while True: try: # TODO: If self._next_token() == '%' skip to newline? if self._next_token() == '@': self._parse_entry() except StopIteration: break
def _parse_entry(self): """Parse an entry.""" entry_type = self._next_token().lower() if entry_type == 'string': self._parse_string() elif entry_type not in ['comment', 'preamble']: self._parse_record(entry_type) def _parse_string(self): """Parse a string entry and store the definition.""" if self._next_token() in ['{', '(']: field = self._parse_field() if field: self.definitions[field[0]] = field[1] def _parse_record(self, record_type): """Parse a record.""" if self._next_token() in ['{', '(']: key = self._next_token() self.records[key] = { u'id': key, u'type': record_type.lower() } if self._next_token() == ',': while True: field = self._parse_field() if field: k, v = field[0], field[1] if k in self.keynorms: k = self.keynorms[k] if k == 'pages': v = v.replace(' ', '').replace('--', '-') if k == 'author' or k == 'editor': v = self.parse_names(v) # Recapitalizing the title generally causes more problems than it solves # elif k == 'title': # v = latex_to_unicode(v, capitalize='title') else: v = latex_to_unicode(v) self.records[key][k] = v if self._token != ',': break def _parse_field(self): """Parse a Field.""" name = self._next_token() if self._next_token() == '=': value = self._parse_value() return name, value def _parse_value(self): """Parse a value. Digits, definitions, and the contents of double quotes or curly brackets.""" val = [] while True: t = self._next_token() if t == '"': brac_counter = 0 while True: t = self._next_token(skipws=False) if t == '{': brac_counter += 1 if t == '}': brac_counter -= 1 if t == '"' and brac_counter <= 0: break else: val.append(t) elif t == '{': brac_counter = 0 while True: t = self._next_token(skipws=False) if t == '{': brac_counter += 1 if t == '}': brac_counter -= 1 if brac_counter < 0: break else: val.append(t) elif re.match(r'\w', t): val.extend([self.definitions.get(t, t), ' ']) elif t.isdigit(): val.append([t, ' ']) elif t == '#': pass else: break value = ' '.join(''.join(val).split()) return value
[docs] @classmethod def parse_names(cls, names): """Parse a string of names separated by "and" like in a BibTeX authors field.""" names = [latex_to_unicode(n) for n in re.split(r'\sand\s(?=[^{}]*(?:\{|$))', names) if n] return names
@property def size(self): """Return the number of records parsed.""" return len(self.records) @property def records_list(self): """Return the records as a list of dictionaries.""" return list(self.records.values()) @property def metadata(self): """Return metadata for the parsed collection of records.""" auto = {u'records': self.size} auto.update(self.meta) return auto @property def json(self): """Return a list of records as a JSON string. Follows the BibJSON convention.""" return json.dumps(OrderedDict([('metadata', self.metadata), ('records', self.records.values())]))
[docs]def parse_bibtex(data): bib = BibtexParser(data) bib.parse() return bib.records_list
# TODO: Improvements to BibTexParser # - Initialize with options, then pass text to .parse method to reuse an instance? # - Initialize with a single entry, and have attributes that correspond to the bibtex fields? # - Have a classmethod that takes text containing multiple entries, then returns a list of instances # - Have a list wrapper class that allows serialization of all at once? # TODO: BibtexWriter - write python dict or BibJSON to BibTeX