Source code for chemdataextractor.biblio.bibtex

# -*- coding: utf-8 -*-
"""
BibTeX parser.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from collections import OrderedDict
import json
import re

from ..text.latex import latex_to_unicode


[docs]class BibtexParser(object):
    """A class for parsing a BibTeX string into JSON or a python data structure.

    Example usage:

    .. code-block:: python

        with open(example.bib, 'r') as f:
            bib = BibtexParser(f.read())
            bib.parse()
            print bib.records_list
            print bib.json

    """

[docs]    def __init__(self, data, **kwargs):
        """Initialize BibtexParser with data.

        Optional metadata passed as keyword arguments will be included in the JSON output.
        e.g. collection, label, description, id, owner, created, modified, source

        Example usage:

        .. code-block:: python

            bib = BibtexParser(data, created=unicode(datetime.utcnow()), owner='mcs07')

        """
        self.data = data
        self.meta = kwargs
        self._token = None
        self.token_type = None
        self._tokens = re.compile(r'([^\s"\'#%@{}()=,]+|\s|"|\'|#|%|@|{|}|\(|\)|=|,)').finditer(self.data)
        self.mode = None
        self.definitions = {}
        self.records = OrderedDict()

        # Key name normalizations
        self.keynorms = {
            u'keyw': u'keyword',
            u'keywords': u'keyword',
            u'authors': u'author',
            u'editors': u'editor',
            u'url': u'link',
            u'urls': u'link',
            u'links': u'link',
            u'subjects': u'subject'
        }

    def _next_token(self, skipws=True):
        """Increment _token to the next token and return it."""
        self._token = next(self._tokens).group(0)
        return self._next_token() if skipws and self._token.isspace() else self._token

[docs]    def parse(self):
        """Parse self.data and store the parsed BibTeX to self.records."""
        while True:
            try:
                # TODO: If self._next_token() == '%' skip to newline?
                if self._next_token() == '@':
                    self._parse_entry()
            except StopIteration:
                break

    def _parse_entry(self):
        """Parse an entry."""
        entry_type = self._next_token().lower()
        if entry_type == 'string':
            self._parse_string()
        elif entry_type not in ['comment', 'preamble']:
            self._parse_record(entry_type)

    def _parse_string(self):
        """Parse a string entry and store the definition."""
        if self._next_token() in ['{', '(']:
            field = self._parse_field()
            if field:
                self.definitions[field[0]] = field[1]

    def _parse_record(self, record_type):
        """Parse a record."""
        if self._next_token() in ['{', '(']:
            key = self._next_token()
            self.records[key] = {
                u'id': key,
                u'type': record_type.lower()
            }
            if self._next_token() == ',':
                while True:
                    field = self._parse_field()
                    if field:
                        k, v = field[0], field[1]
                        if k in self.keynorms:
                            k = self.keynorms[k]
                        if k == 'pages':
                            v = v.replace(' ', '').replace('--', '-')
                        if k == 'author' or k == 'editor':
                            v = self.parse_names(v)
                        # Recapitalizing the title generally causes more problems than it solves
                        # elif k == 'title':
                        #     v = latex_to_unicode(v, capitalize='title')
                        else:
                            v = latex_to_unicode(v)
                        self.records[key][k] = v
                    if self._token != ',':
                        break

    def _parse_field(self):
        """Parse a Field."""
        name = self._next_token()
        if self._next_token() == '=':
            value = self._parse_value()
            return name, value

    def _parse_value(self):
        """Parse a value. Digits, definitions, and the contents of double quotes or curly brackets."""
        val = []
        while True:
            t = self._next_token()
            if t == '"':
                brac_counter = 0
                while True:
                    t = self._next_token(skipws=False)
                    if t == '{':
                        brac_counter += 1
                    if t == '}':
                        brac_counter -= 1
                    if t == '"' and brac_counter <= 0:
                        break
                    else:
                        val.append(t)
            elif t == '{':
                brac_counter = 0
                while True:
                    t = self._next_token(skipws=False)
                    if t == '{':
                        brac_counter += 1
                    if t == '}':
                        brac_counter -= 1
                    if brac_counter < 0:
                        break
                    else:
                        val.append(t)
            elif re.match(r'\w', t):
                val.extend([self.definitions.get(t, t), ' '])
            elif t.isdigit():
                val.append([t, ' '])
            elif t == '#':
                pass
            else:
                break

        value = ' '.join(''.join(val).split())
        return value

[docs]    @classmethod
    def parse_names(cls, names):
        """Parse a string of names separated by "and" like in a BibTeX authors field."""
        names = [latex_to_unicode(n) for n in re.split(r'\sand\s(?=[^{}]*(?:\{|$))', names) if n]
        return names

    @property
    def size(self):
        """Return the number of records parsed."""
        return len(self.records)

    @property
    def records_list(self):
        """Return the records as a list of dictionaries."""
        return list(self.records.values())

    @property
    def metadata(self):
        """Return metadata for the parsed collection of records."""
        auto = {u'records': self.size}
        auto.update(self.meta)
        return auto

    @property
    def json(self):
        """Return a list of records as a JSON string. Follows the BibJSON convention."""
        return json.dumps(OrderedDict([('metadata', self.metadata), ('records', self.records.values())]))


[docs]def parse_bibtex(data):
    bib = BibtexParser(data)
    bib.parse()
    return bib.records_list


# TODO: Improvements to BibTexParser
# - Initialize with options, then pass text to .parse method to reuse an instance?
# - Initialize with a single entry, and have attributes that correspond to the bibtex fields?
# - Have a classmethod that takes text containing multiple entries, then returns a list of instances
# - Have a list wrapper class that allows serialization of all at once?

# TODO: BibtexWriter - write python dict or BibJSON to BibTeX