Source code for bibtexparser.bparser

# -*- coding: utf-8 -*-

# Original source: github.com/okfn/bibserver
# Authors:
# markmacgillivray
# Etienne Posthumus (epoz)
# Francois Boulogne <fboulogne at april dot org>

import io
import logging

from bibtexparser.bibdatabase import (BibDatabase, BibDataString, as_text,
                                      BibDataStringExpression, STANDARD_TYPES)
from bibtexparser.bibtexexpression import BibtexExpression

logger = logging.getLogger(__name__)

__all__ = ['BibTexParser']


def parse(data, *args, **kwargs):
    parser = BibTexParser(*args, **kwargs)
    return parser.parse(data)


[docs]class BibTexParser(object): """ A parser for reading BibTeX bibliographic data files. Example:: from bibtexparser.bparser import BibTexParser bibtex_str = ... parser = BibTexParser() parser.ignore_nonstandard_types = False parser.homogenize_fields = False parser.common_strings = False bib_database = bibtexparser.loads(bibtex_str, parser) :param customization: function or None (default) Customization to apply to parsed entries. :param ignore_nonstandard_types: bool (default True) If True ignores non-standard bibtex entry types. :param homogenize_fields: bool (default False) Common field name replacements (as set in alt_dict attribute). :param interpolate_strings: bool (default True) If True, replace bibtex string by their value, else uses BibDataString objects. :param common_strings: bool (default False) Include common string definitions (e.g. month abbreviations) to the bibtex file. :param add_missing_from_crossref: bool (default False) Resolve BibTeX references set in the crossref field for BibTeX entries and add the fields from the referenced entry to the referencing entry. """ def __new__(cls, data=None, **args): """ To catch the old API structure in which creating the parser would immediately parse and return data. """ if data is None: return super(BibTexParser, cls).__new__(cls) else: # For backwards compatibility: if data is given, parse # and return the `BibDatabase` object instead of the parser. return parse(data, **args) def __init__(self, data=None, customization=None, ignore_nonstandard_types=True, homogenize_fields=False, interpolate_strings=True, common_strings=False, add_missing_from_crossref=False): """ Creates a parser for reading BibTeX files :return: parser :rtype: `BibTexParser` """ self.bib_database = BibDatabase() #: Load common strings such as months abbreviation #: Default: `False`. self.common_strings = common_strings if self.common_strings: self.bib_database.load_common_strings() #: Callback function to process BibTeX entries after parsing, #: for example to create a list from a string with multiple values. #: By default all BibTeX values are treated as simple strings. #: Default: `None`. self.customization = customization #: Ignore non-standard BibTeX types (`book`, `article`, etc). #: Default: `True`. self.ignore_nonstandard_types = ignore_nonstandard_types #: Sanitize BibTeX field names, for example change `url` to `link` etc. #: Field names are always converted to lowercase names. #: Default: `False`. self.homogenize_fields = homogenize_fields #: Interpolate Bibtex Strings or keep the structure self.interpolate_strings = interpolate_strings # On some sample data files, the character encoding detection simply # hangs We are going to default to utf8, and mandate it. self.encoding = 'utf8' # Add missing field from cross-ref self.add_missing_from_crossref = add_missing_from_crossref # pre-defined set of key changes self.alt_dict = { 'keyw': u'keyword', 'keywords': u'keyword', 'authors': u'author', 'editors': u'editor', 'urls': u'url', 'link': u'url', 'links': u'url', 'subjects': u'subject', 'xref': u'crossref' } # Setup the parser expression self._init_expressions()
[docs] def parse(self, bibtex_str, partial=False): """Parse a BibTeX string into an object :param bibtex_str: BibTeX string :type: str or unicode :param partial: If True, print errors only on parsing failures. If False, an exception is raised. :type: boolean :return: bibliographic database :rtype: BibDatabase """ bibtex_file_obj = self._bibtex_file_obj(bibtex_str) try: self._expr.parseFile(bibtex_file_obj) except self._expr.ParseException as exc: logger.error("Could not parse properly, starting at %s", exc.line) if not partial: raise exc if self.add_missing_from_crossref: self.bib_database.add_missing_from_crossref() return self.bib_database
[docs] def parse_file(self, file, partial=False): """Parse a BibTeX file into an object :param file: BibTeX file or file-like object :type: file :param partial: If True, print errors only on parsing failures. If False, an exception is raised. :type: boolean :return: bibliographic database :rtype: BibDatabase """ return self.parse(file.read(), partial=partial)
def _init_expressions(self): """ Defines all parser expressions used internally. """ self._expr = BibtexExpression() # Handle string as BibDataString object self._expr.set_string_name_parse_action( lambda s, l, t: BibDataString(self.bib_database, t[0])) # Add notice to logger self._expr.add_log_function(logger.debug) # Set actions self._expr.entry.addParseAction( lambda s, l, t: self._add_entry( t.get('EntryType'), t.get('Key'), t.get('Fields')) ) self._expr.implicit_comment.addParseAction( lambda s, l, t: self._add_comment(t[0]) ) self._expr.explicit_comment.addParseAction( lambda s, l, t: self._add_comment(t[0]) ) self._expr.preamble_decl.addParseAction( lambda s, l, t: self._add_preamble(t[0]) ) self._expr.string_def.addParseAction( lambda s, l, t: self._add_string(t['StringName'].name, t['StringValue']) ) def _bibtex_file_obj(self, bibtex_str): # Some files have Byte-order marks inserted at the start byte = b'\xef\xbb\xbf' if isinstance(bibtex_str, str): byte = str(byte, self.encoding, 'ignore') if len(bibtex_str) >= 1 and bibtex_str[0] == byte: bibtex_str = bibtex_str[1:] else: if len(bibtex_str) >= 3 and bibtex_str[:3] == byte: bibtex_str = bibtex_str[3:] bibtex_str = bibtex_str.decode(encoding=self.encoding) return io.StringIO(bibtex_str) def _clean_val(self, val): """ Clean instring before adding to dictionary :param val: a value :type val: string :returns: string -- value """ if not val or val == "{}": return '' elif self.interpolate_strings: return as_text(val) else: return val def _clean_key(self, key): """ Lowercase a key and return as unicode. :param key: a key :type key: string :returns: (unicode) string -- value """ key = key.lower() if not isinstance(key, str): return str(key, 'utf-8') else: return key def _clean_field_key(self, key): """ Clean a bibtex field key and homogenize alternative forms. :param key: a key :type key: string :returns: string -- value """ key = self._clean_key(key) if self.homogenize_fields: if key in list(self.alt_dict.keys()): key = self.alt_dict[key] return key def _add_entry(self, entry_type, entry_id, fields): """ Adds a parsed entry. Includes checking type and fields, cleaning, applying customizations. :param entry_type: the entry type :type entry_type: string :param entry_id: the entry bibid :type entry_id: string :param fields: the fields and values :type fields: dictionary :returns: string -- value """ d = {} entry_type = self._clean_key(entry_type) if self.ignore_nonstandard_types and entry_type not in STANDARD_TYPES: logger.warning('Entry type %s not standard. Not considered.', entry_type) return for key in fields: d[self._clean_field_key(key)] = self._clean_val(fields[key]) d['ENTRYTYPE'] = entry_type d['ID'] = entry_id crossref = d.get('crossref', None) if self.add_missing_from_crossref and crossref is not None: d['_crossref'] = crossref if self.customization is not None: logger.debug('Apply customizations and return dict') d = self.customization(d) self.bib_database.entries.append(d) def _add_comment(self, comment): """ Stores a comment in the list of comment. :param comment: the parsed comment :type comment: string """ logger.debug('Store comment in list of comments: ' + comment.__repr__()) self.bib_database.comments.append(comment) def _add_string(self, string_key, string): """ Stores a new string in the string dictionary. :param string_key: the string key :type string_key: string :param string: the string value :type string: string """ if string_key in self.bib_database.strings: logger.warning('Overwriting existing string for key: %s.', string_key) logger.debug(u'Store string: {} -> {}'.format(string_key, string)) self.bib_database.strings[string_key] = self._clean_val(string) def _add_preamble(self, preamble): """ Stores a preamble. :param preamble: the parsed preamble :type preamble: string """ logger.debug('Store preamble in list of preambles') self.bib_database.preambles.append(preamble)