Source code for bibtexparser.bibtexexpression

import pyparsing as pp

from .bibdatabase import BibDataStringExpression


# General helpers

def _strip_after_new_lines(s):
    """Removes leading and trailing whitespaces in all but first line."""
    lines = s.splitlines()
    if len(lines) > 1:
        lines = [lines[0]] + [l.lstrip() for l in lines[1:]]
    return '\n'.join(lines)


[docs]def strip_after_new_lines(s):
    """Removes leading and trailing whitespaces in all but first line.

    :param s: string or BibDataStringExpression
    """
    if isinstance(s, BibDataStringExpression):
        s.apply_on_strings(_strip_after_new_lines)
        return s
    else:
        return _strip_after_new_lines(s)


[docs]def add_logger_parse_action(expr, log_func):
    """Register a callback on expression parsing with the adequate message."""
    def action(s, l, t):
        log_func("Found {}: {}".format(expr.resultsName, t))
    expr.addParseAction(action)


# Parse action helpers
# Helpers for returning values from the parsed tokens. Shaped as pyparsing's
# parse actions. See pyparsing documentation for the arguments.

def first_token(string_, location, token):
    # TODO Handle this case correctly!
    assert(len(token) == 1)
    return token[0]


def remove_trailing_newlines(string_, location, token):
    if token[0]:
        return token[0].rstrip('\n')


def remove_braces(string_, location, token):
    if len(token[0]) < 1:
        return ''
    else:
        start = 1 if token[0][0] == '{' else 0
        end = -1 if token[0][-1] == '}' else None
        return token[0][start:end]


[docs]def field_to_pair(string_, location, token):
    """
    Looks for parsed element named 'Field'.

    :returns: (name, value).
    """
    field = token.get('Field')
    value = field.get('Value')
    if isinstance(value, pp.ParseResults):
        # For pyparsing >= 2.3.1 (see #225 and API change note in pyparsing's
        # Changelog).
        value = value[0]
    return (field.get('FieldName'),
            strip_after_new_lines(value))


# Expressions helpers

[docs]def in_braces_or_pars(exp):
    """
    exp -> (exp)|{exp}
    """
    return ((pp.Suppress('{') + exp + pp.Suppress('}')) |
            (pp.Suppress('(') + exp + pp.Suppress(')')))


[docs]class BibtexExpression(object):
    """Gives access to pyparsing expressions.

    Attributes are pyparsing expressions for the following elements:

    * main_expression: the bibtex file
    * string_def: a string definition
    * preamble_decl: a preamble declaration
    * explicit_comment: an explicit comment
    * entry: an entry definition
    * implicit_comment: an implicit comment

    """

    ParseException = pp.ParseException

    def __init__(self):

        # Bibtex keywords

        string_def_start = pp.CaselessKeyword("@string")
        preamble_start = pp.CaselessKeyword("@preamble")
        comment_line_start = pp.CaselessKeyword('@comment')

        # String names
        string_name = pp.Word(pp.alphanums + '_-:')('StringName')
        self.set_string_name_parse_action(lambda s, l, t: None)
        string_name.addParseAction(self._string_name_parse_action)

        # Values inside bibtex fields
        # Values can be integer or string expressions. The latter may use
        # quoted or braced values.

        # Integer values
        integer = pp.Word(pp.nums)('Integer')

        # Braced values: braced values can contain nested (but balanced) braces
        braced_value_content = pp.CharsNotIn('{}')
        braced_value = pp.Forward()  # Recursive definition for nested braces
        braced_value <<= pp.originalTextFor(
            '{' + pp.ZeroOrMore(braced_value | braced_value_content) + '}'
            )('BracedValue')
        braced_value.setParseAction(remove_braces)
        # TODO add ignore for "\}" and "\{" ?
        # TODO @ are not parsed by bibtex in braces

        # Quoted values: may contain braced content with balanced braces
        brace_in_quoted = pp.nestedExpr('{', '}', ignoreExpr=None)
        text_in_quoted = pp.CharsNotIn('"{}')
        # (quotes should be escaped by braces in quoted value)
        quoted_value = pp.originalTextFor(
            '"' + pp.ZeroOrMore(text_in_quoted | brace_in_quoted) + '"'
            )('QuotedValue')
        quoted_value.addParseAction(pp.removeQuotes)

        # String expressions
        string_expr = pp.delimitedList(
            (quoted_value | braced_value | string_name), delim='#'
            )('StringExpression')
        self.set_string_expression_parse_action(lambda s, l, t: None)
        string_expr.addParseAction(self._string_expr_parse_action)

        value = (integer | string_expr)('Value')

        # Entries

        # @EntryType { ...
        entry_type = (pp.Suppress('@') + pp.Word(pp.alphas))('EntryType')
        entry_type.setParseAction(first_token)

        # Entry key: any character up to a ',' without leading and trailing
        # spaces. Also exclude spaces and prevent it from being empty.
        key = pp.SkipTo(',')('Key')  # TODO Maybe also exclude @',\#}{~%

        def citekeyParseAction(string_, location, token):
            """Parse action for validating citekeys.

            It ensures citekey is not empty and has no space.

            :args: see pyparsing documentation.
            """
            key = first_token(string_, location, token).strip()
            if len(key) < 1:
                raise self.ParseException(
                    string_, loc=location, msg="Empty citekeys are not allowed.")
            for i, c in enumerate(key):
                if c.isspace():
                    raise self.ParseException(
                        string_, loc=(location + i),
                        msg="Whitespace not allowed in citekeys.")
            return key

        key.setParseAction(citekeyParseAction)

        # Field name: word of letters, digits, dashes and underscores
        field_name = pp.Word(pp.alphanums + '_-().+')('FieldName')
        field_name.setParseAction(first_token)

        # Field: field_name = value
        field = pp.Group(field_name + pp.Suppress('=') + value)('Field')
        field.setParseAction(field_to_pair)

        # List of fields: comma separeted fields
        field_list = (pp.delimitedList(field) + pp.Suppress(pp.Optional(','))
                      )('Fields')
        field_list.setParseAction(
            lambda s, l, t: {k: v for (k, v) in reversed(t.get('Fields'))})

        # Entry: type, key, and fields
        self.entry = (entry_type +
                      in_braces_or_pars(key + pp.Suppress(',') + field_list)
                      )('Entry')

        # Other stuff: comments, string definitions, and preamble declarations

        # Explicit comments: @comment + everything up to next valid declaration
        # starting on new line.
        not_an_implicit_comment = (pp.LineEnd() + pp.Literal('@')
                                   ) | pp.StringEnd()
        self.explicit_comment = (
            pp.Suppress(comment_line_start) +
            pp.originalTextFor(pp.SkipTo(not_an_implicit_comment),
                               asString=True))('ExplicitComment')
        self.explicit_comment.addParseAction(remove_trailing_newlines)
        self.explicit_comment.addParseAction(remove_braces)
        # Previous implementation included comment until next '}'.
        # This is however not inline with bibtex behavior that is to only
        # ignore until EOL. Brace stipping is arbitrary here but avoids
        # duplication on bibtex write.

        # Empty implicit_comments lead to infinite loop of zeroOrMore
        def mustNotBeEmpty(t):
            if not t[0]:
                raise pp.ParseException("Match must not be empty.")

        # Implicit comments: not anything else
        self.implicit_comment = pp.originalTextFor(
            pp.SkipTo(not_an_implicit_comment).setParseAction(mustNotBeEmpty),
            asString=True)('ImplicitComment')
        self.implicit_comment.addParseAction(remove_trailing_newlines)

        # String definition
        self.string_def = (pp.Suppress(string_def_start) + in_braces_or_pars(
            string_name +
            pp.Suppress('=') +
            string_expr('StringValue')
            ))('StringDefinition')

        # Preamble declaration
        self.preamble_decl = (pp.Suppress(preamble_start) +
                              in_braces_or_pars(value))('PreambleDeclaration')

        # Main bibtex expression

        self.main_expression = pp.ZeroOrMore(
                self.string_def |
                self.preamble_decl |
                self.explicit_comment |
                self.entry |
                self.implicit_comment)

[docs]    def add_log_function(self, log_fun):
        """Add notice to logger on entry, comment, preamble, string definitions.

        :param log_fun: logger function
        """
        for e in [self.entry,
                  self.implicit_comment,
                  self.explicit_comment,
                  self.preamble_decl,
                  self.string_def]:
            add_logger_parse_action(e, log_fun)

[docs]    def set_string_name_parse_action(self, fun):
        """Set the parseAction for string name expression.

        .. Note::

            For some reason pyparsing duplicates the string_name
            expression so setting its parseAction a posteriori has no effect
            in the context of a string expression. This is why this function
            should be used instead.
        """
        self._string_name_parse_action_fun = fun

    def _string_name_parse_action(self, s, l, t):
        return self._string_name_parse_action_fun(s, l, t)

[docs]    def set_string_expression_parse_action(self, fun):
        """Set the parseAction for string_expression expression.

        .. Note::

            See set_string_name_parse_action.
        """
        self._string_expr_parse_action_fun = fun

    def _string_expr_parse_action(self, s, l, t):
        return self._string_expr_parse_action_fun(s, l, t)

    def parseFile(self, file_obj):
        return self.main_expression.parseFile(file_obj, parseAll=True)