waylan/docdata.py

## docdata.py
"""
DocData

A Meta-Data handler for lightweight markup languages.

Note: This was an experiment which was rejected in favor of a different API.
See the better implementation here: <https://github.com/waylan/docdata>

An implementation of Meta-Data as defined by MultiMarkdown. However, it can
work with any lightweight markup language and the various keys can have
custom transformations defined for them.

First, define the transformations by subclassing the Data class:

        >>> class MetaData(Data):
        ...     def _transform(self, value):
        ...         "Transform to string by default. "
        ...         return  ' '.join(value)
        ...
        ...     def _transform_summary(self, value):
        ...         "Preserve line breaks in summary. "
        ...         return '\\n'.join(value)
        ...
        ...     def _transform_tags(self, value):
        ...         "One tag per line or one line of comma-seperated tags."
        ...         if len(value) == 1:
        ...             return value[0].split(',')
        ...         else:
        ...             return value

    Then, pass a document and your Data subclass (not an instance) to the
    `get_data` function:

        >>> doc = '''---
        ... Title:   A Document Title
        ... Summary: A document summary.
        ...          Line two of the document summary.
        ... Tags:    foo,bar
        ... ---
        ... The document body.
        ... '''
        >>> doc, data = get_data(doc, MetaData)

    The `get_data` function returns the document with the meta-data stripped
    and the Meta-Data as an instance of your Data subclass:

        >>> doc
        'The document body.\\n'
        >>> data['title']
        'A Document Title'
        >>> data['summary']
        'A document summary.\\nLine two of the document summary.'
        >>> data['tags']
        ['foo', 'bar']

    Notice that each of the Meta-Data values has been transformed appropriately
    and is ready to use. Now the document can be passed to your lightweight
    markup processor of choice.

    Note that you could pass some of the Meta-Data values to your lightweight
    markup parser to alter its behavior, or to chose a differant parser for a
    given document.

"""
import re


class Data(dict):
    """
    A dictionary which allows custom type conversions per key.

    A subclass of Data should define a method for each known key which
    gets the raw value, converts it and returns it. When a method is not
    defined for a key, the raw value is simply returned as is.

    The methods should be named `_transform_key` where "key" is the name
    of the key. For example, if a key is named "foo" the coresponding method
    would be `_transform_foo` and might look something like this:

        >>> class FooData(Data):
        ...     def _transform_foo(self, value):
        ...         "If foo is a list, convert to a string. "
        ...         if isinstance(value, list):
        ...             return ' '.join(value)
        ...         else:
        ...             return value

    When an instance of the class contains a key named "foo" and the value
    for that key is a list, the list will be converted to a string:

        >>> data = FooData(foo=['a', 'b', 'c'])
        >>> data.get_raw_item('foo')
        ['a', 'b', 'c']
        >>> data['foo']
        'a b c'
        >>> data.get_raw_item('foo') == data['foo']
        False

    If the key named "foo" already contains a string, then that string would
    be returned unaltered:

        >>> data = FooData(foo='a string')
        >>> data.get_raw_item('foo')
        'a string'
        >>> data['foo']
        'a string'
        >>> data.get_raw_item('foo') == data['foo']
        True

    Of course, the above is one simple example. Any number of possable
    conversions could be accounted for.

    If a `_transform_*` method is not defined for a given key, then a default
    `_transform` method is tried. You can define the default `_transform`
    method to apply to all keys which do not have a specific method defined
    for them:

        >>> class BarData(FooData):
        ...     def _transform(self, value):
        ...         "Convert int to a string. "
        ...         if isinstance(value, int):
        ...             return str(value)
        ...         else:
        ...             return value
        ...
        >>> data = BarData({'foo': ['a', 'b', 'c'], 'bar': 42})
        >>> data.get_raw_item('bar')
        42
        >>> data['bar']
        '42'
        >>> # The _transform_foo method still works for 'foo'
        >>> data['foo']
        'a b c'

    If neither a matching `_transform_*` method or `_transform` method
    are defined, then the raw data is returned unaltered:

        >>> data = FooData(baz='some value')
        >>> data.get_raw_item('baz')
        'some value'
        >>> data['baz']
        'some value'
        >>> data.get_raw_item('baz') == data['baz']
        True

    """

    def get_raw_item(self, key):
        ' Return raw unconverted item. '
        return super(Data, self).__getitem__(key)

    def __getitem__(self, key):
        trans = '_transform_%s' % key
        raw_value = self.get_raw_item(key)
        if hasattr(self, trans):
            return getattr(self, trans)(raw_value)
        if hasattr(self, '_transform'):
            return self._transform(raw_value)
        return raw_value


META_RE = re.compile(r'^[ ]{0,3}(?P<key>[A-Za-z0-9_-]+):\s*(?P<value>.*)')
META_MORE_RE = re.compile(r'^([ ]{4}|\t)(\s*)(?P<value>.*)')
BEGIN_RE = re.compile(r'^-{3}(\s.*)?')
END_RE = re.compile(r'^(-{3}|\.{3})(\s.*)?')


def get_data(doc, cls=Data):
    """
    Extract meta-data from a text document.

    Returns a tuple of (doc, data). The document is returned with the
    meta-data removed and the meta-data is returned in a container of
    type `cls` (defaults to Data). Pass in a subclass of Data to do custom
    transformations of the data for any given key.

    """
    lines = doc.replace('\r\n', '\n').replace('\r', '\n').split('\n')

    if lines and BEGIN_RE.match(lines[0]):
        lines.pop(0)

    data = cls()
    key = None
    while lines:
        line = lines.pop(0)

        if line.strip() == '' or END_RE.match(line):
            break  # blank line or end of YAML header - done
        m1 = META_RE.match(line)
        if m1:
            key = m1.group('key').lower().strip()
            value = m1.group('value').strip()
            try:
                data.get_raw_item(key).append(value)
            except KeyError:
                data[key] = [value]
        else:
            m2 = META_MORE_RE.match(line)
            if m2 and key:
                # Add another line to existing key
                data.get_raw_item(key).append(m2.group('value').strip())
            else:
                lines.insert(0, line)
                break  # no meta data - done
    return '\n'.join(lines), data


if __name__ == '__main__':
    import doctest
    doctest.testmod()
	"""
	DocData

	A Meta-Data handler for lightweight markup languages.

	Note: This was an experiment which was rejected in favor of a different API.
	See the better implementation here: <https://github.com/waylan/docdata>

	An implementation of Meta-Data as defined by MultiMarkdown. However, it can
	work with any lightweight markup language and the various keys can have
	custom transformations defined for them.

	First, define the transformations by subclassing the Data class:

	>>> class MetaData(Data):
	... def _transform(self, value):
	... "Transform to string by default. "
	... return ' '.join(value)
	...
	... def _transform_summary(self, value):
	... "Preserve line breaks in summary. "
	... return '\\n'.join(value)
	...
	... def _transform_tags(self, value):
	... "One tag per line or one line of comma-seperated tags."
	... if len(value) == 1:
	... return value[0].split(',')
	... else:
	... return value

	Then, pass a document and your Data subclass (not an instance) to the
	`get_data` function:

	>>> doc = '''---
	... Title: A Document Title
	... Summary: A document summary.
	... Line two of the document summary.
	... Tags: foo,bar
	... ---
	... The document body.
	... '''
	>>> doc, data = get_data(doc, MetaData)

	The `get_data` function returns the document with the meta-data stripped
	and the Meta-Data as an instance of your Data subclass:

	>>> doc
	'The document body.\\n'
	>>> data['title']
	'A Document Title'
	>>> data['summary']
	'A document summary.\\nLine two of the document summary.'
	>>> data['tags']
	['foo', 'bar']

	Notice that each of the Meta-Data values has been transformed appropriately
	and is ready to use. Now the document can be passed to your lightweight
	markup processor of choice.

	Note that you could pass some of the Meta-Data values to your lightweight
	markup parser to alter its behavior, or to chose a differant parser for a
	given document.

	"""
	import re


	class Data(dict):
	"""
	A dictionary which allows custom type conversions per key.

	A subclass of Data should define a method for each known key which
	gets the raw value, converts it and returns it. When a method is not
	defined for a key, the raw value is simply returned as is.

	The methods should be named `_transform_key` where "key" is the name
	of the key. For example, if a key is named "foo" the coresponding method
	would be `_transform_foo` and might look something like this:

	>>> class FooData(Data):
	... def _transform_foo(self, value):
	... "If foo is a list, convert to a string. "
	... if isinstance(value, list):
	... return ' '.join(value)
	... else:
	... return value

	When an instance of the class contains a key named "foo" and the value
	for that key is a list, the list will be converted to a string:

	>>> data = FooData(foo=['a', 'b', 'c'])
	>>> data.get_raw_item('foo')
	['a', 'b', 'c']
	>>> data['foo']
	'a b c'
	>>> data.get_raw_item('foo') == data['foo']
	False

	If the key named "foo" already contains a string, then that string would
	be returned unaltered:

	>>> data = FooData(foo='a string')
	>>> data.get_raw_item('foo')
	'a string'
	>>> data['foo']
	'a string'
	>>> data.get_raw_item('foo') == data['foo']
	True

	Of course, the above is one simple example. Any number of possable
	conversions could be accounted for.

	If a `_transform_*` method is not defined for a given key, then a default
	`_transform` method is tried. You can define the default `_transform`
	method to apply to all keys which do not have a specific method defined
	for them:

	>>> class BarData(FooData):
	... def _transform(self, value):
	... "Convert int to a string. "
	... if isinstance(value, int):
	... return str(value)
	... else:
	... return value
	...
	>>> data = BarData({'foo': ['a', 'b', 'c'], 'bar': 42})
	>>> data.get_raw_item('bar')
	42
	>>> data['bar']
	'42'
	>>> # The _transform_foo method still works for 'foo'
	>>> data['foo']
	'a b c'

	If neither a matching `_transform_*` method or `_transform` method
	are defined, then the raw data is returned unaltered:

	>>> data = FooData(baz='some value')
	>>> data.get_raw_item('baz')
	'some value'
	>>> data['baz']
	'some value'
	>>> data.get_raw_item('baz') == data['baz']
	True

	"""

	def get_raw_item(self, key):
	' Return raw unconverted item. '
	return super(Data, self).__getitem__(key)

	def __getitem__(self, key):
	trans = '_transform_%s' % key
	raw_value = self.get_raw_item(key)
	if hasattr(self, trans):
	return getattr(self, trans)(raw_value)
	if hasattr(self, '_transform'):
	return self._transform(raw_value)
	return raw_value


	META_RE = re.compile(r'^[ ]{0,3}(?P<key>[A-Za-z0-9_-]+):\s(?P<value>.)')
	META_MORE_RE = re.compile(r'^([ ]{4}\|\t)(\s)(?P<value>.)')
	BEGIN_RE = re.compile(r'^-{3}(\s.*)?')
	END_RE = re.compile(r'^(-{3}\|\.{3})(\s.*)?')


	def get_data(doc, cls=Data):
	"""
	Extract meta-data from a text document.

	Returns a tuple of (doc, data). The document is returned with the
	meta-data removed and the meta-data is returned in a container of
	type `cls` (defaults to Data). Pass in a subclass of Data to do custom
	transformations of the data for any given key.

	"""
	lines = doc.replace('\r\n', '\n').replace('\r', '\n').split('\n')

	if lines and BEGIN_RE.match(lines[0]):
	lines.pop(0)

	data = cls()
	key = None
	while lines:
	line = lines.pop(0)

	if line.strip() == '' or END_RE.match(line):
	break # blank line or end of YAML header - done
	m1 = META_RE.match(line)
	if m1:
	key = m1.group('key').lower().strip()
	value = m1.group('value').strip()
	try:
	data.get_raw_item(key).append(value)
	except KeyError:
	data[key] = [value]
	else:
	m2 = META_MORE_RE.match(line)
	if m2 and key:
	# Add another line to existing key
	data.get_raw_item(key).append(m2.group('value').strip())
	else:
	lines.insert(0, line)
	break # no meta data - done
	return '\n'.join(lines), data


	if __name__ == '__main__':
	import doctest
	doctest.testmod()