Skip to content

Instantly share code, notes, and snippets.

@waylan
Last active August 29, 2015 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save waylan/24bd9705b0c3e5bd213b to your computer and use it in GitHub Desktop.
Save waylan/24bd9705b0c3e5bd213b to your computer and use it in GitHub Desktop.
A better Meta-Data handler for lightweight markup languages.
"""
DocData
A Meta-Data handler for lightweight markup languages.
Note: This was an experiment which was rejected in favor of a different API.
See the better implementation here: <https://github.com/waylan/docdata>
An implementation of Meta-Data as defined by MultiMarkdown. However, it can
work with any lightweight markup language and the various keys can have
custom transformations defined for them.
First, define the transformations by subclassing the Data class:
>>> class MetaData(Data):
... def _transform(self, value):
... "Transform to string by default. "
... return ' '.join(value)
...
... def _transform_summary(self, value):
... "Preserve line breaks in summary. "
... return '\\n'.join(value)
...
... def _transform_tags(self, value):
... "One tag per line or one line of comma-seperated tags."
... if len(value) == 1:
... return value[0].split(',')
... else:
... return value
Then, pass a document and your Data subclass (not an instance) to the
`get_data` function:
>>> doc = '''---
... Title: A Document Title
... Summary: A document summary.
... Line two of the document summary.
... Tags: foo,bar
... ---
... The document body.
... '''
>>> doc, data = get_data(doc, MetaData)
The `get_data` function returns the document with the meta-data stripped
and the Meta-Data as an instance of your Data subclass:
>>> doc
'The document body.\\n'
>>> data['title']
'A Document Title'
>>> data['summary']
'A document summary.\\nLine two of the document summary.'
>>> data['tags']
['foo', 'bar']
Notice that each of the Meta-Data values has been transformed appropriately
and is ready to use. Now the document can be passed to your lightweight
markup processor of choice.
Note that you could pass some of the Meta-Data values to your lightweight
markup parser to alter its behavior, or to chose a differant parser for a
given document.
"""
import re
class Data(dict):
"""
A dictionary which allows custom type conversions per key.
A subclass of Data should define a method for each known key which
gets the raw value, converts it and returns it. When a method is not
defined for a key, the raw value is simply returned as is.
The methods should be named `_transform_key` where "key" is the name
of the key. For example, if a key is named "foo" the coresponding method
would be `_transform_foo` and might look something like this:
>>> class FooData(Data):
... def _transform_foo(self, value):
... "If foo is a list, convert to a string. "
... if isinstance(value, list):
... return ' '.join(value)
... else:
... return value
When an instance of the class contains a key named "foo" and the value
for that key is a list, the list will be converted to a string:
>>> data = FooData(foo=['a', 'b', 'c'])
>>> data.get_raw_item('foo')
['a', 'b', 'c']
>>> data['foo']
'a b c'
>>> data.get_raw_item('foo') == data['foo']
False
If the key named "foo" already contains a string, then that string would
be returned unaltered:
>>> data = FooData(foo='a string')
>>> data.get_raw_item('foo')
'a string'
>>> data['foo']
'a string'
>>> data.get_raw_item('foo') == data['foo']
True
Of course, the above is one simple example. Any number of possable
conversions could be accounted for.
If a `_transform_*` method is not defined for a given key, then a default
`_transform` method is tried. You can define the default `_transform`
method to apply to all keys which do not have a specific method defined
for them:
>>> class BarData(FooData):
... def _transform(self, value):
... "Convert int to a string. "
... if isinstance(value, int):
... return str(value)
... else:
... return value
...
>>> data = BarData({'foo': ['a', 'b', 'c'], 'bar': 42})
>>> data.get_raw_item('bar')
42
>>> data['bar']
'42'
>>> # The _transform_foo method still works for 'foo'
>>> data['foo']
'a b c'
If neither a matching `_transform_*` method or `_transform` method
are defined, then the raw data is returned unaltered:
>>> data = FooData(baz='some value')
>>> data.get_raw_item('baz')
'some value'
>>> data['baz']
'some value'
>>> data.get_raw_item('baz') == data['baz']
True
"""
def get_raw_item(self, key):
' Return raw unconverted item. '
return super(Data, self).__getitem__(key)
def __getitem__(self, key):
trans = '_transform_%s' % key
raw_value = self.get_raw_item(key)
if hasattr(self, trans):
return getattr(self, trans)(raw_value)
if hasattr(self, '_transform'):
return self._transform(raw_value)
return raw_value
META_RE = re.compile(r'^[ ]{0,3}(?P<key>[A-Za-z0-9_-]+):\s*(?P<value>.*)')
META_MORE_RE = re.compile(r'^([ ]{4}|\t)(\s*)(?P<value>.*)')
BEGIN_RE = re.compile(r'^-{3}(\s.*)?')
END_RE = re.compile(r'^(-{3}|\.{3})(\s.*)?')
def get_data(doc, cls=Data):
"""
Extract meta-data from a text document.
Returns a tuple of (doc, data). The document is returned with the
meta-data removed and the meta-data is returned in a container of
type `cls` (defaults to Data). Pass in a subclass of Data to do custom
transformations of the data for any given key.
"""
lines = doc.replace('\r\n', '\n').replace('\r', '\n').split('\n')
if lines and BEGIN_RE.match(lines[0]):
lines.pop(0)
data = cls()
key = None
while lines:
line = lines.pop(0)
if line.strip() == '' or END_RE.match(line):
break # blank line or end of YAML header - done
m1 = META_RE.match(line)
if m1:
key = m1.group('key').lower().strip()
value = m1.group('value').strip()
try:
data.get_raw_item(key).append(value)
except KeyError:
data[key] = [value]
else:
m2 = META_MORE_RE.match(line)
if m2 and key:
# Add another line to existing key
data.get_raw_item(key).append(m2.group('value').strip())
else:
lines.insert(0, line)
break # no meta data - done
return '\n'.join(lines), data
if __name__ == '__main__':
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment