Last active
August 29, 2015 14:16
-
-
Save waylan/24bd9705b0c3e5bd213b to your computer and use it in GitHub Desktop.
A better Meta-Data handler for lightweight markup languages.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
DocData | |
A Meta-Data handler for lightweight markup languages. | |
Note: This was an experiment which was rejected in favor of a different API. | |
See the better implementation here: <https://github.com/waylan/docdata> | |
An implementation of Meta-Data as defined by MultiMarkdown. However, it can | |
work with any lightweight markup language and the various keys can have | |
custom transformations defined for them. | |
First, define the transformations by subclassing the Data class: | |
>>> class MetaData(Data): | |
... def _transform(self, value): | |
... "Transform to string by default. " | |
... return ' '.join(value) | |
... | |
... def _transform_summary(self, value): | |
... "Preserve line breaks in summary. " | |
... return '\\n'.join(value) | |
... | |
... def _transform_tags(self, value): | |
... "One tag per line or one line of comma-seperated tags." | |
... if len(value) == 1: | |
... return value[0].split(',') | |
... else: | |
... return value | |
Then, pass a document and your Data subclass (not an instance) to the | |
`get_data` function: | |
>>> doc = '''--- | |
... Title: A Document Title | |
... Summary: A document summary. | |
... Line two of the document summary. | |
... Tags: foo,bar | |
... --- | |
... The document body. | |
... ''' | |
>>> doc, data = get_data(doc, MetaData) | |
The `get_data` function returns the document with the meta-data stripped | |
and the Meta-Data as an instance of your Data subclass: | |
>>> doc | |
'The document body.\\n' | |
>>> data['title'] | |
'A Document Title' | |
>>> data['summary'] | |
'A document summary.\\nLine two of the document summary.' | |
>>> data['tags'] | |
['foo', 'bar'] | |
Notice that each of the Meta-Data values has been transformed appropriately | |
and is ready to use. Now the document can be passed to your lightweight | |
markup processor of choice. | |
Note that you could pass some of the Meta-Data values to your lightweight | |
markup parser to alter its behavior, or to chose a differant parser for a | |
given document. | |
""" | |
import re | |
class Data(dict): | |
""" | |
A dictionary which allows custom type conversions per key. | |
A subclass of Data should define a method for each known key which | |
gets the raw value, converts it and returns it. When a method is not | |
defined for a key, the raw value is simply returned as is. | |
The methods should be named `_transform_key` where "key" is the name | |
of the key. For example, if a key is named "foo" the coresponding method | |
would be `_transform_foo` and might look something like this: | |
>>> class FooData(Data): | |
... def _transform_foo(self, value): | |
... "If foo is a list, convert to a string. " | |
... if isinstance(value, list): | |
... return ' '.join(value) | |
... else: | |
... return value | |
When an instance of the class contains a key named "foo" and the value | |
for that key is a list, the list will be converted to a string: | |
>>> data = FooData(foo=['a', 'b', 'c']) | |
>>> data.get_raw_item('foo') | |
['a', 'b', 'c'] | |
>>> data['foo'] | |
'a b c' | |
>>> data.get_raw_item('foo') == data['foo'] | |
False | |
If the key named "foo" already contains a string, then that string would | |
be returned unaltered: | |
>>> data = FooData(foo='a string') | |
>>> data.get_raw_item('foo') | |
'a string' | |
>>> data['foo'] | |
'a string' | |
>>> data.get_raw_item('foo') == data['foo'] | |
True | |
Of course, the above is one simple example. Any number of possable | |
conversions could be accounted for. | |
If a `_transform_*` method is not defined for a given key, then a default | |
`_transform` method is tried. You can define the default `_transform` | |
method to apply to all keys which do not have a specific method defined | |
for them: | |
>>> class BarData(FooData): | |
... def _transform(self, value): | |
... "Convert int to a string. " | |
... if isinstance(value, int): | |
... return str(value) | |
... else: | |
... return value | |
... | |
>>> data = BarData({'foo': ['a', 'b', 'c'], 'bar': 42}) | |
>>> data.get_raw_item('bar') | |
42 | |
>>> data['bar'] | |
'42' | |
>>> # The _transform_foo method still works for 'foo' | |
>>> data['foo'] | |
'a b c' | |
If neither a matching `_transform_*` method or `_transform` method | |
are defined, then the raw data is returned unaltered: | |
>>> data = FooData(baz='some value') | |
>>> data.get_raw_item('baz') | |
'some value' | |
>>> data['baz'] | |
'some value' | |
>>> data.get_raw_item('baz') == data['baz'] | |
True | |
""" | |
def get_raw_item(self, key): | |
' Return raw unconverted item. ' | |
return super(Data, self).__getitem__(key) | |
def __getitem__(self, key): | |
trans = '_transform_%s' % key | |
raw_value = self.get_raw_item(key) | |
if hasattr(self, trans): | |
return getattr(self, trans)(raw_value) | |
if hasattr(self, '_transform'): | |
return self._transform(raw_value) | |
return raw_value | |
META_RE = re.compile(r'^[ ]{0,3}(?P<key>[A-Za-z0-9_-]+):\s*(?P<value>.*)') | |
META_MORE_RE = re.compile(r'^([ ]{4}|\t)(\s*)(?P<value>.*)') | |
BEGIN_RE = re.compile(r'^-{3}(\s.*)?') | |
END_RE = re.compile(r'^(-{3}|\.{3})(\s.*)?') | |
def get_data(doc, cls=Data): | |
""" | |
Extract meta-data from a text document. | |
Returns a tuple of (doc, data). The document is returned with the | |
meta-data removed and the meta-data is returned in a container of | |
type `cls` (defaults to Data). Pass in a subclass of Data to do custom | |
transformations of the data for any given key. | |
""" | |
lines = doc.replace('\r\n', '\n').replace('\r', '\n').split('\n') | |
if lines and BEGIN_RE.match(lines[0]): | |
lines.pop(0) | |
data = cls() | |
key = None | |
while lines: | |
line = lines.pop(0) | |
if line.strip() == '' or END_RE.match(line): | |
break # blank line or end of YAML header - done | |
m1 = META_RE.match(line) | |
if m1: | |
key = m1.group('key').lower().strip() | |
value = m1.group('value').strip() | |
try: | |
data.get_raw_item(key).append(value) | |
except KeyError: | |
data[key] = [value] | |
else: | |
m2 = META_MORE_RE.match(line) | |
if m2 and key: | |
# Add another line to existing key | |
data.get_raw_item(key).append(m2.group('value').strip()) | |
else: | |
lines.insert(0, line) | |
break # no meta data - done | |
return '\n'.join(lines), data | |
if __name__ == '__main__': | |
import doctest | |
doctest.testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment