Created
December 19, 2012 04:22
-
-
Save photofroggy/4334363 to your computer and use it in GitHub Desktop.
Tablumps parsers using regex and string ops.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import time | |
class ReTablumps(object): | |
""" dAmn tablumps parser. | |
dAmn sends certain information formatted in a specific manner. | |
Links, images, thumbs, and other forms of data are formatted | |
in strings where the different attributes of these values are | |
separated by tab characters (``\\t``), and usually begin with an | |
ampersand. | |
We refer to these items as "tablumps" because of the tab | |
characters being used as delimeters. The job of this class is to | |
replace tablumps with readable strings, or to extract the data | |
given in the tablumps. | |
""" | |
expressions = None | |
replace = None | |
titles = None | |
subs = None | |
def __init__(self): | |
"""Populate the expressions and replaces used when parsing tablumps.""" | |
if self.expressions is not None: | |
return | |
# Regular expression objects used to find any complicated tablumps. | |
self.expressions = [ | |
re.compile("&avatar\t([a-zA-Z0-9-]+)\t([0-9]+)\t"), | |
re.compile("&dev\t(.)\t([a-zA-Z0-9-]+)\t"), | |
re.compile("&emote\t([^\t]+)\t([0-9]+)\t([0-9]+)\t(.*?)\t([a-z0-9./=-_]+)\t"), | |
re.compile("&a\t([^\t]+)\t([^\t]*)\t"), | |
re.compile("&link\t([^\t]+)\t&\t"), | |
re.compile("&link\t([^\t]+)\t([^\t]+)\t&\t"), | |
re.compile("&acro\t([^\t]+)\t(.*)&\/acro\t"), | |
re.compile("&abbr\t([^\t]+)\t(.*)&\/abbr\t"), | |
re.compile("&thumb\t(?P<ID>[0-9]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t"), | |
re.compile("&img\t([^\t]+)\t([^\t]*)\t([^\t]*)\t"), | |
re.compile("&iframe\t([^\t]+)\t([0-9%]*)\t([0-9%]*)\t&\/iframe\t"), | |
] | |
self.titles = ('avatar', 'dev', 'emote', 'a', 'link', 'link', 'acronym', 'abbr', 'thumb', 'img', 'iframe') | |
# Regular expression objects used to find and replace complicated tablumps. | |
self.subs = [ | |
(re.compile("&avatar\t([a-zA-Z0-9-]+)\t([0-9]+)\t"), ":icon\\1:"), | |
(re.compile("&dev\t(.)\t([a-zA-Z0-9-]+)\t"), ":dev\\2:"), | |
(re.compile("&emote\t([^\t]+)\t([0-9]+)\t([0-9]+)\t(.*?)\t([a-z0-9./=-_]+)\t"), "\\1"), | |
(re.compile("&a\t([^\t]+)\t([^\t]*)\t"), "<a href=\"\\1\" title=\"\\2\">"), | |
(re.compile("&link\t([^\t]+)\t&\t"), "\\1"), | |
(re.compile("&link\t([^\t]+)\t([^\t]+)\t&\t"), "\\1 (\\2)"), | |
(re.compile("&acro\t([^\t]+)\t"), "<acronym title=\"\\1\">"), | |
(re.compile("&abbr\t([^\t]+)\t"), "<abbr title=\"\\1\">"), | |
(re.compile("&thumb\t([0-9]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t"), ":thumb\\1:"), | |
(re.compile("&img\t([^\t]+)\t([^\t]*)\t([^\t]*)\t"), "<img src=\"\\1\" alt=\"\\2\" title=\"\\3\" />"), | |
(re.compile("&iframe\t([^\t]+)\t([0-9%]*)\t([0-9%]*)\t&\/iframe\t"), "<iframe src=\"\\1\" width=\"\\2\" height=\"\\3\" />"), | |
(re.compile("<([^>]+) (width|height|title|alt)=\"\"([^>]*?)>"), "<\\1\\3>"), | |
] | |
# Search and replace pairs used to parse simple HTML tags. | |
self.replace = [ | |
("&b\t", "<b>"), | |
("&/b\t", "</b>"), | |
("&i\t", "<i>"), | |
("&/i\t", "</i>"), | |
("&u\t", "<u>"), | |
("&/u\t", "</u>"), | |
("&s\t", "<s>"), | |
("&/s\t", "</s>"), | |
("&sup\t", "<sup>"), | |
("&/sup\t", "</sup>"), | |
("&sub\t", "<sub>"), | |
("&/sub\t", "</sub>"), | |
("&code\t", "<code>"), | |
("&/code\t", "</code>"), | |
("&p\t", "<p>"), | |
("&/p\t", "</p>"), | |
("&ul\t", "<ul>"), | |
("&/ul\t", "</ul>"), | |
("&ol\t", "<ol>"), | |
("&/ol\t", "</ol>"), | |
("&li\t", "<li>"), | |
("&/li\t", "</li>"), | |
("&bcode\t", "<bcode>"), | |
("&/bcode\t", "</bcode>"), | |
("&br\t", "\n"), | |
("&/a\t", "</a>"), | |
("&/acro\t", "</acronym>"), | |
("&/abbr\t", "</abbr>"), | |
] | |
def parse(self, data): | |
""" Parse any dAmn Tablumps found in our input data. | |
This method will simply return a string with the tablumps | |
parsed into readable formats. | |
""" | |
try: | |
for lump, repl in self.replace: | |
data = data.replace(lump, repl) | |
for expression, repl in self.subs: | |
data = expression.sub(repl, data) | |
except Exception: | |
pass | |
return data | |
def capture(self, text): | |
""" Return any dAmn Tablumps found in our input data. | |
Rather than parsing the tablumps, this method returns the | |
data given by tablumps. This only works for tablumps where | |
a regular expression is used for parsing. | |
""" | |
lumps = {} | |
for key, expression in enumerate(self.expressions): | |
cc = expression.findall(text) | |
if not cc: | |
continue | |
lumps[self.titles[key]] = cc | |
return lumps | |
class TablumpString(object): | |
""" | |
An object representing a string containing tablumps. | |
""" | |
def __init__(self, parser, raw, tokens): | |
self.parser = parser | |
self.raw = raw | |
self.tokens = tokens | |
self._html = None | |
self._ansi = None | |
self._text = None | |
def text(self): | |
""" | |
Render as plain text. | |
""" | |
if self._text is None: | |
self._text = self.parser.render(0, self.tokens) | |
return self._text | |
class StrTablumps(object): | |
""" | |
Parses tablumps using plain string operations. | |
""" | |
def __init__(self): | |
""" | |
start things. yay | |
""" | |
self.map = self.default_map() | |
def default_map(self): | |
""" | |
Default map containing renderers and argument numbers. | |
""" | |
def rt_link(data): | |
if len(data) == 1: | |
return data[0] | |
return '{0} ({1})'.format(data[0], data[1]) | |
def rh_link(data): | |
if len(data) == 1: | |
return '<a href="{0}">[link]</a>'.format(data[0]) | |
return '<a href="{0}">{1}</a>'.format(data[0], data[1]) | |
return { | |
'&b\t': [0, '<b>', '<b>', '\x1b[1m'], | |
'&/b\t': [0, '</b>', '</b>', '\x1b[22m'], | |
'&i\t': [0, '<i>', '<i>', '\x1b[3m'], | |
'&/i\t': [0, '</i>', '</i>', '\x1b[23m'], | |
'&u\t': [0, '<u>', '<u>', '\x1b[4m'], | |
'&/u\t': [0, '</u>', '</u>', '\x1b[24m'], | |
'&s\t': [0, '<s>', '<s>', '\x1b[9m'], | |
'&/s\t': [0, '</s>', '</s>', '\x1b[29m'], | |
'&sup\t': [0, '<sup>'], | |
'&/sup\t': [0, '</sup>'], | |
'&sub\t': [0, '<sub>'], | |
'&/sub\t': [0, '</sub>'], | |
'&code\t': [0, '<code>'], | |
'&/code\t': [0, '</code>'], | |
'&p\t': [0, '<p>'], | |
'&/p\t': [0, '</p>'], | |
'&ul\t': [0, '<ul>'], | |
'&/ul\t': [0, '</ul>'], | |
'&ol\t': [0, '<ol>'], | |
'&li\t': [0, '<li>' ], | |
'&/li\t': [0, '</li>'], | |
'&/ol\t': [0, '</ol>'], | |
'&link\t': [ 3, rt_link, rh_link], | |
# function( data ) { | |
# t = data[1]; | |
# return '<a target="_blank" href="'+data[0]+'" title="'+( t || data[0] )+'">'+( t || '[link]' )+'</a>'; | |
# } | |
#], | |
'&acro\t': [ 1, '<acronym title="{0}">' ], | |
'&/acro\t': [0, '</acronym>'], | |
'&abbr\t': [ 1, '<abbr title="{0}">'], | |
'&/abbr\t': [ 0, '</abbr>'], | |
'&img\t': [ 3, '<img src="{0}" alt="{1}" title="{2}" />'], | |
'&iframe\t': [ 3, '<iframe src="{0}" width="{1}" height="{2}" />'], | |
'&/iframe\t': [ 0, '</iframe>'], | |
'&a\t': [ 2, '<a href="{0}" title="{1}">' ], | |
'&/a\t': [ 0, '</a>'], | |
'&br\t': [ 0, '<br/>' ], | |
'&bcode\t': [0, '<bcode>', '<span><pre><code>'], | |
'&/bcode\t': [0, '</bcode>', '</code></pre></span>'], | |
'EOF': [0, '', None, '\x1b[m'] | |
} | |
def parse(self, data): | |
""" | |
Parse a string that may possibly contain tablumps. | |
""" | |
return TablumpString(self, data, self.tokenise(data)) | |
def tokenise(self, data): | |
""" | |
Tokenise our data based on things. | |
""" | |
result = [] | |
start = 0 | |
i = -1 | |
working = data | |
while True: | |
i+= 1 | |
try: | |
c = working[i] | |
except IndexError: | |
result.append([ 'raw', working ]) | |
break | |
if c != '&': | |
continue | |
result.append([ 'raw', working[:i] ]) | |
working = working[i:] | |
start = i + 1 | |
ti = working.find('\t') | |
if ti == -1: | |
continue | |
tag = working[:(ti + 1)] | |
if ' ' in tag: | |
continue | |
working = working[(ti + 1):] | |
crops = self.crop(tag, working) | |
if crops is None: | |
continue | |
result.append(crops[0]) | |
working = crops[1] | |
i = -1 | |
return result | |
def crop(self, tag, data): | |
""" | |
Crop tablump data. | |
""" | |
cropping = None | |
if not tag in self.map: | |
return cropping | |
args = self.map[tag] | |
cropping = [[tag, []], data] | |
if args[0] == 0: | |
return cropping | |
tokens = self.tokens(data, args[0]) | |
return [[tag, tokens[0]], tokens[1]] | |
def tokens(self, data, lim, sep=None, end=None): | |
""" | |
Crop `lim` tokens from `data`. | |
""" | |
sep = sep or '\t' | |
end = end or '&' | |
tokens = [] | |
for i in range(lim): | |
sepi = data.find(sep) | |
if sepi == -1: | |
break | |
tokens.append(data[:sepi]) | |
data = data[sepi+1:] | |
if tokens[-1] == end: | |
tokens.pop() | |
break | |
return [tokens, data] | |
def render(self, format, tokens): | |
""" | |
Render a set of tablump tokens as a string. | |
""" | |
format+= 1 | |
rendered = '' | |
for token in tokens: | |
if token[0] == 'raw': | |
rendered+= token[1] | |
continue | |
renderer = None | |
try: | |
renderer = self.map[token[0]] | |
except KeyError: | |
continue | |
try: | |
renderer = renderer[format] | |
except IndexError: | |
renderer = renderer[1] | |
try: | |
rendered+= renderer( token[1] ) | |
except TypeError: | |
rendered+= renderer.format(*token[1]) | |
return rendered | |
if __name__ == '__main__': | |
pstr = StrTablumps() | |
preg = ReTablumps() | |
rtls = '&b\t&a\thttp://google.com\tfoo\tsomething&/a\t&/b\t&link\thttp://github.com\tgithub\t&\t' | |
print '>> Using ' + rtls | |
print '>> Testing regex method...' | |
sttsre = time.time() | |
ptlsre = preg.parse(rtls) | |
edtsre = time.time() | |
diffre = edtsre - sttsre | |
print '>> Result: ' + ptlsre | |
print '>> Start: {0}; End: {1}; Diff: {2}'.format( sttsre, edtsre, diffre ) | |
print '>> Testing string method...' | |
sttsstr = time.time() | |
tls = pstr.parse(rtls) | |
ptlsstr = tls.text() | |
edtsstr = time.time() | |
diffstr = edtsstr - sttsstr | |
print '>> Result: ' + ptlsstr | |
print '>> Start: {0}; End: {1}; Diff: {2}'.format( sttsstr, edtsstr, diffstr ) | |
print '>> string diff - reg diff: {0}'.format(diffstr - diffre) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment