Skip to content

Instantly share code, notes, and snippets.

@photofroggy
Created December 19, 2012 04:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save photofroggy/4334363 to your computer and use it in GitHub Desktop.
Save photofroggy/4334363 to your computer and use it in GitHub Desktop.
Tablumps parsers using regex and string ops.
import re
import time
class ReTablumps(object):
""" dAmn tablumps parser.
dAmn sends certain information formatted in a specific manner.
Links, images, thumbs, and other forms of data are formatted
in strings where the different attributes of these values are
separated by tab characters (``\\t``), and usually begin with an
ampersand.
We refer to these items as "tablumps" because of the tab
characters being used as delimeters. The job of this class is to
replace tablumps with readable strings, or to extract the data
given in the tablumps.
"""
expressions = None
replace = None
titles = None
subs = None
def __init__(self):
"""Populate the expressions and replaces used when parsing tablumps."""
if self.expressions is not None:
return
# Regular expression objects used to find any complicated tablumps.
self.expressions = [
re.compile("&avatar\t([a-zA-Z0-9-]+)\t([0-9]+)\t"),
re.compile("&dev\t(.)\t([a-zA-Z0-9-]+)\t"),
re.compile("&emote\t([^\t]+)\t([0-9]+)\t([0-9]+)\t(.*?)\t([a-z0-9./=-_]+)\t"),
re.compile("&a\t([^\t]+)\t([^\t]*)\t"),
re.compile("&link\t([^\t]+)\t&\t"),
re.compile("&link\t([^\t]+)\t([^\t]+)\t&\t"),
re.compile("&acro\t([^\t]+)\t(.*)&\/acro\t"),
re.compile("&abbr\t([^\t]+)\t(.*)&\/abbr\t"),
re.compile("&thumb\t(?P<ID>[0-9]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t"),
re.compile("&img\t([^\t]+)\t([^\t]*)\t([^\t]*)\t"),
re.compile("&iframe\t([^\t]+)\t([0-9%]*)\t([0-9%]*)\t&\/iframe\t"),
]
self.titles = ('avatar', 'dev', 'emote', 'a', 'link', 'link', 'acronym', 'abbr', 'thumb', 'img', 'iframe')
# Regular expression objects used to find and replace complicated tablumps.
self.subs = [
(re.compile("&avatar\t([a-zA-Z0-9-]+)\t([0-9]+)\t"), ":icon\\1:"),
(re.compile("&dev\t(.)\t([a-zA-Z0-9-]+)\t"), ":dev\\2:"),
(re.compile("&emote\t([^\t]+)\t([0-9]+)\t([0-9]+)\t(.*?)\t([a-z0-9./=-_]+)\t"), "\\1"),
(re.compile("&a\t([^\t]+)\t([^\t]*)\t"), "<a href=\"\\1\" title=\"\\2\">"),
(re.compile("&link\t([^\t]+)\t&\t"), "\\1"),
(re.compile("&link\t([^\t]+)\t([^\t]+)\t&\t"), "\\1 (\\2)"),
(re.compile("&acro\t([^\t]+)\t"), "<acronym title=\"\\1\">"),
(re.compile("&abbr\t([^\t]+)\t"), "<abbr title=\"\\1\">"),
(re.compile("&thumb\t([0-9]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t"), ":thumb\\1:"),
(re.compile("&img\t([^\t]+)\t([^\t]*)\t([^\t]*)\t"), "<img src=\"\\1\" alt=\"\\2\" title=\"\\3\" />"),
(re.compile("&iframe\t([^\t]+)\t([0-9%]*)\t([0-9%]*)\t&\/iframe\t"), "<iframe src=\"\\1\" width=\"\\2\" height=\"\\3\" />"),
(re.compile("<([^>]+) (width|height|title|alt)=\"\"([^>]*?)>"), "<\\1\\3>"),
]
# Search and replace pairs used to parse simple HTML tags.
self.replace = [
("&b\t", "<b>"),
("&/b\t", "</b>"),
("&i\t", "<i>"),
("&/i\t", "</i>"),
("&u\t", "<u>"),
("&/u\t", "</u>"),
("&s\t", "<s>"),
("&/s\t", "</s>"),
("&sup\t", "<sup>"),
("&/sup\t", "</sup>"),
("&sub\t", "<sub>"),
("&/sub\t", "</sub>"),
("&code\t", "<code>"),
("&/code\t", "</code>"),
("&p\t", "<p>"),
("&/p\t", "</p>"),
("&ul\t", "<ul>"),
("&/ul\t", "</ul>"),
("&ol\t", "<ol>"),
("&/ol\t", "</ol>"),
("&li\t", "<li>"),
("&/li\t", "</li>"),
("&bcode\t", "<bcode>"),
("&/bcode\t", "</bcode>"),
("&br\t", "\n"),
("&/a\t", "</a>"),
("&/acro\t", "</acronym>"),
("&/abbr\t", "</abbr>"),
]
def parse(self, data):
""" Parse any dAmn Tablumps found in our input data.
This method will simply return a string with the tablumps
parsed into readable formats.
"""
try:
for lump, repl in self.replace:
data = data.replace(lump, repl)
for expression, repl in self.subs:
data = expression.sub(repl, data)
except Exception:
pass
return data
def capture(self, text):
""" Return any dAmn Tablumps found in our input data.
Rather than parsing the tablumps, this method returns the
data given by tablumps. This only works for tablumps where
a regular expression is used for parsing.
"""
lumps = {}
for key, expression in enumerate(self.expressions):
cc = expression.findall(text)
if not cc:
continue
lumps[self.titles[key]] = cc
return lumps
class TablumpString(object):
"""
An object representing a string containing tablumps.
"""
def __init__(self, parser, raw, tokens):
self.parser = parser
self.raw = raw
self.tokens = tokens
self._html = None
self._ansi = None
self._text = None
def text(self):
"""
Render as plain text.
"""
if self._text is None:
self._text = self.parser.render(0, self.tokens)
return self._text
class StrTablumps(object):
"""
Parses tablumps using plain string operations.
"""
def __init__(self):
"""
start things. yay
"""
self.map = self.default_map()
def default_map(self):
"""
Default map containing renderers and argument numbers.
"""
def rt_link(data):
if len(data) == 1:
return data[0]
return '{0} ({1})'.format(data[0], data[1])
def rh_link(data):
if len(data) == 1:
return '<a href="{0}">[link]</a>'.format(data[0])
return '<a href="{0}">{1}</a>'.format(data[0], data[1])
return {
'&b\t': [0, '<b>', '<b>', '\x1b[1m'],
'&/b\t': [0, '</b>', '</b>', '\x1b[22m'],
'&i\t': [0, '<i>', '<i>', '\x1b[3m'],
'&/i\t': [0, '</i>', '</i>', '\x1b[23m'],
'&u\t': [0, '<u>', '<u>', '\x1b[4m'],
'&/u\t': [0, '</u>', '</u>', '\x1b[24m'],
'&s\t': [0, '<s>', '<s>', '\x1b[9m'],
'&/s\t': [0, '</s>', '</s>', '\x1b[29m'],
'&sup\t': [0, '<sup>'],
'&/sup\t': [0, '</sup>'],
'&sub\t': [0, '<sub>'],
'&/sub\t': [0, '</sub>'],
'&code\t': [0, '<code>'],
'&/code\t': [0, '</code>'],
'&p\t': [0, '<p>'],
'&/p\t': [0, '</p>'],
'&ul\t': [0, '<ul>'],
'&/ul\t': [0, '</ul>'],
'&ol\t': [0, '<ol>'],
'&li\t': [0, '<li>' ],
'&/li\t': [0, '</li>'],
'&/ol\t': [0, '</ol>'],
'&link\t': [ 3, rt_link, rh_link],
# function( data ) {
# t = data[1];
# return '<a target="_blank" href="'+data[0]+'" title="'+( t || data[0] )+'">'+( t || '[link]' )+'</a>';
# }
#],
'&acro\t': [ 1, '<acronym title="{0}">' ],
'&/acro\t': [0, '</acronym>'],
'&abbr\t': [ 1, '<abbr title="{0}">'],
'&/abbr\t': [ 0, '</abbr>'],
'&img\t': [ 3, '<img src="{0}" alt="{1}" title="{2}" />'],
'&iframe\t': [ 3, '<iframe src="{0}" width="{1}" height="{2}" />'],
'&/iframe\t': [ 0, '</iframe>'],
'&a\t': [ 2, '<a href="{0}" title="{1}">' ],
'&/a\t': [ 0, '</a>'],
'&br\t': [ 0, '<br/>' ],
'&bcode\t': [0, '<bcode>', '<span><pre><code>'],
'&/bcode\t': [0, '</bcode>', '</code></pre></span>'],
'EOF': [0, '', None, '\x1b[m']
}
def parse(self, data):
"""
Parse a string that may possibly contain tablumps.
"""
return TablumpString(self, data, self.tokenise(data))
def tokenise(self, data):
"""
Tokenise our data based on things.
"""
result = []
start = 0
i = -1
working = data
while True:
i+= 1
try:
c = working[i]
except IndexError:
result.append([ 'raw', working ])
break
if c != '&':
continue
result.append([ 'raw', working[:i] ])
working = working[i:]
start = i + 1
ti = working.find('\t')
if ti == -1:
continue
tag = working[:(ti + 1)]
if ' ' in tag:
continue
working = working[(ti + 1):]
crops = self.crop(tag, working)
if crops is None:
continue
result.append(crops[0])
working = crops[1]
i = -1
return result
def crop(self, tag, data):
"""
Crop tablump data.
"""
cropping = None
if not tag in self.map:
return cropping
args = self.map[tag]
cropping = [[tag, []], data]
if args[0] == 0:
return cropping
tokens = self.tokens(data, args[0])
return [[tag, tokens[0]], tokens[1]]
def tokens(self, data, lim, sep=None, end=None):
"""
Crop `lim` tokens from `data`.
"""
sep = sep or '\t'
end = end or '&'
tokens = []
for i in range(lim):
sepi = data.find(sep)
if sepi == -1:
break
tokens.append(data[:sepi])
data = data[sepi+1:]
if tokens[-1] == end:
tokens.pop()
break
return [tokens, data]
def render(self, format, tokens):
"""
Render a set of tablump tokens as a string.
"""
format+= 1
rendered = ''
for token in tokens:
if token[0] == 'raw':
rendered+= token[1]
continue
renderer = None
try:
renderer = self.map[token[0]]
except KeyError:
continue
try:
renderer = renderer[format]
except IndexError:
renderer = renderer[1]
try:
rendered+= renderer( token[1] )
except TypeError:
rendered+= renderer.format(*token[1])
return rendered
if __name__ == '__main__':
pstr = StrTablumps()
preg = ReTablumps()
rtls = '&b\t&a\thttp://google.com\tfoo\tsomething&/a\t&/b\t&link\thttp://github.com\tgithub\t&\t'
print '>> Using ' + rtls
print '>> Testing regex method...'
sttsre = time.time()
ptlsre = preg.parse(rtls)
edtsre = time.time()
diffre = edtsre - sttsre
print '>> Result: ' + ptlsre
print '>> Start: {0}; End: {1}; Diff: {2}'.format( sttsre, edtsre, diffre )
print '>> Testing string method...'
sttsstr = time.time()
tls = pstr.parse(rtls)
ptlsstr = tls.text()
edtsstr = time.time()
diffstr = edtsstr - sttsstr
print '>> Result: ' + ptlsstr
print '>> Start: {0}; End: {1}; Diff: {2}'.format( sttsstr, edtsstr, diffstr )
print '>> string diff - reg diff: {0}'.format(diffstr - diffre)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment