photofroggy/tablumps.py

## tablumps.py
import re
import time


class ReTablumps(object):
    """ dAmn tablumps parser.

        dAmn sends certain information formatted in a specific manner.
        Links, images, thumbs, and other forms of data are formatted
        in strings where the different attributes of these values are
        separated by tab characters (``\\t``), and usually begin with an
        ampersand.

        We refer to these items as "tablumps" because of the tab
        characters being used as delimeters. The job of this class is to
        replace tablumps with readable strings, or to extract the data
        given in the tablumps.
    """

    expressions = None
    replace = None
    titles = None
    subs = None

    def __init__(self):
        """Populate the expressions and replaces used when parsing tablumps."""
        if self.expressions is not None:
            return
        # Regular expression objects used to find any complicated tablumps.
        self.expressions = [
            re.compile("&avatar\t([a-zA-Z0-9-]+)\t([0-9]+)\t"),
            re.compile("&dev\t(.)\t([a-zA-Z0-9-]+)\t"),
            re.compile("&emote\t([^\t]+)\t([0-9]+)\t([0-9]+)\t(.*?)\t([a-z0-9./=-_]+)\t"),
            re.compile("&a\t([^\t]+)\t([^\t]*)\t"),
            re.compile("&link\t([^\t]+)\t&\t"),
            re.compile("&link\t([^\t]+)\t([^\t]+)\t&\t"),
            re.compile("&acro\t([^\t]+)\t(.*)&\/acro\t"),
            re.compile("&abbr\t([^\t]+)\t(.*)&\/abbr\t"),
            re.compile("&thumb\t(?P<ID>[0-9]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t"),
            re.compile("&img\t([^\t]+)\t([^\t]*)\t([^\t]*)\t"),
            re.compile("&iframe\t([^\t]+)\t([0-9%]*)\t([0-9%]*)\t&\/iframe\t"),
        ]
        self.titles = ('avatar', 'dev', 'emote', 'a', 'link', 'link', 'acronym', 'abbr', 'thumb', 'img', 'iframe')
        # Regular expression objects used to find and replace complicated tablumps.
        self.subs = [
            (re.compile("&avatar\t([a-zA-Z0-9-]+)\t([0-9]+)\t"), ":icon\\1:"),
            (re.compile("&dev\t(.)\t([a-zA-Z0-9-]+)\t"), ":dev\\2:"),
            (re.compile("&emote\t([^\t]+)\t([0-9]+)\t([0-9]+)\t(.*?)\t([a-z0-9./=-_]+)\t"), "\\1"),
            (re.compile("&a\t([^\t]+)\t([^\t]*)\t"), "<a href=\"\\1\" title=\"\\2\">"),
            (re.compile("&link\t([^\t]+)\t&\t"), "\\1"),
            (re.compile("&link\t([^\t]+)\t([^\t]+)\t&\t"), "\\1 (\\2)"),
            (re.compile("&acro\t([^\t]+)\t"), "<acronym title=\"\\1\">"),
            (re.compile("&abbr\t([^\t]+)\t"), "<abbr title=\"\\1\">"),
            (re.compile("&thumb\t([0-9]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t"), ":thumb\\1:"),
            (re.compile("&img\t([^\t]+)\t([^\t]*)\t([^\t]*)\t"), "<img src=\"\\1\" alt=\"\\2\" title=\"\\3\" />"),
            (re.compile("&iframe\t([^\t]+)\t([0-9%]*)\t([0-9%]*)\t&\/iframe\t"), "<iframe src=\"\\1\" width=\"\\2\" height=\"\\3\" />"),
            (re.compile("<([^>]+) (width|height|title|alt)=\"\"([^>]*?)>"), "<\\1\\3>"),
        ]
        # Search and replace pairs used to parse simple HTML tags.
        self.replace = [
            ("&b\t", "<b>"),
            ("&/b\t", "</b>"),
            ("&i\t", "<i>"),
            ("&/i\t", "</i>"),
            ("&u\t", "<u>"),
            ("&/u\t", "</u>"),
            ("&s\t", "<s>"),
            ("&/s\t", "</s>"),
            ("&sup\t", "<sup>"),
            ("&/sup\t", "</sup>"),
            ("&sub\t", "<sub>"),
            ("&/sub\t", "</sub>"),
            ("&code\t", "<code>"),
            ("&/code\t", "</code>"),
            ("&p\t", "<p>"),
            ("&/p\t", "</p>"),
            ("&ul\t", "<ul>"),
            ("&/ul\t", "</ul>"),
            ("&ol\t", "<ol>"),
            ("&/ol\t", "</ol>"),
            ("&li\t", "<li>"),
            ("&/li\t", "</li>"),
            ("&bcode\t", "<bcode>"),
            ("&/bcode\t", "</bcode>"),
            ("&br\t", "\n"),
            ("&/a\t", "</a>"),
            ("&/acro\t", "</acronym>"),
            ("&/abbr\t", "</abbr>"),
        ]

    def parse(self, data):
        """ Parse any dAmn Tablumps found in our input data.

            This method will simply return a string with the tablumps
            parsed into readable formats.
        """
        try:
            for lump, repl in self.replace:
                data = data.replace(lump, repl)
            for expression, repl in self.subs:
                data = expression.sub(repl, data)
        except Exception:
            pass
        return data

    def capture(self, text):
        """ Return any dAmn Tablumps found in our input data.

            Rather than parsing the tablumps, this method returns the
            data given by tablumps. This only works for tablumps where
            a regular expression is used for parsing.
        """
        lumps = {}
        for key, expression in enumerate(self.expressions):
            cc = expression.findall(text)
            if not cc:
                continue
            lumps[self.titles[key]] = cc
        return lumps


class TablumpString(object):
    """
    An object representing a string containing tablumps.
    """

    def __init__(self, parser, raw, tokens):
        self.parser = parser
        self.raw = raw
        self.tokens = tokens
        self._html = None
        self._ansi = None
        self._text = None

    def text(self):
        """
        Render as plain text.
        """
        if self._text is None:
            self._text = self.parser.render(0, self.tokens)
        return self._text


class StrTablumps(object):
    """
    Parses tablumps using plain string operations.
    """

    def __init__(self):
        """
        start things. yay
        """
        self.map = self.default_map()

    def default_map(self):
        """
        Default map containing renderers and argument numbers.
        """

        def rt_link(data):
            if len(data) == 1:
                return data[0]
            return '{0} ({1})'.format(data[0], data[1])

        def rh_link(data):
            if len(data) == 1:
                return '<a href="{0}">[link]</a>'.format(data[0])
            return '<a href="{0}">{1}</a>'.format(data[0], data[1])

        return {
            '&b\t': [0, '<b>', '<b>', '\x1b[1m'],
            '&/b\t': [0, '</b>', '</b>', '\x1b[22m'],
            '&i\t': [0, '<i>', '<i>', '\x1b[3m'],
            '&/i\t': [0, '</i>', '</i>', '\x1b[23m'],
            '&u\t': [0, '<u>', '<u>', '\x1b[4m'],
            '&/u\t': [0, '</u>', '</u>', '\x1b[24m'],
            '&s\t': [0, '<s>', '<s>', '\x1b[9m'],
            '&/s\t': [0, '</s>', '</s>', '\x1b[29m'],
            '&sup\t': [0, '<sup>'],
            '&/sup\t': [0, '</sup>'],
            '&sub\t': [0, '<sub>'],
            '&/sub\t': [0, '</sub>'],
            '&code\t': [0, '<code>'],
            '&/code\t': [0, '</code>'],
            '&p\t': [0, '<p>'],
            '&/p\t': [0, '</p>'],
            '&ul\t': [0, '<ul>'],
            '&/ul\t': [0, '</ul>'],
            '&ol\t': [0, '<ol>'],
            '&li\t': [0, '<li>' ],
            '&/li\t': [0, '</li>'],
            '&/ol\t': [0, '</ol>'],
            '&link\t': [ 3, rt_link, rh_link],
            #    function( data ) {
            #        t = data[1];
            #        return '<a target="_blank" href="'+data[0]+'" title="'+( t || data[0] )+'">'+( t || '[link]' )+'</a>';
            #    }
            #],
            '&acro\t': [ 1, '<acronym title="{0}">' ],
            '&/acro\t': [0, '</acronym>'],
            '&abbr\t': [ 1, '<abbr title="{0}">'],
            '&/abbr\t': [ 0, '</abbr>'],
            '&img\t': [ 3, '<img src="{0}" alt="{1}" title="{2}" />'],
            '&iframe\t': [ 3, '<iframe src="{0}" width="{1}" height="{2}" />'],
            '&/iframe\t': [ 0, '</iframe>'],
            '&a\t': [ 2, '<a href="{0}" title="{1}">' ],
            '&/a\t': [ 0, '</a>'],
            '&br\t': [ 0, '<br/>' ],
            '&bcode\t': [0, '<bcode>', '<span><pre><code>'],
            '&/bcode\t': [0, '</bcode>', '</code></pre></span>'],
            'EOF': [0, '', None, '\x1b[m']
        }

    def parse(self, data):
        """
        Parse a string that may possibly contain tablumps.
        """
        return TablumpString(self, data, self.tokenise(data))

    def tokenise(self, data):
        """
        Tokenise our data based on things.
        """
        result = []
        start = 0
        i = -1
        working = data

        while True:
            i+= 1
            try:
                c = working[i]
            except IndexError:
                result.append([ 'raw', working ])
                break

            if c != '&':
                continue

            result.append([ 'raw', working[:i] ])
            working = working[i:]
            start = i + 1

            ti = working.find('\t')
            if ti == -1:
                continue

            tag = working[:(ti + 1)]

            if ' ' in tag:
                continue

            working = working[(ti + 1):]
            crops = self.crop(tag, working)

            if crops is None:
                continue

            result.append(crops[0])
            working = crops[1]
            i = -1

        return result


    def crop(self, tag, data):
        """
        Crop tablump data.
        """
        cropping = None

        if not tag in self.map:
            return cropping

        args = self.map[tag]
        cropping = [[tag, []], data]

        if args[0] == 0:
            return cropping

        tokens = self.tokens(data, args[0])
        return [[tag, tokens[0]], tokens[1]]


    def tokens(self, data, lim, sep=None, end=None):
        """
        Crop `lim` tokens from `data`.
        """
        sep = sep or '\t'
        end = end or '&'
        tokens = []

        for i in range(lim):
            sepi = data.find(sep)

            if sepi == -1:
                break

            tokens.append(data[:sepi])
            data = data[sepi+1:]

            if tokens[-1] == end:
                tokens.pop()
                break

        return [tokens, data]

    def render(self, format, tokens):
        """
        Render a set of tablump tokens as a string.
        """
        format+= 1
        rendered = ''

        for token in tokens:
            if token[0] == 'raw':
                rendered+= token[1]
                continue

            renderer = None
            try:
                renderer = self.map[token[0]]
            except KeyError:
                continue

            try:
                renderer = renderer[format]
            except IndexError:
                renderer = renderer[1]

            try:
                rendered+= renderer( token[1] )
            except TypeError:
                rendered+= renderer.format(*token[1])

        return rendered


if __name__ == '__main__':
    pstr = StrTablumps()
    preg = ReTablumps()
    rtls = '&b\t&a\thttp://google.com\tfoo\tsomething&/a\t&/b\t&link\thttp://github.com\tgithub\t&\t'

    print '>> Using ' + rtls
    print '>> Testing regex method...'
    sttsre = time.time()
    ptlsre = preg.parse(rtls)
    edtsre = time.time()
    diffre = edtsre - sttsre
    print '>> Result: ' + ptlsre
    print '>> Start: {0}; End: {1}; Diff: {2}'.format( sttsre, edtsre, diffre )
    print '>> Testing string method...'
    sttsstr = time.time()
    tls = pstr.parse(rtls)
    ptlsstr = tls.text()
    edtsstr = time.time()
    diffstr = edtsstr - sttsstr
    print '>> Result: ' + ptlsstr
    print '>> Start: {0}; End: {1}; Diff: {2}'.format( sttsstr, edtsstr, diffstr )
    print '>> string diff - reg diff: {0}'.format(diffstr - diffre)
	import re
	import time


	class ReTablumps(object):
	""" dAmn tablumps parser.

	dAmn sends certain information formatted in a specific manner.
	Links, images, thumbs, and other forms of data are formatted
	in strings where the different attributes of these values are
	separated by tab characters (``\\t``), and usually begin with an
	ampersand.

	We refer to these items as "tablumps" because of the tab
	characters being used as delimeters. The job of this class is to
	replace tablumps with readable strings, or to extract the data
	given in the tablumps.
	"""

	expressions = None
	replace = None
	titles = None
	subs = None

	def __init__(self):
	"""Populate the expressions and replaces used when parsing tablumps."""
	if self.expressions is not None:
	return
	# Regular expression objects used to find any complicated tablumps.
	self.expressions = [
	re.compile("&avatar\t([a-zA-Z0-9-]+)\t([0-9]+)\t"),
	re.compile("&dev\t(.)\t([a-zA-Z0-9-]+)\t"),
	re.compile("&emote\t([^\t]+)\t([0-9]+)\t([0-9]+)\t(.*?)\t([a-z0-9./=-_]+)\t"),
	re.compile("&a\t([^\t]+)\t([^\t]*)\t"),
	re.compile("&link\t([^\t]+)\t&\t"),
	re.compile("&link\t([^\t]+)\t([^\t]+)\t&\t"),
	re.compile("&acro\t([^\t]+)\t(.*)&\/acro\t"),
	re.compile("&abbr\t([^\t]+)\t(.*)&\/abbr\t"),
	re.compile("&thumb\t(?P<ID>[0-9]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t"),
	re.compile("&img\t([^\t]+)\t([^\t])\t([^\t])\t"),
	re.compile("&iframe\t([^\t]+)\t([0-9%])\t([0-9%])\t&\/iframe\t"),
	]
	self.titles = ('avatar', 'dev', 'emote', 'a', 'link', 'link', 'acronym', 'abbr', 'thumb', 'img', 'iframe')
	# Regular expression objects used to find and replace complicated tablumps.
	self.subs = [
	(re.compile("&avatar\t([a-zA-Z0-9-]+)\t([0-9]+)\t"), ":icon\\1:"),
	(re.compile("&dev\t(.)\t([a-zA-Z0-9-]+)\t"), ":dev\\2:"),
	(re.compile("&emote\t([^\t]+)\t([0-9]+)\t([0-9]+)\t(.*?)\t([a-z0-9./=-_]+)\t"), "\\1"),
	(re.compile("&a\t([^\t]+)\t([^\t]*)\t"), "<a href=\"\\1\" title=\"\\2\">"),
	(re.compile("&link\t([^\t]+)\t&\t"), "\\1"),
	(re.compile("&link\t([^\t]+)\t([^\t]+)\t&\t"), "\\1 (\\2)"),
	(re.compile("&acro\t([^\t]+)\t"), "<acronym title=\"\\1\">"),
	(re.compile("&abbr\t([^\t]+)\t"), "<abbr title=\"\\1\">"),
	(re.compile("&thumb\t([0-9]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t"), ":thumb\\1:"),
	(re.compile("&img\t([^\t]+)\t([^\t])\t([^\t])\t"), "<img src=\"\\1\" alt=\"\\2\" title=\"\\3\" />"),
	(re.compile("&iframe\t([^\t]+)\t([0-9%])\t([0-9%])\t&\/iframe\t"), "<iframe src=\"\\1\" width=\"\\2\" height=\"\\3\" />"),
	(re.compile("<([^>]+) (width\|height\|title\|alt)=\"\"([^>]*?)>"), "<\\1\\3>"),
	]
	# Search and replace pairs used to parse simple HTML tags.
	self.replace = [
	("&b\t", "<b>"),
	("&/b\t", "</b>"),
	("&i\t", "<i>"),
	("&/i\t", "</i>"),
	("&u\t", "<u>"),
	("&/u\t", "</u>"),
	("&s\t", "<s>"),
	("&/s\t", "</s>"),
	("&sup\t", "<sup>"),
	("&/sup\t", "</sup>"),
	("&sub\t", "<sub>"),
	("&/sub\t", "</sub>"),
	("&code\t", "<code>"),
	("&/code\t", "</code>"),
	("&p\t", "<p>"),
	("&/p\t", "</p>"),
	("&ul\t", "<ul>"),
	("&/ul\t", "</ul>"),
	("&ol\t", "<ol>"),
	("&/ol\t", "</ol>"),
	("&li\t", "<li>"),
	("&/li\t", "</li>"),
	("&bcode\t", "<bcode>"),
	("&/bcode\t", "</bcode>"),
	("&br\t", "\n"),
	("&/a\t", "</a>"),
	("&/acro\t", "</acronym>"),
	("&/abbr\t", "</abbr>"),
	]

	def parse(self, data):
	""" Parse any dAmn Tablumps found in our input data.

	This method will simply return a string with the tablumps
	parsed into readable formats.
	"""
	try:
	for lump, repl in self.replace:
	data = data.replace(lump, repl)
	for expression, repl in self.subs:
	data = expression.sub(repl, data)
	except Exception:
	pass
	return data

	def capture(self, text):
	""" Return any dAmn Tablumps found in our input data.

	Rather than parsing the tablumps, this method returns the
	data given by tablumps. This only works for tablumps where
	a regular expression is used for parsing.
	"""
	lumps = {}
	for key, expression in enumerate(self.expressions):
	cc = expression.findall(text)
	if not cc:
	continue
	lumps[self.titles[key]] = cc
	return lumps


	class TablumpString(object):
	"""
	An object representing a string containing tablumps.
	"""

	def __init__(self, parser, raw, tokens):
	self.parser = parser
	self.raw = raw
	self.tokens = tokens
	self._html = None
	self._ansi = None
	self._text = None

	def text(self):
	"""
	Render as plain text.
	"""
	if self._text is None:
	self._text = self.parser.render(0, self.tokens)
	return self._text



	class StrTablumps(object):
	"""
	Parses tablumps using plain string operations.
	"""

	def __init__(self):
	"""
	start things. yay
	"""
	self.map = self.default_map()

	def default_map(self):
	"""
	Default map containing renderers and argument numbers.
	"""

	def rt_link(data):
	if len(data) == 1:
	return data[0]
	return '{0} ({1})'.format(data[0], data[1])

	def rh_link(data):
	if len(data) == 1:
	return '<a href="{0}">[link]</a>'.format(data[0])
	return '<a href="{0}">{1}</a>'.format(data[0], data[1])

	return {
	'&b\t': [0, '<b>', '<b>', '\x1b[1m'],
	'&/b\t': [0, '</b>', '</b>', '\x1b[22m'],
	'&i\t': [0, '<i>', '<i>', '\x1b[3m'],
	'&/i\t': [0, '</i>', '</i>', '\x1b[23m'],
	'&u\t': [0, '<u>', '<u>', '\x1b[4m'],
	'&/u\t': [0, '</u>', '</u>', '\x1b[24m'],
	'&s\t': [0, '<s>', '<s>', '\x1b[9m'],
	'&/s\t': [0, '</s>', '</s>', '\x1b[29m'],
	'&sup\t': [0, '<sup>'],
	'&/sup\t': [0, '</sup>'],
	'&sub\t': [0, '<sub>'],
	'&/sub\t': [0, '</sub>'],
	'&code\t': [0, '<code>'],
	'&/code\t': [0, '</code>'],
	'&p\t': [0, '<p>'],
	'&/p\t': [0, '</p>'],
	'&ul\t': [0, '<ul>'],
	'&/ul\t': [0, '</ul>'],
	'&ol\t': [0, '<ol>'],
	'&li\t': [0, '<li>' ],
	'&/li\t': [0, '</li>'],
	'&/ol\t': [0, '</ol>'],
	'&link\t': [ 3, rt_link, rh_link],
	# function( data ) {
	# t = data[1];
	# return '<a target="_blank" href="'+data[0]+'" title="'+( t \|\| data[0] )+'">'+( t \|\| '[link]' )+'</a>';
	# }
	#],
	'&acro\t': [ 1, '<acronym title="{0}">' ],
	'&/acro\t': [0, '</acronym>'],
	'&abbr\t': [ 1, '<abbr title="{0}">'],
	'&/abbr\t': [ 0, '</abbr>'],
	'&img\t': [ 3, '<img src="{0}" alt="{1}" title="{2}" />'],
	'&iframe\t': [ 3, '<iframe src="{0}" width="{1}" height="{2}" />'],
	'&/iframe\t': [ 0, '</iframe>'],
	'&a\t': [ 2, '<a href="{0}" title="{1}">' ],
	'&/a\t': [ 0, '</a>'],
	'&br\t': [ 0, '<br/>' ],
	'&bcode\t': [0, '<bcode>', '<span><pre><code>'],
	'&/bcode\t': [0, '</bcode>', '</code></pre></span>'],
	'EOF': [0, '', None, '\x1b[m']
	}

	def parse(self, data):
	"""
	Parse a string that may possibly contain tablumps.
	"""
	return TablumpString(self, data, self.tokenise(data))

	def tokenise(self, data):
	"""
	Tokenise our data based on things.
	"""
	result = []
	start = 0
	i = -1
	working = data

	while True:
	i+= 1
	try:
	c = working[i]
	except IndexError:
	result.append([ 'raw', working ])
	break

	if c != '&':
	continue

	result.append([ 'raw', working[:i] ])
	working = working[i:]
	start = i + 1

	ti = working.find('\t')
	if ti == -1:
	continue

	tag = working[:(ti + 1)]

	if ' ' in tag:
	continue

	working = working[(ti + 1):]
	crops = self.crop(tag, working)

	if crops is None:
	continue

	result.append(crops[0])
	working = crops[1]
	i = -1

	return result


	def crop(self, tag, data):
	"""
	Crop tablump data.
	"""
	cropping = None

	if not tag in self.map:
	return cropping

	args = self.map[tag]
	cropping = [[tag, []], data]

	if args[0] == 0:
	return cropping

	tokens = self.tokens(data, args[0])
	return [[tag, tokens[0]], tokens[1]]


	def tokens(self, data, lim, sep=None, end=None):
	"""
	Crop `lim` tokens from `data`.
	"""
	sep = sep or '\t'
	end = end or '&'
	tokens = []

	for i in range(lim):
	sepi = data.find(sep)

	if sepi == -1:
	break

	tokens.append(data[:sepi])
	data = data[sepi+1:]

	if tokens[-1] == end:
	tokens.pop()
	break

	return [tokens, data]

	def render(self, format, tokens):
	"""
	Render a set of tablump tokens as a string.
	"""
	format+= 1
	rendered = ''

	for token in tokens:
	if token[0] == 'raw':
	rendered+= token[1]
	continue

	renderer = None
	try:
	renderer = self.map[token[0]]
	except KeyError:
	continue

	try:
	renderer = renderer[format]
	except IndexError:
	renderer = renderer[1]

	try:
	rendered+= renderer( token[1] )
	except TypeError:
	rendered+= renderer.format(*token[1])

	return rendered


	if __name__ == '__main__':
	pstr = StrTablumps()
	preg = ReTablumps()
	rtls = '&b\t&a\thttp://google.com\tfoo\tsomething&/a\t&/b\t&link\thttp://github.com\tgithub\t&\t'

	print '>> Using ' + rtls
	print '>> Testing regex method...'
	sttsre = time.time()
	ptlsre = preg.parse(rtls)
	edtsre = time.time()
	diffre = edtsre - sttsre
	print '>> Result: ' + ptlsre
	print '>> Start: {0}; End: {1}; Diff: {2}'.format( sttsre, edtsre, diffre )
	print '>> Testing string method...'
	sttsstr = time.time()
	tls = pstr.parse(rtls)
	ptlsstr = tls.text()
	edtsstr = time.time()
	diffstr = edtsstr - sttsstr
	print '>> Result: ' + ptlsstr
	print '>> Start: {0}; End: {1}; Diff: {2}'.format( sttsstr, edtsstr, diffstr )
	print '>> string diff - reg diff: {0}'.format(diffstr - diffre)