Triavanicus/gfm.py

## gfm.py
import re
from hashlib import md5
from markdown import markdown

def gfm(value):
    # Extract pre blocks.
    extractions = {}
    def pre_extraction_callback(matchobj):
        digest = md5(matchobj.group(0).encode('utf-8')).hexdigest()
        extractions[digest] = matchobj.group(0)
        return "{gfm-extraction-%s}" % digest
    pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE | re.DOTALL)
    value = re.sub(pattern, pre_extraction_callback, value)

    # Prevent foo_bar_baz from ending up with an italic word in the middle.
    def italic_callback(matchobj):
        s = matchobj.group(0)
        if list(s).count('_') >= 2:
            return s.replace('_', '\_')
        return s
    pattern = re.compile(r'^(?! {4}|\t).*\w+(?<!_)_\w+_\w[\w_]*', re.MULTILINE | re.UNICODE)
    value = re.sub(pattern, italic_callback, value)

    # In very clear cases, let newlines become <br /> tags.
    def newline_callback(matchobj):
        if len(matchobj.group(1)) == 1:
            return matchobj.group(0).rstrip() + '  \n'
        else:
            return matchobj.group(0)
    pattern = re.compile(r'^[\w\<][^\n]*(\n+)', re.MULTILINE | re.UNICODE)
    value = re.sub(pattern, newline_callback, value)

    # Insert pre block extractions.
    def pre_insert_callback(matchobj):
        return '\n\n' + extractions[matchobj.group(1)]
    value = re.sub(r'{gfm-extraction-([0-9a-f]{32})\}', pre_insert_callback, value)

    value = markdown(value)


    #All other gfm things

    def link_callback(matchobj):
        return "<a href=\""+matchobj.group(1)+"\" target=\"_blank\">"+matchobj.group(1)+"</a>"
    pattern = re.compile(r'((?<!")http://[a-zA-Z./&%#]*|(?<!")https://[a-zA-Z./&%#]*|(?<!")http://[a-zA-Z./&%#]*|(?<!")git://[a-zA-Z./&%#]*|(?<!")ftp://[a-zA-Z./&%#]*)',re.MULTILINE | re.UNICODE)
    value = re.sub(pattern, link_callback, value)

    def del_callback(matchobj):
        return "<del>"+matchobj.group(1)+"</del>"
    pattern = re.compile(r'~~(.*)~~',re.MULTILINE | re.UNICODE)
    value = re.sub(pattern, del_callback, value)

    def sup_callback(matchobj):
        return "<sup>"+matchobj.group(1)+"</sup>"
    pattern = re.compile(r'\^\((.*)\)',re.MULTILINE | re.UNICODE)
    value = re.sub(pattern, sup_callback, value)

    def mark_callback(matchobj):
        return "<mark>"+matchobj.group(1)+"</mark>"
    pattern = re.compile(r'==(.*)==',re.MULTILINE | re.UNICODE)
    value = re.sub(pattern, mark_callback, value)


    def table_callback(matchobj):
        table = "<table><tr>"
        headers = matchobj.group(1).replace(r'^( *| *\| *)$', '').split("|")
        align = matchobj.group(2).replace(r'^ *|\| *$', '').split("|")
        cells = matchobj.group(3).replace(r'\n$', '').split("\n")

        for i in range(0, len(align)):
            align[i] = align[i].replace("<br />", '')
            align[i] = align[i].replace(" ", '')

        i = 0
        for header in headers:
            header = header.replace("<p>", '')
            header = header.replace("</p>", '')
            thTag = "<th"
            if re.match(r'(\:\-+\:)', align[i]):
                thTag = thTag + " align=\"center\""
            elif re.match(r'\:\-+', align[i]):
                thTag = thTag + " align=\"left\""
            elif re.match(r'\-+\:', align[i]):
                thTag = thTag + " align=\"right\""
            thTag = thTag+">"
            table = table + thTag+header+"</th>"
            i+=1

        table = table + "</tr>"
        for cell in cells:
            if len(cell) == 0:
                continue
            table = table + "<tr>"
            i = 0
            for content in cell.split("|"):
                content = content.replace("<p>", '')
                content = content.replace("</p>", '')
                content = content.replace("<br />", "")
                content = content.replace(r' {2,}', '')
                tdTag = "<td"
                if re.match(r'(\:\-+\:)', align[i]):
                    tdTag = tdTag + " align=\"center\""
                elif re.match(r'\:\-+', align[i]):
                    tdTag = tdTag + " align=\"left\""
                elif re.match(r'\-+\:', align[i]):
                    tdTag = tdTag + " align=\"right\""
                tdTag = tdTag+">"
                table = table + tdTag+content+"</td>"
                i+=1
            table = table + "</tr>"


        table = table + "</table>"
        return table
    pattern = re.compile(r'^ *(\S.*\|.*)<br />\n *([-:]+ *\|[-| :]*)\n((?:.*\|.*(?:\n|$))*)\n*', re.MULTILINE | re.UNICODE)
    value = re.sub(pattern, table_callback, value)

    def ptable_callback(matchobj):
        table = "<table>\n<tr>"
        headers = matchobj.group(1).replace(r'^(\| *| *\| *\|)$', '').split("|")
        align = matchobj.group(2).replace(r'^ *|\| *$', '').split("|")
        cells = matchobj.group(3).replace(r'\n$', '').split("\n")

        for i in range(0, len(align)):
            align[i] = align[i].replace("<br />", '')
            align[i] = align[i].replace(" ", '')

        i = 0
        for header in headers:
            header = header.replace("<p>", '')
            header = header.replace("</p>", '')
            thTag = "<th"
            if re.match(r'(\:\-+\:)', align[i]):
                thTag = thTag + " align=\"center\""
            elif re.match(r'\:\-+', align[i]):
                thTag = thTag + " align=\"left\""
            elif re.match(r'\-+\:', align[i]):
                thTag = thTag + " align=\"right\""
            thTag = thTag+">"
            table = table + thTag+header+"</th>"
            i+=1

        table = table + "</tr>\n"
        for cell in cells:
            if len(cell) == 0:
                continue
            table = table + "<tr>"
            i = 0
            cell = cell.replace("<p>", '')
            cell = cell.replace("</p>", '')
            cell = cell[1:-1]
            for content in cell.split("|"):
                content = content
                content = content.replace("<p>", '')
                content = content.replace("</p>", '')
                content = content.replace("<br />", "")
                content = content.replace(r' {2,}', '')
                tdTag = "<td"
                try:
                    if re.match(r'(\:\-+\:)', align[i]):
                        tdTag = tdTag + " align=\"center\""
                    elif re.match(r'\:\-+', align[i]):
                        tdTag = tdTag + " align=\"left\""
                    elif re.match(r'\-+\:', align[i]):
                        tdTag = tdTag + " align=\"right\""
                except:
                    pass

                tdTag = tdTag+">"
                table = table + tdTag+content+"</td>"
                i+=1
            table = table + "</tr>\n"


        table = table + "</table>"
        return table
    pattern = re.compile(r'^<p>\| *(\S.*\|.*) *\| *\n *\| *(\S.*\|.*) *\| *\n *((?:.*\|.*(?:\n|$))*)\n*', re.MULTILINE | re.UNICODE)
    value = re.sub(pattern, ptable_callback, value)
    pattern = re.compile(r'^ *\| *(\S.*\|.*) *\| *\n *\| *(\S.*\|.*) *\| *\n *((?:.*\|.*(?:\n|$))*)\n*', re.MULTILINE | re.UNICODE)
    value = re.sub(pattern, ptable_callback, value)

    return value


# Test suite.
try:
    from nose.tools import assert_equal
except ImportError:
    def assert_equal(a, b):
        assert a == b, '%r != %r' % (a, b)

def test_single_underscores():
    """Don't touch single underscores inside words."""
    assert_equal(
        gfm('foo_bar'),
        'foo_bar',
    )

def test_underscores_code_blocks():
    """Don't touch underscores in code blocks."""
    assert_equal(
        gfm('    foo_bar_baz'),
        '    foo_bar_baz',
    )

def test_underscores_pre_blocks():
    """Don't touch underscores in pre blocks."""
    assert_equal(
        gfm('<pre>\nfoo_bar_baz\n</pre>'),
        '\n\n<pre>\nfoo_bar_baz\n</pre>',
    )

def test_pre_block_pre_text():
    """Don't treat pre blocks with pre-text differently."""
    a = '\n\n<pre>\nthis is `a\\_test` and this\\_too\n</pre>'
    b = 'hmm<pre>\nthis is `a\\_test` and this\\_too\n</pre>'
    assert_equal(
        gfm(a)[2:],
        gfm(b)[3:],
    )

def test_two_underscores():
    """Escape two or more underscores inside words."""
    assert_equal(
        gfm('foo_bar_baz'),
        'foo\\_bar\\_baz',
    )
    assert_equal(
        gfm('something else then foo_bar_baz'),
        'something else then foo\\_bar\\_baz',
    )

def test_newlines_simple():
    """Turn newlines into br tags in simple cases."""
    assert_equal(
        gfm('foo\nbar'),
        'foo  \nbar',
    )

def test_newlines_group():
    """Convert newlines in all groups."""
    assert_equal(
        gfm('apple\npear\norange\n\nruby\npython\nerlang'),
        'apple  \npear  \norange\n\nruby  \npython  \nerlang',
    )

def test_newlines_long_group():
    """Convert newlines in even long groups."""
    assert_equal(
        gfm('apple\npear\norange\nbanana\n\nruby\npython\nerlang'),
        'apple  \npear  \norange  \nbanana\n\nruby  \npython  \nerlang',
    )

def test_newlines_list():
    """Don't convert newlines in lists."""
    assert_equal(
        gfm('# foo\n# bar'),
        '# foo\n# bar',
    )
    assert_equal(
        gfm('* foo\n* bar'),
        '* foo\n* bar',
    )
	import re
	from hashlib import md5
	from markdown import markdown

	def gfm(value):
	# Extract pre blocks.
	extractions = {}
	def pre_extraction_callback(matchobj):
	digest = md5(matchobj.group(0).encode('utf-8')).hexdigest()
	extractions[digest] = matchobj.group(0)
	return "{gfm-extraction-%s}" % digest
	pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE \| re.DOTALL)
	value = re.sub(pattern, pre_extraction_callback, value)

	# Prevent foo_bar_baz from ending up with an italic word in the middle.
	def italic_callback(matchobj):
	s = matchobj.group(0)
	if list(s).count('_') >= 2:
	return s.replace('_', '\_')
	return s
	pattern = re.compile(r'^(?! {4}\|\t).\w+(?<!_)_\w+_\w[\w_]', re.MULTILINE \| re.UNICODE)
	value = re.sub(pattern, italic_callback, value)

	# In very clear cases, let newlines become <br /> tags.
	def newline_callback(matchobj):
	if len(matchobj.group(1)) == 1:
	return matchobj.group(0).rstrip() + ' \n'
	else:
	return matchobj.group(0)
	pattern = re.compile(r'^[\w\<][^\n]*(\n+)', re.MULTILINE \| re.UNICODE)
	value = re.sub(pattern, newline_callback, value)

	# Insert pre block extractions.
	def pre_insert_callback(matchobj):
	return '\n\n' + extractions[matchobj.group(1)]
	value = re.sub(r'{gfm-extraction-([0-9a-f]{32})\}', pre_insert_callback, value)

	value = markdown(value)


	#All other gfm things

	def link_callback(matchobj):
	return "<a href=\""+matchobj.group(1)+"\" target=\"_blank\">"+matchobj.group(1)+"</a>"
	pattern = re.compile(r'((?<!")http://[a-zA-Z./&%#]\|(?<!")https://[a-zA-Z./&%#]\|(?<!")http://[a-zA-Z./&%#]\|(?<!")git://[a-zA-Z./&%#]\|(?<!")ftp://[a-zA-Z./&%#]*)',re.MULTILINE \| re.UNICODE)
	value = re.sub(pattern, link_callback, value)

	def del_callback(matchobj):
	return "<del>"+matchobj.group(1)+"</del>"
	pattern = re.compile(r'~~(.*)~~',re.MULTILINE \| re.UNICODE)
	value = re.sub(pattern, del_callback, value)

	def sup_callback(matchobj):
	return "<sup>"+matchobj.group(1)+"</sup>"
	pattern = re.compile(r'\^\((.*)\)',re.MULTILINE \| re.UNICODE)
	value = re.sub(pattern, sup_callback, value)

	def mark_callback(matchobj):
	return "<mark>"+matchobj.group(1)+"</mark>"
	pattern = re.compile(r'==(.*)==',re.MULTILINE \| re.UNICODE)
	value = re.sub(pattern, mark_callback, value)


	def table_callback(matchobj):
	table = "<table><tr>"
	headers = matchobj.group(1).replace(r'^( \| \\| *)$', '').split("\|")
	align = matchobj.group(2).replace(r'^ \|\\| $', '').split("\|")
	cells = matchobj.group(3).replace(r'\n$', '').split("\n")

	for i in range(0, len(align)):
	align[i] = align[i].replace("<br />", '')
	align[i] = align[i].replace(" ", '')

	i = 0
	for header in headers:
	header = header.replace("<p>", '')
	header = header.replace("</p>", '')
	thTag = "<th"
	if re.match(r'(\:\-+\:)', align[i]):
	thTag = thTag + " align=\"center\""
	elif re.match(r'\:\-+', align[i]):
	thTag = thTag + " align=\"left\""
	elif re.match(r'\-+\:', align[i]):
	thTag = thTag + " align=\"right\""
	thTag = thTag+">"
	table = table + thTag+header+"</th>"
	i+=1

	table = table + "</tr>"
	for cell in cells:
	if len(cell) == 0:
	continue
	table = table + "<tr>"
	i = 0
	for content in cell.split("\|"):
	content = content.replace("<p>", '')
	content = content.replace("</p>", '')
	content = content.replace("<br />", "")
	content = content.replace(r' {2,}', '')
	tdTag = "<td"
	if re.match(r'(\:\-+\:)', align[i]):
	tdTag = tdTag + " align=\"center\""
	elif re.match(r'\:\-+', align[i]):
	tdTag = tdTag + " align=\"left\""
	elif re.match(r'\-+\:', align[i]):
	tdTag = tdTag + " align=\"right\""
	tdTag = tdTag+">"
	table = table + tdTag+content+"</td>"
	i+=1
	table = table + "</tr>"


	table = table + "</table>"
	return table
	pattern = re.compile(r'^ (\S.\\|.)<br />\n ([-:]+ \\|[-\| :])\n((?:.\\|.(?:\n\|$)))\n', re.MULTILINE \| re.UNICODE)
	value = re.sub(pattern, table_callback, value)

	def ptable_callback(matchobj):
	table = "<table>\n<tr>"
	headers = matchobj.group(1).replace(r'^(\\| \| \\| *\\|)$', '').split("\|")
	align = matchobj.group(2).replace(r'^ \|\\| $', '').split("\|")
	cells = matchobj.group(3).replace(r'\n$', '').split("\n")

	for i in range(0, len(align)):
	align[i] = align[i].replace("<br />", '')
	align[i] = align[i].replace(" ", '')

	i = 0
	for header in headers:
	header = header.replace("<p>", '')
	header = header.replace("</p>", '')
	thTag = "<th"
	if re.match(r'(\:\-+\:)', align[i]):
	thTag = thTag + " align=\"center\""
	elif re.match(r'\:\-+', align[i]):
	thTag = thTag + " align=\"left\""
	elif re.match(r'\-+\:', align[i]):
	thTag = thTag + " align=\"right\""
	thTag = thTag+">"
	table = table + thTag+header+"</th>"
	i+=1

	table = table + "</tr>\n"
	for cell in cells:
	if len(cell) == 0:
	continue
	table = table + "<tr>"
	i = 0
	cell = cell.replace("<p>", '')
	cell = cell.replace("</p>", '')
	cell = cell[1:-1]
	for content in cell.split("\|"):
	content = content
	content = content.replace("<p>", '')
	content = content.replace("</p>", '')
	content = content.replace("<br />", "")
	content = content.replace(r' {2,}', '')
	tdTag = "<td"
	try:
	if re.match(r'(\:\-+\:)', align[i]):
	tdTag = tdTag + " align=\"center\""
	elif re.match(r'\:\-+', align[i]):
	tdTag = tdTag + " align=\"left\""
	elif re.match(r'\-+\:', align[i]):
	tdTag = tdTag + " align=\"right\""
	except:
	pass

	tdTag = tdTag+">"
	table = table + tdTag+content+"</td>"
	i+=1
	table = table + "</tr>\n"


	table = table + "</table>"
	return table
	pattern = re.compile(r'^<p>\\| (\S.\\|.) \\| \n \\| (\S.\\|.) \\| \n ((?:.\\|.(?:\n\|$)))\n', re.MULTILINE \| re.UNICODE)
	value = re.sub(pattern, ptable_callback, value)
	pattern = re.compile(r'^ \\| (\S.\\|.) \\| \n \\| (\S.\\|.) \\| \n ((?:.\\|.(?:\n\|$)))\n*', re.MULTILINE \| re.UNICODE)
	value = re.sub(pattern, ptable_callback, value)

	return value


	# Test suite.
	try:
	from nose.tools import assert_equal
	except ImportError:
	def assert_equal(a, b):
	assert a == b, '%r != %r' % (a, b)

	def test_single_underscores():
	"""Don't touch single underscores inside words."""
	assert_equal(
	gfm('foo_bar'),
	'foo_bar',
	)

	def test_underscores_code_blocks():
	"""Don't touch underscores in code blocks."""
	assert_equal(
	gfm(' foo_bar_baz'),
	' foo_bar_baz',
	)

	def test_underscores_pre_blocks():
	"""Don't touch underscores in pre blocks."""
	assert_equal(
	gfm('<pre>\nfoo_bar_baz\n</pre>'),
	'\n\n<pre>\nfoo_bar_baz\n</pre>',
	)

	def test_pre_block_pre_text():
	"""Don't treat pre blocks with pre-text differently."""
	a = '\n\n<pre>\nthis is `a\\_test` and this\\_too\n</pre>'
	b = 'hmm<pre>\nthis is `a\\_test` and this\\_too\n</pre>'
	assert_equal(
	gfm(a)[2:],
	gfm(b)[3:],
	)

	def test_two_underscores():
	"""Escape two or more underscores inside words."""
	assert_equal(
	gfm('foo_bar_baz'),
	'foo\\_bar\\_baz',
	)
	assert_equal(
	gfm('something else then foo_bar_baz'),
	'something else then foo\\_bar\\_baz',
	)

	def test_newlines_simple():
	"""Turn newlines into br tags in simple cases."""
	assert_equal(
	gfm('foo\nbar'),
	'foo \nbar',
	)

	def test_newlines_group():
	"""Convert newlines in all groups."""
	assert_equal(
	gfm('apple\npear\norange\n\nruby\npython\nerlang'),
	'apple \npear \norange\n\nruby \npython \nerlang',
	)

	def test_newlines_long_group():
	"""Convert newlines in even long groups."""
	assert_equal(
	gfm('apple\npear\norange\nbanana\n\nruby\npython\nerlang'),
	'apple \npear \norange \nbanana\n\nruby \npython \nerlang',
	)

	def test_newlines_list():
	"""Don't convert newlines in lists."""
	assert_equal(
	gfm('# foo\n# bar'),
	'# foo\n# bar',
	)
	assert_equal(
	gfm('* foo\n* bar'),
	'* foo\n* bar',
	)