public
Last active — forked from gasman/gfm.py

  • Download Gist
gfm.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
"""GitHub flavoured markdown: because normal markdown has some vicious
gotchas.
 
Further reading on the gotchas:
http://blog.stackoverflow.com/2009/10/markdown-one-year-later/
 
This is a Python port of GitHub code, taken from
https://gist.github.com/901706
 
To run the tests, install nose ($ easy_install nose) then:
 
$ nosetests libs/gfm.py
 
"""
 
import re
 
 
def remove_pre_blocks(markdown_source):
# replace <pre> blocks with placeholders, so we don't accidentally
# muck up stuff inside the block with our other transformations
original_blocks = []
 
pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE | re.DOTALL)
 
while re.search(pattern, markdown_source):
# save the original block
original_block = re.search(pattern, markdown_source).group(0)
original_blocks.append(original_block)
 
# put in a placeholder
markdown_source = re.sub(pattern, '{placeholder}', markdown_source,
count=1)
 
return (markdown_source, original_blocks)
 
 
def remove_inline_code_blocks(markdown_source):
original_blocks = []
 
pattern = re.compile(r'`.*?`', re.DOTALL)
 
while re.search(pattern, markdown_source):
# save the original block
original_block = re.search(pattern, markdown_source).group(0)
original_blocks.append(original_block)
 
# put in a placeholder
markdown_source = re.sub(pattern, '{placeholder}', markdown_source,
count=1)
 
return (markdown_source, original_blocks)
 
 
def gfm(text):
text, code_blocks = remove_pre_blocks(text)
text, inline_blocks = remove_inline_code_blocks(text)
 
# Prevent foo_bar_baz from ending up with an italic word in the middle.
def italic_callback(matchobj):
s = matchobj.group(0)
# don't mess with URLs:
if 'http:' in s or 'https:' in s:
return s
return s.replace('_', '\_')
 
# fix italics for code blocks
pattern = re.compile(r'^(?! {4}|\t).*\w+(?<!_)_\w+_\w[\w_]*', re.MULTILINE | re.UNICODE)
text = re.sub(pattern, italic_callback, text)
 
# linkify naked URLs
regex_string = """
(^|\s) # start of string or has whitespace before it
(https?://[:/.?=&;a-zA-Z0-9_-]+) # the URL itself, http or https only
(\s|$) # trailing whitespace or end of string
"""
pattern = re.compile(regex_string, re.VERBOSE | re.MULTILINE | re.UNICODE)
# wrap the URL in brackets: http://foo -> [http://foo](http://foo)
text = re.sub(pattern, r'\1[\2](\2)\3', text)
 
# In very clear cases, let newlines become <br /> tags.
def newline_callback(matchobj):
if len(matchobj.group(1)) == 1:
return matchobj.group(0).rstrip() + ' \n'
else:
return matchobj.group(0)
 
pattern = re.compile(r'^[\w\<][^\n]*(\n+)', re.MULTILINE | re.UNICODE)
text = re.sub(pattern, newline_callback, text)
 
# now restore removed code blocks
removed_blocks = code_blocks + inline_blocks
for removed_block in removed_blocks:
text = text.replace('{placeholder}', removed_block, 1)
 
return text
 
# Test suite.
try:
from nose.tools import assert_equal
except ImportError:
def assert_equal(a, b):
assert a == b, '%r != %r' % (a, b)
 
def test_single_underscores():
"""Don't touch single underscores inside words."""
assert_equal(
gfm('foo_bar'),
'foo_bar',
)
 
def test_underscores_code_blocks():
"""Don't touch underscores in code blocks."""
assert_equal(
gfm(' foo_bar_baz'),
' foo_bar_baz',
)
 
def test_underscores_inline_code_blocks():
"""Don't touch underscores in code blocks."""
assert_equal(
gfm('foo `foo_bar_baz`'),
'foo `foo_bar_baz`',
)
 
def test_underscores_pre_blocks():
"""Don't touch underscores in pre blocks."""
assert_equal(
gfm('<pre>\nfoo_bar_baz\n</pre>'),
'<pre>\nfoo_bar_baz\n</pre>',
)
 
def test_pre_block_pre_text():
"""Don't treat pre blocks with pre-text differently."""
a = '\n\n<pre>\nthis is `a\\_test` and this\\_too\n</pre>'
b = 'hmm<pre>\nthis is `a\\_test` and this\\_too\n</pre>'
assert_equal(
gfm(a)[2:],
gfm(b)[3:],
)
 
def test_two_underscores():
"""Escape two or more underscores inside words."""
assert_equal(
gfm('foo_bar_baz'),
'foo\\_bar\\_baz',
)
assert_equal(
gfm('something else then foo_bar_baz'),
'something else then foo\\_bar\\_baz',
)
 
def test_newlines_simple():
"""Turn newlines into br tags in simple cases."""
assert_equal(
gfm('foo\nbar'),
'foo \nbar',
)
 
def test_newlines_group():
"""Convert newlines in all groups."""
assert_equal(
gfm('apple\npear\norange\n\nruby\npython\nerlang'),
'apple \npear \norange\n\nruby \npython \nerlang',
)
 
def test_newlines_long_group():
"""Convert newlines in even long groups."""
assert_equal(
gfm('apple\npear\norange\nbanana\n\nruby\npython\nerlang'),
'apple \npear \norange \nbanana\n\nruby \npython \nerlang',
)
 
def test_newlines_list():
"""Don't convert newlines in lists."""
assert_equal(
gfm('# foo\n# bar'),
'# foo\n# bar',
)
assert_equal(
gfm('* foo\n* bar'),
'* foo\n* bar',
)
 
def test_underscores_urls():
"""Don't replace underscores in URLs"""
assert_equal(
gfm('[foo](http://example.com/a_b_c)'),
'[foo](http://example.com/a_b_c)'
)
 
def test_underscores_in_html():
"""Don't replace underscores in HTML blocks"""
assert_equal(
gfm('<img src="http://example.com/a_b_c" />'),
'<img src="http://example.com/a_b_c" />'
)
def test_linkify_naked_urls():
"""Wrap naked URLs in []() so they become clickable links."""
assert_equal(
gfm(" http://www.example.com:80/foo?bar=bar&biz=biz"),
" [http://www.example.com:80/foo?bar=bar&biz=biz](http://www.example.com:80/foo?bar=bar&biz=biz)"
)

This has problems with inline code:

`foo_bar_baz` -> `foo\_bar\_baz`

I've forked and fixed with a test.

Inline code should be processed like <pre> blocks. Given the input
"a little something `foo_bar_baz`"
your version still produces wrong results.

You're right. I've fixed this.

If anyone is still following along: compared with GitHub's markdown, this is missing URL autolinking. I've updated it.

@Wilfred I'm not sure this is available on PyPI, so I've added it as a module to my coaster library for Flask: https://github.com/hasgeek/coaster/blob/master/coaster/gfm.py (The license is ambiguous, unfortunately, because several people have added patches along the way.)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.