Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
import re
from hashlib import md5
def gfm(text):
# Extract pre blocks.
extractions = {}
def pre_extraction_callback(matchobj):
digest = md5(matchobj.group(0)).hexdigest()
extractions[digest] = matchobj.group(0)
return "{gfm-extraction-%s}" % digest
pattern = re.compile(r'<pre>.*?</pre>', re.MULTILINE | re.DOTALL)
text = re.sub(pattern, pre_extraction_callback, text)
# Prevent foo_bar_baz from ending up with an italic word in the middle.
def italic_callback(matchobj):
s = matchobj.group(0)
if list(s).count('_') >= 2:
return s.replace('_', '\_')
return s
pattern = re.compile(r'^(?! {4}|\t)\w+(?<!_)_\w+_\w[\w_]*', re.MULTILINE | re.UNICODE)
text = re.sub(pattern, italic_callback, text)
# In very clear cases, let newlines become <br /> tags.
def newline_callback(matchobj):
if len(matchobj.group(1)) == 1:
return matchobj.group(0).rstrip() + ' \n'
else:
return matchobj.group(0)
pattern = re.compile(r'^[\w\<][^\n]*(\n+)', re.MULTILINE | re.UNICODE)
text = re.sub(pattern, newline_callback, text)
# Insert pre block extractions.
def pre_insert_callback(matchobj):
return '\n\n' + extractions[matchobj.group(1)]
text = re.sub(r'{gfm-extraction-([0-9a-f]{32})\}', pre_insert_callback, text)
return text
# Test suite.
try:
from nose.tools import assert_equal
except ImportError:
def assert_equal(a, b):
assert a == b, '%r != %r' % (a, b)
def test_single_underscores():
"""Don't touch single underscores inside words."""
assert_equal(
gfm('foo_bar'),
'foo_bar',
)
def test_underscores_code_blocks():
"""Don't touch underscores in code blocks."""
assert_equal(
gfm(' foo_bar_baz'),
' foo_bar_baz',
)
def test_underscores_pre_blocks():
"""Don't touch underscores in pre blocks."""
assert_equal(
gfm('<pre>\nfoo_bar_baz\n</pre>'),
'\n\n<pre>\nfoo_bar_baz\n</pre>',
)
def test_pre_block_pre_text():
"""Don't treat pre blocks with pre-text differently."""
a = '\n\n<pre>\nthis is `a\\_test` and this\\_too\n</pre>'
b = 'hmm<pre>\nthis is `a\\_test` and this\\_too\n</pre>'
assert_equal(
gfm(a)[2:],
gfm(b)[3:],
)
def test_two_underscores():
"""Escape two or more underscores inside words."""
assert_equal(
gfm('foo_bar_baz'),
'foo\\_bar\\_baz',
)
def test_newlines_simple():
"""Turn newlines into br tags in simple cases."""
assert_equal(
gfm('foo\nbar'),
'foo \nbar',
)
def test_newlines_group():
"""Convert newlines in all groups."""
assert_equal(
gfm('apple\npear\norange\n\nruby\npython\nerlang'),
'apple \npear \norange\n\nruby \npython \nerlang',
)
def test_newlines_long_group():
"""Convert newlines in even long groups."""
assert_equal(
gfm('apple\npear\norange\nbanana\n\nruby\npython\nerlang'),
'apple \npear \norange \nbanana\n\nruby \npython \nerlang',
)
def test_newlines_list():
"""Don't convert newlines in lists."""
assert_equal(
gfm('# foo\n# bar'),
'# foo\n# bar',
)
assert_equal(
gfm('* foo\n* bar'),
'* foo\n* bar',
)

Great improvement on the way that regexes are handled with unicode! What are the cases that required these changes? Can you provide unittests illustrating the problems?

Owner

mvasilkov commented Nov 23, 2010

Yup, I'll add a test or two. The basic case is with any e.g. Russian word: u'раз_джва_три'. The underscores won't get escaped unless re.UNICODE is there.

Also changed the regex pattern a bit (added (?<!_) negative lookbehind), so the usual __bold__ syntax works as expected. Not the best possible way to do this, but probably the shortest one.

gasman commented Mar 6, 2011

There seems to be a bug in the regex for dealing with underscores - it's only matching the first word in the line. See my fork for a fix.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment