Last active
August 29, 2015 14:02
-
-
Save riccardomurri/a79e1ce21ad117a39813 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import re | |
from markdown2 import markdown | |
link_patterns = [ | |
# issueNNN --> /help/issueNNN | |
(re.compile(r'\b(issue)\s*(\d+)\b'), r'/help/\g<1>\g<2>'), | |
# web URL | |
(re.compile(r'''(?xi) | |
(?<!['"]) (?# not preceded by a quotes, avoid recursive processing of href="..." ) | |
\b | |
( (?# Capture group 1: entire matched URL) | |
(?:ftp|https?):// (?# URL schema) | |
(?: (?# optional userinfo part: ) | |
[a-z0-9._~\-%!$&'()*+,;=:]+ (?# see RFC 3986, sec. 3.2.1 ) | |
@ (?# literal '@' mark ) | |
)? | |
(?: (?# host, either one of the following: ) | |
[a-z0-9][a-z0-9.-]+\.(?:[a-z][a-z]+) (?# looks like an FQDN ) | |
| | |
[0-9]+(?:\.[0-9]+){3} (?# IPv4 dotted-quad ) | |
| | |
\[ [0-9a-f]+(?::[0-9a-f]+){7} \] (?# IPv6 full address literal ) | |
| | |
\[ ([0-9a-f]+(?: :[0-9a-f]+ )* )? :: ([0-9a-f]+(?: :[0-9a-f]+)* ) \] | |
(?# looks like an IPv6 abbreviated address literal ) | |
) | |
(?: :[0-9]+ )? (?# optional port number ) | |
(?: (?# optional path: ) | |
/ (?# literal slash ) | |
[a-z0-9._~\-%!$&'()*+,;=:@]* (?# see RFC 3986, sec. 2.2, 2.3, and 3.3 ) | |
)* (?# repeat zero or more times ) | |
(?: (?# optional query part: ) | |
[?] (?# literal question mark ) | |
[a-z0-9._~\-%!$&'()*+,;=:@?/]* (?# see RFC 3986, sec. 2.2, 2.3, and 3.4 ) | |
)? (?# at most once ) | |
(?: (?# optional fragment part: ) | |
[#] (?# literal hash mark ) | |
[a-z0-9._~\-%!$&'()*+,;=:@?/]* (?# see RFC 3986, sec. 2.2, 2.3, and 3.5 ) | |
)? (?# at most once ) | |
) | |
'''), r'\g<1>'), | |
# "naked" domain, assume `http://` URL | |
(re.compile(r'''(?xi) | |
(?<!/) (?# not preceded by a /, avoid matching http://example.com ) | |
(?<!\.) (?# not preceded by a ., avoid matching just `com` in example.com ) | |
(?<!@) (?# not preceded by a @, avoid matching foo@_gmail.com_ ) | |
\b | |
( (?# capture group 1: domain name) | |
[a-z0-9][a-z0-9.-]+\.(?:[a-z][a-z]+) (?# looks like a domain name ) | |
/? (?# optional slash ) | |
(?!@) (?# not followed by a @, avoid matching "foo.na" in "foo.na@example.com" ) | |
) | |
\b | |
'''), r'http://\g<1>') | |
] | |
def _to_markdown(text): | |
html = markdown( | |
text, | |
extras=['footnotes', 'fenced-code-blocks', 'link-patterns', 'footnotes'], | |
link_patterns=link_patterns) | |
# Ensure the code is converted to utf-8 | |
return html.encode('utf-8') | |
def test_link_patterns(): | |
for src, expected in ( | |
("issue1", '<p><a href="/help/issue1">issue1</a></p>'), | |
("issue 2", '<p><a href="/help/issue2">issue 2</a></p>'), | |
("nonlink1", '<p>nonlink1</p>'), | |
("http://www.google.com", | |
'<p><a href="http://www.google.com">http://www.google.com</a></p>'), | |
("http://www.google.com/", | |
'<p><a href="http://www.google.com/">http://www.google.com/</a></p>'), | |
("http://www.google.com:80/", | |
'<p><a href="http://www.google.com:80/">http://www.google.com:80/</a></p>'), | |
("http://www.google.com:80", | |
'<p><a href="http://www.google.com:80">http://www.google.com:80</a></p>'), | |
("http://www.google.com/search", | |
'<p><a href="http://www.google.com/search">http://www.google.com/search</a></p>'), | |
("http://www.google.com/search?q=regexp", | |
'<p><a href="http://www.google.com/search?q=regexp">http://www.google.com/search?q=regexp</a></p>'), | |
("[google](https://google.com)", | |
'<p><a href="https://google.com">google</a></p>'), | |
(""" | |
[Google][1] | |
[1]: http://www.google.com | |
""", | |
'<p><a href="http://www.google.com">Google</a></p>'), | |
("This is text with http://www.example.org/foo/bar/something link", | |
'<p>This is text with <a href="http://www.example.org/foo/bar/something">http://www.example.org/foo/bar/something</a> link</p>'), | |
("Dotted-quad addresses are also supported, as in http://192.0.2.5/", | |
'<p>Dotted-quad addresses are also supported, as in <a href="http://192.0.2.5/">http://192.0.2.5/</a></p>'), | |
("IPv6 full address literals, as in http://[2001:DB8:0:0:0:0:0:1]", | |
'<p>IPv6 full address literals, as in <a href="http://[2001:DB8:0:0:0:0:0:1]">http://[2001:DB8:0:0:0:0:0:1]</a></p>'), | |
("IPv6 abbreviated address literals, as in http://[::1]", | |
'<p>IPv6 abbreviated address literals, as in <a href="http://[::1]">http://[::1]</a></p>'), | |
("IPv6 abbreviated address literals w/ port number, as in http://[::1]:8080", | |
'<p>IPv6 abbreviated address literals w/ port number, as in <a href="http://[::1]:8080">http://[::1]:8080</a></p>'), | |
("FTP URLs, as in ftp://ftp.example.org/", | |
'<p>FTP URLs, as in <a href="ftp://ftp.example.org/">ftp://ftp.example.org/</a></p>'), | |
("FTP URLs with explicit username, as in ftp://anonymous@ftp.example.org", | |
'<p>FTP URLs with explicit username, as in <a href="ftp://anonymous@ftp.example.org">ftp://anonymous@ftp.example.org</a></p>'), | |
("FTP URLs with explicit username, as in ftp://anonymous@ftp.example.org", | |
'<p>FTP URLs with explicit username, as in <a href="ftp://anonymous@ftp.example.org">ftp://anonymous@ftp.example.org</a></p>'), | |
("FTP URLs with username and password (INSECURE!), as in ftp://anonymous:guesswho@ftp.example.org", | |
'<p>FTP URLs with username and password (INSECURE!), as in <a href="ftp://anonymous:guesswho@ftp.example.org">ftp://anonymous:guesswho@ftp.example.org</a></p>'), | |
): | |
actual = _to_markdown(src).strip() | |
assert actual == expected, \ | |
("Rendering of '%(src)s'" | |
" expected to be:\n\t '%(expected)s'\n" | |
" but gotten:\n\t '%(actual)s'\n instead." | |
% locals()) | |
if __name__ == "__main__": | |
test_link_patterns() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment