Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Python script testing regular expressions to detect URLs in strings.
# -*- coding: utf-8 -*-
"""
URL Detection
I'm looking for a roughly accurate URL counter for spam detection in user-submitted content.
REFERENCES
http://daringfireball.net/2010/07/improved_regex_for_matching_urls
"""
import re
should_match = """\
http://foo.com/blah_blah
http://foo.com/blah_blah/
https://foo.com/blah_blah
https://foo.com/blah_blah/
(Something like http://foo.com/blah_blah)
http://foo.com/blah_blah_(wikipedia)
http://foo.com/more_(than)_one_(parens)
(Something like http://foo.com/blah_blah_(wikipedia))
http://foo.com/blah_(wikipedia)#cite-1
http://foo.com/blah_(wikipedia)_blah#cite-1
http://foo.com/unicode_(✪)_in_parens
http://foo.com/(something)?after=parens
http://foo.com/blah_blah.
http://foo.com/blah_blah/.
<http://foo.com/blah_blah>
<http://foo.com/blah_blah/>
http://foo.com/blah_blah,
http://www.extinguishedscholar.com/wpglob/?p=364.
http://✪df.ws/1234
rdar://1234
rdar:/1234
x-yojimbo-item://6303E4C1-6A6E-45A6-AB9D-3A908F59AE0E
message://%3c330e7f840905021726r6a4ba78dkf1fd71420c1bf6ff@mail.gmail.com%3e
http://➡.ws/䨹
www.c.ws/䨹
<tag>http://example.com</tag>
Just a www.example.com link.
http://example.com/something?with,commas,in,url, but not at end
What about <mailto:gruber@daringfireball.net?subject=TEST> (including brokets).
mailto:name@example.com
bit.ly/foo
“is.gd/foo/”
WWW.EXAMPLE.COM
http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))/Web_ENG/View_DetailPhoto.aspx?PicId=752
http://www.asianewsphoto.com/(S(neugxif4twuizg551ywh3f55))
http://lcweb2.loc.gov/cgi-bin/query/h?pp/horyd:@field(NUMBER+@band(thc+5a46634))
magnet:?xt=urn:btih:c12fe1c06bba254a9dc9f519b335aa7c1367a88a&dn"""
should_fail = """\
6:00p
filename.txt"""
known_to_fail = """\
http://example.com/quotes-are-“part”
✪df.ws/1234
example.com
example.com/"""
# Source: https://github.com/lepture/mistune/blob/master/mistune.py#L470
MISTUNE_URL = r'''(https?:\/\/[^\s<]+[^<.,:;"')\]\s])'''
# Source: https://gist.github.com/uogbuji/705383
# Note: Returns groups: url is first item in group
GRUBER_URL = ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|' \
ur'\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|' \
ur'[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))'
# Source: http://daringfireball.net/2010/07/improved_regex_for_matching_urls
GRUBER_ALL = ur'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|' \
ur'[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|' \
ur'\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|' \
ur'[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'
def test_positive():
test_cases = (should_match + known_to_fail).split('\n')
patterns = [
('Gruber URL', GRUBER_URL),
('Gruber All', GRUBER_ALL),
('Mistune', MISTUNE_URL)
]
results = {}
for name, pattern in patterns:
print 'TESTING %s' % (name)
detector = re.compile(pattern)
results[name] = [0, 0]
for test_case in test_cases:
matches = detector.findall(test_case)
if len(matches) < 1:
print 'FAIL: %s' % (test_case)
results[name][1] += 1
else:
print 'PASS: %s -> %s' % (test_case, matches)
results[name][0] += 1
for name, pattern in patterns:
detector = re.compile(pattern)
matches = detector.findall(should_match)
print '%s finds %s of %s' % (name, len(matches), len(test_cases))
#print matches
print results
def test_negative():
test_cases = should_fail.split('\n')
patterns = [
('Gruber URL', GRUBER_URL),
('Gruber All', GRUBER_ALL),
('Mistune', MISTUNE_URL)
]
results = {}
for name, pattern in patterns:
print 'NEGATIVE TEST FOR %s' % (name)
detector = re.compile(pattern)
results[name] = [0, 0]
for test_case in test_cases:
matches = detector.findall(test_case)
if len(matches) >= 1:
print 'FAIL: %s' % (test_case)
results[name][1] += 1
else:
print 'PASS: %s -> %s' % (test_case, matches)
results[name][0] += 1
print results
if __name__ == '__main__':
test_positive()
test_negative()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.