Last active
August 29, 2015 14:17
-
-
Save tripleee/93d9c4c152e99fa4d976 to your computer and use it in GitHub Desktop.
Simple email regex test suite
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# http://stackoverflow.com/questions/29264036/how-to-properly-escape-within-nested-parentheses-and-brackets | |
import urllib | |
import re | |
def tests (): | |
"""Download test suite; return a list of addresses to test with.""" | |
tests_url = 'http://code.iamcal.com/php/rfc822/tests/tests.xml' | |
tests_xml = urllib.urlopen(tests_url).read() | |
######## FIXME: fugly hand scraping | |
test_re = r'<test>\s*' \ | |
r'(?:<valid>(?P<valid1>(?:true|false))</valid>\s*' \ | |
r'<warning>(?P<warning1>(?:true|false))</warning>\s*)?' \ | |
r'(?:<comment>(?P<comment1>.*?)</comment>\s*)?' \ | |
r'<address>(?P<address>.*?)</address>\s*' \ | |
r'(?:<!--.*?-->\s*)?' \ | |
r'(?:<valid>(?P<valid2>(?:true|false))</valid>\s*' \ | |
r'<warning>(?P<warning2>(?:true|false))</warning>\s*)?' \ | |
r'(?:<comment>(?P<comment2>.*?)</comment>\s*)?' \ | |
r'<source>.*?</source>\s*' \ | |
r'(?:<comment>(?P<comment2>.*?)</comment>\s*)?' \ | |
r'<source>.*?</source>\s*' \ | |
r'(?:<comment>(?P<comment3>.*?)</comment>\s*)?' \ | |
r'(?:<sourcelink>.*?</sourcelink>\s*)?' \ | |
r'<id>(?P<id>.*?)</id>' | |
tests = [] | |
for m in re.finditer(test_re, tests_xml): | |
g = m.groupdict() | |
suffix = None | |
if 'valid1' in g: | |
suffix = '1' | |
elif 'valid2' in g: | |
suffix = '2' | |
if suffix: | |
g['valid'] = g['valid' + suffix] | |
del g['valid' + suffix] | |
if 'warning' + suffix in g: | |
g['warning'] = g['warning' + suffix] | |
del g['warning' + suffix] | |
suffix = None | |
if 'comment1' in g: | |
suffix = None | |
if 'comment1' in g: | |
suffix = '1' | |
elif 'comment2' in g: | |
suffix = '2' | |
elif 'comment3' in g: | |
suffix = '3' | |
if suffix: | |
g['comment'] = g['comment' + suffix] | |
del g['comment' + suffix] | |
g['address'] = g['address'].replace('␀', '\0') | |
tests.append(g) | |
return tests | |
def main(): | |
res = { | |
'orig': r"^((?P<name>[^(\s+\<)]*)\s+\<)?(?P<email>[^@]+?@[^>]+)>?$", | |
'kasra': r'(\w+)@([\w.]+)', | |
#'lostspoon_deleted': | |
# '([\\w-+]+(?:\\.[\\w-+]+)*@(?:[\\w-]+\\.)+[a-zA-Z]{2,7})', | |
'mine': r'^\s*(?P<name>[^\s<>](?:.*?[^\s<>])?)??\s*' \ | |
r'<?(?P<email>[^<>@\s]+@[^<>@\s]+)>?$', | |
'html5': r'''^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9]''' \ | |
r'(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9]' \ | |
r'(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$', | |
'rfc822pat': ''.join(r''' | |
[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\ | |
xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xf | |
f\n\015()]*)*\)[\040\t]*)*(?:(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\x | |
ff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"[^\\\x80-\xff\n\015 | |
"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015"]*)*")[\040\t]*(?:\([^\\\x80-\ | |
xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80 | |
-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]* | |
)*(?:\.[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\ | |
\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\ | |
x80-\xff\n\015()]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x8 | |
0-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|"[^\\\x80-\xff\n | |
\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015"]*)*")[\040\t]*(?:\([^\\\x | |
80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^ | |
\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040 | |
80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^ | |
\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040 | |
\t]*)*)*@[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([ | |
^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\ | |
\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\ | |
x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80- | |
\xff\n\015\[\]]|\\[^\x80-\xff])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015() | |
]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\ | |
x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:\.[\04 | |
0\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\ | |
n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\ | |
015()]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?! | |
[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\ | |
]]|\\[^\x80-\xff])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\ | |
x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\01 | |
5()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*)*|(?:[^(\040)<>@,;:". | |
\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff] | |
)|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015"]*)*")[^ | |
()<>@,;:".\\\[\]\x80-\xff\000-\010\012-\037]*(?:(?:\([^\\\x80-\xff\n\0 | |
15()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][ | |
^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)|"[^\\\x80-\xff\ | |
15()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][ | |
^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)|"[^\\\x80-\xff\ | |
n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015"]*)*")[^()<>@,;:".\\\[\]\ | |
x80-\xff\000-\010\012-\037]*)*<[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(? | |
:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80- | |
\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:@[\040\t]* | |
(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015 | |
()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015() | |
]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\0 | |
40)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\ | |
[^\x80-\xff])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\ | |
xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]* | |
)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:\.[\040\t]*(?:\([^\\\x80 | |
-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x | |
80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t | |
]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\ | |
\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff]) | |
*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x | |
80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80 | |
-\xff\n\015()]*)*\)[\040\t]*)*)*(?:,[\040\t]*(?:\([^\\\x80-\xff\n\015( | |
)]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\ | |
\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*@[\040\t | |
]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\0 | |
15()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015 | |
()]*)*\)[\040\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^( | |
\040)<>@,;:".\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]| | |
\\[^\x80-\xff])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80 | |
-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015() | |
]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:\.[\040\t]*(?:\([^\\\x | |
80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^ | |
\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040 | |
\t]*)*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:". | |
\\\[\]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff | |
])*\])[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\ | |
\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x | |
80-\xff\n\015()]*)*\)[\040\t]*)*)*)*:[\040\t]*(?:\([^\\\x80-\xff\n\015 | |
()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\ | |
\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*)?(?:[^ | |
(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000- | |
\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\xff\ | |
n\015"]*)*")[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]| | |
\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\)) | |
[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:\.[\040\t]*(?:\([^\\\x80-\xff | |
\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\x | |
ff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*( | |
?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\ | |
000-\037\x80-\xff])|"[^\\\x80-\xff\n\015"]*(?:\\[^\x80-\xff][^\\\x80-\ | |
xff\n\015"]*)*")[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\x | |
ff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*) | |
*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*)*@[\040\t]*(?:\([^\\\x80-\x | |
ff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80- | |
\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*) | |
*(?:[^(\040)<>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\ | |
]\000-\037\x80-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\] | |
)[\040\t]*(?:\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80- | |
\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\x | |
ff\n\015()]*)*\)[\040\t]*)*(?:\.[\040\t]*(?:\([^\\\x80-\xff\n\015()]*( | |
?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()]*(?:\\[^\x80-\xff][^\\\x80 | |
-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*)*\)[\040\t]*)*(?:[^(\040)< | |
>@,;:".\\\[\]\000-\037\x80-\xff]+(?![^(\040)<>@,;:".\\\[\]\000-\037\x8 | |
0-\xff])|\[(?:[^\\\x80-\xff\n\015\[\]]|\\[^\x80-\xff])*\])[\040\t]*(?: | |
\([^\\\x80-\xff\n\015()]*(?:(?:\\[^\x80-\xff]|\([^\\\x80-\xff\n\015()] | |
*(?:\\[^\x80-\xff][^\\\x80-\xff\n\015()]*)*\))[^\\\x80-\xff\n\015()]*) | |
*\)[\040\t]*)*)*>)'''.split('\n')), | |
'final': r"^((?P<name>[^(\\s+\<)]*)\s+<)?(?P<email>[^@]+?@[^>]+)>?$"} | |
testlist = tests() | |
i=1 | |
for t in testlist: | |
print('%d id: %s regex: %s' % (i, t['id'], t['address'])) | |
i+=1 | |
tn = dict() | |
tp = dict() | |
fn = dict() | |
fp = dict() | |
success = dict() | |
failure = dict() | |
for regex in res: | |
tn[regex] = 0 | |
tp[regex] = 0 | |
fn[regex] = 0 | |
fp[regex] = 0 | |
success[regex] = 0 | |
failure[regex] = 0 | |
try: | |
r = re.compile(res[regex]) | |
except Exception, err: | |
print('ERROR: %s: failed to compile %s: %s' % ( | |
regex, res[regex], err)) | |
continue | |
for pretest in ['someone@example.com', | |
'Some One <someone@example.com>']: | |
result = r.match(pretest) | |
if result is None: | |
print('ERROR: %s: failed to match pretest %r: %s' % ( | |
regex, pretest, res[regex])) | |
break | |
print('test %s: pretest %s: %s' % ( | |
regex, pretest, result.groupdict())) | |
for test in testlist: | |
for templ in ['Firstname Lastname <%s>', '<%s>', | |
'"Lastname, Firstname" <%s>', '%s']: | |
s = templ % (test['address']) | |
result = r.match(s) | |
print('test %s: %s (regex %s): %s (expect %s)' % ( | |
test['id'], s, regex, result!=None, test['valid'])) | |
if result != None: | |
if 'email' in result.groupdict() and \ | |
result.group('email') == test['address']: | |
verdict = 'success' | |
success[regex]+=1 | |
else: | |
verdict = 'failure' | |
failure[regex]+=1 | |
print(' => %s: %r' % (verdict, result.groupdict())) | |
if test['valid'] == 'true': | |
tp[regex]+=1 | |
if test['valid'] == 'true': | |
tp[regex]+=1 | |
else: | |
fp[regex]+=1 | |
else: | |
failure[regex]+=1 | |
if test['valid'] == 'true': | |
fn[regex]+=1 | |
else: | |
tn[regex]+=1 | |
print ('Total FN FP TN TP S F Regex') | |
for regex in res: | |
total = fn[regex] + fp[regex] + tn[regex] + tp[regex] | |
print ('%4d %4d %4d %4d %4d %4d %4d %s' % ( | |
total, fn[regex], fp[regex], tn[regex], tp[regex], | |
success[regex], failure[regex], regex)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment