-
-
Save aballant01/8eb51ba4b9184a65da40 to your computer and use it in GitHub Desktop.
Functions and tests for validating an Email address
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import re | |
def is_valid_email_address(email): | |
"""Checks whether or not an email address meets specified standards for | |
composing email addresses, including proper escaping of special characters | |
and domain validation | |
Things missing: | |
- ipv6 Domains - while it does support properly formatted ipv4 domains, i've | |
yet to add in ipv6 domains | |
- Does not deal with or check unicode properly - it'll treat all emails as ASCII | |
strings | |
- It doesn't check that the domain name is actually a proper domain name - e.g. | |
passing in the domain "example.web", which wouldn't actually be allowed, will | |
pass the domain check | |
""" | |
if email.strip() == '': | |
return False | |
# To properly account for initial state of the validator, we need to check | |
# for an escape sequence initiaion at the start of the email address | |
if email[0] == '"': | |
state = 'escape' | |
else: | |
state = "local" | |
# We're going to replace the extra @'s with an escape character to ease | |
# validation such that we can split up the component pieces | |
parts = re.sub(r"(@)(?=.+?@)", '!!AT_SYMBOL!!', email) | |
# We split up the email into the local component and the domain component | |
# which should make it easier for us to perform validation (as each component | |
# has different rules regarding what is and isn't valid) | |
parts = parts.split('@') | |
# if we weren't able to split it up into a local component and a domain | |
# component, then the email address cannot be valid | |
if len(parts) < 2: | |
return False | |
# Now that we've properly split up the components | |
local = re.sub(r"!!AT_SYMBOL!!", "@", parts[0]) | |
skip = False | |
for i, c in enumerate(local): | |
# If the next character has been properly escaped (as determined below) | |
# then just move on to the next character | |
if skip: | |
skip = False | |
continue | |
if state == "local": | |
# Matches the text to initiate an escape sequence | |
if re.match(r"(?:^|\.)\"", local[i-1:i]): | |
state = "escape" | |
# checks that no disallowed characters appear | |
elif re.match(r"[\s\"(),:;<>@\[\]]", local[i]): | |
return False | |
# Ensures there are no double periods in a non-escape sequence | |
elif re.search(r"\.\.", local[i:i+2]): | |
return False | |
elif state == "escape": | |
# Checks for an escape sequence terminator (properly formatted) | |
if re.match(r"[^\\]?\"(?:\.|$)", local[i-1:i+1]): | |
state = "local" | |
elif re.match(r"[\\]", local[i]): | |
skip = True | |
continue | |
elif re.match(r"[^\\]\\[^\"]", local[i-1:i+1]): | |
return False | |
# If we haven't returned by this point, then we've found the local component | |
# to be valid, so we'll check whether the domain component is valid | |
return is_email_domain_valid(parts[1]) | |
def is_email_domain_valid(domain): | |
"""Checks a given domain sequence (as split from an email address) to see if it | |
is a valid domain name | |
""" | |
valid_ip = re.compile('^\[?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\]?$') | |
# If the domain only has numbers, periods, and brackets, we'll treat it | |
# as an IP address | |
if re.match(r"^[\d\.\[\]]+$", domain): | |
# If we can easily see that it's a valid IP address, then we'll call it valid | |
if re.match(valid_ip, domain): | |
# We need to make sure that if the IP has brackets surrounding it that | |
# the brackets are matching (the regex doesn't capture that) | |
if domain[0] == '[' or domain[-1] == ']': | |
return domain[0] == '[' and domain[-1] == ']' | |
return True | |
return False | |
# The domain must have at least one word character in it to be valid, so we | |
# can make that check very easily | |
if not re.match(r"\w{1}", domain): | |
return False | |
# Matches an invalid character sequence in a domain name | |
# [\"\@\s\(\)] - Domain cannot have these characters in it | |
# \.\. - Cannot have two periods in sequence | |
# ^[^\.]+$ - Must have at least one period in it | |
# ^[-_\.] - Cannot start with one of these characters | |
# [-_\.]$ - Cannot end with one of these characters | |
invalid_character_sequence = re.compile('([\"\@\s\(\)]|\.\.|^[^\.]+$|^[-_\.]|[-_\.]$)') | |
if re.search(invalid_character_sequence, domain): | |
return False | |
return True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import email_validation | |
import unittest | |
class EmailTest(unittest.TestCase): | |
def testValidDomainsAreValid(self): | |
self.assertTrue(email_validation.is_email_domain_valid("gmail.com")) | |
self.assertTrue(email_validation.is_email_domain_valid("yahoo.com")) | |
self.assertTrue(email_validation.is_email_domain_valid("example.co.uk")) | |
self.assertTrue(email_validation.is_email_domain_valid("example-one.com")) | |
self.assertTrue(email_validation.is_email_domain_valid("[123.123.123.123]")) | |
def testInvalidDomainsAreInvalid(self): | |
self.assertFalse(email_validation.is_email_domain_valid("123.123.123.123]")) | |
self.assertFalse(email_validation.is_email_domain_valid("[123.123.123.123")) | |
self.assertFalse(email_validation.is_email_domain_valid("123.123.123.1233")) | |
def testValidSimpleEmailAddresses(self): | |
self.assertTrue(email_validation.is_valid_email_address('niceandsimple@example.com')) | |
self.assertTrue(email_validation.is_valid_email_address('very.common@example.com')) | |
self.assertTrue(email_validation.is_valid_email_address('a.little.lengthy.but.fine@dept.example.com')) | |
self.assertTrue(email_validation.is_valid_email_address('disposable.style.email_validation.with+symbol@example.com')) | |
self.assertTrue(email_validation.is_valid_email_address('other.email-with-dash@example.com')) | |
def testValidComplexEmailAddresses(self): | |
self.assertTrue(email_validation.is_valid_email_address('"much.more unusual"@example.com')) | |
self.assertTrue(email_validation.is_valid_email_address('"very.unusual.@.unusual.com"@example.com')) | |
self.assertTrue(email_validation.is_valid_email_address('"very.(),:;<>[]\".VERY.\"very@\\ \"very\".unusual"@strange.example.com')) | |
self.assertTrue(email_validation.is_valid_email_address('"()<>:,;@\\"!#$%&\'*+-/=?_`{}| ~.a"@example.org')) | |
def testInvalidEmailAddresses(self): | |
self.assertFalse(email_validation.is_valid_email_address('Abc.example.com')) | |
self.assertFalse(email_validation.is_valid_email_address('A@b@c@example.com')) | |
self.assertFalse(email_validation.is_valid_email_address('a"b(c)d,e:f;g<h>i[j\k]l@example.com')) | |
self.assertFalse(email_validation.is_valid_email_address('just"not"right@example.com')) | |
self.assertFalse(email_validation.is_valid_email_address('this is"not\allowed@example.com')) | |
self.assertFalse(email_validation.is_valid_email_address('this\ still\"not\\allowed@example.com ')) | |
self.assertFalse(email_validation.is_valid_email_address('email@example')) | |
self.assertFalse(email_validation.is_valid_email_address('email@-example.com')) | |
self.assertFalse(email_validation.is_valid_email_address('email@111.222.333.44444')) | |
self.assertFalse(email_validation.is_valid_email_address('email@example..com')) | |
self.assertFalse(email_validation.is_valid_email_address('Abc..123@example.com')) | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
So this is my overly complicated and complex email validation tool. I've tried to capture lots of different types of unusual valid and invalid emails, and to do that using pure regular expressions seemed...difficult. Namely, using a regex to match character sequences only within a properly formatted set of quotes when other quotes can appear within the former quotes is quite difficult (or impossible) for a regex. If someone can show otherwise, I'd love to see it
The code here is...ugly. I'm working on cleaning it up and simplifying the logic/removing inconsistencies (like the mix between double and single quoted strings) - I'll update the gist each time I do. I didn't have a completely clear idea beforehand what steps the FSM was going to perform, and I haven't really figured out a good way to format/organize an FSM.