Skip to content

Instantly share code, notes, and snippets.

@aballant01
Last active January 3, 2016 00:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aballant01/8eb51ba4b9184a65da40 to your computer and use it in GitHub Desktop.
Save aballant01/8eb51ba4b9184a65da40 to your computer and use it in GitHub Desktop.
Functions and tests for validating an Email address
#!/usr/bin/python
import re
def is_valid_email_address(email):
"""Checks whether or not an email address meets specified standards for
composing email addresses, including proper escaping of special characters
and domain validation
Things missing:
- ipv6 Domains - while it does support properly formatted ipv4 domains, i've
yet to add in ipv6 domains
- Does not deal with or check unicode properly - it'll treat all emails as ASCII
strings
- It doesn't check that the domain name is actually a proper domain name - e.g.
passing in the domain "example.web", which wouldn't actually be allowed, will
pass the domain check
"""
if email.strip() == '':
return False
# To properly account for initial state of the validator, we need to check
# for an escape sequence initiaion at the start of the email address
if email[0] == '"':
state = 'escape'
else:
state = "local"
# We're going to replace the extra @'s with an escape character to ease
# validation such that we can split up the component pieces
parts = re.sub(r"(@)(?=.+?@)", '!!AT_SYMBOL!!', email)
# We split up the email into the local component and the domain component
# which should make it easier for us to perform validation (as each component
# has different rules regarding what is and isn't valid)
parts = parts.split('@')
# if we weren't able to split it up into a local component and a domain
# component, then the email address cannot be valid
if len(parts) < 2:
return False
# Now that we've properly split up the components
local = re.sub(r"!!AT_SYMBOL!!", "@", parts[0])
skip = False
for i, c in enumerate(local):
# If the next character has been properly escaped (as determined below)
# then just move on to the next character
if skip:
skip = False
continue
if state == "local":
# Matches the text to initiate an escape sequence
if re.match(r"(?:^|\.)\"", local[i-1:i]):
state = "escape"
# checks that no disallowed characters appear
elif re.match(r"[\s\"(),:;<>@\[\]]", local[i]):
return False
# Ensures there are no double periods in a non-escape sequence
elif re.search(r"\.\.", local[i:i+2]):
return False
elif state == "escape":
# Checks for an escape sequence terminator (properly formatted)
if re.match(r"[^\\]?\"(?:\.|$)", local[i-1:i+1]):
state = "local"
elif re.match(r"[\\]", local[i]):
skip = True
continue
elif re.match(r"[^\\]\\[^\"]", local[i-1:i+1]):
return False
# If we haven't returned by this point, then we've found the local component
# to be valid, so we'll check whether the domain component is valid
return is_email_domain_valid(parts[1])
def is_email_domain_valid(domain):
"""Checks a given domain sequence (as split from an email address) to see if it
is a valid domain name
"""
valid_ip = re.compile('^\[?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\]?$')
# If the domain only has numbers, periods, and brackets, we'll treat it
# as an IP address
if re.match(r"^[\d\.\[\]]+$", domain):
# If we can easily see that it's a valid IP address, then we'll call it valid
if re.match(valid_ip, domain):
# We need to make sure that if the IP has brackets surrounding it that
# the brackets are matching (the regex doesn't capture that)
if domain[0] == '[' or domain[-1] == ']':
return domain[0] == '[' and domain[-1] == ']'
return True
return False
# The domain must have at least one word character in it to be valid, so we
# can make that check very easily
if not re.match(r"\w{1}", domain):
return False
# Matches an invalid character sequence in a domain name
# [\"\@\s\(\)] - Domain cannot have these characters in it
# \.\. - Cannot have two periods in sequence
# ^[^\.]+$ - Must have at least one period in it
# ^[-_\.] - Cannot start with one of these characters
# [-_\.]$ - Cannot end with one of these characters
invalid_character_sequence = re.compile('([\"\@\s\(\)]|\.\.|^[^\.]+$|^[-_\.]|[-_\.]$)')
if re.search(invalid_character_sequence, domain):
return False
return True
#!/usr/bin/python
import email_validation
import unittest
class EmailTest(unittest.TestCase):
def testValidDomainsAreValid(self):
self.assertTrue(email_validation.is_email_domain_valid("gmail.com"))
self.assertTrue(email_validation.is_email_domain_valid("yahoo.com"))
self.assertTrue(email_validation.is_email_domain_valid("example.co.uk"))
self.assertTrue(email_validation.is_email_domain_valid("example-one.com"))
self.assertTrue(email_validation.is_email_domain_valid("[123.123.123.123]"))
def testInvalidDomainsAreInvalid(self):
self.assertFalse(email_validation.is_email_domain_valid("123.123.123.123]"))
self.assertFalse(email_validation.is_email_domain_valid("[123.123.123.123"))
self.assertFalse(email_validation.is_email_domain_valid("123.123.123.1233"))
def testValidSimpleEmailAddresses(self):
self.assertTrue(email_validation.is_valid_email_address('niceandsimple@example.com'))
self.assertTrue(email_validation.is_valid_email_address('very.common@example.com'))
self.assertTrue(email_validation.is_valid_email_address('a.little.lengthy.but.fine@dept.example.com'))
self.assertTrue(email_validation.is_valid_email_address('disposable.style.email_validation.with+symbol@example.com'))
self.assertTrue(email_validation.is_valid_email_address('other.email-with-dash@example.com'))
def testValidComplexEmailAddresses(self):
self.assertTrue(email_validation.is_valid_email_address('"much.more unusual"@example.com'))
self.assertTrue(email_validation.is_valid_email_address('"very.unusual.@.unusual.com"@example.com'))
self.assertTrue(email_validation.is_valid_email_address('"very.(),:;<>[]\".VERY.\"very@\\ \"very\".unusual"@strange.example.com'))
self.assertTrue(email_validation.is_valid_email_address('"()<>:,;@\\"!#$%&\'*+-/=?_`{}| ~.a"@example.org'))
def testInvalidEmailAddresses(self):
self.assertFalse(email_validation.is_valid_email_address('Abc.example.com'))
self.assertFalse(email_validation.is_valid_email_address('A@b@c@example.com'))
self.assertFalse(email_validation.is_valid_email_address('a"b(c)d,e:f;g<h>i[j\k]l@example.com'))
self.assertFalse(email_validation.is_valid_email_address('just"not"right@example.com'))
self.assertFalse(email_validation.is_valid_email_address('this is"not\allowed@example.com'))
self.assertFalse(email_validation.is_valid_email_address('this\ still\"not\\allowed@example.com '))
self.assertFalse(email_validation.is_valid_email_address('email@example'))
self.assertFalse(email_validation.is_valid_email_address('email@-example.com'))
self.assertFalse(email_validation.is_valid_email_address('email@111.222.333.44444'))
self.assertFalse(email_validation.is_valid_email_address('email@example..com'))
self.assertFalse(email_validation.is_valid_email_address('Abc..123@example.com'))
if __name__ == '__main__':
unittest.main()
@aballant01
Copy link
Author

So this is my overly complicated and complex email validation tool. I've tried to capture lots of different types of unusual valid and invalid emails, and to do that using pure regular expressions seemed...difficult. Namely, using a regex to match character sequences only within a properly formatted set of quotes when other quotes can appear within the former quotes is quite difficult (or impossible) for a regex. If someone can show otherwise, I'd love to see it

The code here is...ugly. I'm working on cleaning it up and simplifying the logic/removing inconsistencies (like the mix between double and single quoted strings) - I'll update the gist each time I do. I didn't have a completely clear idea beforehand what steps the FSM was going to perform, and I haven't really figured out a good way to format/organize an FSM.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment