Skip to content

Instantly share code, notes, and snippets.

@elidickinson
Created September 30, 2014 21:58
Show Gist options
  • Save elidickinson/213ec78c54e3aab9aedb to your computer and use it in GitHub Desktop.
Save elidickinson/213ec78c54e3aab9aedb to your computer and use it in GitHub Desktop.
import dns.resolver
import csv
import re
import sys
domain_cache = {}
def test_email_domain(domain):
if domain is None:
return False
cached = domain_cache.get(domain,None)
if cached is not None:
return cached
try:
print domain
dnsresult = dns.resolver.query(domain,'MX')
result = (len(dnsresult) > 0)
except (dns.resolver.NXDOMAIN,dns.resolver.NoAnswer,dns.resolver.NoNameservers):
result = False
domain_cache[domain] = result
return result
def domain_from_email(email):
domain = re.search("@(.+)$", email)
if domain is None:
return None
return domain.group(1)
def email_looks_valid(email):
email = email.lower()
looks_ok = re.match("^.+@.+\..+$",email)
looks_bogus = re.match("^(asdf|aaa+|junk|spam|abcd?|1234?|qwerty?)@",email) or \
re.match("^.@.\.",email) or \
re.match("@(asdf|a+|jkl|1234?|abcd?|spam|mailinator)\.",email)
return (looks_ok is not None) and (looks_bogus is None)
if len(sys.argv) < 3:
sys.exit("USAGE: filter_bad_emails.py input.csv output.csv")
in_file_name = sys.argv[1]
out_file_name = sys.argv[2]
csv_in = csv.DictReader(open(in_file_name))
csv_out = None
for row in csv_in:
email = row.get("email", False) or row.get("Email",False) or row.get("EMAIL",False)
domain = domain_from_email(email)
row['email_valid'] = email_looks_valid(email)
row['domain_valid'] = test_email_domain(domain)
if csv_out is None:
csv_out = csv.DictWriter(open(out_file_name,'w'), fieldnames = row.keys())
csv_out.writeheader()
csv_out.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment