Created
September 30, 2014 21:58
-
-
Save elidickinson/213ec78c54e3aab9aedb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import dns.resolver | |
import csv | |
import re | |
import sys | |
domain_cache = {} | |
def test_email_domain(domain): | |
if domain is None: | |
return False | |
cached = domain_cache.get(domain,None) | |
if cached is not None: | |
return cached | |
try: | |
print domain | |
dnsresult = dns.resolver.query(domain,'MX') | |
result = (len(dnsresult) > 0) | |
except (dns.resolver.NXDOMAIN,dns.resolver.NoAnswer,dns.resolver.NoNameservers): | |
result = False | |
domain_cache[domain] = result | |
return result | |
def domain_from_email(email): | |
domain = re.search("@(.+)$", email) | |
if domain is None: | |
return None | |
return domain.group(1) | |
def email_looks_valid(email): | |
email = email.lower() | |
looks_ok = re.match("^.+@.+\..+$",email) | |
looks_bogus = re.match("^(asdf|aaa+|junk|spam|abcd?|1234?|qwerty?)@",email) or \ | |
re.match("^.@.\.",email) or \ | |
re.match("@(asdf|a+|jkl|1234?|abcd?|spam|mailinator)\.",email) | |
return (looks_ok is not None) and (looks_bogus is None) | |
if len(sys.argv) < 3: | |
sys.exit("USAGE: filter_bad_emails.py input.csv output.csv") | |
in_file_name = sys.argv[1] | |
out_file_name = sys.argv[2] | |
csv_in = csv.DictReader(open(in_file_name)) | |
csv_out = None | |
for row in csv_in: | |
email = row.get("email", False) or row.get("Email",False) or row.get("EMAIL",False) | |
domain = domain_from_email(email) | |
row['email_valid'] = email_looks_valid(email) | |
row['domain_valid'] = test_email_domain(domain) | |
if csv_out is None: | |
csv_out = csv.DictWriter(open(out_file_name,'w'), fieldnames = row.keys()) | |
csv_out.writeheader() | |
csv_out.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment