alucard001/clean_email.py

## clean_email.py
# coding: utf-8

import pandas as pd
import numpy as np
from validate_email import validate_email
import sys

# Note: you may encounter error if you don't have pyDNS or pyDNS3 installed.
import DNS
DNS.defaults["server"] = ["8.8.8.8", "8.8.4.4"]
DNS.defaults["server_rotate"] = True
# Default is UDP.  But why TCP, because when I wrote this, I have over 300,000 emails, UDP gave me a lot of timeout error
# even if I set the timeout second very low
DNS.defaults["protocol"] = "tcp"
# Timeout second for each checking
DNS.defaults["timeout"] = 2

# This is supposed to be a command line program,
# The format would be like:
#
# python2.7 clean_email.py <your csv file with only 1 email column> <file_name_saved_valid_email> <file_name_saved_invalid_email>
#
if(sys.argv[1] != ''):
    path = sys.argv[1]
else:
    path = "data/test_count_email.csv"
    sys.exit()


email = pd.read_csv(path, header=None, dtype=object)

unique_email = email.astype(str).drop_duplicates()

# Check if email address is correct, and add it to valid_email array

valid_email = []
# vaild_email_address = []
df_email = ''

ok_file = open(sys.argv[2], "w")
fail_file = open(sys.argv[3], "w")

for i, row in unique_email.iterrows():
    try:
        email = str(row[1]).lower()

        is_valid_email = validate_email(email, verify=True, smtp_timeout=2)

        if(is_valid_email):
            ok_file.write(",".join([email, row[2]]) + "\n")
        else:
            fail_file.write(",".join([email, row[2]]) + "\n")

    except Exception as e:
        # print(str(e))
        # print("Current email address:" + email)

        # email that causes exception will be written as failed one
        fail_file.write(",".join([email, row[2]]) + "\n")
        pass

# unique_email["valid"] = valid_email
# all_valid_address = unique_email[unique_email["valid"]==True]


# Write CSV file
# all_valid_address.to_csv("valid_email.csv", header=False, encoding="utf-8", index=False)

## readallcsv.py
# This file is used to read all CSV files generated from above script
import pandas as pd
import glob

# Get all valid file content
# merge it into one, and
# create a new CSV file with everything
for filetype in ["valid", "invalid"]:
	output = open(filetype + ".csv", "w")
	for file in glob.glob("data/email_chunk_*_" + filetype + ".csv"):
		for row in open(file, "r"):
			output.write(row)


## split_and_process.py
"""
Split large email list into small chunk,
deduplicate it and save it into separated file,
then run a validation script on all those files.

Yes, like doing Hadoop process manually in single machine.
"""

import pandas as pd
import sys

from subprocess import Popen

path = sys.argv[1]
chunksize = int(sys.argv[2])

email = pd.read_csv(path, header=None, dtype=object, chunksize=chunksize)

unique_email = []
for chunk in email:
    unique_email.append(chunk.astype(str).drop_duplicates())

chunkEmailLength = len(unique_email)

for i in range(chunkEmailLength):
    filename = "data/email_chunk_" + str(i) + ".csv"
    unique_email[i].to_csv(path_or_buf = filename, encoding="utf-8", header=False, index=False)

    valid_file = "data/email_chunk_" + str(i) + "_valid.csv"
    invalid_file = "data/email_chunk_" + str(i) + "_invalid.csv"

    Popen(["python2.7", "Clean_Email.py", filename, valid_file, invalid_file])
	# coding: utf-8

	import pandas as pd
	import numpy as np
	from validate_email import validate_email
	import sys

	# Note: you may encounter error if you don't have pyDNS or pyDNS3 installed.
	import DNS
	DNS.defaults["server"] = ["8.8.8.8", "8.8.4.4"]
	DNS.defaults["server_rotate"] = True
	# Default is UDP. But why TCP, because when I wrote this, I have over 300,000 emails, UDP gave me a lot of timeout error
	# even if I set the timeout second very low
	DNS.defaults["protocol"] = "tcp"
	# Timeout second for each checking
	DNS.defaults["timeout"] = 2

	# This is supposed to be a command line program,
	# The format would be like:
	#
	# python2.7 clean_email.py <your csv file with only 1 email column> <file_name_saved_valid_email> <file_name_saved_invalid_email>
	#
	if(sys.argv[1] != ''):
	path = sys.argv[1]
	else:
	path = "data/test_count_email.csv"
	sys.exit()


	email = pd.read_csv(path, header=None, dtype=object)

	unique_email = email.astype(str).drop_duplicates()

	# Check if email address is correct, and add it to valid_email array

	valid_email = []
	# vaild_email_address = []
	df_email = ''

	ok_file = open(sys.argv[2], "w")
	fail_file = open(sys.argv[3], "w")

	for i, row in unique_email.iterrows():
	try:
	email = str(row[1]).lower()

	is_valid_email = validate_email(email, verify=True, smtp_timeout=2)

	if(is_valid_email):
	ok_file.write(",".join([email, row[2]]) + "\n")
	else:
	fail_file.write(",".join([email, row[2]]) + "\n")

	except Exception as e:
	# print(str(e))
	# print("Current email address:" + email)

	# email that causes exception will be written as failed one
	fail_file.write(",".join([email, row[2]]) + "\n")
	pass

	# unique_email["valid"] = valid_email
	# all_valid_address = unique_email[unique_email["valid"]==True]


	# Write CSV file
	# all_valid_address.to_csv("valid_email.csv", header=False, encoding="utf-8", index=False)
	# This file is used to read all CSV files generated from above script
	import pandas as pd
	import glob

	# Get all valid file content
	# merge it into one, and
	# create a new CSV file with everything
	for filetype in ["valid", "invalid"]:
	output = open(filetype + ".csv", "w")
	for file in glob.glob("data/email_chunk_*_" + filetype + ".csv"):
	for row in open(file, "r"):
	output.write(row)
	"""
	Split large email list into small chunk,
	deduplicate it and save it into separated file,
	then run a validation script on all those files.

	Yes, like doing Hadoop process manually in single machine.
	"""

	import pandas as pd
	import sys

	from subprocess import Popen

	path = sys.argv[1]
	chunksize = int(sys.argv[2])

	email = pd.read_csv(path, header=None, dtype=object, chunksize=chunksize)

	unique_email = []
	for chunk in email:
	unique_email.append(chunk.astype(str).drop_duplicates())

	chunkEmailLength = len(unique_email)

	for i in range(chunkEmailLength):
	filename = "data/email_chunk_" + str(i) + ".csv"
	unique_email[i].to_csv(path_or_buf = filename, encoding="utf-8", header=False, index=False)

	valid_file = "data/email_chunk_" + str(i) + "_valid.csv"
	invalid_file = "data/email_chunk_" + str(i) + "_invalid.csv"

	Popen(["python2.7", "Clean_Email.py", filename, valid_file, invalid_file])