Skip to content

Instantly share code, notes, and snippets.

@toinetoine
Last active September 9, 2015 00:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save toinetoine/a232ab932c7b3e8ea812 to your computer and use it in GitHub Desktop.
Save toinetoine/a232ab932c7b3e8ea812 to your computer and use it in GitHub Desktop.
Script to tally the domains of emails contained in the Ashley Madison email file
# write domain counts to file after every 1,000,000 emails read
increment_to_save = 1000000
def write_to_file(file_name, domains):
# sort the domains by count
from operator import itemgetter
domains = sorted(domains, key=itemgetter('count'))
# write the domains to the results file
results_file = open(file_name, "w")
domains = reversed(domains)
for domain in domains:
results_file.write(domain["domain"] + "," + str(domain["count"]) + "\n")
results_file.close()
emails_file = open("emails_dump.txt", "r")
# storage for domain counts
domains = list()
email_number = 1
for email in emails_file:
# if not @ symbol in email, go onto next email
if(email.find("@") != -1):
# get the email's domain
email_domain = email.split("@")[-1].rstrip('\n')
# go through the existing domains checking for the email's domain
domain_exists = False
for domain in domains:
if domain["domain"] == email_domain:
domain["count"] += 1
domain_exists = True
break
# if the domain doesn't already exist in the counts, then add it
if not domain_exists:
new_domain = {"domain": "", "count": 1}
new_domain["domain"] = email_domain
domains.append(new_domain)
# for every 1 million email addresses read, write the current domain counts to a file
if ((email_number % increment_to_save) == 0) and (email_number > 0):
write_to_file("domain_counts_" + str(int(email_number/increment_to_save)) + ".csv", domains)
email_number+=1
emails_file.close()
# write the final domain counts to a file
write_to_file("final_domain_counts.csv", domains)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment