Skip to content

Instantly share code, notes, and snippets.

@mckelvin
Created April 29, 2012 12:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mckelvin/2550270 to your computer and use it in GitHub Desktop.
Save mckelvin/2550270 to your computer and use it in GitHub Desktop.
statistics of email domain from csdn600w
#!/usr/bin/python
# author Kelvin Pan <ibmmc@live.com>
# 04/29/2012 17:13
from operator import itemgetter
DATA_FILE='/tmp/www.csdn.net.sql'
TEST_DATA_FILE='./test.txt'
def cleaned_line(eachline):
email_domain_name = eachline.strip().split()[-1].split('@')[-1].replace('___csdn_1','').lower()
return filter(lambda x: ord(x)<128, email_domain_name)
def main():
'''statistics of email domain from csdn600w'''
txt = open(DATA_FILE)
data_store = {}
for eachline in txt:
eachline = cleaned_line(eachline);
try:
data_store[email_domain_name] = data_store[email_domain_name] + 1
except Exception, e:
data_store[email_domain_name] = 1
data_store_after_sort = sorted(data_store.iteritems(),key=itemgetter(1),reverse=True)
for domain,count in data_store_after_sort:
print '%-40s %d' % (domain,count)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment