Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Get a list of unique domain names from comScore browsing data
#
# Get All Unique Domain Names from comScore
#
# INPUT: comScore browsing data file
#
# OUTPUT: a text file containing a list of unique domains
#
# PAREMETERS:
# + INTERNET_USAGE_FILE: path to the comScore browsing data
# + FINAL_OUTPUT_FILE: path to intended output file
#
import time
INTERNET_USAGE_FILE = "../../web_browsing_2004"
FINAL_OUTPUT_FILE = "all_unique_domains.txt"
def simplifyDomain(myStr):
domain = myStr.strip()
domain = domain.replace("http://","")
domain = domain.replace("www.","")
domain = domain.split("/")[0].strip()
return domain
print " Start processing the data file %s" %(INTERNET_USAGE_FILE,)
first_line = True
count = 0
prev_timer= time.time()
UNIQUE_DOMAINS = set()
with open(INTERNET_USAGE_FILE) as myfile:
for line in myfile:
# Monitor the progress
count = count + 1
if (count%1000000==0):
now = time.time()
difference = int(now - prev_timer)
print " + Processing line %s.000.000th time_till_now=%s(s)"%(count/1000000,difference,)
if first_line: # Skip the first row
first_line = False
else:
datas = line.split(",")
domain = simplifyDomain(datas[1])
UNIQUE_DOMAINS.add(domain)
print "len(UNIQUE_DOMAINS) = %s"%(len(UNIQUE_DOMAINS),)
my_output_file = open(FINAL_OUTPUT_FILE,"w")
# Write to file
for d in UNIQUE_DOMAINS:
my_output_file.write(d + "\n")
my_output_file.close()
print "Done, write the final output to %s" %(FINAL_OUTPUT_FILE,)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment