Skip to content

Instantly share code, notes, and snippets.

@englehardt
Last active March 31, 2022 23:36
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save englehardt/802d1872d6bda2084723489a82540cb3 to your computer and use it in GitHub Desktop.
Save englehardt/802d1872d6bda2084723489a82540cb3 to your computer and use it in GitHub Desktop.
A utility file to retrieve and parse the Alexa Top 1 Million site list
from StringIO import StringIO
import requests
import zipfile
import random
import json
import os
EC2_LIST = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
def get_top_1m(location):
"""
Returns list of top 1 million sites. If no list exists
for the current day, a new one is fetched
@param location lists where raw list is cached
"""
location = os.path.expanduser(location)
site_list = os.path.join(location, 'top-1m.csv')
if not os.path.isfile(site_list):
print "%s does not exist, downloading a copy." % site_list
resp = requests.get(EC2_LIST)
with zipfile.ZipFile(StringIO(resp.content), 'r') as zpf:
contents = zpf.read(zpf.infolist()[0])
if not os.path.isdir(location):
os.makedirs(location)
with open(site_list, 'w') as f:
f.write(contents)
else:
with open(site_list, 'r') as f:
contents = f.read()
return [x.split(',')[-1] for x in contents.split('\n')]
def get_sampled_sites(location, include_rank=False,
slices=[(10000, 0, 10000),
(10000, 10000, 100000),
(15000, 100000, 1000000)]):
location = os.path.expanduser(location)
site_list = os.path.join(location, 'sampled_sites.json')
# If sampled site list exists, read and return it
if os.path.isfile(site_list):
with open(site_list, 'r') as f:
return json.load(f)
# If not, create it and return it
if not os.path.isdir(location):
os.makedirs(location)
sites = sample_top_sites(location, include_rank, slices)
with open(site_list, 'w') as f:
json.dump(sites, f)
return sites
def sample_top_sites(location, include_rank=False,
slices=[(10000, 0, 10000),
(10000, 10000, 100000),
(15000, 100000, 1000000)]):
"""
Returns a subsample of sites from the top 1 million given by `slices`
Parameters
----------
location : str
Location of top 1 million site list. If the list does not exist at this
location it will be downloaded.
include_rank : bool
Indicates whether or not to include the alexa rank in the output sample
slices : list of tuples
List of slices to sample. Each slice should be given as follows:
(# of sites, start_index, end_index)
Returns
-------
list of str or list of tuples
List of URLs sampled from the top 1m according to `slices`. If
`include_rank` is True, this returns of list of `(int: rank, str: url)`
"""
location = os.path.expanduser(location)
top_1m = get_top_1m(location)
if include_rank:
top_1m = zip(range(len(top_1m)), top_1m)
sites = list()
for sl in slices:
sites.extend(random.sample(top_1m[sl[1]:sl[2]], sl[0]))
return sites
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment