Last active
March 31, 2022 23:36
-
-
Save englehardt/802d1872d6bda2084723489a82540cb3 to your computer and use it in GitHub Desktop.
A utility file to retrieve and parse the Alexa Top 1 Million site list
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from StringIO import StringIO | |
import requests | |
import zipfile | |
import random | |
import json | |
import os | |
EC2_LIST = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip' | |
def get_top_1m(location): | |
""" | |
Returns list of top 1 million sites. If no list exists | |
for the current day, a new one is fetched | |
@param location lists where raw list is cached | |
""" | |
location = os.path.expanduser(location) | |
site_list = os.path.join(location, 'top-1m.csv') | |
if not os.path.isfile(site_list): | |
print "%s does not exist, downloading a copy." % site_list | |
resp = requests.get(EC2_LIST) | |
with zipfile.ZipFile(StringIO(resp.content), 'r') as zpf: | |
contents = zpf.read(zpf.infolist()[0]) | |
if not os.path.isdir(location): | |
os.makedirs(location) | |
with open(site_list, 'w') as f: | |
f.write(contents) | |
else: | |
with open(site_list, 'r') as f: | |
contents = f.read() | |
return [x.split(',')[-1] for x in contents.split('\n')] | |
def get_sampled_sites(location, include_rank=False, | |
slices=[(10000, 0, 10000), | |
(10000, 10000, 100000), | |
(15000, 100000, 1000000)]): | |
location = os.path.expanduser(location) | |
site_list = os.path.join(location, 'sampled_sites.json') | |
# If sampled site list exists, read and return it | |
if os.path.isfile(site_list): | |
with open(site_list, 'r') as f: | |
return json.load(f) | |
# If not, create it and return it | |
if not os.path.isdir(location): | |
os.makedirs(location) | |
sites = sample_top_sites(location, include_rank, slices) | |
with open(site_list, 'w') as f: | |
json.dump(sites, f) | |
return sites | |
def sample_top_sites(location, include_rank=False, | |
slices=[(10000, 0, 10000), | |
(10000, 10000, 100000), | |
(15000, 100000, 1000000)]): | |
""" | |
Returns a subsample of sites from the top 1 million given by `slices` | |
Parameters | |
---------- | |
location : str | |
Location of top 1 million site list. If the list does not exist at this | |
location it will be downloaded. | |
include_rank : bool | |
Indicates whether or not to include the alexa rank in the output sample | |
slices : list of tuples | |
List of slices to sample. Each slice should be given as follows: | |
(# of sites, start_index, end_index) | |
Returns | |
------- | |
list of str or list of tuples | |
List of URLs sampled from the top 1m according to `slices`. If | |
`include_rank` is True, this returns of list of `(int: rank, str: url)` | |
""" | |
location = os.path.expanduser(location) | |
top_1m = get_top_1m(location) | |
if include_rank: | |
top_1m = zip(range(len(top_1m)), top_1m) | |
sites = list() | |
for sl in slices: | |
sites.extend(random.sample(top_1m[sl[1]:sl[2]], sl[0])) | |
return sites |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment