Skip to content

Instantly share code, notes, and snippets.

@haykuro
Last active July 26, 2018 17:16
Show Gist options
  • Save haykuro/cfd3d8a92abf71e4d44e41705a10af9f to your computer and use it in GitHub Desktop.
Save haykuro/cfd3d8a92abf71e4d44e41705a10af9f to your computer and use it in GitHub Desktop.
from urllib import urlretrieve
from zipfile import ZipFile
from os.path import isfile
ZIP_URL = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
ZIP_PATH = '/tmp/top-1m.zip'
if not isfile(ZIP_PATH):
urlretrieve(ZIP_URL, ZIP_PATH)
ZFILE = ZipFile(ZIP_PATH, 'r')
CONTENT = []
for name in ZFILE.namelist():
if ".csv" in name and len(CONTENT) < 1:
EX_FILE = ZFILE.open(name) # this is a file like object
CONTENT = EX_FILE.read().split('\n')
if len(CONTENT) > 0:
for line in CONTENT[:500]:
if line is not "":
(d_id, domain) = line.split(',')
print "id: %s, domain: %s" % (d_id, domain)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment