haykuro/alexa_top500.py

## alexa_top500.py
from urllib import urlretrieve
from zipfile import ZipFile
from os.path import isfile

ZIP_URL = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
ZIP_PATH = '/tmp/top-1m.zip'

if not isfile(ZIP_PATH):
  urlretrieve(ZIP_URL, ZIP_PATH)

ZFILE = ZipFile(ZIP_PATH, 'r')

CONTENT = []

for name in ZFILE.namelist():
  if ".csv" in name and len(CONTENT) < 1:
    EX_FILE = ZFILE.open(name) # this is a file like object
    CONTENT = EX_FILE.read().split('\n')

if len(CONTENT) > 0:
  for line in CONTENT[:500]:
    if line is not "":
      (d_id, domain) = line.split(',')
      print "id: %s, domain: %s" % (d_id, domain)
	from urllib import urlretrieve
	from zipfile import ZipFile
	from os.path import isfile

	ZIP_URL = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
	ZIP_PATH = '/tmp/top-1m.zip'

	if not isfile(ZIP_PATH):
	urlretrieve(ZIP_URL, ZIP_PATH)

	ZFILE = ZipFile(ZIP_PATH, 'r')

	CONTENT = []

	for name in ZFILE.namelist():
	if ".csv" in name and len(CONTENT) < 1:
	EX_FILE = ZFILE.open(name) # this is a file like object
	CONTENT = EX_FILE.read().split('\n')

	if len(CONTENT) > 0:
	for line in CONTENT[:500]:
	if line is not "":
	(d_id, domain) = line.split(',')
	print "id: %s, domain: %s" % (d_id, domain)