Skip to content

Instantly share code, notes, and snippets.

@sushain97
Last active May 24, 2017 07:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sushain97/5ed026c606295860c81c67d712b81811 to your computer and use it in GitHub Desktop.
Save sushain97/5ed026c606295860c81c67d712b81811 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import argparse
import hmac
import hashlib
import datetime
import urllib.parse
import collections
import operator
import base64
import logging
import functools
import itertools
import sys
import xml.etree.ElementTree as ET
import requests
Country = collections.namedtuple('Country', ['name', 'code', 'total_sites'])
PageViews = collections.namedtuple('PageViews', ['per_million', 'per_user'])
Site = collections.namedtuple('Site', ['url', 'country', 'rank', 'reach', 'page_views'])
alexa_log = logging.getLogger('alexa_log')
class AlexaTopSites:
api = 'ats.amazonaws.com'
namespaces = {'aws': 'http://ats.amazonaws.com/doc/2005-11-21'}
page_size = 100
def __init__(self, id, secret):
self.id = id
self.secret = secret
@classmethod
def _parse_response(cls, response):
root = ET.fromstring(response)
find = functools.partial(root.find, namespaces=cls.namespaces)
findall = functools.partial(root.findall, namespaces=cls.namespaces)
if not find('.//aws:TopSites'):
raise ValueError(response)
country = Country(
find('.//aws:CountryName').text,
find('.//aws:CountryCode').text,
int(find('.//aws:TotalSites').text),
)
for site in findall('.//aws:Site'):
page_views = PageViews(
float(site.find('.//aws:PageViews/aws:PerMillion', namespaces=cls.namespaces).text),
float(site.find('.//aws:PageViews/aws:PerUser', namespaces=cls.namespaces).text),
)
site = Site(
site.find('.//aws:DataUrl', namespaces=cls.namespaces).text,
country,
int(site.find('.//aws:Rank', namespaces=cls.namespaces).text),
float(site.find('.//aws:Reach/aws:PerMillion', namespaces=cls.namespaces).text),
page_views,
)
yield site
def sites(self, number, country='US', start=1):
if start <= 0:
raise ValueError('start must be a natural number')
responses = []
while number > 0:
alexa_log.info('Requesting %d sites starting at %d.' % (number, start))
params = collections.OrderedDict([
('Action', 'TopSites'),
('AWSAccessKeyId', self.id),
('Timestamp', datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.000Z')),
('ResponseGroup', 'Country'),
('Start', start),
('Count', number),
('CountryCode', country),
('SignatureVersion', '2'),
('SignatureMethod', 'HmacSHA1'),
])
params = collections.OrderedDict(sorted(params.items(), key=operator.itemgetter(0)))
query_string = urllib.parse.urlencode(params)
sign_string = 'GET\n%s\n/\n%s' % (self.api, query_string)
signature = hmac.new(bytearray(self.secret, 'ascii'),
sign_string.encode('ascii'), hashlib.sha1).digest()
params['Signature'] = base64.b64encode(signature)
request = requests.get('http://%s' % self.api, params=params)
response = request.text
responses.append(self._parse_response(response))
number -= self.page_size
start += self.page_size
return itertools.chain.from_iterable(responses)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Get top X sites per Alexa')
parser.add_argument('-i', '--access-key-id', help='Alexa ROOT Access Key ID', required=True)
parser.add_argument('-s', '--access-key-secret', help='Alexa ROOT Access Key Secret', required=True)
parser.add_argument('-a', '--start', type=int, help='Starting position for sites', default=1)
parser.add_argument('-n', '--number', type=int, help='Number of sites', default=10)
parser.add_argument('-c', '--country', help='Country code', default='US')
parser.add_argument('--verbose', '-v', action='count', help='Verbosity', default=0)
args = parser.parse_args()
if args.number < 1:
parser.error('Number of sites must be >= 1')
if args.start < 1:
parser.error('Start must be natural number')
logging.basicConfig(stream=sys.stdout)
logging_level = [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG][min(args.verbose, 4)]
alexa_log.setLevel(logging_level)
alexa = AlexaTopSites(args.access_key_id, args.access_key_secret)
for site in alexa.sites(args.number, country=args.country, start=args.start):
print(site)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment