Last active
May 24, 2017 07:56
-
-
Save sushain97/5ed026c606295860c81c67d712b81811 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import hmac | |
import hashlib | |
import datetime | |
import urllib.parse | |
import collections | |
import operator | |
import base64 | |
import logging | |
import functools | |
import itertools | |
import sys | |
import xml.etree.ElementTree as ET | |
import requests | |
Country = collections.namedtuple('Country', ['name', 'code', 'total_sites']) | |
PageViews = collections.namedtuple('PageViews', ['per_million', 'per_user']) | |
Site = collections.namedtuple('Site', ['url', 'country', 'rank', 'reach', 'page_views']) | |
alexa_log = logging.getLogger('alexa_log') | |
class AlexaTopSites: | |
api = 'ats.amazonaws.com' | |
namespaces = {'aws': 'http://ats.amazonaws.com/doc/2005-11-21'} | |
page_size = 100 | |
def __init__(self, id, secret): | |
self.id = id | |
self.secret = secret | |
@classmethod | |
def _parse_response(cls, response): | |
root = ET.fromstring(response) | |
find = functools.partial(root.find, namespaces=cls.namespaces) | |
findall = functools.partial(root.findall, namespaces=cls.namespaces) | |
if not find('.//aws:TopSites'): | |
raise ValueError(response) | |
country = Country( | |
find('.//aws:CountryName').text, | |
find('.//aws:CountryCode').text, | |
int(find('.//aws:TotalSites').text), | |
) | |
for site in findall('.//aws:Site'): | |
page_views = PageViews( | |
float(site.find('.//aws:PageViews/aws:PerMillion', namespaces=cls.namespaces).text), | |
float(site.find('.//aws:PageViews/aws:PerUser', namespaces=cls.namespaces).text), | |
) | |
site = Site( | |
site.find('.//aws:DataUrl', namespaces=cls.namespaces).text, | |
country, | |
int(site.find('.//aws:Rank', namespaces=cls.namespaces).text), | |
float(site.find('.//aws:Reach/aws:PerMillion', namespaces=cls.namespaces).text), | |
page_views, | |
) | |
yield site | |
def sites(self, number, country='US', start=1): | |
if start <= 0: | |
raise ValueError('start must be a natural number') | |
responses = [] | |
while number > 0: | |
alexa_log.info('Requesting %d sites starting at %d.' % (number, start)) | |
params = collections.OrderedDict([ | |
('Action', 'TopSites'), | |
('AWSAccessKeyId', self.id), | |
('Timestamp', datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.000Z')), | |
('ResponseGroup', 'Country'), | |
('Start', start), | |
('Count', number), | |
('CountryCode', country), | |
('SignatureVersion', '2'), | |
('SignatureMethod', 'HmacSHA1'), | |
]) | |
params = collections.OrderedDict(sorted(params.items(), key=operator.itemgetter(0))) | |
query_string = urllib.parse.urlencode(params) | |
sign_string = 'GET\n%s\n/\n%s' % (self.api, query_string) | |
signature = hmac.new(bytearray(self.secret, 'ascii'), | |
sign_string.encode('ascii'), hashlib.sha1).digest() | |
params['Signature'] = base64.b64encode(signature) | |
request = requests.get('http://%s' % self.api, params=params) | |
response = request.text | |
responses.append(self._parse_response(response)) | |
number -= self.page_size | |
start += self.page_size | |
return itertools.chain.from_iterable(responses) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Get top X sites per Alexa') | |
parser.add_argument('-i', '--access-key-id', help='Alexa ROOT Access Key ID', required=True) | |
parser.add_argument('-s', '--access-key-secret', help='Alexa ROOT Access Key Secret', required=True) | |
parser.add_argument('-a', '--start', type=int, help='Starting position for sites', default=1) | |
parser.add_argument('-n', '--number', type=int, help='Number of sites', default=10) | |
parser.add_argument('-c', '--country', help='Country code', default='US') | |
parser.add_argument('--verbose', '-v', action='count', help='Verbosity', default=0) | |
args = parser.parse_args() | |
if args.number < 1: | |
parser.error('Number of sites must be >= 1') | |
if args.start < 1: | |
parser.error('Start must be natural number') | |
logging.basicConfig(stream=sys.stdout) | |
logging_level = [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG][min(args.verbose, 4)] | |
alexa_log.setLevel(logging_level) | |
alexa = AlexaTopSites(args.access_key_id, args.access_key_secret) | |
for site in alexa.sites(args.number, country=args.country, start=args.start): | |
print(site) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment