Skip to content

Instantly share code, notes, and snippets.

@jjam3774
Created February 17, 2014 17:27
Show Gist options
  • Save jjam3774/d37d2467e2e3749c6208 to your computer and use it in GitHub Desktop.
Save jjam3774/d37d2467e2e3749c6208 to your computer and use it in GitHub Desktop.
load with akamai
#!/usr/bin/python
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <codecell>
from httplib import HTTPConnection
import time
import re
import random
# <codecell>
GOOGLEBOT_USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
CHROME_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36"
# <codecell>
def convert_to_akamai_url(url):
akamai_prefix = 'http://thematic-id-5017.brcdn.com/fetch_thematic/5017/gmtnjthiky37113i/v1/'
return akamai_prefix + url[28:].replace('-th.html', '.html')
# <codecell>
def test_url(url, match_url, user_agent = GOOGLEBOT_USER_AGENT):
'''
Tests if the thematic page can be get. Returns a triple
(success, reason, time_used).
'''
original_url = match_url
if url.startswith('http://'):
url = url.split('http://')[1]
i = url.find('/')
if i == -1:
return (False, 'Not a valid URL.', -1)
try:
time_used = time.time()
conn = HTTPConnection(url[:i])
conn.request('GET', url[i:], headers={'user-agent':user_agent})
res = conn.getresponse()
time_used = time.time() - time_used
msg = res.read()
# uncomment to write header file
# with open(url.split('/')[-1] + '.json', 'w') as f:
# f.write(json.dumps(res.getheaders()))
# with open(url.split('/')[-1] + '.txt', 'w') as f:
# f.write(msg)
conn.close()
m = re.search(r'rel=["]?canonical["]?\s+href=["]?%s["]?' % re.escape(original_url), msg)
if not m:
return (False, 'Response code %s for url: %s' % (str(res.status), original_url), time_used)
return (True, '', time_used)
except Exception, e:
return (False, str(e), -1)
# <codecell>
filename = 'neimanmarcus_urls.txt'
with open(filename) as f:
urls = f.read().split('\n')
# <codecell>
random.shuffle(urls)
for url in urls:
res = test_url(url, url, user_agent = CHROME_USER_AGENT)
print url, res
akamai_url = convert_to_akamai_url(url)
res_akamai = test_url(akamai_url, url, user_agent = CHROME_USER_AGENT)
print akamai_url, res_akamai
time.sleep(1)
# <codecell>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment