Skip to content

Instantly share code, notes, and snippets.

@mckoss
Created November 30, 2009 02:36
Show Gist options
  • Save mckoss/245228 to your computer and use it in GitHub Desktop.
Save mckoss/245228 to your computer and use it in GitHub Desktop.
AMZN.com Dictionary Crawler
import httplib
import urllib2
import sys
# TODO: Speed up by searching on multiple connections, say, 10
def DictSearch(sAfter):
dict = open("dict.txt")
f = open("output.csv", 'a')
f.write("Dictionary test of AMZN.COM keywords (after: %s)\n" % sAfter)
conn = httplib.HTTPConnection("amzn.com")
try:
words = []
for line in dict:
word = line.strip()
if word > sAfter:
Test(word, conn, f)
except KeyboardInterrupt:
pass
conn.close()
f.close()
def Test(word, conn, f):
conn.request("HEAD", "/%s" % word)
resp = conn.getresponse()
print "%s: %d (%s) -> %s" % (word, resp.status, resp.reason, resp.getheader('Location'))
if resp.status != httplib.NOT_FOUND:
f.write("%s, %s\n" % (word, resp.getheader('Location')))
resp.read()
def MapURLs():
words = open("found.txt")
f = open("lookup.csv", 'a')
f.write("URLs of AMZN.COM keywords\n")
try:
for word in words:
Fetch(word.strip(), f)
except KeyboardInterrupt:
pass
f.close()
words.close()
def Fetch(word, f=None):
if f is None:
f = sys.stdout
try:
h = urllib2.urlopen("http://amzn.com/%s" % word)
sURL = h.geturl()
except Exception, e:
print e
sURL = None
if f is not sys.stdout:
print "%s -> %s" % (word, sURL)
f.write("%s, %s\n" % (word, sURL))
# Dictionary('a')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment