Skip to content

Instantly share code, notes, and snippets.

@miglen
Created January 5, 2016 09:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save miglen/9d331ea10b8c0660c1d3 to your computer and use it in GitHub Desktop.
Save miglen/9d331ea10b8c0660c1d3 to your computer and use it in GitHub Desktop.
Dead simple {for devs} python crawler (script) for extracting structured data from any website into CSV
#!/bin/env/python
#
# Source: http://blog.webhose.io/2015/08/16/dead-simple-for-devs-python-crawler-script-for-extracting-structured-data-from-any-almost-website-into-csv/
import sys, thread, Queue, re, urllib2, urlparse, time, csv
### Set the site you want to crawl & the patterns of the fields you want to extract ###
siteToCrawl = "http://www.amazon.com/"
fields = {}
fields["Title"] = '<title>(.*?)</title>'
fields["Rating"] = 'title="(S+) out of 5 stars"'
fields["Price"] = 'data-price="(.*?)"'
fields["Image"] = 'src="(http://ecx.images-amazon.com/images/I/.*?)"'
########################################################################
dupcheck = set()
q = Queue.Queue(25)
q.put(siteToCrawl)
csvFile = open("output.csv", "w",0)
csvTitles = dict(fields)
csvTitles["Link"] = ""
writer = csv.DictWriter(csvFile, fieldnames=csvTitles)
writer.writeheader()
def queueURLs(html, origLink):
for url in re.findall('''<a[^>]+href=["'](.[^"']+)["']''', html, re.I):
try:
if url.startswith("http") and urlparse.urlparse(url).netlock != urlparse.urlparse(siteToCrawl).netlock: # Make sure we keep crawling the same domain
continue
except Exception:
continue
link = url.split("#", 1)[0] if url.startswith("http") else '{uri.scheme}://{uri.netloc}'.format(uri=urlparse.urlparse(origLink)) + url.split("#", 1)[0]
if link in dupcheck:
continue
dupcheck.add(link)
if len(dupcheck) > 99999:
dupcheck.clear()
q.put(link)
def analyzePage(html,link):
print "Analyzing %s" % link
output = {}
for key, value in fields.iteritems():
m = re.search(fields[key],html, re.I | re.S)
if m:
output[key] = m.group(1)
output["Link"] = link
writer.writerow(output)
def getHTML(link):
try:
request = urllib2.Request(link)
request.add_header('User-Agent', 'Structured Data Extractor')
html = urllib2.build_opener().open(request).read()
analyzePage(html,link)
queueURLs(html, link)
except (KeyboardInterrupt, SystemExit):
raise
except Exception, e:
print e
while True:
thread.start_new_thread( getHTML, (q.get(),))
time.sleep(0.5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment