Skip to content

Instantly share code, notes, and snippets.

@kanzure
Created April 15, 2013 04:22
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kanzure/5385691 to your computer and use it in GitHub Desktop.
Save kanzure/5385691 to your computer and use it in GitHub Desktop.
pbase crawler
# -*- coding: utf-8 -*-
import requests
import lxml.etree
from StringIO import StringIO
def parse_html(content):
"""
A possibly safer way to parse HTML content with lxml. This will presumably
not break on poorly formatted HTML.
"""
if not isinstance(content, StringIO):
if not isinstance(content, str) and not isinstance(content, unicode):
raise Exception("input content must be a str or StringIO instead of " + str(type(content)))
content = StringIO(content)
parser = lxml.etree.HTMLParser()
tree = lxml.etree.parse(content, parser)
return tree
def get_camera_brands(verbose=True):
"""
Returns a list of relative paths on the remote server that refer to
different camera brands.
"""
url = "http://www.pbase.com/cameras"
response = requests.get(url)
tree = parse_html(response.content)
# extract links to camera brands
urls = tree.xpath("//table/tr/td/a[contains(@href, '/cameras/')]/@href")
# throw out the urls that don't refer to brands
ok_urls = [url for url in urls if url.count("/") == 2]
if verbose:
for url in ok_urls:
print "brand: " + str(url)
return ok_urls
def get_cameras_by_brand(brand_urls, verbose=True):
"""
Returns a list of relative paths on the remote server that refer to
different camera models for all the camera brands.
"""
model_urls = []
for brand_url in brand_urls:
url = "http://www.pbase.com" + brand_url
response = requests.get(url)
tree = parse_html(response)
models = tree.xpath("//a[contains(@href, '" + brand_url + "')]/@href")
model_urls.extend(models)
if verbose:
for model_url in models:
print "model: " + str(model_url)
return model_urls
def main(verbose=True):
"""
Orchestrates this crawler.
"""
brands = get_camera_brands(verbose=verbose)
models = get_cameras_by_brand(brands, verbose=verbose)
raise NotImplementedError("totally don't want to use random sampling to get complete coverage of 600k+ photos dude")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment