Created
April 15, 2013 04:22
-
-
Save kanzure/5385691 to your computer and use it in GitHub Desktop.
pbase crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import requests | |
import lxml.etree | |
from StringIO import StringIO | |
def parse_html(content): | |
""" | |
A possibly safer way to parse HTML content with lxml. This will presumably | |
not break on poorly formatted HTML. | |
""" | |
if not isinstance(content, StringIO): | |
if not isinstance(content, str) and not isinstance(content, unicode): | |
raise Exception("input content must be a str or StringIO instead of " + str(type(content))) | |
content = StringIO(content) | |
parser = lxml.etree.HTMLParser() | |
tree = lxml.etree.parse(content, parser) | |
return tree | |
def get_camera_brands(verbose=True): | |
""" | |
Returns a list of relative paths on the remote server that refer to | |
different camera brands. | |
""" | |
url = "http://www.pbase.com/cameras" | |
response = requests.get(url) | |
tree = parse_html(response.content) | |
# extract links to camera brands | |
urls = tree.xpath("//table/tr/td/a[contains(@href, '/cameras/')]/@href") | |
# throw out the urls that don't refer to brands | |
ok_urls = [url for url in urls if url.count("/") == 2] | |
if verbose: | |
for url in ok_urls: | |
print "brand: " + str(url) | |
return ok_urls | |
def get_cameras_by_brand(brand_urls, verbose=True): | |
""" | |
Returns a list of relative paths on the remote server that refer to | |
different camera models for all the camera brands. | |
""" | |
model_urls = [] | |
for brand_url in brand_urls: | |
url = "http://www.pbase.com" + brand_url | |
response = requests.get(url) | |
tree = parse_html(response) | |
models = tree.xpath("//a[contains(@href, '" + brand_url + "')]/@href") | |
model_urls.extend(models) | |
if verbose: | |
for model_url in models: | |
print "model: " + str(model_url) | |
return model_urls | |
def main(verbose=True): | |
""" | |
Orchestrates this crawler. | |
""" | |
brands = get_camera_brands(verbose=verbose) | |
models = get_cameras_by_brand(brands, verbose=verbose) | |
raise NotImplementedError("totally don't want to use random sampling to get complete coverage of 600k+ photos dude") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment