kanzure/pbase.py

## pbase.py
# -*- coding: utf-8 -*-
import requests
import lxml.etree
from StringIO import StringIO

def parse_html(content):
    """
    A possibly safer way to parse HTML content with lxml. This will presumably
    not break on poorly formatted HTML.
    """
    if not isinstance(content, StringIO):
        if not isinstance(content, str) and not isinstance(content, unicode):
            raise Exception("input content must be a str or StringIO instead of " + str(type(content)))
        content = StringIO(content)
    parser = lxml.etree.HTMLParser()
    tree = lxml.etree.parse(content, parser)
    return tree

def get_camera_brands(verbose=True):
    """
    Returns a list of relative paths on the remote server that refer to
    different camera brands.
    """
    url = "http://www.pbase.com/cameras"
    response = requests.get(url)
    tree = parse_html(response.content)

    # extract links to camera brands
    urls = tree.xpath("//table/tr/td/a[contains(@href, '/cameras/')]/@href")

    # throw out the urls that don't refer to brands
    ok_urls = [url for url in urls if url.count("/") == 2]

    if verbose:
        for url in ok_urls:
            print "brand: " + str(url)

    return ok_urls

def get_cameras_by_brand(brand_urls, verbose=True):
    """
    Returns a list of relative paths on the remote server that refer to
    different camera models for all the camera brands.
    """
    model_urls = []
    for brand_url in brand_urls:
        url = "http://www.pbase.com" + brand_url
        response = requests.get(url)
        tree = parse_html(response)
        models = tree.xpath("//a[contains(@href, '" + brand_url + "')]/@href")
        model_urls.extend(models)

        if verbose:
            for model_url in models:
                print "model: " + str(model_url)

    return model_urls

def main(verbose=True):
    """
    Orchestrates this crawler.
    """
    brands = get_camera_brands(verbose=verbose)
    models = get_cameras_by_brand(brands, verbose=verbose)
    raise NotImplementedError("totally don't want to use random sampling to get complete coverage of 600k+ photos dude")

if __name__ == "__main__":
    main()
	# -- coding: utf-8 --
	import requests
	import lxml.etree
	from StringIO import StringIO

	def parse_html(content):
	"""
	A possibly safer way to parse HTML content with lxml. This will presumably
	not break on poorly formatted HTML.
	"""
	if not isinstance(content, StringIO):
	if not isinstance(content, str) and not isinstance(content, unicode):
	raise Exception("input content must be a str or StringIO instead of " + str(type(content)))
	content = StringIO(content)
	parser = lxml.etree.HTMLParser()
	tree = lxml.etree.parse(content, parser)
	return tree

	def get_camera_brands(verbose=True):
	"""
	Returns a list of relative paths on the remote server that refer to
	different camera brands.
	"""
	url = "http://www.pbase.com/cameras"
	response = requests.get(url)
	tree = parse_html(response.content)

	# extract links to camera brands
	urls = tree.xpath("//table/tr/td/a[contains(@href, '/cameras/')]/@href")

	# throw out the urls that don't refer to brands
	ok_urls = [url for url in urls if url.count("/") == 2]

	if verbose:
	for url in ok_urls:
	print "brand: " + str(url)

	return ok_urls

	def get_cameras_by_brand(brand_urls, verbose=True):
	"""
	Returns a list of relative paths on the remote server that refer to
	different camera models for all the camera brands.
	"""
	model_urls = []
	for brand_url in brand_urls:
	url = "http://www.pbase.com" + brand_url
	response = requests.get(url)
	tree = parse_html(response)
	models = tree.xpath("//a[contains(@href, '" + brand_url + "')]/@href")
	model_urls.extend(models)

	if verbose:
	for model_url in models:
	print "model: " + str(model_url)

	return model_urls

	def main(verbose=True):
	"""
	Orchestrates this crawler.
	"""
	brands = get_camera_brands(verbose=verbose)
	models = get_cameras_by_brand(brands, verbose=verbose)
	raise NotImplementedError("totally don't want to use random sampling to get complete coverage of 600k+ photos dude")

	if __name__ == "__main__":
	main()