jmduke/gist:9474688

## gistfile1.txt
# -*- coding: utf-8 -*-

import re

from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector

from lxml.cssselect import css_to_xpath
from HTMLParser import HTMLParser

class Review(Item):
    author = Field()
    url = Field()
    grade = Field()
    show = Field()
    season = Field()
    episode = Field()
    date = Field()

author_css = css_to_xpath('.author')
grade_css = css_to_xpath('.rating')
show_css = css_to_xpath('.series')
season_css = css_to_xpath('.season')
episode_css = css_to_xpath('.episode')
date_css = css_to_xpath('.published')

def html_to_text(s):
    # strip tags
    s = re.sub(r'<\W*(?:b|big|i|small|tt|abbr|acronym|cite|code|dfn|em|kbd|strong|samp|var|a|bdo|q|span|sub|sup)\b[^>]*?>', '', s, flags=re.I)
    s = re.sub(r'<[^>]*?>', ' ', s)
    # replace entities
    s = HTMLParser().unescape(s)
    # strip leading and trailing spaces
    s = s.strip()
    # replace all sequences of subsequent whitespaces with a single space
    s = re.sub(r'\s+', ' ', s)
    return s

class AVSpider(CrawlSpider):
    name = 'avclub'
    allowed_domains = ['www.avclub.com']
    start_urls = ['http://www.avclub.com/search?feature_types=tv-club&page=' + str(i) for i in range(400)]

    rules = (
        Rule(SgmlLinkExtractor(allow=r'/tvclub/.*'), callback='parse_review'),
        # Rule(SgmlLinkExtractor(allow=r'/search\?feature_types=tv-club&page=[0-9]*'))
    )
    def parse_review(self, response):
        hxs = HtmlXPathSelector(response)

        author = html_to_text(hxs.select(author_css).extract()[0])
        grade = html_to_text(hxs.select(grade_css).extract()[0]).split()[0]
        show = html_to_text(hxs.select(show_css).extract()[0])

        # These come in the form "S2", "E12", etc., so strip the first character.
        season = html_to_text(hxs.select(season_css).extract()[0])[1:]
        episode = html_to_text(hxs.select(episode_css).extract()[0])[1:]

        # This comes in the form "Feb 13, 2014 • 10:05PM", so split at the dot.
        date = html_to_text(hxs.select(date_css).extract()[0].split("•".decode('utf-8'))[0])

        return [Review(
            author=author,
            url=response.url,
            grade=grade,
            show=show,
            season=season,
            episode=episode,
            date=date
        )]
	# -- coding: utf-8 --

	import re

	from scrapy.item import Item, Field
	from scrapy.contrib.spiders import CrawlSpider, Rule
	from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
	from scrapy.selector import HtmlXPathSelector

	from lxml.cssselect import css_to_xpath
	from HTMLParser import HTMLParser

	class Review(Item):
	author = Field()
	url = Field()
	grade = Field()
	show = Field()
	season = Field()
	episode = Field()
	date = Field()

	author_css = css_to_xpath('.author')
	grade_css = css_to_xpath('.rating')
	show_css = css_to_xpath('.series')
	season_css = css_to_xpath('.season')
	episode_css = css_to_xpath('.episode')
	date_css = css_to_xpath('.published')

	def html_to_text(s):
	# strip tags
	s = re.sub(r'<\W(?:b\|big\|i\|small\|tt\|abbr\|acronym\|cite\|code\|dfn\|em\|kbd\|strong\|samp\|var\|a\|bdo\|q\|span\|sub\|sup)\b[^>]?>', '', s, flags=re.I)
	s = re.sub(r'<[^>]*?>', ' ', s)
	# replace entities
	s = HTMLParser().unescape(s)
	# strip leading and trailing spaces
	s = s.strip()
	# replace all sequences of subsequent whitespaces with a single space
	s = re.sub(r'\s+', ' ', s)
	return s

	class AVSpider(CrawlSpider):
	name = 'avclub'
	allowed_domains = ['www.avclub.com']
	start_urls = ['http://www.avclub.com/search?feature_types=tv-club&page=' + str(i) for i in range(400)]

	rules = (
	Rule(SgmlLinkExtractor(allow=r'/tvclub/.*'), callback='parse_review'),
	# Rule(SgmlLinkExtractor(allow=r'/search\?feature_types=tv-club&page=[0-9]*'))
	)
	def parse_review(self, response):
	hxs = HtmlXPathSelector(response)

	author = html_to_text(hxs.select(author_css).extract()[0])
	grade = html_to_text(hxs.select(grade_css).extract()[0]).split()[0]
	show = html_to_text(hxs.select(show_css).extract()[0])

	# These come in the form "S2", "E12", etc., so strip the first character.
	season = html_to_text(hxs.select(season_css).extract()[0])[1:]
	episode = html_to_text(hxs.select(episode_css).extract()[0])[1:]

	# This comes in the form "Feb 13, 2014 • 10:05PM", so split at the dot.
	date = html_to_text(hxs.select(date_css).extract()[0].split("•".decode('utf-8'))[0])

	return [Review(
	author=author,
	url=response.url,
	grade=grade,
	show=show,
	season=season,
	episode=episode,
	date=date
	)]