Skip to content

Instantly share code, notes, and snippets.

@jmduke
Created March 10, 2014 21:25
Show Gist options
  • Save jmduke/9474688 to your computer and use it in GitHub Desktop.
Save jmduke/9474688 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import re
from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from lxml.cssselect import css_to_xpath
from HTMLParser import HTMLParser
class Review(Item):
author = Field()
url = Field()
grade = Field()
show = Field()
season = Field()
episode = Field()
date = Field()
author_css = css_to_xpath('.author')
grade_css = css_to_xpath('.rating')
show_css = css_to_xpath('.series')
season_css = css_to_xpath('.season')
episode_css = css_to_xpath('.episode')
date_css = css_to_xpath('.published')
def html_to_text(s):
# strip tags
s = re.sub(r'<\W*(?:b|big|i|small|tt|abbr|acronym|cite|code|dfn|em|kbd|strong|samp|var|a|bdo|q|span|sub|sup)\b[^>]*?>', '', s, flags=re.I)
s = re.sub(r'<[^>]*?>', ' ', s)
# replace entities
s = HTMLParser().unescape(s)
# strip leading and trailing spaces
s = s.strip()
# replace all sequences of subsequent whitespaces with a single space
s = re.sub(r'\s+', ' ', s)
return s
class AVSpider(CrawlSpider):
name = 'avclub'
allowed_domains = ['www.avclub.com']
start_urls = ['http://www.avclub.com/search?feature_types=tv-club&page=' + str(i) for i in range(400)]
rules = (
Rule(SgmlLinkExtractor(allow=r'/tvclub/.*'), callback='parse_review'),
# Rule(SgmlLinkExtractor(allow=r'/search\?feature_types=tv-club&page=[0-9]*'))
)
def parse_review(self, response):
hxs = HtmlXPathSelector(response)
author = html_to_text(hxs.select(author_css).extract()[0])
grade = html_to_text(hxs.select(grade_css).extract()[0]).split()[0]
show = html_to_text(hxs.select(show_css).extract()[0])
# These come in the form "S2", "E12", etc., so strip the first character.
season = html_to_text(hxs.select(season_css).extract()[0])[1:]
episode = html_to_text(hxs.select(episode_css).extract()[0])[1:]
# This comes in the form "Feb 13, 2014 • 10:05PM", so split at the dot.
date = html_to_text(hxs.select(date_css).extract()[0].split("•".decode('utf-8'))[0])
return [Review(
author=author,
url=response.url,
grade=grade,
show=show,
season=season,
episode=episode,
date=date
)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment