Skip to content

Instantly share code, notes, and snippets.

@hofaiwong
Created May 29, 2016 17:00
Show Gist options
  • Save hofaiwong/fe6d0589c42ee798d027287d1aac604d to your computer and use it in GitHub Desktop.
Save hofaiwong/fe6d0589c42ee798d027287d1aac604d to your computer and use it in GitHub Desktop.
class ComicsSpider(Spider):
name = 'comics'
allowed_urls = ['comicbookroundup.com']
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
# Generic function to get details from xpath in response
def getdetail(self, response, path, function, index, alt):
try:
return function(response.xpath(path).extract()[index])
except Exception:
return alt
# Main parse function
def parse(self, response):
rows = response.xpath('//*[@id="all-series"]/div/table[2]/tr').extract()
for row in rows:
series_url = Selector(text=row).xpath('//td[2]/a/@href').extract()
yield Request('http://comicbookroundup.com'+series_url[0], callback=self.parse_series_contents)
# Function to parse contents of a given series page
def parse_series_contents(self, response):
item = ComicsItem()
# Scrape data from the top of the series page
# ...
# Scrape data from table of issues in the series page
rows = response.xpath('//*[@id="issues"]/div[1]/table[2]/tr').extract()
item['issues_list'] = {}
for row in rows:
# Scrape issue data from each row
# ...
item['issues_list'][str(issue)] = {'rating_critic': rating_critic,
'rating_user': rating_user,
'writer': writer,
'artist': artist,
'reviews_critic_count': reviews_critic_count,
'reviews_user_count': reviews_user_count}
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment