class ComicsSpider(Spider):
name = 'comics'
allowed_urls = ['']
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
# Generic function to get details from xpath in response
def getdetail(self, response, path, function, index, alt):
return function(response.xpath(path).extract()[index])
except Exception:
return alt
# Main parse function
def parse(self, response):
rows = response.xpath('//*[@id="all-series"]/div/table[2]/tr').extract()
for row in rows:
series_url = Selector(text=row).xpath('//td[2]/a/@href').extract()
yield Request(''+series_url[0], callback=self.parse_series_contents)
# Function to parse contents of a given series page
def parse_series_contents(self, response):
item = ComicsItem()
# Scrape data from the top of the series page
# ...
# Scrape data from table of issues in the series page
rows = response.xpath('//*[@id="issues"]/div[1]/table[2]/tr').extract()
item['issues_list'] = {}
for row in rows:
# Scrape issue data from each row
# ...
item['issues_list'][str(issue)] = {'rating_critic': rating_critic,
'rating_user': rating_user,
'writer': writer,
'artist': artist,
'reviews_critic_count': reviews_critic_count,
'reviews_user_count': reviews_user_count}
yield item
