-
-
Save hofaiwong/fe6d0589c42ee798d027287d1aac604d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ComicsSpider(Spider): | |
name = 'comics' | |
allowed_urls = ['comicbookroundup.com'] | |
f = open("urls.txt") | |
start_urls = [url.strip() for url in f.readlines()] | |
f.close() | |
# Generic function to get details from xpath in response | |
def getdetail(self, response, path, function, index, alt): | |
try: | |
return function(response.xpath(path).extract()[index]) | |
except Exception: | |
return alt | |
# Main parse function | |
def parse(self, response): | |
rows = response.xpath('//*[@id="all-series"]/div/table[2]/tr').extract() | |
for row in rows: | |
series_url = Selector(text=row).xpath('//td[2]/a/@href').extract() | |
yield Request('http://comicbookroundup.com'+series_url[0], callback=self.parse_series_contents) | |
# Function to parse contents of a given series page | |
def parse_series_contents(self, response): | |
item = ComicsItem() | |
# Scrape data from the top of the series page | |
# ... | |
# Scrape data from table of issues in the series page | |
rows = response.xpath('//*[@id="issues"]/div[1]/table[2]/tr').extract() | |
item['issues_list'] = {} | |
for row in rows: | |
# Scrape issue data from each row | |
# ... | |
item['issues_list'][str(issue)] = {'rating_critic': rating_critic, | |
'rating_user': rating_user, | |
'writer': writer, | |
'artist': artist, | |
'reviews_critic_count': reviews_critic_count, | |
'reviews_user_count': reviews_user_count} | |
yield item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment