-
-
Save yeere91/732b67824fafeb4c4ca8cee5cd15845d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def parse(self, response): | |
print 'PARSING DATA VIA WAYBACK.PY.PARSE.()..' | |
## Look at all the months on the website... | |
months = response.xpath('//div[@id="calOver"]/div[@class="month"]').extract() | |
## For each month, get the list of URLs and extract the href | |
url_list = [] | |
for month in months: | |
url_list.extend(Selector(text = month).xpath('//div[@class="pop"]/ul/li/a/@href').extract()) | |
## Append the front of the URL to create a complete address for each item in the url_list | |
url_list = map(lambda x: 'https://web.archive.org' + x, url_list) | |
## For each item in url_list, yield a request and handle the request via parseData function | |
for url in url_list: | |
yield scrapy.Request(url=url, callback=self.parseData) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment