Created
November 26, 2018 01:06
-
-
Save aaferrari/bcac380b6832212c7bb65476affa41b2 to your computer and use it in GitHub Desktop.
Scraper to extract information from Spanish National Research Council page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy, json | |
from scrapy.crawler import CrawlerProcess | |
class BookSpider(scrapy.Spider): | |
name = "freebooks" | |
start_urls = ['http://libros.csic.es/freebooks.php'] | |
def parse(self, response): | |
# Because the HTML is malformed the links of the books are extracted with regular expressions | |
urls = list(set(response.css("#principal").re("http://libros.csic.es/product_info.php\?products_id=[0-9]+"))) | |
if len(urls) > 0: | |
for book in urls: yield scrapy.Request(book, callback=self.book_parser) | |
next_page = response.css('a[title=" Next Page "]::attr(href)').extract_first() | |
if next_page is not None: | |
next_page = response.urljoin(next_page) | |
yield scrapy.Request(next_page, callback=self.parse) | |
def book_parser(self, response): | |
book = response.css("#principal") | |
categories =[] | |
collection =[] | |
for item in book.css('.ficha > a'): | |
if item.root.getparent().cssselect("strong")[0].text == 'Subjects: ': | |
categories.append(item.css("::text").extract_first()) | |
elif item.root.getparent().cssselect("strong")[0].text == 'Collection: ': | |
collection.append(item.css("::text").extract_first()) | |
books.append({ | |
'url': response.url, | |
'title': book.css('h2::text').extract_first(), | |
'author': book.css('.autores::text').extract(), | |
'published': book.css('.ficha::text').extract()[0].strip(), | |
'language': book.css('.ficha::text').extract()[1].strip(), | |
'category': categories, | |
'collection': collection, | |
'download_link': "http://libros.csic.es/download.php?id=%s&pdf=products_pdfcomple" % response.url.split("=")[1] | |
}) | |
books =[] | |
process = CrawlerProcess({ | |
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' | |
}) | |
process.crawl(BookSpider) | |
process.start() | |
# Uncomment the following lines to save the results in a JSON file | |
""" | |
output = file('csic-books.json', "w") | |
output.write(json.dumps(books, sort_keys=True, indent=4, separators=(',', ': '))) | |
output.close() | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment