Skip to content

Instantly share code, notes, and snippets.

Created November 26, 2018 01:06
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
What would you like to do?
Scraper to extract information from Spanish National Research Council page
import scrapy, json
from scrapy.crawler import CrawlerProcess
class BookSpider(scrapy.Spider):
name = "freebooks"
start_urls = ['']
def parse(self, response):
# Because the HTML is malformed the links of the books are extracted with regular expressions
urls = list(set(response.css("#principal").re("\?products_id=[0-9]+")))
if len(urls) > 0:
for book in urls: yield scrapy.Request(book, callback=self.book_parser)
next_page = response.css('a[title=" Next Page "]::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
def book_parser(self, response):
book = response.css("#principal")
categories =[]
collection =[]
for item in book.css('.ficha > a'):
if item.root.getparent().cssselect("strong")[0].text == 'Subjects: ':
elif item.root.getparent().cssselect("strong")[0].text == 'Collection: ':
'url': response.url,
'title': book.css('h2::text').extract_first(),
'author': book.css('.autores::text').extract(),
'published': book.css('.ficha::text').extract()[0].strip(),
'language': book.css('.ficha::text').extract()[1].strip(),
'category': categories,
'collection': collection,
'download_link': "" % response.url.split("=")[1]
books =[]
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
# Uncomment the following lines to save the results in a JSON file
output = file('csic-books.json', "w")
output.write(json.dumps(books, sort_keys=True, indent=4, separators=(',', ': ')))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment