Skip to content

Instantly share code, notes, and snippets.

@aaferrari
Created November 26, 2018 01:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aaferrari/bcac380b6832212c7bb65476affa41b2 to your computer and use it in GitHub Desktop.
Save aaferrari/bcac380b6832212c7bb65476affa41b2 to your computer and use it in GitHub Desktop.
Scraper to extract information from Spanish National Research Council page
import scrapy, json
from scrapy.crawler import CrawlerProcess
class BookSpider(scrapy.Spider):
name = "freebooks"
start_urls = ['http://libros.csic.es/freebooks.php']
def parse(self, response):
# Because the HTML is malformed the links of the books are extracted with regular expressions
urls = list(set(response.css("#principal").re("http://libros.csic.es/product_info.php\?products_id=[0-9]+")))
if len(urls) > 0:
for book in urls: yield scrapy.Request(book, callback=self.book_parser)
next_page = response.css('a[title=" Next Page "]::attr(href)').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
def book_parser(self, response):
book = response.css("#principal")
categories =[]
collection =[]
for item in book.css('.ficha > a'):
if item.root.getparent().cssselect("strong")[0].text == 'Subjects: ':
categories.append(item.css("::text").extract_first())
elif item.root.getparent().cssselect("strong")[0].text == 'Collection: ':
collection.append(item.css("::text").extract_first())
books.append({
'url': response.url,
'title': book.css('h2::text').extract_first(),
'author': book.css('.autores::text').extract(),
'published': book.css('.ficha::text').extract()[0].strip(),
'language': book.css('.ficha::text').extract()[1].strip(),
'category': categories,
'collection': collection,
'download_link': "http://libros.csic.es/download.php?id=%s&pdf=products_pdfcomple" % response.url.split("=")[1]
})
books =[]
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(BookSpider)
process.start()
# Uncomment the following lines to save the results in a JSON file
"""
output = file('csic-books.json', "w")
output.write(json.dumps(books, sort_keys=True, indent=4, separators=(',', ': ')))
output.close()
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment