aaferrari/csic-scraper.py

## csic-scraper.py
import scrapy, json
from scrapy.crawler import CrawlerProcess

class BookSpider(scrapy.Spider):
    name = "freebooks"
    start_urls = ['http://libros.csic.es/freebooks.php']

    def parse(self, response):
        # Because the HTML is malformed the links of the books are extracted with regular expressions
        urls = list(set(response.css("#principal").re("http://libros.csic.es/product_info.php\?products_id=[0-9]+")))
        if len(urls) > 0:
            for book in urls: yield scrapy.Request(book, callback=self.book_parser)

        next_page = response.css('a[title=" Next Page "]::attr(href)').extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse)

    def book_parser(self, response):
        book = response.css("#principal")
        categories =[]
        collection =[]
        for item in book.css('.ficha > a'):
            if item.root.getparent().cssselect("strong")[0].text == 'Subjects: ':
                categories.append(item.css("::text").extract_first())
            elif item.root.getparent().cssselect("strong")[0].text == 'Collection: ':
                collection.append(item.css("::text").extract_first())

        books.append({
            'url': response.url,
            'title': book.css('h2::text').extract_first(),
            'author': book.css('.autores::text').extract(),
            'published': book.css('.ficha::text').extract()[0].strip(),
            'language': book.css('.ficha::text').extract()[1].strip(),
            'category': categories,
            'collection': collection,
            'download_link': "http://libros.csic.es/download.php?id=%s&pdf=products_pdfcomple" % response.url.split("=")[1]
        })

books =[]
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(BookSpider)
process.start()

# Uncomment the following lines to save the results in a JSON file
"""
output = file('csic-books.json', "w")
output.write(json.dumps(books, sort_keys=True, indent=4, separators=(',', ': ')))
output.close()
"""
	import scrapy, json
	from scrapy.crawler import CrawlerProcess

	class BookSpider(scrapy.Spider):
	name = "freebooks"
	start_urls = ['http://libros.csic.es/freebooks.php']

	def parse(self, response):
	# Because the HTML is malformed the links of the books are extracted with regular expressions
	urls = list(set(response.css("#principal").re("http://libros.csic.es/product_info.php\?products_id=[0-9]+")))
	if len(urls) > 0:
	for book in urls: yield scrapy.Request(book, callback=self.book_parser)

	next_page = response.css('a[title=" Next Page "]::attr(href)').extract_first()
	if next_page is not None:
	next_page = response.urljoin(next_page)
	yield scrapy.Request(next_page, callback=self.parse)

	def book_parser(self, response):
	book = response.css("#principal")
	categories =[]
	collection =[]
	for item in book.css('.ficha > a'):
	if item.root.getparent().cssselect("strong")[0].text == 'Subjects: ':
	categories.append(item.css("::text").extract_first())
	elif item.root.getparent().cssselect("strong")[0].text == 'Collection: ':
	collection.append(item.css("::text").extract_first())

	books.append({
	'url': response.url,
	'title': book.css('h2::text').extract_first(),
	'author': book.css('.autores::text').extract(),
	'published': book.css('.ficha::text').extract()[0].strip(),
	'language': book.css('.ficha::text').extract()[1].strip(),
	'category': categories,
	'collection': collection,
	'download_link': "http://libros.csic.es/download.php?id=%s&pdf=products_pdfcomple" % response.url.split("=")[1]
	})

	books =[]
	process = CrawlerProcess({
	'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
	})
	process.crawl(BookSpider)
	process.start()

	# Uncomment the following lines to save the results in a JSON file
	"""
	output = file('csic-books.json', "w")
	output.write(json.dumps(books, sort_keys=True, indent=4, separators=(',', ': ')))
	output.close()
	"""