Skip to content

Instantly share code, notes, and snippets.

@ksamuel
Created February 26, 2012 13:21
Show Gist options
  • Save ksamuel/1916685 to your computer and use it in GitHub Desktop.
Save ksamuel/1916685 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu
import urlparse
import os
import urllib
from scrapy.utils.url import urljoin_rfc
from scrapy.selector import HtmlXPathSelector
from scrapy.utils.response import get_base_url
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
BIBLIO = "/home/kevin/Bureau/livres/biblio"
os.environ['DOWNLOAD_DELAY'] = "0.25"
os.environ['USER_AGENT'] = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.A.B.C Safari/525.13"
class BooksSpider(CrawlSpider):
name = "books"
allowed_domains = ["dl.dropbox.com"]
start_urls = [
"http://dl.dropbox.com/u/25833073/Boudu/_catalog/82267917/82267917_books.html",
]
rules = (
Rule(SgmlLinkExtractor(allow=(r'.*_books_.*\.html', ) )),
Rule(SgmlLinkExtractor(allow=(r'book_.*\.html', )), callback='parse'),
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
base_url = get_base_url(response)
urls = hxs.select("//*[contains(text(), 'Télécharger')]/@href")
urls = (urljoin_rfc(base_url, url) for url in urls.extract())
for url in urls:
filename = urlparse.unquote(url).split('/')[-1]
try:
urllib.urlretrieve(url, os.path.join(BIBLIO, filename))
except Exception as e:
print e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment