Instantly share code, notes, and snippets.

Embed
What would you like to do?
Download books from Springer (RD) passing only the url of the search result or the book page to the script
#!/usr/bin/python
""" @copyleft Eliezer Silva (djosacv@gmail.com)
This script was developed mostly for recreative and educational purpose, so you use it on your own risk and as it is.
With this script you can crawl in springer book search results and download all the book listed in the page. This version also
checks if the result set includes many pages and navigate through those pages.
There's two ways of using it:
1) Link to a single book ulr in springer page:
python download-books-from-rss-springer.py 'http://rd.springer.com/book/10.1007/978-3-662-07003-1'
2) Link to a search result with many books:
python download-books-from-rss-springer.py -s 'http://rd.springer.com/search?facet-series=%223214%22&facet-content-type=%22Book%22&showAll=false'
"""
import feedparser
import urllib
import re
from HTMLParser import HTMLParser
from urlparse import urlparse
import urllib2
import sys
def downloader(url,filename=None):
file_name = filename
if(filename==None):
file_name = url.split('/')[-1]
u = urllib2.urlopen(url)
f = open(file_name, 'wb')
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
print "Downloading: %s Bytes to %s" % ( file_size, file_name)
file_size_dl = 0
block_sz = 8192
parcels = file_size/30
disp = False
c=1
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
status = status + chr(8)*(len(status)+1)
#print status,
if (file_size_dl > c*parcels):
c+=1
disp=True
if(disp == True):
print "_"+r"%3.2f%%" % (file_size_dl * 100. / file_size),
disp=False
print ""
f.close()
class myhtmlparser(HTMLParser):
def __init__(self,link):
self.reset()
self.inLink=False
self.domain=self.get_domain(link)
self.pdflink=""
self.bookname=None
self.is_searching=True
self.inyear=False
self.year=""
self.author_list=[]
self.inauthorlist=False
self.inauth=False
def handle_starttag(self, tag, attrs):
if(tag == 'span' and ('class','copyright-year') in attrs):
self.inyear=True
if(tag == 'a' and self.find_link(attrs) and self.is_searching):
self.inLink=True
if(tag == 'div' and (('class','author-list') in attrs or ('class','editor-list') in attrs)):
self.inauthorlist=True
if(tag == 'a' and ('itemprop','name') in attrs and self.inauthorlist):
self.inauth=True
def handle_endtag(self,tag):
if(tag == 'div' and self.inauthorlist):
self.inauthorlist=False
def handle_data(self, data):
if(self.inLink):
print data
print "pdf link "+self.pdflink
year = ""
if(self.year!=""):
year="["+self.year+"]"
authn=",".join(self.author_list)
if(authn!=""):
authn="["+authn+"]"
self.is_searching=False
self.inLink=False
if(self.inyear):
self.year=data.replace("\n","").replace(" ","")
self.inyear=False
if(self.inauth):
self.author_list.append(data.replace("\n","").replace(" "," ").split()[-1])
self.inauth=False
def doDownload(self):
year = ""
if(self.year!=""):
year="["+self.year+"]"
authn=",".join(self.author_list)
if(authn!=""):
authn="["+authn+"]"
downloader(self.pdflink,year+authn+self.bookname+".pdf")
def find_link(self,attrs):
attrnames=zip(*attrs)[0]
vals=zip(*attrs)[1]
ret=False
bookn=""
if('doi' in attrnames and ('contenttype', 'Book') in attrs and 'href' in attrnames):
for pair_val in attrs:
if(pair_val[0]=='href' and pair_val[1].endswith('pdf')):
self.pdflink = self.domain+pair_val[1]
ret = True
if(pair_val[0]=='publication'):
bookn=pair_val[1].replace('/','.').split("|")[1]+"."+pair_val[1].replace('/','.').split("|")[0]
if(ret):
self.bookname=bookn
return ret
def get_domain(self,url):
parsed_uri = urlparse( url )
domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri)
return domain
def openurlitem(item):
fp=urllib.urlopen(item)
data=fp.read().replace('\n', '')
return data
def process_item(url_link):
parser = myhtmlparser(url_link)
parser.feed(openurlitem(url_link))
parser.doDownload()
class mylisthtmlparser(myhtmlparser):
def __init__(self,link):
self.reset()
self.inLink=False
self.inResultList=False
self.domain=self.get_domain(link)
self.nextlink=""
self.pagination=False
def handle_starttag(self, tag, attrs):
if(self.inResultList):
if(tag == 'a' and ('class','title') in attrs):
self.inLink=True
self.pdflink = self.getHref(attrs)
if(tag == 'ol' and ('id','results-list') in attrs):
self.inResultList=True
if(tag=='form' and ('class','pagination') in attrs):
self.pagination = True
if(self.pagination and tag == 'a' and ('class','next') in attrs and ('title','next') in attrs):
self.nextlink = self.pdflink = self.getHref(attrs)
print "next link = "+self.nextlink
def handle_endtag(self, tag):
if(tag == 'ol' and self.inResultList):
self.inResultList=False
if(tag == 'form' and self.pagination):
self.pagination= False
def handle_data(self, data):
if(self.inLink):
print "Opening "+data
print "url link "+self.pdflink
try:
process_item(self.pdflink)
except:
print "error : "
self.inLink=False
def hasNext(self):
return self.nextlink.startswith("http")
def getHref(self,attrs):
for pair_val in attrs:
if(pair_val[0]=='href'):
return self.domain+pair_val[1]
def process_page(url):
parser = mylisthtmlparser(url)
parser.feed(openurlitem(url))
while(parser.hasNext()):
url = parser.nextlink
parser = mylisthtmlparser(url)
parser.feed(openurlitem(url))
if(len(sys.argv) >= 2):
args=sys.argv[1:]
if(len(args)==2):
if args[0]=="-s":
try:
process_page(args[1])
except:
print "processing error. check your url"
print "format: 'python download-books-from-rss-springer.py [-s] url'"
else:
print "format: 'python download-books-from-rss-springer.py [-s] url'"
elif len(args)==1:
try:
process_item(args[0])
except:
print "processing error. check your url"
print "format: 'python download-books-from-rss-springer.py [-s] url'"
else:
print "format: 'python download-books-from-rss-springer.py [-s] url'"
print "if argument -s is passed, the script will assume the link is a rss feed with a list of books"
print "otherwise, it will assume the link is a url of a single book"
else:
print "python download-books-from-rss-springer.py [-s] url"
# process_page("http://rd.springer.com/search?facet-series=%223214%22&facet-content-type=%22Book%22&showAll=false")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment