Skip to content

Instantly share code, notes, and snippets.

@nikAizuddin
Created April 10, 2020 07:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nikAizuddin/7a3f1f0c0cb8ab430bf3643f56f48983 to your computer and use it in GitHub Desktop.
Save nikAizuddin/7a3f1f0c0cb8ab430bf3643f56f48983 to your computer and use it in GitHub Desktop.
"""Download Free Springer books.
Requirements:
* Make sure you have Python 3.7. It is recommended that you use Python from Anaconda.
* Download chromedriver https://chromedriver.chromium.org/downloads and extract it
into the same directory as this script.
How to Execute:
$ conda create --name springer-download
$ conda activate springer-download
(springer-download) $ conda install python==3.7
(springer-download) $ conda install beautifulsoup4 selenium pandas lxml html5lib requests tqdm
(springer-download) $ python springer-download.py [URL]
"""
import re
import os
import logging
import argparse
import tqdm
import requests
import pandas as pd
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
logger = logging.getLogger(__name__)
class WebScraper:
"""Scrap web-page.
"""
def __init__(self):
self.driver = self._init_driver()
def __del__(self):
self.driver.quit()
def _init_driver(self):
"""Initialize chromedriver
"""
chromedriver_file = self._find_driver_in_cwd()
driver = webdriver.Chrome(chromedriver_file)
return driver
@staticmethod
def _find_driver_in_cwd():
"""Find chromedriver file.
"""
chromedriver_file = None
for root, directories, files in os.walk('.'):
for filename in files:
if re.match(r'chromedriver(|\.exe)', filename):
chromedriver_file = os.path.join(root, filename)
if chromedriver_file is None:
raise FileNotFoundError
return chromedriver_file
class SpringerScraper(WebScraper):
"""Scrap Springer.
"""
def get_dataframe_table(self, url):
"""Read HTML table into Pandas Dataframe
Parameters
----------
url : str
URL to the main page containing table.
Returns
-------
pandas.DataFrame
Pandas DataFrame containing book informations.
"""
self.driver.get(url)
soup = BeautifulSoup(self.driver.page_source, 'lxml')
table = soup.find('table')
df = pd.read_html(str(table))
return df
def download_pdf(self, url, outpdf):
"""Download PDF file from the given URL (with cookies).
Parameters
----------
url : str
URL to the web-page containing PDF url.
outpdf : str
PDF filename to be written.
"""
self.driver.get(url)
cookies = {c['name']:c['value'] for c in self.driver.get_cookies()}
try:
pdfurl = self.driver.find_element_by_class_name('cta-button-container__item').find_element_by_css_selector('a').get_attribute('href')
except selenium.common.exceptions.NoSuchElementException as e:
logger.warning('Unable to download "{}"'.format(url))
logger.warning(str(e))
else:
response = requests.get(pdfurl, cookies=cookies)
with open(outpdf, 'wb') as f:
f.write(response.content)
def main():
_init_logger()
args = _parse_args()
mainscrap = SpringerScraper()
df = mainscrap.get_dataframe_table(args.url)
book_urls = df[0]['S'][2:-1]
book_titles = df[0]['A'][2:-1].apply(lambda x: x.lower().replace(' ', '-').replace('/', '').replace(',', '').replace('&', 'and'))
book_categories = df[0]['L'][2:-1].apply(lambda x: x.lower().replace(' ', '-').replace('/', '').replace(',', '').replace('&', 'and'))
pdfdir = os.path.join('downloads')
if not os.path.exists(pdfdir):
logger.info('Creating "{}"'.format(pdfdir))
os.mkdir(pdfdir)
for book_category in book_categories.drop_duplicates():
if not os.path.exists(os.path.join(pdfdir, book_category)):
logger.info('Creating "{}"'.format(os.path.join(pdfdir, book_category)))
os.mkdir(os.path.join(pdfdir, book_category))
for title, category, url in tqdm.tqdm(zip(book_titles, book_categories, book_urls), total=len(book_urls)):
pdfscrap = SpringerScraper()
outpdf = os.path.join(pdfdir, category, title + '.pdf')
logger.info('Downloading "{}" into "{}"'.format(url, outpdf))
pdfscrap.download_pdf(url, outpdf)
del pdfscrap
def _init_logger():
"""Initialize logger.
"""
formatter = logging.Formatter('%(asctime)s %(process)d:%(thread)d:%(levelname)s:%(name)s:%(lineno)d: %(message)s')
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
stream_handler.setLevel('INFO')
logger.addHandler(stream_handler)
logger.setLevel('DEBUG')
def _parse_args():
"""Parse Arguments from command-line.
"""
parser = argparse.ArgumentParser(
description='Download Free Springer books',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('url', type=str, help='URL to the list of books')
args = parser.parse_args()
return args
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment