nikAizuddin/springer-download.py

## springer-download.py
"""Download Free Springer books.

Requirements:
    * Make sure you have Python 3.7. It is recommended that you use Python from Anaconda.
    * Download chromedriver https://chromedriver.chromium.org/downloads and extract it
      into the same directory as this script.

How to Execute:
    $ conda create --name springer-download
    $ conda activate springer-download
    (springer-download) $ conda install python==3.7
    (springer-download) $ conda install beautifulsoup4 selenium pandas lxml html5lib requests tqdm
    (springer-download) $ python springer-download.py [URL]
"""

import re
import os
import logging
import argparse

import tqdm
import requests
import pandas as pd
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver


logger = logging.getLogger(__name__)


class WebScraper:
    """Scrap web-page.
    """

    def __init__(self):
        self.driver = self._init_driver()

    def __del__(self):
        self.driver.quit()

    def _init_driver(self):
        """Initialize chromedriver
        """

        chromedriver_file = self._find_driver_in_cwd()
        driver = webdriver.Chrome(chromedriver_file)

        return driver

    @staticmethod
    def _find_driver_in_cwd():
        """Find chromedriver file.
        """

        chromedriver_file = None

        for root, directories, files in os.walk('.'):
            for filename in files:
                if re.match(r'chromedriver(|\.exe)', filename):
                    chromedriver_file = os.path.join(root, filename)

        if chromedriver_file is None:
            raise FileNotFoundError

        return chromedriver_file


class SpringerScraper(WebScraper):
    """Scrap Springer.
    """

    def get_dataframe_table(self, url):
        """Read HTML table into Pandas Dataframe

        Parameters
        ----------
        url : str
            URL to the main page containing table.

        Returns
        -------
        pandas.DataFrame
            Pandas DataFrame containing book informations.
        """

        self.driver.get(url)
        soup = BeautifulSoup(self.driver.page_source, 'lxml')
        table = soup.find('table')
        df = pd.read_html(str(table))

        return df

    def download_pdf(self, url, outpdf):
        """Download PDF file from the given URL (with cookies).

        Parameters
        ----------
        url : str
            URL to the web-page containing PDF url.
        outpdf : str
            PDF filename to be written.
        """

        self.driver.get(url)
        cookies = {c['name']:c['value'] for c in self.driver.get_cookies()}

        try:
            pdfurl = self.driver.find_element_by_class_name('cta-button-container__item').find_element_by_css_selector('a').get_attribute('href')
        except selenium.common.exceptions.NoSuchElementException as e:
            logger.warning('Unable to download "{}"'.format(url))
            logger.warning(str(e))
        else:
            response = requests.get(pdfurl, cookies=cookies)
            with open(outpdf, 'wb') as f:
                f.write(response.content)


def main():
    _init_logger()
    args = _parse_args()

    mainscrap = SpringerScraper()
    df = mainscrap.get_dataframe_table(args.url)

    book_urls = df[0]['S'][2:-1]
    book_titles = df[0]['A'][2:-1].apply(lambda x: x.lower().replace(' ', '-').replace('/', '').replace(',', '').replace('&', 'and'))
    book_categories = df[0]['L'][2:-1].apply(lambda x: x.lower().replace(' ', '-').replace('/', '').replace(',', '').replace('&', 'and'))

    pdfdir = os.path.join('downloads')
    if not os.path.exists(pdfdir):
        logger.info('Creating "{}"'.format(pdfdir))
        os.mkdir(pdfdir)
    for book_category in book_categories.drop_duplicates():
        if not os.path.exists(os.path.join(pdfdir, book_category)):
            logger.info('Creating "{}"'.format(os.path.join(pdfdir, book_category)))
            os.mkdir(os.path.join(pdfdir, book_category))

    for title, category, url in tqdm.tqdm(zip(book_titles, book_categories, book_urls), total=len(book_urls)):
        pdfscrap = SpringerScraper()

        outpdf = os.path.join(pdfdir, category, title + '.pdf')
        logger.info('Downloading "{}" into "{}"'.format(url, outpdf))
        pdfscrap.download_pdf(url, outpdf)

        del pdfscrap


def _init_logger():
    """Initialize logger.
    """

    formatter = logging.Formatter('%(asctime)s %(process)d:%(thread)d:%(levelname)s:%(name)s:%(lineno)d: %(message)s')
    stream_handler = logging.StreamHandler()
    stream_handler.setFormatter(formatter)
    stream_handler.setLevel('INFO')

    logger.addHandler(stream_handler)
    logger.setLevel('DEBUG')


def _parse_args():
    """Parse Arguments from command-line.
    """

    parser = argparse.ArgumentParser(
        description='Download Free Springer books',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('url', type=str, help='URL to the list of books')
    args = parser.parse_args()

    return args


if __name__ == '__main__':
    main()
	"""Download Free Springer books.

	Requirements:
	* Make sure you have Python 3.7. It is recommended that you use Python from Anaconda.
	* Download chromedriver https://chromedriver.chromium.org/downloads and extract it
	into the same directory as this script.

	How to Execute:
	$ conda create --name springer-download
	$ conda activate springer-download
	(springer-download) $ conda install python==3.7
	(springer-download) $ conda install beautifulsoup4 selenium pandas lxml html5lib requests tqdm
	(springer-download) $ python springer-download.py [URL]
	"""

	import re
	import os
	import logging
	import argparse

	import tqdm
	import requests
	import pandas as pd
	from bs4 import BeautifulSoup
	import selenium
	from selenium import webdriver


	logger = logging.getLogger(__name__)


	class WebScraper:
	"""Scrap web-page.
	"""

	def __init__(self):
	self.driver = self._init_driver()

	def __del__(self):
	self.driver.quit()

	def _init_driver(self):
	"""Initialize chromedriver
	"""

	chromedriver_file = self._find_driver_in_cwd()
	driver = webdriver.Chrome(chromedriver_file)

	return driver

	@staticmethod
	def _find_driver_in_cwd():
	"""Find chromedriver file.
	"""

	chromedriver_file = None

	for root, directories, files in os.walk('.'):
	for filename in files:
	if re.match(r'chromedriver(\|\.exe)', filename):
	chromedriver_file = os.path.join(root, filename)

	if chromedriver_file is None:
	raise FileNotFoundError

	return chromedriver_file


	class SpringerScraper(WebScraper):
	"""Scrap Springer.
	"""

	def get_dataframe_table(self, url):
	"""Read HTML table into Pandas Dataframe

	Parameters
	----------
	url : str
	URL to the main page containing table.

	Returns
	-------
	pandas.DataFrame
	Pandas DataFrame containing book informations.
	"""

	self.driver.get(url)
	soup = BeautifulSoup(self.driver.page_source, 'lxml')
	table = soup.find('table')
	df = pd.read_html(str(table))

	return df

	def download_pdf(self, url, outpdf):
	"""Download PDF file from the given URL (with cookies).

	Parameters
	----------
	url : str
	URL to the web-page containing PDF url.
	outpdf : str
	PDF filename to be written.
	"""

	self.driver.get(url)
	cookies = {c['name']:c['value'] for c in self.driver.get_cookies()}

	try:
	pdfurl = self.driver.find_element_by_class_name('cta-button-container__item').find_element_by_css_selector('a').get_attribute('href')
	except selenium.common.exceptions.NoSuchElementException as e:
	logger.warning('Unable to download "{}"'.format(url))
	logger.warning(str(e))
	else:
	response = requests.get(pdfurl, cookies=cookies)
	with open(outpdf, 'wb') as f:
	f.write(response.content)


	def main():
	_init_logger()
	args = _parse_args()

	mainscrap = SpringerScraper()
	df = mainscrap.get_dataframe_table(args.url)

	book_urls = df[0]['S'][2:-1]
	book_titles = df[0]['A'][2:-1].apply(lambda x: x.lower().replace(' ', '-').replace('/', '').replace(',', '').replace('&', 'and'))
	book_categories = df[0]['L'][2:-1].apply(lambda x: x.lower().replace(' ', '-').replace('/', '').replace(',', '').replace('&', 'and'))

	pdfdir = os.path.join('downloads')
	if not os.path.exists(pdfdir):
	logger.info('Creating "{}"'.format(pdfdir))
	os.mkdir(pdfdir)
	for book_category in book_categories.drop_duplicates():
	if not os.path.exists(os.path.join(pdfdir, book_category)):
	logger.info('Creating "{}"'.format(os.path.join(pdfdir, book_category)))
	os.mkdir(os.path.join(pdfdir, book_category))

	for title, category, url in tqdm.tqdm(zip(book_titles, book_categories, book_urls), total=len(book_urls)):
	pdfscrap = SpringerScraper()

	outpdf = os.path.join(pdfdir, category, title + '.pdf')
	logger.info('Downloading "{}" into "{}"'.format(url, outpdf))
	pdfscrap.download_pdf(url, outpdf)

	del pdfscrap


	def _init_logger():
	"""Initialize logger.
	"""

	formatter = logging.Formatter('%(asctime)s %(process)d:%(thread)d:%(levelname)s:%(name)s:%(lineno)d: %(message)s')
	stream_handler = logging.StreamHandler()
	stream_handler.setFormatter(formatter)
	stream_handler.setLevel('INFO')

	logger.addHandler(stream_handler)
	logger.setLevel('DEBUG')


	def _parse_args():
	"""Parse Arguments from command-line.
	"""

	parser = argparse.ArgumentParser(
	description='Download Free Springer books',
	formatter_class=argparse.ArgumentDefaultsHelpFormatter)

	parser.add_argument('url', type=str, help='URL to the list of books')
	args = parser.parse_args()

	return args


	if __name__ == '__main__':
	main()