Last active
June 7, 2022 14:47
-
-
Save InNoobWeTrust/f066582f5553142d7074e63deffc465b to your computer and use it in GitHub Desktop.
Script to download free books on Springer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
from bs4 import BeautifulSoup | |
import json | |
import logging | |
from multiprocessing.pool import ThreadPool | |
from pathlib import Path | |
import re | |
import requests | |
from slugify import slugify | |
import sys | |
from urllib.parse import urljoin | |
from urllib.request import urlopen | |
dir_path = Path.cwd() | |
pages = None | |
logger = logging.getLogger() | |
logger.setLevel(logging.DEBUG) | |
output_file_handler = logging.FileHandler("progress.log") | |
stdout_handler = logging.StreamHandler(sys.stdout) | |
logger.addHandler(output_file_handler) | |
logger.addHandler(stdout_handler) | |
download_dirs = { | |
"pdf": dir_path / "books" / "pdf", | |
"epub": dir_path / "books" / "epub", | |
} | |
for d in download_dirs.values(): | |
try: | |
d.mkdir(parents=True) | |
except: | |
pass | |
def get(link): | |
logger.debug(f"link: {link}") | |
html = urlopen(link) | |
return BeautifulSoup(html, "html.parser") | |
def pdf(book_doc, base_url): | |
lst = book_doc.select("a.test-bookpdf-link") | |
if not len(lst): | |
return None | |
return urljoin(base_url, lst[0]["href"]) | |
def epub(book_doc, base_url): | |
lst = book_doc.select("a.test-bookepub-link") | |
if not len(lst): | |
return None | |
return urljoin(base_url, lst[0]["href"]) | |
def book_download_links(book_link): | |
doc = get(book_link) | |
pdf_link = pdf(doc, book_link) | |
epub_link = epub(doc, book_link) | |
logger.debug(f"pdf link: {pdf_link}") | |
logger.debug(f"epub link: {epub_link}") | |
return ("pdf", pdf_link), ("epub", epub_link) | |
def download(book_variant): | |
book_title, link, download_dir, default_file_name = book_variant | |
try: | |
response = requests.get(link, allow_redirects=True, stream=True) | |
if "content-disposition" in [ | |
k.lower() for k in response.headers.keys() | |
]: | |
content_disposition = response.headers['content-disposition'] | |
file_name = re.findall("filename=(.+)", content_disposition)[0] | |
else: | |
file_name = default_file_name | |
file_path = download_dir / file_name | |
with open(file_path, 'wb') as f: | |
for chunk in response.iter_content(chunk_size=1024): | |
if chunk: | |
f.write(chunk) | |
except: | |
logger.error(f"Cannot download {book_title}") | |
with open("failed.txt", "a+") as f: | |
print(f"{book_title}: {link}", file=f) | |
def download_book(book): | |
book_title = book['Book Title'] | |
logger.info(f"Downloading: {book_title}...") | |
variant_links = book_download_links(book['OpenURL']) | |
book_variants = [(book_title, book[variant], download_dirs[variant], | |
slugify(book_title) + '.' + variant) | |
for variant, link in variant_links] | |
pool = ThreadPool(2) | |
pool.imap_unordered(download, book_variants) | |
pool.close() | |
pool.join() | |
# Data source: https://www.springernature.com/gp/librarians/news-events/all-news-articles/industry-news-initiatives/free-access-to-textbooks-for-institutions-affected-by-coronaviru/17855960 | |
with open("books.json", "r") as book_json: | |
books = json.load(book_json) | |
logger.info(f"Total {len(books)} books") | |
pool = ThreadPool(4) | |
pool.imap_unordered(download_book, books) | |
pool.close() | |
pool.join() | |
logger.info("Download finished!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[build-system] | |
build-backend = "poetry.masonry.api" | |
requires = ["poetry>=0.12"] | |
[tool] | |
[tool.poetry] | |
authors = ["InNoobWeTrust"] | |
description = "Script to download free books on Springer" | |
name = "download-springer-books" | |
version = "0.1.0" | |
[tool.poetry.dependencies] | |
bs4 = "^0.0.1" | |
python = "^3.7" | |
python-slugify = "^4.0.0" | |
requests = "^2.23.0" | |
[tool.poetry.dev-dependencies] | |
pylama = "^7.7.1" | |
yapf = "^0.30.0" | |
rope = "^0.17.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
poetry install | |
nohup poetry run python3 -u ./download_springer_books.py > nohup.log 2>&1 & | |
echo $! > process.pid |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
kill -9 $(cat process.pid) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment