Skip to content

Instantly share code, notes, and snippets.

@InNoobWeTrust
Last active June 7, 2022 14:47
Show Gist options
  • Save InNoobWeTrust/f066582f5553142d7074e63deffc465b to your computer and use it in GitHub Desktop.
Save InNoobWeTrust/f066582f5553142d7074e63deffc465b to your computer and use it in GitHub Desktop.
Script to download free books on Springer
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import json
import logging
from multiprocessing.pool import ThreadPool
from pathlib import Path
import re
import requests
from slugify import slugify
import sys
from urllib.parse import urljoin
from urllib.request import urlopen
dir_path = Path.cwd()
pages = None
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
output_file_handler = logging.FileHandler("progress.log")
stdout_handler = logging.StreamHandler(sys.stdout)
logger.addHandler(output_file_handler)
logger.addHandler(stdout_handler)
download_dirs = {
"pdf": dir_path / "books" / "pdf",
"epub": dir_path / "books" / "epub",
}
for d in download_dirs.values():
try:
d.mkdir(parents=True)
except:
pass
def get(link):
logger.debug(f"link: {link}")
html = urlopen(link)
return BeautifulSoup(html, "html.parser")
def pdf(book_doc, base_url):
lst = book_doc.select("a.test-bookpdf-link")
if not len(lst):
return None
return urljoin(base_url, lst[0]["href"])
def epub(book_doc, base_url):
lst = book_doc.select("a.test-bookepub-link")
if not len(lst):
return None
return urljoin(base_url, lst[0]["href"])
def book_download_links(book_link):
doc = get(book_link)
pdf_link = pdf(doc, book_link)
epub_link = epub(doc, book_link)
logger.debug(f"pdf link: {pdf_link}")
logger.debug(f"epub link: {epub_link}")
return ("pdf", pdf_link), ("epub", epub_link)
def download(book_variant):
book_title, link, download_dir, default_file_name = book_variant
try:
response = requests.get(link, allow_redirects=True, stream=True)
if "content-disposition" in [
k.lower() for k in response.headers.keys()
]:
content_disposition = response.headers['content-disposition']
file_name = re.findall("filename=(.+)", content_disposition)[0]
else:
file_name = default_file_name
file_path = download_dir / file_name
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
except:
logger.error(f"Cannot download {book_title}")
with open("failed.txt", "a+") as f:
print(f"{book_title}: {link}", file=f)
def download_book(book):
book_title = book['Book Title']
logger.info(f"Downloading: {book_title}...")
variant_links = book_download_links(book['OpenURL'])
book_variants = [(book_title, book[variant], download_dirs[variant],
slugify(book_title) + '.' + variant)
for variant, link in variant_links]
pool = ThreadPool(2)
pool.imap_unordered(download, book_variants)
pool.close()
pool.join()
# Data source: https://www.springernature.com/gp/librarians/news-events/all-news-articles/industry-news-initiatives/free-access-to-textbooks-for-institutions-affected-by-coronaviru/17855960
with open("books.json", "r") as book_json:
books = json.load(book_json)
logger.info(f"Total {len(books)} books")
pool = ThreadPool(4)
pool.imap_unordered(download_book, books)
pool.close()
pool.join()
logger.info("Download finished!")
[build-system]
build-backend = "poetry.masonry.api"
requires = ["poetry>=0.12"]
[tool]
[tool.poetry]
authors = ["InNoobWeTrust"]
description = "Script to download free books on Springer"
name = "download-springer-books"
version = "0.1.0"
[tool.poetry.dependencies]
bs4 = "^0.0.1"
python = "^3.7"
python-slugify = "^4.0.0"
requests = "^2.23.0"
[tool.poetry.dev-dependencies]
pylama = "^7.7.1"
yapf = "^0.30.0"
rope = "^0.17.0"
#!/usr/bin/env bash
poetry install
nohup poetry run python3 -u ./download_springer_books.py > nohup.log 2>&1 &
echo $! > process.pid
#!/usr/bin/env bash
kill -9 $(cat process.pid)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment