Last active
April 14, 2021 03:47
-
-
Save khaledkee/7bd7183842d2a84daaeb9a40461b935e to your computer and use it in GitHub Desktop.
Springer Free eBooks collector
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PyPDF2 import PdfFileReader | |
from bs4 import BeautifulSoup | |
from urllib.request import urlopen | |
import tqdm | |
import os | |
import re | |
with open('Springer Ebooks.pdf', 'rb') as PDFFile: | |
PDF = PdfFileReader(PDFFile) | |
pages = PDF.getNumPages() | |
# key = '/Annots' | |
# uri = '/URI' | |
# anchor = '/A' | |
# constants | |
BOOKS_NO = 408 | |
URL_PREFIX = "http" | |
# dictionary of title -> url | |
urls = dict() | |
# Construct the dictionary | |
bookIndex = 1 | |
for page in range(pages): | |
pageObject = PDF.getPage(page) | |
lines = pageObject.extractText().split('\n') | |
currentLine = 0 | |
while currentLine < len(lines): | |
# find line with book index | |
while currentLine + 1 < len(lines) and ( | |
not lines[currentLine].isnumeric() or int(lines[currentLine]) != bookIndex): | |
currentLine += 1 | |
if currentLine + 1 >= len(lines): | |
break | |
title = lines[currentLine + 1] | |
# move to the url | |
while not lines[currentLine].startswith(URL_PREFIX): | |
currentLine += 1 | |
url = lines[currentLine] | |
urls[title] = url | |
bookIndex += 1 | |
BOOKS_DIR = "books" | |
PDF_EXT = ".pdf" | |
PDF_LINK_ATTRIB = {'title': 'Download this book in PDF format'} | |
ANCHOR_TAG = 'a' | |
ANCHOR_LINK_ATTRIB = 'href' | |
SPRINGER_PATH = 'https://link.springer.com' | |
# Download the books | |
for title in tqdm.tqdm(urls): | |
url = urls[title] | |
html_page = urlopen(url) | |
soup = BeautifulSoup(html_page, 'html.parser') | |
if not os.path.exists(BOOKS_DIR): | |
os.makedirs(BOOKS_DIR) | |
for link in soup.findAll(ANCHOR_TAG, attrs=PDF_LINK_ATTRIB): | |
pdfLink = SPRINGER_PATH + link.get(ANCHOR_LINK_ATTRIB) | |
# escape / to not make python think it's a directory | |
file_name = title.replace("/", ",").strip() | |
fullfilename = os.path.join(BOOKS_DIR, file_name + PDF_EXT) | |
if not os.path.exists(fullfilename): | |
file_resp = urlopen(pdfLink) | |
file_resp_bin = file_resp.read() | |
with open(fullfilename, "wb") as binfile: | |
binfile.write(file_resp_bin) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a Python script to download a list of Spring free eBooks (At the time of creating it)!
Caution: the books come from different fields