Skip to content

Instantly share code, notes, and snippets.

@erincar
Created May 31, 2020 15:28
Show Gist options
  • Save erincar/428dd5b9f48c9b52ad0c8afe58587d54 to your computer and use it in GitHub Desktop.
Save erincar/428dd5b9f48c9b52ad0c8afe58587d54 to your computer and use it in GitHub Desktop.
[2020-04-29] Springer free e books autodownload
import argparse
import os
import requests
import urllib.request
from lxml import html
from pathlib import Path
parser = argparse.ArgumentParser()
parser.add_argument(
"--start",
type=int,
default=1,
nargs="?",
help="starting index for the list",
)
args = parser.parse_args()
page_url = 'https://www.thebiomics.com/notes/springer-free-e-books-list.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:76.0) Gecko/20100101 Firefox/76.0',
}
page_response = requests.get(page_url, headers=headers)
page_tree = html.fromstring(page_response.content)
base_springer = "https://link.springer.com"
download_path = os.path.join(str(Path.home()), "Downloads", "Springer")
urls = page_tree.xpath(
'//table[@class="table data-table"]/tbody/tr/td[position()=1]/a/@href'
)[args.start-1:]
amount, s = len(urls), 0
for i, url in enumerate(urls):
try:
response = requests.get(url, headers=headers)
tree = html.fromstring(response.content)
pdf = tree.xpath('//a[@data-track-action="Book download - pdf"]/@href')[0].replace("%2F", "/")
download_url = base_springer + pdf
authors = tree.xpath('//div[@class="persons__list"]/ul/li/span/text()')
bookname = tree.xpath('//div[@data-test="book-title"]/h1/text()')[0]
filename = ", ".join(authors) + " - " + bookname + ".pdf"
print(download_url, "->", filename)
urllib.request.urlretrieve(
download_url,
os.path.join(download_path, filename.replace("\xa0", " ")),
)
print(f"Downloaded {i+1-s}/{amount}, skipped {s}")
except Exception:
print("Error during download, skipping")
s += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment