Last active
June 16, 2020 18:18
-
-
Save s4yed/20e4d230cd82c4fd6e9632fe9c55602e to your computer and use it in GitHub Desktop.
Springer Machine Learning and Data Science Books.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
*------------------------------------------------------------- | |
* Copyright (c) Ahmed Sayed. All rights reserved. | |
* Licensed under the MIT License. | |
* Title: SpringerBooks.py | |
* Date: 2020-05-01 11:33:21 | |
*------------------------------------------------------------- | |
Excute the following commands to install the dependencies before running the script: | |
pip install beautifulsoup4 | |
pip install requests | |
''' | |
from bs4 import BeautifulSoup | |
import requests | |
import os | |
GREEN = '\33[32m' | |
YELLOW = '\33[33m' | |
url = 'https://techgrabyte.com/springer-released-65-machine-learning-data-science-books-free/' | |
springer_url = 'https://link.springer.com/' | |
dir_name = 'Machine Learning and Data Science Books' | |
def find_links(url, text='', title=False): | |
req = requests.get(url) | |
soup = BeautifulSoup(req.text, "html.parser") | |
links = set() | |
for link in soup.find_all('a'): | |
href = link.get('href') | |
if text in href: | |
if title: | |
links.add((href,soup.title.string.replace(' | SpringerLink','').strip())) | |
else: | |
links.add(href) | |
return links | |
springer_links = find_links(url, 'link.springer') | |
books = [] | |
print(YELLOW + '[!] Getting all links ...') | |
for link in springer_links: | |
isbn = link.split('isbn=')[-1] | |
book = find_links(link, isbn+'.pdf', True) | |
if book: | |
books.append(book) | |
print(GREEN + '[+] Done') | |
os.mkdir(dir_name) | |
for book in books: | |
book_name = list(book)[0][1] | |
book_link = list(book)[0][0] | |
print(YELLOW + f'[!] Fetching {book_name} book ...') | |
req = requests.get(springer_url+book_link, stream=True) | |
book_name = book_name.replace(' ','-') | |
with open(f'{dir_name}/{book_name}.pdf', 'wb') as f: | |
f.write(req.raw.read()) | |
print(GREEN + '[+] Done') |
I ran it already but in somehow the script brings some of the links once with no downloads and starts to download after those links.
Hello there, the script is cool, if you could advice me how can I write a similar code or edit this to use for downloading PDF and DOCX files from a different website, i am new to Python and wanted to automate such a thing as you did. Thanks in advance.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Run both and observe the difference, it's your call after all :D