Created
April 30, 2020 19:34
-
-
Save martisak/127a678552b3cb30247cd6648433ac73 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
try: | |
from BeautifulSoup import BeautifulSoup | |
except ImportError: | |
from bs4 import BeautifulSoup | |
import requests | |
from pathlib import Path | |
import os | |
df = pd.read_excel (r'Free+English+textbooks.xlsx') | |
df1_grouped = df.groupby('Copyright Year') | |
# iterate over each groupp | |
for group_name, df_group in df1_grouped: | |
directory = f"{group_name}".replace("; ","-").replace(" ","_").replace("/","_") | |
print(directory) | |
Path(os.path.join("./books", directory)).mkdir(parents=True, exist_ok=True) | |
for row_index, row in df_group.iterrows(): | |
url = row['OpenURL'] | |
year = row['Copyright Year'] | |
title = row['Book Title'] | |
filename = f"{year}_{title}.pdf" | |
print("\t" + filename) | |
headers = requests.utils.default_headers() | |
req = requests.get(url, headers) | |
soup = BeautifulSoup(req.content, 'html.parser') | |
bookurl = "https://link.springer.com" + soup.find('a', attrs={'class':'test-bookpdf-link'})["href"] | |
r = requests.get(bookurl, allow_redirects=True) | |
open(os.path.join("./books", directory, filename), 'wb').write(r.content) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment