Skip to content

Instantly share code, notes, and snippets.

@martisak
Created April 30, 2020 19:34
Show Gist options
  • Save martisak/127a678552b3cb30247cd6648433ac73 to your computer and use it in GitHub Desktop.
Save martisak/127a678552b3cb30247cd6648433ac73 to your computer and use it in GitHub Desktop.
import pandas as pd
try:
from BeautifulSoup import BeautifulSoup
except ImportError:
from bs4 import BeautifulSoup
import requests
from pathlib import Path
import os
df = pd.read_excel (r'Free+English+textbooks.xlsx')
df1_grouped = df.groupby('Copyright Year')
# iterate over each groupp
for group_name, df_group in df1_grouped:
directory = f"{group_name}".replace("; ","-").replace(" ","_").replace("/","_")
print(directory)
Path(os.path.join("./books", directory)).mkdir(parents=True, exist_ok=True)
for row_index, row in df_group.iterrows():
url = row['OpenURL']
year = row['Copyright Year']
title = row['Book Title']
filename = f"{year}_{title}.pdf"
print("\t" + filename)
headers = requests.utils.default_headers()
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
bookurl = "https://link.springer.com" + soup.find('a', attrs={'class':'test-bookpdf-link'})["href"]
r = requests.get(bookurl, allow_redirects=True)
open(os.path.join("./books", directory, filename), 'wb').write(r.content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment