Skip to content

Instantly share code, notes, and snippets.

@aakash30jan
Last active May 4, 2020 15:36
Show Gist options
  • Save aakash30jan/0c999e448d933f547ec352f1bebf3bf9 to your computer and use it in GitHub Desktop.
Save aakash30jan/0c999e448d933f547ec352f1bebf3bf9 to your computer and use it in GitHub Desktop.
Build your book library
#Tweaked chris-hamberg/springer_books original script.
from subprocess import call
try:
import pandas as pd
import lxml.html
except:
print("Need to get some stuff . . .")
call("pip install -r https://raw.githubusercontent.com/aakash30jan/springer_books/master/requirements.txt", shell=True)
import pandas as pd
import lxml.html
import os, sys
import requests
import itertools
START_IDX = 0
xlsx = 'Free+English+textbooks.xlsx'
xfile = pd.ExcelFile(xlsx)
df = xfile.parse()
if not os.path.exists('Books'):
os.mkdir('Books')
elif not os.path.isdir('Books'):
print('Error: a file named "Books" cannot be in the execution directory.')
sys.exit(0)
class Book:
def __init__(self, idx, title, edition, subject, url):
self.idx = idx
self.title = self.fix(title)
self.edition = edition
self.name = f'{self.title}, {self.edition}'
self.subject = self.process(subject)
self.url = url
self.pdf = None
self.epub = None
def __repr__(self):
return f'{self.idx}: {self.title}, {self.edition} [{self.subject}]'
def fix(self, title):
return title.replace('/', '_')
def process(self, subject):
subject = subject.split(';')[0]
try:
os.mkdir(os.path.join('Books', subject))
except FileExistsError:
pass
finally:
self.path = os.path.join('Books', subject, self.name + '.pdf')
self.epat = os.path.join('Books', subject, self.name + '.epub')
return subject
def check(self):
if os.path.exists(self.path):
print(f'Info: {self.path} already saved.')
self.save_pdf = None
if os.path.exists(self.epat):
print(f'Info: {self.epat} already saved.')
self.save_epub = None
if os.path.exists(self.path) and os.path.exists(self.epat):
self.scrape = lambda: 0
self.save = lambda: 0
def scrape(self):
response = requests.get(self.url)
html = lxml.html.fromstring(response.content)
pdf = None
epub = None
try:
xpath = html.xpath(
'//*[@id="main-content"]/article[1]/div/div/div[2]/div/div/a'
)
if not bool(xpath):
pdf = html.xpath(
'//*[@id="main-content"]/article[1]/div/div/div[2]/div[1]/a'
)
pdf = pdf[0]
epub = html.xpath(
'//*[@id="main-content"]/article[1]/div/div/div[2]/div[2]/a'
)
epub = epub[0]
else:
xpath = xpath[0]
if 'pdf' in xpath.get('href'):
pdf = xpath
else:
epub = xpath
except IndexError:
print(
f'Error: {self.idx} {self.name} server access point missing'
)
self.save = lambda: 0
return False
else:
if self.save_pdf and pdf:
stub = pdf.get('href')
pdf = f'https://link.springer.com/{stub}'
self.pdf = requests.get(pdf).content
if self.save_epub and epub:
stub = epub.get('href')
epub = f'https://link.springer.com/{stub}'
self.epub = requests.get(epub).content
def save(self):
if self.save_pdf:
self.save_pdf()
if self.save_epub:
self.save_epub()
def save_pdf(self):
if not self.pdf:
print(f'Info: Springer does not furnish this as pdf.')
elif not os.path.exists(self.path):
with open(self.path, 'wb') as fhand:
fhand.write(self.pdf)
print(f'Saved: {self.path}')
def save_epub(self):
if not self.epub:
print(f'Info: Springer does not furnish this as epub.')
elif self.epub and not os.path.exists(self.epat):
with open(self.epat, 'wb') as fhand:
fhand.write(self.epub)
print(f'Saved: {self.epat}')
for idx, row in itertools.islice(df.iterrows(), START_IDX, None):
book = Book(idx,
df['Book Title'].iloc[idx],
df['Edition'].iloc[idx],
df['Subject Classification'].iloc[idx],
df['OpenURL'].iloc[idx])
print('\n', book)
book.check()
book.scrape()
book.save()
print('\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment