Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
PacktPub e-books downloader
from pathlib import Path
import logging
import re
import requests
from bs4 import BeautifulSoup
log = logging.getLogger(__name__)
log.addHandler(logging.NullHandler())
class PacktBooks:
_URL = {'login': 'https://www.packtpub.com/',
'freebook': 'https://www.packtpub.com/packt/offers/free-learning',
'mybooks': 'https://www.packtpub.com/account/my-ebooks'}
_AVAIL_BOOK_TYPES = {'pdf', 'epub', 'mobi', 'code'}
def __init__(self, username, password):
self._session = None
self._book_types = self._AVAIL_BOOK_TYPES
self._username = username
self._password = password
self._logged_in = False
def _create_session(self):
self._session = requests.Session()
self._session.headers.update({'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
@property
def book_types(self):
return ','.join(self._book_types)
@book_types.setter
def book_types(self, types):
if isinstance(types, str):
self._book_types = set([types]) & self._AVAIL_BOOK_TYPES
else:
self._book_types = set(types) & self._AVAIL_BOOK_TYPES
log.debug('Set wanted book types to {}'.format(self.book_types))
def login(self):
if self._session is None:
self._create_session()
payload = {'email': self._username,
'password': self._password,
'op': 'Login'}
# Retrieve values for form_id and form_build_id
soup = BeautifulSoup(requests.get(self._URL['login']).content,
'lxml')
# Add hidden form fields
form = soup.find('form', {'id': 'packt-user-login-form'})
for elem in form.find_all('input', type='hidden'):
payload[elem['name']] = elem['value']
# Login
r = self._session.post(url=self._URL['login'],
data=payload)
r.raise_for_status()
self._logged_in = True
def logoff(self):
if self._session:
self._session.close()
self._logged_in = False
def _get_books(self):
# Prepare the regular expressions for books and code download links
re_book_types = re.compile('^/ebook_download/.*/(?:' + '|'.join(self._book_types) + ')')
re_book_code = re.compile('^/code_download/[0-9]+')
r = self._session.get(self._URL['mybooks'])
r.raise_for_status()
soup = BeautifulSoup(r.content, 'lxml')
book_list = soup.find('div', id='product-account-list')
for book in book_list.find_all('div', class_='product-line'):
ebook={}
if not book.has_attr('title'):
log.debug('Skipping fake book entry')
continue
ebook['title'] = book['title'][:-8] # Remove the '(ebook)' postfix
log.debug('Found "{}"'.format(ebook['title']))
links={}
for link in book.find_all('a', href=re_book_types):
filetype = link['href'].rsplit('/', maxsplit=1)[1]
links[filetype] = link['href']
if 'code' in self._book_types:
for link in book.find_all('a', href=re_book_code):
links['zip'] = link['href']
log.debug('{} link(s) found'.format(len(links)))
ebook['links'] = links
yield ebook
def download_books(self, dl_folder,
organize_in_folders=True,
overwrite=True):
if not self._logged_in:
self.login()
dl_folder = Path(dl_folder)
for book in self._get_books():
try:
safe_title = self._safe_filename(book['title'])
if organize_in_folders:
book_path = Path(dl_folder, safe_title)
book_path.mkdir()
else:
book_path = dl_folder
for filetype, link in book['links'].items():
try:
filename = '{0}.{1}'.format(safe_title, filetype)
file_path = Path(book_path, filename)
if not overwrite and file_path.exists():
log.debug('File already exists, skipping')
continue
link_url='https://www.packtpub.com{}'.format(link)
r = self._session.get(link_url)
r.raise_for_status()
with open(str(file_path), 'wb') as f:
f.write(r.content)
except Exception:
log.exception('Unable to download {} for {}'.format(filetype,
book['title']))
log.debug('Downloaded {}'.format(book['title']))
except Exception as e:
log.exception('Unable to save {}'.format(book['title']))
def claim_book(self):
try:
soup = BeautifulSoup(self._session.get(self._URL['freebook']).content, 'lxml')
claim_link = soup.find('a', href=re.compile('^/freelearning-claim/.*'))
claim_url = 'https://www.packtpub.com{}'.format(claim_link['href'])
r = self._session.get(claim_url)
r.raise_for_status()
except Exception as e:
log.exception('Unable to claim book')
@staticmethod
def _safe_filename(filename):
return "".join(x if x.isalnum() else "_" for x in filename).strip('_')
@nneul

This comment has been minimized.

Copy link

@nneul nneul commented Dec 19, 2018

No longer works due to changes in site - quick hack I put together here you can look at to see new method against their new REST endpoints which is much simpler than site parsing. https://gist.github.com/nneul/6eda98fd87a58a623b857523247f3471

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment