Skip to content

Instantly share code, notes, and snippets.

@MatthieuSarter
Forked from kspeeckaert/packt_books.py
Last active March 7, 2017 18:50
Show Gist options
  • Save MatthieuSarter/78078b53495707835ba8d01eb5ef257b to your computer and use it in GitHub Desktop.
Save MatthieuSarter/78078b53495707835ba8d01eb5ef257b to your computer and use it in GitHub Desktop.
PacktPub e-books downloader
#!/usr/bin/python3
# https://gist.github.com/kspeeckaert/9d8bec27692432772913
# bs4, lxml, pathlib
from pathlib import Path
import logging
import re
import requests
from bs4 import BeautifulSoup
username = ""
password = ""
save_path = ""
base_href = "https://www.packtpub.com"
log = logging.getLogger(__name__)
log.addHandler(logging.StreamHandler())
class PacktBooks:
_URL = {'login': base_href,
'freebook': base_href + '/packt/offers/free-learning',
'mybooks': base_href + '/account/my-order-history'}
_AVAIL_BOOK_TYPES = {'pdf', 'epub', 'mobi', 'code'}
def __init__(self, username, password):
self._session = None
self._book_types = self._AVAIL_BOOK_TYPES
self._username = username
self._password = password
self._logged_in = False
def _create_session(self):
self._session = requests.Session()
self._session.headers.update({'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'})
@property
def book_types(self):
return ','.join(self._book_types)
@book_types.setter
def book_types(self, types):
if isinstance(types, str):
self._book_types = set([types]) & self._AVAIL_BOOK_TYPES
else:
self._book_types = set(types) & self._AVAIL_BOOK_TYPES
log.debug('Set wanted book types to {}'.format(self.book_types))
def login(self):
if self._session is None:
self._create_session()
payload = {'email': self._username,
'password': self._password,
'op': 'Login'}
# Retrieve values for form_id and form_build_id
soup = BeautifulSoup(self._session.get(self._URL['login']).content,
'lxml')
# Add hidden form fields
form = soup.find('form', {'id': 'packt-user-login-form'})
for elem in form.find_all('input', type='hidden'):
payload[elem['name']] = elem['value']
# Login
r = self._session.post(url=self._URL['login'],
data=payload)
r.raise_for_status()
self._logged_in = True
def logoff(self):
if self._session:
self._session.close()
self._logged_in = False
def _get_books(self):
r = self._session.get(self._URL['mybooks'])
r.raise_for_status()
soup = BeautifulSoup(r.content, 'lxml')
book_list = soup.find('div', id='orders-list')
i = 0
for book in book_list.find_all('div', class_='product-line'):
i = i + 1
ebook={}
order_products = book.find_all('div', class_='order-product-item-inner')
if not order_products:
log.debug('No products in order {}, skipping.'.format(i))
j = 0
for order_product in order_products:
j = j + 1
product_links = order_product.find_all('a')
if not product_links:
log.debug('No link in product {} from order {}, skipping.'.format(j, i))
for product_link in product_links:
if not product_link.contents:
log.debug('Skipping "{}", not an eBook'.format(product_link))
if not re.match(".*[eBook]", product_link.contents[0]):
log.debug('Skipping "{}", not an eBook'.format(product_link.contents[0]))
ebook['title'] = product_link.contents[0][4:-8] # Remove the ' [eBook]' postfix and '1 x ' prefix
ebook['url'] = base_href + product_link['href']
log.debug('Found "{}"'.format(ebook['title']))
yield ebook
def _set_book_links(self, book):
# Prepare the regular expressions for books and code download links
re_book_types = re.compile('^/ebook_download/.*/(?:' + '|'.join(self._book_types) + ')')
re_book_code = re.compile('^/code_download/[0-9]+')
r = self._session.get(book['url'])
r.raise_for_status()
book_page = BeautifulSoup(r.content, 'lxml')
links={}
for link in book_page.find_all('a', href=re_book_types):
filetype = link['href'].rsplit('/', maxsplit=1)[1]
links[filetype] = link['href']
if 'code' in self._book_types:
for link in book_page.find_all('a', href=re_book_code):
links['zip'] = link['href']
log.debug('{} link(s) found'.format(len(links)))
book['links'] = links
def download_books(self, dl_folder,
organize_in_folders=True,
overwrite=True):
if not self._logged_in:
self.login()
dl_folder = Path(dl_folder)
for book in self._get_books():
try:
book['links'] = None
safe_title = self._safe_filename(book['title'])
if organize_in_folders:
book_path = Path(dl_folder, safe_title)
if not book_path.exists():
book_path.mkdir()
else:
book_path = dl_folder
# Sentinel file to avoid downloading every book page
# Once all the files for a book are downloaded, the
# sentinel file for this book is added. If sentinel
# exists when treating a book, this book is skipped.
book_sentinel = Path(book_path, safe_title + "_ok")
if book_sentinel.exists():
if not overwrite:
log.debug('Book already downloaded, skipping')
continue
book_sentinel.unlink()
self._set_book_links(book)
ok = True
for filetype, link in book['links'].items():
try:
filename = '{0}.{1}'.format(safe_title, filetype)
file_path = Path(book_path, filename)
if not overwrite and file_path.exists():
log.debug('File already exists, skipping')
continue
link_url='{}{}'.format(base_href, link)
r = self._session.get(link_url)
r.raise_for_status()
with open(str(file_path), 'wb') as f:
f.write(r.content)
except Exception:
ok = False
log.exception('Unable to download {} for {}'.format(filetype,
book['title']))
log.debug('Downloaded {}'.format(book['title']))
# Write sentinel file
if ok:
open(str(book_sentinel), "w").close()
except Exception as e:
log.exception('Unable to save {}'.format(book['title']))
def claim_book(self):
if not self._logged_in:
self.login()
try:
soup = BeautifulSoup(self._session.get(self._URL['freebook']).content, 'lxml')
claim_link = soup.find('a', href=re.compile('^/freelearning-claim/.*'))
claim_url = 'https://www.packtpub.com{}'.format(claim_link['href'])
r = self._session.get(claim_url)
r.raise_for_status()
except Exception as e:
log.exception('Unable to claim book')
@staticmethod
def _safe_filename(filename):
return "".join(x if x.isalnum() else "_" for x in filename).strip('_')
if __name__ == "__main__":
pb = PacktBooks(username, password)
pb.claim_book()
pb.download_books(save_path, overwrite=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment