-
-
Save MatthieuSarter/78078b53495707835ba8d01eb5ef257b to your computer and use it in GitHub Desktop.
PacktPub e-books downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# https://gist.github.com/kspeeckaert/9d8bec27692432772913 | |
# bs4, lxml, pathlib | |
from pathlib import Path | |
import logging | |
import re | |
import requests | |
from bs4 import BeautifulSoup | |
username = "" | |
password = "" | |
save_path = "" | |
base_href = "https://www.packtpub.com" | |
log = logging.getLogger(__name__) | |
log.addHandler(logging.StreamHandler()) | |
class PacktBooks: | |
_URL = {'login': base_href, | |
'freebook': base_href + '/packt/offers/free-learning', | |
'mybooks': base_href + '/account/my-order-history'} | |
_AVAIL_BOOK_TYPES = {'pdf', 'epub', 'mobi', 'code'} | |
def __init__(self, username, password): | |
self._session = None | |
self._book_types = self._AVAIL_BOOK_TYPES | |
self._username = username | |
self._password = password | |
self._logged_in = False | |
def _create_session(self): | |
self._session = requests.Session() | |
self._session.headers.update({'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Encoding': 'gzip, deflate', | |
'Connection': 'keep-alive', | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}) | |
@property | |
def book_types(self): | |
return ','.join(self._book_types) | |
@book_types.setter | |
def book_types(self, types): | |
if isinstance(types, str): | |
self._book_types = set([types]) & self._AVAIL_BOOK_TYPES | |
else: | |
self._book_types = set(types) & self._AVAIL_BOOK_TYPES | |
log.debug('Set wanted book types to {}'.format(self.book_types)) | |
def login(self): | |
if self._session is None: | |
self._create_session() | |
payload = {'email': self._username, | |
'password': self._password, | |
'op': 'Login'} | |
# Retrieve values for form_id and form_build_id | |
soup = BeautifulSoup(self._session.get(self._URL['login']).content, | |
'lxml') | |
# Add hidden form fields | |
form = soup.find('form', {'id': 'packt-user-login-form'}) | |
for elem in form.find_all('input', type='hidden'): | |
payload[elem['name']] = elem['value'] | |
# Login | |
r = self._session.post(url=self._URL['login'], | |
data=payload) | |
r.raise_for_status() | |
self._logged_in = True | |
def logoff(self): | |
if self._session: | |
self._session.close() | |
self._logged_in = False | |
def _get_books(self): | |
r = self._session.get(self._URL['mybooks']) | |
r.raise_for_status() | |
soup = BeautifulSoup(r.content, 'lxml') | |
book_list = soup.find('div', id='orders-list') | |
i = 0 | |
for book in book_list.find_all('div', class_='product-line'): | |
i = i + 1 | |
ebook={} | |
order_products = book.find_all('div', class_='order-product-item-inner') | |
if not order_products: | |
log.debug('No products in order {}, skipping.'.format(i)) | |
j = 0 | |
for order_product in order_products: | |
j = j + 1 | |
product_links = order_product.find_all('a') | |
if not product_links: | |
log.debug('No link in product {} from order {}, skipping.'.format(j, i)) | |
for product_link in product_links: | |
if not product_link.contents: | |
log.debug('Skipping "{}", not an eBook'.format(product_link)) | |
if not re.match(".*[eBook]", product_link.contents[0]): | |
log.debug('Skipping "{}", not an eBook'.format(product_link.contents[0])) | |
ebook['title'] = product_link.contents[0][4:-8] # Remove the ' [eBook]' postfix and '1 x ' prefix | |
ebook['url'] = base_href + product_link['href'] | |
log.debug('Found "{}"'.format(ebook['title'])) | |
yield ebook | |
def _set_book_links(self, book): | |
# Prepare the regular expressions for books and code download links | |
re_book_types = re.compile('^/ebook_download/.*/(?:' + '|'.join(self._book_types) + ')') | |
re_book_code = re.compile('^/code_download/[0-9]+') | |
r = self._session.get(book['url']) | |
r.raise_for_status() | |
book_page = BeautifulSoup(r.content, 'lxml') | |
links={} | |
for link in book_page.find_all('a', href=re_book_types): | |
filetype = link['href'].rsplit('/', maxsplit=1)[1] | |
links[filetype] = link['href'] | |
if 'code' in self._book_types: | |
for link in book_page.find_all('a', href=re_book_code): | |
links['zip'] = link['href'] | |
log.debug('{} link(s) found'.format(len(links))) | |
book['links'] = links | |
def download_books(self, dl_folder, | |
organize_in_folders=True, | |
overwrite=True): | |
if not self._logged_in: | |
self.login() | |
dl_folder = Path(dl_folder) | |
for book in self._get_books(): | |
try: | |
book['links'] = None | |
safe_title = self._safe_filename(book['title']) | |
if organize_in_folders: | |
book_path = Path(dl_folder, safe_title) | |
if not book_path.exists(): | |
book_path.mkdir() | |
else: | |
book_path = dl_folder | |
# Sentinel file to avoid downloading every book page | |
# Once all the files for a book are downloaded, the | |
# sentinel file for this book is added. If sentinel | |
# exists when treating a book, this book is skipped. | |
book_sentinel = Path(book_path, safe_title + "_ok") | |
if book_sentinel.exists(): | |
if not overwrite: | |
log.debug('Book already downloaded, skipping') | |
continue | |
book_sentinel.unlink() | |
self._set_book_links(book) | |
ok = True | |
for filetype, link in book['links'].items(): | |
try: | |
filename = '{0}.{1}'.format(safe_title, filetype) | |
file_path = Path(book_path, filename) | |
if not overwrite and file_path.exists(): | |
log.debug('File already exists, skipping') | |
continue | |
link_url='{}{}'.format(base_href, link) | |
r = self._session.get(link_url) | |
r.raise_for_status() | |
with open(str(file_path), 'wb') as f: | |
f.write(r.content) | |
except Exception: | |
ok = False | |
log.exception('Unable to download {} for {}'.format(filetype, | |
book['title'])) | |
log.debug('Downloaded {}'.format(book['title'])) | |
# Write sentinel file | |
if ok: | |
open(str(book_sentinel), "w").close() | |
except Exception as e: | |
log.exception('Unable to save {}'.format(book['title'])) | |
def claim_book(self): | |
if not self._logged_in: | |
self.login() | |
try: | |
soup = BeautifulSoup(self._session.get(self._URL['freebook']).content, 'lxml') | |
claim_link = soup.find('a', href=re.compile('^/freelearning-claim/.*')) | |
claim_url = 'https://www.packtpub.com{}'.format(claim_link['href']) | |
r = self._session.get(claim_url) | |
r.raise_for_status() | |
except Exception as e: | |
log.exception('Unable to claim book') | |
@staticmethod | |
def _safe_filename(filename): | |
return "".join(x if x.isalnum() else "_" for x in filename).strip('_') | |
if __name__ == "__main__": | |
pb = PacktBooks(username, password) | |
pb.claim_book() | |
pb.download_books(save_path, overwrite=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment