MatthieuSarter/packt_books.py

## packt_books.py
#!/usr/bin/python3
# https://gist.github.com/kspeeckaert/9d8bec27692432772913
# bs4, lxml, pathlib
from pathlib import Path
import logging
import re
import requests
from bs4 import BeautifulSoup

username = ""
password = ""
save_path = ""

base_href = "https://www.packtpub.com"

log = logging.getLogger(__name__)
log.addHandler(logging.StreamHandler())

class PacktBooks:

    _URL = {'login': base_href,
            'freebook': base_href + '/packt/offers/free-learning',
            'mybooks': base_href + '/account/my-order-history'}

    _AVAIL_BOOK_TYPES = {'pdf', 'epub', 'mobi', 'code'}

    def __init__(self, username, password):
        self._session = None
        self._book_types = self._AVAIL_BOOK_TYPES
        self._username = username
        self._password = password
        self._logged_in = False

    def _create_session(self):
        self._session = requests.Session()
        self._session.headers.update({'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                                      'Accept-Encoding': 'gzip, deflate',
                                      'Connection': 'keep-alive',
                                      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'})
    @property
    def book_types(self):
        return ','.join(self._book_types)

    @book_types.setter
    def book_types(self, types):
        if isinstance(types, str):
            self._book_types = set([types]) & self._AVAIL_BOOK_TYPES
        else:
            self._book_types = set(types) & self._AVAIL_BOOK_TYPES
        log.debug('Set wanted book types to {}'.format(self.book_types))

    def login(self):
        if self._session is None:
            self._create_session()

        payload = {'email': self._username,
                   'password': self._password,
                   'op': 'Login'}

        # Retrieve values for form_id and form_build_id
        soup = BeautifulSoup(self._session.get(self._URL['login']).content,
                             'lxml')

        # Add hidden form fields
        form = soup.find('form', {'id': 'packt-user-login-form'})
        for elem in form.find_all('input', type='hidden'):
            payload[elem['name']] = elem['value']

        # Login
        r = self._session.post(url=self._URL['login'],
                               data=payload)
        r.raise_for_status()
        self._logged_in = True

    def logoff(self):
        if self._session:
            self._session.close()
            self._logged_in = False

    def _get_books(self):
        r = self._session.get(self._URL['mybooks'])
        r.raise_for_status()

        soup = BeautifulSoup(r.content, 'lxml')
        book_list = soup.find('div', id='orders-list')

        i = 0
        for book in book_list.find_all('div', class_='product-line'):
            i = i + 1
            ebook={}

            order_products = book.find_all('div', class_='order-product-item-inner')
            if not order_products:
                log.debug('No products in order {}, skipping.'.format(i))

            j = 0
            for order_product in order_products:
                j = j + 1
                product_links = order_product.find_all('a')
                if not product_links:
                    log.debug('No link in product {} from order {}, skipping.'.format(j, i))

                for product_link in product_links:
                    if not product_link.contents:
                        log.debug('Skipping "{}", not an eBook'.format(product_link))
                    if not re.match(".*[eBook]", product_link.contents[0]):
                        log.debug('Skipping "{}", not an eBook'.format(product_link.contents[0]))

                    ebook['title'] = product_link.contents[0][4:-8] # Remove the ' [eBook]' postfix and '1 x ' prefix
                    ebook['url'] = base_href + product_link['href']

                    log.debug('Found "{}"'.format(ebook['title']))
                    yield ebook

    def _set_book_links(self, book):
        # Prepare the regular expressions for books and code download links
        re_book_types = re.compile('^/ebook_download/.*/(?:' + '|'.join(self._book_types)  +  ')')
        re_book_code = re.compile('^/code_download/[0-9]+')

        r = self._session.get(book['url'])
        r.raise_for_status()

        book_page = BeautifulSoup(r.content, 'lxml')

        links={}
        for link in book_page.find_all('a', href=re_book_types):
            filetype = link['href'].rsplit('/', maxsplit=1)[1]
            links[filetype] = link['href']

            if 'code' in self._book_types:
                for link in book_page.find_all('a', href=re_book_code):
                    links['zip'] = link['href']
        log.debug('{} link(s) found'.format(len(links)))
        book['links'] = links

    def download_books(self, dl_folder,
                       organize_in_folders=True,
                       overwrite=True):

        if not self._logged_in:
            self.login()
        dl_folder = Path(dl_folder)

        for book in self._get_books():
            try:
                book['links'] = None
                safe_title = self._safe_filename(book['title'])

                if organize_in_folders:
                    book_path = Path(dl_folder, safe_title)
                    if not book_path.exists():
                        book_path.mkdir()
                else:
                    book_path = dl_folder

                # Sentinel file to avoid downloading every book page
                # Once all the files for a book are downloaded, the
                # sentinel file for this book is added. If sentinel
                # exists when treating a book, this book is skipped.
                book_sentinel = Path(book_path, safe_title + "_ok")
                if book_sentinel.exists():
                    if not overwrite:
                        log.debug('Book already downloaded, skipping')
                        continue
                    book_sentinel.unlink()

                self._set_book_links(book)

                ok = True
                for filetype, link in book['links'].items():
                    try:
                        filename = '{0}.{1}'.format(safe_title, filetype)
                        file_path = Path(book_path, filename)
                        if not overwrite and file_path.exists():
                            log.debug('File already exists, skipping')
                            continue

                        link_url='{}{}'.format(base_href, link)
                        r = self._session.get(link_url)
                        r.raise_for_status()

                        with open(str(file_path), 'wb') as f:
                            f.write(r.content)
                    except Exception:
                        ok = False
                        log.exception('Unable to download {} for {}'.format(filetype,
                                                                            book['title']))
                    log.debug('Downloaded {}'.format(book['title']))
                # Write sentinel file
                if ok:
                    open(str(book_sentinel), "w").close()
            except Exception as e:
                log.exception('Unable to save {}'.format(book['title']))

    def claim_book(self):
        if not self._logged_in:
            self.login()
        try:
            soup = BeautifulSoup(self._session.get(self._URL['freebook']).content, 'lxml')
            claim_link = soup.find('a', href=re.compile('^/freelearning-claim/.*'))
            claim_url = 'https://www.packtpub.com{}'.format(claim_link['href'])
            r = self._session.get(claim_url)
            r.raise_for_status()
        except Exception as e:
            log.exception('Unable to claim book')

    @staticmethod
    def _safe_filename(filename):
        return "".join(x if x.isalnum() else "_" for x in filename).strip('_')

if __name__ == "__main__":
    pb = PacktBooks(username, password)
    pb.claim_book()
    pb.download_books(save_path, overwrite=False)
	#!/usr/bin/python3
	# https://gist.github.com/kspeeckaert/9d8bec27692432772913
	# bs4, lxml, pathlib
	from pathlib import Path
	import logging
	import re
	import requests
	from bs4 import BeautifulSoup

	username = ""
	password = ""
	save_path = ""

	base_href = "https://www.packtpub.com"

	log = logging.getLogger(__name__)
	log.addHandler(logging.StreamHandler())

	class PacktBooks:

	_URL = {'login': base_href,
	'freebook': base_href + '/packt/offers/free-learning',
	'mybooks': base_href + '/account/my-order-history'}

	_AVAIL_BOOK_TYPES = {'pdf', 'epub', 'mobi', 'code'}

	def __init__(self, username, password):
	self._session = None
	self._book_types = self._AVAIL_BOOK_TYPES
	self._username = username
	self._password = password
	self._logged_in = False

	def _create_session(self):
	self._session = requests.Session()
	self._session.headers.update({'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Encoding': 'gzip, deflate',
	'Connection': 'keep-alive',
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'})
	@property
	def book_types(self):
	return ','.join(self._book_types)

	@book_types.setter
	def book_types(self, types):
	if isinstance(types, str):
	self._book_types = set([types]) & self._AVAIL_BOOK_TYPES
	else:
	self._book_types = set(types) & self._AVAIL_BOOK_TYPES
	log.debug('Set wanted book types to {}'.format(self.book_types))

	def login(self):
	if self._session is None:
	self._create_session()

	payload = {'email': self._username,
	'password': self._password,
	'op': 'Login'}

	# Retrieve values for form_id and form_build_id
	soup = BeautifulSoup(self._session.get(self._URL['login']).content,
	'lxml')

	# Add hidden form fields
	form = soup.find('form', {'id': 'packt-user-login-form'})
	for elem in form.find_all('input', type='hidden'):
	payload[elem['name']] = elem['value']

	# Login
	r = self._session.post(url=self._URL['login'],
	data=payload)
	r.raise_for_status()
	self._logged_in = True

	def logoff(self):
	if self._session:
	self._session.close()
	self._logged_in = False

	def _get_books(self):
	r = self._session.get(self._URL['mybooks'])
	r.raise_for_status()

	soup = BeautifulSoup(r.content, 'lxml')
	book_list = soup.find('div', id='orders-list')

	i = 0
	for book in book_list.find_all('div', class_='product-line'):
	i = i + 1
	ebook={}

	order_products = book.find_all('div', class_='order-product-item-inner')
	if not order_products:
	log.debug('No products in order {}, skipping.'.format(i))

	j = 0
	for order_product in order_products:
	j = j + 1
	product_links = order_product.find_all('a')
	if not product_links:
	log.debug('No link in product {} from order {}, skipping.'.format(j, i))

	for product_link in product_links:
	if not product_link.contents:
	log.debug('Skipping "{}", not an eBook'.format(product_link))
	if not re.match(".*[eBook]", product_link.contents[0]):
	log.debug('Skipping "{}", not an eBook'.format(product_link.contents[0]))

	ebook['title'] = product_link.contents[0][4:-8] # Remove the ' [eBook]' postfix and '1 x ' prefix
	ebook['url'] = base_href + product_link['href']

	log.debug('Found "{}"'.format(ebook['title']))
	yield ebook

	def _set_book_links(self, book):
	# Prepare the regular expressions for books and code download links
	re_book_types = re.compile('^/ebook_download/.*/(?:' + '\|'.join(self._book_types) + ')')
	re_book_code = re.compile('^/code_download/[0-9]+')

	r = self._session.get(book['url'])
	r.raise_for_status()

	book_page = BeautifulSoup(r.content, 'lxml')

	links={}
	for link in book_page.find_all('a', href=re_book_types):
	filetype = link['href'].rsplit('/', maxsplit=1)[1]
	links[filetype] = link['href']

	if 'code' in self._book_types:
	for link in book_page.find_all('a', href=re_book_code):
	links['zip'] = link['href']
	log.debug('{} link(s) found'.format(len(links)))
	book['links'] = links

	def download_books(self, dl_folder,
	organize_in_folders=True,
	overwrite=True):

	if not self._logged_in:
	self.login()
	dl_folder = Path(dl_folder)

	for book in self._get_books():
	try:
	book['links'] = None
	safe_title = self._safe_filename(book['title'])

	if organize_in_folders:
	book_path = Path(dl_folder, safe_title)
	if not book_path.exists():
	book_path.mkdir()
	else:
	book_path = dl_folder

	# Sentinel file to avoid downloading every book page
	# Once all the files for a book are downloaded, the
	# sentinel file for this book is added. If sentinel
	# exists when treating a book, this book is skipped.
	book_sentinel = Path(book_path, safe_title + "_ok")
	if book_sentinel.exists():
	if not overwrite:
	log.debug('Book already downloaded, skipping')
	continue
	book_sentinel.unlink()

	self._set_book_links(book)

	ok = True
	for filetype, link in book['links'].items():
	try:
	filename = '{0}.{1}'.format(safe_title, filetype)
	file_path = Path(book_path, filename)
	if not overwrite and file_path.exists():
	log.debug('File already exists, skipping')
	continue

	link_url='{}{}'.format(base_href, link)
	r = self._session.get(link_url)
	r.raise_for_status()

	with open(str(file_path), 'wb') as f:
	f.write(r.content)
	except Exception:
	ok = False
	log.exception('Unable to download {} for {}'.format(filetype,
	book['title']))
	log.debug('Downloaded {}'.format(book['title']))
	# Write sentinel file
	if ok:
	open(str(book_sentinel), "w").close()
	except Exception as e:
	log.exception('Unable to save {}'.format(book['title']))

	def claim_book(self):
	if not self._logged_in:
	self.login()
	try:
	soup = BeautifulSoup(self._session.get(self._URL['freebook']).content, 'lxml')
	claim_link = soup.find('a', href=re.compile('^/freelearning-claim/.*'))
	claim_url = 'https://www.packtpub.com{}'.format(claim_link['href'])
	r = self._session.get(claim_url)
	r.raise_for_status()
	except Exception as e:
	log.exception('Unable to claim book')

	@staticmethod
	def _safe_filename(filename):
	return "".join(x if x.isalnum() else "_" for x in filename).strip('_')

	if __name__ == "__main__":
	pb = PacktBooks(username, password)
	pb.claim_book()
	pb.download_books(save_path, overwrite=False)