kspeeckaert/packt_books.py

## packt_books.py
from pathlib import Path
import logging
import re
import requests
from bs4 import BeautifulSoup

log = logging.getLogger(__name__)
log.addHandler(logging.NullHandler())

class PacktBooks:

    _URL = {'login': 'https://www.packtpub.com/',
            'freebook': 'https://www.packtpub.com/packt/offers/free-learning',
            'mybooks': 'https://www.packtpub.com/account/my-ebooks'}

    _AVAIL_BOOK_TYPES = {'pdf', 'epub', 'mobi', 'code'}

    def __init__(self, username, password):
        self._session = None
        self._book_types = self._AVAIL_BOOK_TYPES
        self._username = username
        self._password = password
        self._logged_in = False

    def _create_session(self):
        self._session = requests.Session()
        self._session.headers.update({'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                                      'Accept-Encoding': 'gzip, deflate',
                                      'Connection': 'keep-alive',
                                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
    @property
    def book_types(self):
        return ','.join(self._book_types)

    @book_types.setter
    def book_types(self, types):
        if isinstance(types, str):
            self._book_types = set([types]) & self._AVAIL_BOOK_TYPES
        else:
            self._book_types = set(types) & self._AVAIL_BOOK_TYPES
        log.debug('Set wanted book types to {}'.format(self.book_types))

    def login(self):
        if self._session is None:
            self._create_session()

        payload = {'email': self._username,
                   'password': self._password,
                   'op': 'Login'}

        # Retrieve values for form_id and form_build_id
        soup = BeautifulSoup(requests.get(self._URL['login']).content,
                             'lxml')

        # Add hidden form fields
        form = soup.find('form', {'id': 'packt-user-login-form'})
        for elem in form.find_all('input', type='hidden'):
            payload[elem['name']] = elem['value']

        # Login
        r = self._session.post(url=self._URL['login'],
                               data=payload)
        r.raise_for_status()
        self._logged_in = True

    def logoff(self):
        if self._session:
            self._session.close()
            self._logged_in = False

    def _get_books(self):
        # Prepare the regular expressions for books and code download links
        re_book_types = re.compile('^/ebook_download/.*/(?:' + '|'.join(self._book_types)  +  ')')
        re_book_code = re.compile('^/code_download/[0-9]+')

        r = self._session.get(self._URL['mybooks'])
        r.raise_for_status()

        soup = BeautifulSoup(r.content, 'lxml')
        book_list = soup.find('div', id='product-account-list')

        for book in book_list.find_all('div', class_='product-line'):
            ebook={}
            if not book.has_attr('title'):
                log.debug('Skipping fake book entry')
                continue
            ebook['title'] = book['title'][:-8] # Remove the '(ebook)' postfix

            log.debug('Found "{}"'.format(ebook['title']))

            links={}
            for link in book.find_all('a', href=re_book_types):
                filetype = link['href'].rsplit('/', maxsplit=1)[1]
                links[filetype] = link['href']

                if 'code' in self._book_types:
                    for link in book.find_all('a', href=re_book_code):
                        links['zip'] = link['href']
            log.debug('{} link(s) found'.format(len(links)))
            ebook['links'] = links
            yield ebook

    def download_books(self, dl_folder,
                       organize_in_folders=True,
                       overwrite=True):

        if not self._logged_in:
            self.login()
        dl_folder = Path(dl_folder)

        for book in self._get_books():
            try:
                safe_title = self._safe_filename(book['title'])

                if organize_in_folders:
                    book_path = Path(dl_folder, safe_title)
                    book_path.mkdir()
                else:
                    book_path = dl_folder

                for filetype, link in book['links'].items():
                    try:
                        filename = '{0}.{1}'.format(safe_title, filetype)
                        file_path = Path(book_path, filename)
                        if not overwrite and file_path.exists():
                            log.debug('File already exists, skipping')
                            continue

                        link_url='https://www.packtpub.com{}'.format(link)
                        r = self._session.get(link_url)
                        r.raise_for_status()

                        with open(str(file_path), 'wb') as f:
                            f.write(r.content)
                    except Exception:
                        log.exception('Unable to download {} for {}'.format(filetype,
                                                                            book['title']))
                    log.debug('Downloaded {}'.format(book['title']))
            except Exception as e:
                log.exception('Unable to save {}'.format(book['title']))

    def claim_book(self):
        try:
            soup = BeautifulSoup(self._session.get(self._URL['freebook']).content, 'lxml')
            claim_link = soup.find('a', href=re.compile('^/freelearning-claim/.*'))
            claim_url = 'https://www.packtpub.com{}'.format(claim_link['href'])
            r = self._session.get(claim_url)
            r.raise_for_status()
        except Exception as e:
            log.exception('Unable to claim book')

    @staticmethod
    def _safe_filename(filename):
        return "".join(x if x.isalnum() else "_" for x in filename).strip('_')
	from pathlib import Path
	import logging
	import re
	import requests
	from bs4 import BeautifulSoup

	log = logging.getLogger(__name__)
	log.addHandler(logging.NullHandler())

	class PacktBooks:

	_URL = {'login': 'https://www.packtpub.com/',
	'freebook': 'https://www.packtpub.com/packt/offers/free-learning',
	'mybooks': 'https://www.packtpub.com/account/my-ebooks'}

	_AVAIL_BOOK_TYPES = {'pdf', 'epub', 'mobi', 'code'}

	def __init__(self, username, password):
	self._session = None
	self._book_types = self._AVAIL_BOOK_TYPES
	self._username = username
	self._password = password
	self._logged_in = False

	def _create_session(self):
	self._session = requests.Session()
	self._session.headers.update({'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Encoding': 'gzip, deflate',
	'Connection': 'keep-alive',
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
	@property
	def book_types(self):
	return ','.join(self._book_types)

	@book_types.setter
	def book_types(self, types):
	if isinstance(types, str):
	self._book_types = set([types]) & self._AVAIL_BOOK_TYPES
	else:
	self._book_types = set(types) & self._AVAIL_BOOK_TYPES
	log.debug('Set wanted book types to {}'.format(self.book_types))

	def login(self):
	if self._session is None:
	self._create_session()

	payload = {'email': self._username,
	'password': self._password,
	'op': 'Login'}

	# Retrieve values for form_id and form_build_id
	soup = BeautifulSoup(requests.get(self._URL['login']).content,
	'lxml')

	# Add hidden form fields
	form = soup.find('form', {'id': 'packt-user-login-form'})
	for elem in form.find_all('input', type='hidden'):
	payload[elem['name']] = elem['value']

	# Login
	r = self._session.post(url=self._URL['login'],
	data=payload)
	r.raise_for_status()
	self._logged_in = True

	def logoff(self):
	if self._session:
	self._session.close()
	self._logged_in = False

	def _get_books(self):
	# Prepare the regular expressions for books and code download links
	re_book_types = re.compile('^/ebook_download/.*/(?:' + '\|'.join(self._book_types) + ')')
	re_book_code = re.compile('^/code_download/[0-9]+')

	r = self._session.get(self._URL['mybooks'])
	r.raise_for_status()

	soup = BeautifulSoup(r.content, 'lxml')
	book_list = soup.find('div', id='product-account-list')

	for book in book_list.find_all('div', class_='product-line'):
	ebook={}
	if not book.has_attr('title'):
	log.debug('Skipping fake book entry')
	continue
	ebook['title'] = book['title'][:-8] # Remove the '(ebook)' postfix

	log.debug('Found "{}"'.format(ebook['title']))

	links={}
	for link in book.find_all('a', href=re_book_types):
	filetype = link['href'].rsplit('/', maxsplit=1)[1]
	links[filetype] = link['href']

	if 'code' in self._book_types:
	for link in book.find_all('a', href=re_book_code):
	links['zip'] = link['href']
	log.debug('{} link(s) found'.format(len(links)))
	ebook['links'] = links
	yield ebook

	def download_books(self, dl_folder,
	organize_in_folders=True,
	overwrite=True):

	if not self._logged_in:
	self.login()
	dl_folder = Path(dl_folder)

	for book in self._get_books():
	try:
	safe_title = self._safe_filename(book['title'])

	if organize_in_folders:
	book_path = Path(dl_folder, safe_title)
	book_path.mkdir()
	else:
	book_path = dl_folder

	for filetype, link in book['links'].items():
	try:
	filename = '{0}.{1}'.format(safe_title, filetype)
	file_path = Path(book_path, filename)
	if not overwrite and file_path.exists():
	log.debug('File already exists, skipping')
	continue

	link_url='https://www.packtpub.com{}'.format(link)
	r = self._session.get(link_url)
	r.raise_for_status()

	with open(str(file_path), 'wb') as f:
	f.write(r.content)
	except Exception:
	log.exception('Unable to download {} for {}'.format(filetype,
	book['title']))
	log.debug('Downloaded {}'.format(book['title']))
	except Exception as e:
	log.exception('Unable to save {}'.format(book['title']))

	def claim_book(self):
	try:
	soup = BeautifulSoup(self._session.get(self._URL['freebook']).content, 'lxml')
	claim_link = soup.find('a', href=re.compile('^/freelearning-claim/.*'))
	claim_url = 'https://www.packtpub.com{}'.format(claim_link['href'])
	r = self._session.get(claim_url)
	r.raise_for_status()
	except Exception as e:
	log.exception('Unable to claim book')

	@staticmethod
	def _safe_filename(filename):
	return "".join(x if x.isalnum() else "_" for x in filename).strip('_')