Tafkas/packt2slack.py

## packt2slack.py
import json
import re

import requests
from lxml import html

BASEURL = 'https://www.packtpub.com'
HEADER = {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}

OUTPUT = {'pav': {'channel': "#ebook-monkey",
                  'webhook': "https://hooks.slack.com/services/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"},
          # 'pav-family': {'channel': "#my-channel",
          #               'webhook': "https://hooks.slack.com/services/...."}
          }

SLACK_USERNAME = "ebook monkey"
SLACK_ICON_EMOJI = ":monkey_face:"


def fix_string(foo):
    if isinstance(foo, basestring):
        foo = foo.encode('utf8')
    else:
        foo = unicode(foo).encode('utf8')
    return foo


def fetch_isbn13(detail_page_url):
    """Fetches the ISBN13 number from the Packt Book detail page
    :param detail_page_url: the url of the packt book detal page
    :return: the ISBN13 of the book
    """
    page = requests.get(detail_page_url, headers=HEADER)
    tree = html.fromstring(page.text)
    isbn13 = tree.xpath('//*[contains(concat( " ", @itemprop, " " ), concat( " ", "isbn", " " ))]')[0].text
    return isbn13


def fetch_amazon_rating(isbn13):
    """Fetches the Amazon Rating, number of ratings and ASIN from Amazon
    :param isbn13: An ISBN13 number
    :return: asin number and the Amazon rating and number of ratings
    """
    try:
        s = requests.Session()
        response = s.get('https://www.amazon.com')
        cookies = dict(response.cookies)
        url = '''https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords={}'''.format(isbn13)
        response = requests.get(url, headers=HEADER, cookies=cookies)
        if response.status_code == 503:
            response.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print "oops something unexpected happened: {}".format(e)

    tree = html.fromstring(response.text)
    tmp = tree.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "a-icon-alt", " " ))]')

    # get rating
    rating_regex = re.compile('(\d+(\.\d+)?)')
    rating_matches = [m.group(0) for foo in tmp for m in [rating_regex.search(foo.text)] if m]
    amazon_rating, number_of_ratings = None, None
    if len(rating_matches) > 0:
        amazon_rating = float(rating_matches[0])
        # get number of ratings
        # number_of_ratings = tree.xpath('''/html/body/div[1]/div[3]/div/div[3]/div[2]/div/div[4]/div[1]/div/ul/li/div/div[3]/div[4]/a''')[0].text

    # get asin (is either isbn10 or amazon specific)
    tmp = tree.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "a-link-normal a-text-normal", " " ))]')[0]
    m = re.search('\d{10}', tmp.attrib['href'])
    asin = None
    if m:
        asin = m.group(0)
    else:
        m = re.search('(B[0-9]{2}[0-9A-Z]{7}|[0-9]{9}(X|0-9]))', tmp.attrib['href'])
        if m:
            asin = m.group(0)

    return asin, amazon_rating, number_of_ratings


def fetch_ebook():
    """Fetches the Packt free ebook of the day, enriches the output with information from Amazon and posts it to Slack
    """
    free_book_url = BASEURL + '/packt/offers/free-learning'
    page = requests.get(free_book_url, headers=HEADER)
    tree = html.fromstring(page.text)
    title = tree.xpath('//*[(@id = "deal-of-the-day")]//h2')[0].text.strip()
    description = tree.xpath('''//*[(@id = "deal-of-the-day")]
    //div[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]''')[0].text.strip()
    cover_image = tree.xpath('''//*[contains(concat( " ", @class, " " ),
                                concat( " ", "imagecache-dotd_main_image", " " ))]''')

    image_url = 'https:/{}'.format(fix_string(cover_image[0].attrib['src'][1:])).replace(' ', '%20')

    detail_page = (
        tree.xpath('''//*[contains(concat( " ", @class, " " ), concat( " ", "dotd-main-book-image", " " ))]//a''')
        [0].attrib['href'])
    isbn13 = fetch_isbn13(BASEURL + detail_page)

    asin, amazon_rating, number_of_ratings = fetch_amazon_rating(isbn13)
    if amazon_rating is not None:
        stars = ':star:' * int(round(amazon_rating, 0))
        amazon_text = ("{title} has been rated \n {rating} out of 5 stars {stars} on "
                       "<https://www.amazon.de/dp/{asin}?tag=de125725875-21|Amazon.com>."
                       .format(title=fix_string(title),
                               rating=amazon_rating,
                               stars=stars,
                               number_of_ratings=number_of_ratings,
                               asin=asin))

    else:
        amazon_text = ("{title} has not been rated on "
                       "<https://www.amazon.de/dp/{asin}?tag=de125725875-21|Amazon.com> yet."
                       .format(title=fix_string(title),
                               asin=asin))

    amazon_attachment = {
        "title": "Amazon Rating",
        "text": amazon_text
    }

    for slack_name in OUTPUT:
        payload = {"channel": OUTPUT[slack_name]['channel'],
                   "username": SLACK_USERNAME,
                   "icon_emoji": SLACK_ICON_EMOJI,
                   "attachments": [{
                       "title": "Free ebook today: {title}".format(title=fix_string(title)),
                       "title_link": free_book_url,
                       "text": description,
                       "image_url": image_url},
                       amazon_attachment]
                   }

        r = requests.post(OUTPUT[slack_name]['webhook'],
                          data=json.dumps(payload),
                          headers={"content-type": "text/javascript"})
        if r.status_code != 200:
            print r


if __name__ == '__main__':
    fetch_ebook()
	import json
	import re

	import requests
	from lxml import html

	BASEURL = 'https://www.packtpub.com'
	HEADER = {'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}

	OUTPUT = {'pav': {'channel': "#ebook-monkey",
	'webhook': "https://hooks.slack.com/services/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"},
	# 'pav-family': {'channel': "#my-channel",
	# 'webhook': "https://hooks.slack.com/services/...."}
	}

	SLACK_USERNAME = "ebook monkey"
	SLACK_ICON_EMOJI = ":monkey_face:"


	def fix_string(foo):
	if isinstance(foo, basestring):
	foo = foo.encode('utf8')
	else:
	foo = unicode(foo).encode('utf8')
	return foo


	def fetch_isbn13(detail_page_url):
	"""Fetches the ISBN13 number from the Packt Book detail page
	:param detail_page_url: the url of the packt book detal page
	:return: the ISBN13 of the book
	"""
	page = requests.get(detail_page_url, headers=HEADER)
	tree = html.fromstring(page.text)
	isbn13 = tree.xpath('//*[contains(concat( " ", @itemprop, " " ), concat( " ", "isbn", " " ))]')[0].text
	return isbn13


	def fetch_amazon_rating(isbn13):
	"""Fetches the Amazon Rating, number of ratings and ASIN from Amazon
	:param isbn13: An ISBN13 number
	:return: asin number and the Amazon rating and number of ratings
	"""
	try:
	s = requests.Session()
	response = s.get('https://www.amazon.com')
	cookies = dict(response.cookies)
	url = '''https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords={}'''.format(isbn13)
	response = requests.get(url, headers=HEADER, cookies=cookies)
	if response.status_code == 503:
	response.raise_for_status()
	except requests.exceptions.HTTPError as e:
	print "oops something unexpected happened: {}".format(e)

	tree = html.fromstring(response.text)
	tmp = tree.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "a-icon-alt", " " ))]')

	# get rating
	rating_regex = re.compile('(\d+(\.\d+)?)')
	rating_matches = [m.group(0) for foo in tmp for m in [rating_regex.search(foo.text)] if m]
	amazon_rating, number_of_ratings = None, None
	if len(rating_matches) > 0:
	amazon_rating = float(rating_matches[0])
	# get number of ratings
	# number_of_ratings = tree.xpath('''/html/body/div[1]/div[3]/div/div[3]/div[2]/div/div[4]/div[1]/div/ul/li/div/div[3]/div[4]/a''')[0].text

	# get asin (is either isbn10 or amazon specific)
	tmp = tree.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "a-link-normal a-text-normal", " " ))]')[0]
	m = re.search('\d{10}', tmp.attrib['href'])
	asin = None
	if m:
	asin = m.group(0)
	else:
	m = re.search('(B[0-9]{2}[0-9A-Z]{7}\|[0-9]{9}(X\|0-9]))', tmp.attrib['href'])
	if m:
	asin = m.group(0)

	return asin, amazon_rating, number_of_ratings


	def fetch_ebook():
	"""Fetches the Packt free ebook of the day, enriches the output with information from Amazon and posts it to Slack
	"""
	free_book_url = BASEURL + '/packt/offers/free-learning'
	page = requests.get(free_book_url, headers=HEADER)
	tree = html.fromstring(page.text)
	title = tree.xpath('//*[(@id = "deal-of-the-day")]//h2')[0].text.strip()
	description = tree.xpath('''//*[(@id = "deal-of-the-day")]
	//div[(((count(preceding-sibling::) + 1) = 4) and parent::)]''')[0].text.strip()
	cover_image = tree.xpath('''//*[contains(concat( " ", @class, " " ),
	concat( " ", "imagecache-dotd_main_image", " " ))]''')

	image_url = 'https:/{}'.format(fix_string(cover_image[0].attrib['src'][1:])).replace(' ', '%20')

	detail_page = (
	tree.xpath('''//*[contains(concat( " ", @class, " " ), concat( " ", "dotd-main-book-image", " " ))]//a''')
	[0].attrib['href'])
	isbn13 = fetch_isbn13(BASEURL + detail_page)

	asin, amazon_rating, number_of_ratings = fetch_amazon_rating(isbn13)
	if amazon_rating is not None:
	stars = ':star:' * int(round(amazon_rating, 0))
	amazon_text = ("{title} has been rated \n {rating} out of 5 stars {stars} on "
	"<https://www.amazon.de/dp/{asin}?tag=de125725875-21\|Amazon.com>."
	.format(title=fix_string(title),
	rating=amazon_rating,
	stars=stars,
	number_of_ratings=number_of_ratings,
	asin=asin))

	else:
	amazon_text = ("{title} has not been rated on "
	"<https://www.amazon.de/dp/{asin}?tag=de125725875-21\|Amazon.com> yet."
	.format(title=fix_string(title),
	asin=asin))

	amazon_attachment = {
	"title": "Amazon Rating",
	"text": amazon_text
	}

	for slack_name in OUTPUT:
	payload = {"channel": OUTPUT[slack_name]['channel'],
	"username": SLACK_USERNAME,
	"icon_emoji": SLACK_ICON_EMOJI,
	"attachments": [{
	"title": "Free ebook today: {title}".format(title=fix_string(title)),
	"title_link": free_book_url,
	"text": description,
	"image_url": image_url},
	amazon_attachment]
	}

	r = requests.post(OUTPUT[slack_name]['webhook'],
	data=json.dumps(payload),
	headers={"content-type": "text/javascript"})
	if r.status_code != 200:
	print r


	if __name__ == '__main__':
	fetch_ebook()