TrishGillett/packt_scraper.py

## packt_scraper.py
# -*- coding: utf-8 -*-
"""
author: Trish Gillett (discardthree@gmail.com, @discardthree on github)
Basic scraper to check the Packt Publishing Free ebook of the day.
This version uses selenium because when I tried getting the source via the
requests package it sometimes seemed to return the source for a version of
the website that was different from the one that was live.

Adapted from this code which was used by Estela Alvarez (supita@gmail.com)
to demo webscraping at a Montreal Pyladies meeting:
https://github.com/supita/Pyladies-Python-Web-Scraping
"""


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

def get_webdriver():
    profile = webdriver.FirefoxProfile()
    profile.set_preference("general.useragent.override", "some UA string")
    browser = webdriver.Firefox(profile)
    browser.maximize_window()

    return browser


def check_dotd(browser):
    browser.get('https://www.packtpub.com/packt/offers/free-learning/')
    try:
        WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "dotd-title")))
        source = browser.page_source
        announce_book_of_the_day(source)
    except:
        browser.close()


def announce_book_of_the_day(page_source):
    try:
        soup = BeautifulSoup(page_source, "html.parser")
        book_header = soup.find("div", { "class" : "dotd-title" }).h2
        book_title = book_header.contents[0].strip()
        print "Today's free book at is: " + book_title
        print "Go to https://www.packtpub.com/packt/offers/free-learning/ to see for yourself!"
    except:
        print "Couldn't find out the book of the day, sorry!"


def main():
    browser = get_webdriver()
    check_dotd(browser)
    browser.close()

if __name__ == '__main__':
    main()
	# -- coding: utf-8 --
	"""
	author: Trish Gillett (discardthree@gmail.com, @discardthree on github)
	Basic scraper to check the Packt Publishing Free ebook of the day.
	This version uses selenium because when I tried getting the source via the
	requests package it sometimes seemed to return the source for a version of
	the website that was different from the one that was live.

	Adapted from this code which was used by Estela Alvarez (supita@gmail.com)
	to demo webscraping at a Montreal Pyladies meeting:
	https://github.com/supita/Pyladies-Python-Web-Scraping
	"""


	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from bs4 import BeautifulSoup

	def get_webdriver():
	profile = webdriver.FirefoxProfile()
	profile.set_preference("general.useragent.override", "some UA string")
	browser = webdriver.Firefox(profile)
	browser.maximize_window()

	return browser


	def check_dotd(browser):
	browser.get('https://www.packtpub.com/packt/offers/free-learning/')
	try:
	WebDriverWait(browser, 3).until(EC.presence_of_element_located((By.CLASS_NAME, "dotd-title")))
	source = browser.page_source
	announce_book_of_the_day(source)
	except:
	browser.close()


	def announce_book_of_the_day(page_source):
	try:
	soup = BeautifulSoup(page_source, "html.parser")
	book_header = soup.find("div", { "class" : "dotd-title" }).h2
	book_title = book_header.contents[0].strip()
	print "Today's free book at is: " + book_title
	print "Go to https://www.packtpub.com/packt/offers/free-learning/ to see for yourself!"
	except:
	print "Couldn't find out the book of the day, sorry!"


	def main():
	browser = get_webdriver()
	check_dotd(browser)
	browser.close()

	if __name__ == '__main__':
	main()