alexshevchuk/pgessays.py

## pgessays.py
# -*- coding: utf-8 -*-
"""
Builds epub book out of Paul Graham's essays: http://paulgraham.com/articles.html

Author: Ola Sitarska <ola@sitarska.com>
Copyright: Licensed under the GPL-3 (http://www.gnu.org/licenses/gpl-3.0.html)

This script requires python-epub-library: http://code.google.com/p/python-epub-builder/
"""

import re, ez_epub, genshi
import urllib3
from bs4 import BeautifulSoup

def addSection(link, title):
    if not 'http' in link:
        page = http.request('GET', 'http://www.paulgraham.com/'+link).data
        soup = BeautifulSoup(page, 'lxml')
        soup.prettify()
    else:
        page = http.request('GET', link).data

    section = ez_epub.Section()
    try:
        section.title = title
        print(section.title)

        if not 'http' in link:
            font = str(soup.findAll('table', {'width':'435'})[0].findAll('font')[0])
            if not 'Get funded by' in font and not 'Watch how this essay was' in font and not 'Like to build things?' in font and not len(font)<100:
                content = font
            else:
                content = ''
                for par in soup.findAll('table', {'width':'435'})[0].findAll('p'):
                    content += str(par)

            for p in content.split("<br /><br />"):
                section.text.append(genshi.core.Markup(p))

            #exception for Subject: Airbnb
            for pre in soup.findAll('pre'):
                section.text.append(genshi.core.Markup(pre))
        else:
            for p in str(page).replace("\n","<br />").split("<br /><br />"):
                section.text.append(genshi.core.Markup(p))
    except:
        pass

    return section

http = urllib3.PoolManager()

book = ez_epub.Book()
book.title = "Paul Graham's Essays"
book.authors = ['Paul Graham']

page = http.request('GET', 'http://www.paulgraham.com/articles.html').data
soup = BeautifulSoup(page, 'lxml')
soup.prettify()

links = soup.find_all('table', {'width': '435'})[1].find_all('a')
sections = []
for link in links:
    sections.append(addSection(link['href'], link.text))

book.sections = sections
book.make(book.title)
	# -- coding: utf-8 --
	"""
	Builds epub book out of Paul Graham's essays: http://paulgraham.com/articles.html

	Author: Ola Sitarska <ola@sitarska.com>
	Copyright: Licensed under the GPL-3 (http://www.gnu.org/licenses/gpl-3.0.html)

	This script requires python-epub-library: http://code.google.com/p/python-epub-builder/
	"""

	import re, ez_epub, genshi
	import urllib3
	from bs4 import BeautifulSoup

	def addSection(link, title):
	if not 'http' in link:
	page = http.request('GET', 'http://www.paulgraham.com/'+link).data
	soup = BeautifulSoup(page, 'lxml')
	soup.prettify()
	else:
	page = http.request('GET', link).data

	section = ez_epub.Section()
	try:
	section.title = title
	print(section.title)

	if not 'http' in link:
	font = str(soup.findAll('table', {'width':'435'})[0].findAll('font')[0])
	if not 'Get funded by' in font and not 'Watch how this essay was' in font and not 'Like to build things?' in font and not len(font)<100:
	content = font
	else:
	content = ''
	for par in soup.findAll('table', {'width':'435'})[0].findAll('p'):
	content += str(par)

	for p in content.split("<br /><br />"):
	section.text.append(genshi.core.Markup(p))

	#exception for Subject: Airbnb
	for pre in soup.findAll('pre'):
	section.text.append(genshi.core.Markup(pre))
	else:
	for p in str(page).replace("\n","<br />").split("<br /><br />"):
	section.text.append(genshi.core.Markup(p))
	except:
	pass

	return section

	http = urllib3.PoolManager()

	book = ez_epub.Book()
	book.title = "Paul Graham's Essays"
	book.authors = ['Paul Graham']

	page = http.request('GET', 'http://www.paulgraham.com/articles.html').data
	soup = BeautifulSoup(page, 'lxml')
	soup.prettify()

	links = soup.find_all('table', {'width': '435'})[1].find_all('a')
	sections = []
	for link in links:
	sections.append(addSection(link['href'], link.text))

	book.sections = sections
	book.make(book.title)