Skip to content

Instantly share code, notes, and snippets.

@james-ingold
Forked from olasitarska/pgessays.py
Last active October 12, 2016 17:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save james-ingold/16c64f94e212f88068ab3d55c53831a7 to your computer and use it in GitHub Desktop.
Save james-ingold/16c64f94e212f88068ab3d55c53831a7 to your computer and use it in GitHub Desktop.
Builds epub book out of Paul Graham's essays.
# -*- coding: utf-8 -*-
"""
Builds epub book out of Paul Graham's essays: http://paulgraham.com/articles.html
Author: Ola Sitarska <ola@sitarska.com>
Copyright: Licensed under the GPL-3 (http://www.gnu.org/licenses/gpl-3.0.html)
This script requires python-epub-library: http://code.google.com/p/python-epub-builder/
"""
import re, ez_epub, urllib2, genshi
from BeautifulSoup import BeautifulSoup
def addSection(link, title):
if not 'http' in link:
page = urllib2.urlopen('http://www.paulgraham.com/'+link).read()
soup = BeautifulSoup(page)
soup.prettify()
else:
page = urllib2.urlopen(link).read()
section = ez_epub.Section()
try:
section.title = title
print section.title
if not 'http' in link:
font = str(soup.findAll('table', {'width':'435'})[0].findAll('font')[0])
if not 'Get funded by' in font and not 'Watch how this essay was' in font and not 'Like to build things?' in font and not 'Want to start' in font and not len(font)<100:
content = font
else:
content = ''
for par in soup.findAll('table', {'width':'435'})[0].findAll('p'):
content += str(par)
content = content.replace('<font size="2" face="verdana">', '')
content = content.replace('</font>', '')
content = content.replace('<font color="#999999">', '')
content = content.replace('<font color="#000000">', '')
for p in content.split("<br /><br />"):
section.text.append(genshi.core.Markup(p))
#exception for Subject: Airbnb
for pre in soup.findAll('pre'):
section.text.append(genshi.core.Markup(pre))
else:
for p in str(page).replace("\n","<br />").split("<br /><br />"):
section.text.append(genshi.core.Markup(p))
except:
pass
return section
book = ez_epub.Book()
book.title = "Paul Graham's Essays"
book.authors = ['Paul Graham']
page = urllib2.urlopen('http://www.paulgraham.com/articles.html').read()
soup = BeautifulSoup(page)
soup.prettify()
links = soup.findAll('table', {'width': '435'})[1].findAll('a')
sections = []
for link in links:
if not link.text.startswith('html'):
sections.append(addSection(link['href'], link.text))
book.sections = sections
book.make(book.title)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment