Skip to content

Instantly share code, notes, and snippets.

@goc9000
Forked from olasitarska/pgessays.py
Last active February 25, 2022 22:49
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save goc9000/4287475 to your computer and use it in GitHub Desktop.
Save goc9000/4287475 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Builds epub book out of Paul Graham's essays: http://paulgraham.com/articles.html
Original script: Ola Sitarska <ola@sitarska.com>
Improved version: Cristian Dinu <goc9000@gmail.com>
This script requires python-epub-library: http://code.google.com/p/python-epub-builder/
The checking facility requires epubcheck: http://code.google.com/p/epubcheck/
Embedding the 'Roots of Lisp' paper requires the programs ps2pdf and pdftoppm
to be installed.
"""
import os, base64, hashlib, imghdr, re, urllib2, genshi, shutil, epub, subprocess
from subprocess import Popen, PIPE
from genshi.template import MarkupTemplate
from BeautifulSoup import BeautifulSoup, Comment, Tag
ROOT_URL = 'http://www.paulgraham.com/'
BOOK_TITLE = "Paul Graham's Essays"
OUTPUT_FILE = BOOK_TITLE + '.epub'
OMIT_TRANSLATIONS = True
REMOVE_DEPRECATED_LINKS = True
INCLUDE_COMMENTS = True
INCLUDE_LINKS = True
INCLUDE_APPENDICES = True
INCLUDE_IMAGE_APPENDICES = True
INCLUDE_ROOTS_OF_LISP = False
CHECK_EPUB = True
KEEP_OUTPUT_DIR = False
# These articles will never be dowloaded as appendices (usually because they
# are ads, download, or extensive theory pages
FORCE_EXTERNAL_ARTICLES = [
'hackpaint.html', 'piraha.html', 'arc.html', 'onlisp.html', 'acl.html',
'onlisptext.html', 'filters.html', 'bbf.html', 'accgensub.html'
]
# These articles represent images, a separate category of appendices that may
# be treated differently
IMAGE_APPENDICES = [
'04magnum.html', '1974-911s.html', '59eldorado.html', '75eldorado.html',
'amcars.html', 'americangothic.html', 'baptism.html', 'bluebox.html',
'creationofadam.html', 'denver.html', 'designedforwindows.html',
'garage.html', 'ginevra.html', 'guggen.html', 'hunters.html', 'isetta.html',
'largilliere-chardin.html', 'leonardo.html', 'matador.html',
'montefeltro.html', 'nerdad.html', 'pantheon.html', 'pierced.html',
'pilate.html', 'porsche695.html', 'sr71.html', 'symptg.html', 'tlbmac.html',
'vwfront.html', 'womb.html', 'zero.html'
]
# Text for images representing titles (only the main title has an ALT attribute).
# So far these are needed only for one article.
TITLE_IMAGES = { 'paulgraham_2202_12135763': 'Guiding Philosophy',
'paulgraham_2202_12136436': 'Open Problems',
'paulgraham_2202_12137035': 'Little-Known Secrets',
'paulgraham_2202_12137782': 'Ideas Whose Time Has Returned',
'paulgraham_2202_12138764': 'Pitfalls and Gotchas' }
# These allow for the recognition of banners appearing right under the title
BANNER_ADS = ['Want to start a startup?', 'Watch how this essay was',
'Like to build things?', 'The Suit is Back']
# Sections that contain these strings are ads and will be discarded
SECTION_ADS = [ "There can't be more than a couple thousand",
"If you liked this, you may also like Hackers & Painters",
"You'll find this essay and 14 others in Hackers & Painters"]
# Comments that contain any of these strings are ads and will be discared
COMMENT_ADS = [ 'Leave a tip', 'Winter Founders Program',
'If you liked this', 'redditino.png' ]
SECTION_TEMPLATE = MarkupTemplate("""
<html xmlns="http://www.w3.org/1999/xhtml"
xmlns:py="http://genshi.edgewall.org/">
<head>
<title>${title}</title>
<style type="text/css">
body { font-family: sans-serif; }
h1, h2 { font-variant: small-caps; color: #800000; }
blockquote { font-style: italic; }
a._local_link { background-color: #e0e0e0; }
a._external_link { }
img._embedded_page { border: 1px solid gray; }
${css}
</style>
</head>
<body>
${text}
</body>
</html>
""")
# This keeps track over which are the main articles; this is initialized
# automatically later on.
MAIN_ARTICLES = [
]
class BookData:
articles = None
images = None
unresolved = None
toc = None
main_articles = None
def __init__(self):
self.articles = {}
self.images = {}
self.unresolved = set()
self.main_toc = []
self.appendix_toc = []
self.image_toc = []
self.main_articles = set()
def readFile(filename):
with open(filename, "rb") as f:
return f.read()
def writeFile(filename, data):
with open(filename, "wb") as f:
f.write(data)
def htmlEntities(text):
return text.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')
def isExternalUrl(url):
return re.match("\\w+:", url) is not None
def cachedPageFilename(url):
hsh = base64.b64encode(url, "()").replace('=','_')
return "cache/{0}".format(hsh)
def getPage(url):
if not os.path.exists("cache"):
os.mkdir("cache")
filename = cachedPageFilename(url)
if os.path.isfile(filename):
print "Retrieving {0} from cache".format(url)
return readFile(filename)
print "Downloading: {0}...".format(url)
page = urllib2.urlopen(url).read()
writeFile(filename, page)
return page
def extractBody(page):
return re.search("<body\\b[^>]*>.*?</body\\b[^>]*>", page, re.DOTALL).group(0)
def fixWeirdTags(page):
page = re.sub("<(xa|ax|nota)\\s+",'<a class="_deprecated_link" ', page)
page = re.sub("<ximg\\s+[^>]*>", '', page) # delete deprecated images
return page
def fixXmpTags(page):
def _convertXmp(match):
return '<pre>' + htmlEntities(match.group(1)) + '</pre>'
return re.sub("<xmp\\b[^>]*>(.*?)</xmp>", _convertXmp, page, 0, re.DOTALL | re.I)
def doAdhocFixes(page):
def _adhocFix1(match):
text = re.sub("<br><br>\\d+[.] ", '</li><li>', match.group(1))
return '<ol><li>' + text + '</li></ol>'
page = re.sub("<ol>\\s*1. (Catalogs are so expensive.*?)</ol>", _adhocFix1, page, 0, re.S)
page = re.sub(" alt=\"Click to enlarge\"", '', page)
CREDIT = 'Image: Casey Muller: Trevor Blackwell at Rehearsal Day, summer 2006'
if CREDIT in page:
pat = 'width=410 height=144 border=0 hspace=0 vspace=0></a>'
pos1 = page.find(pat) + len(pat)
pos = page.find(CREDIT)
pos2 = page.rfind('<table', 0, pos)
pos3 = len('</table>') + page.find('</table>', pos)
credit_html = '<br><span style="font-size: 75%">' + CREDIT + "</span><br>"
page = page[:pos1] + credit_html + page[pos1:pos2] + page[pos3:]
if 'alt="Lisp for Web-Based Applications"' in page:
text = getPage('http://lib.store.yahoo.net/lib/paulgraham/bbnexcerpts.txt')
pat = 'BBN Labs in Cambridge, MA.<br><br></font>'
pos = page.find(pat) + len(pat)
bbn_html = "<pre>" + htmlEntities(text) + "</pre>"
page = page[:pos] + bbn_html + page[pos:]
return page
def removeBanners(page):
idx1 = page.find('<font size=2 face="verdana"><table width=100%')
if idx1 != -1:
idx2 = page.find("</table>", idx1)
if idx2 != -1:
idx2 += len("</table>")
is_ad = any((ad in page[idx1:idx2] for ad in BANNER_ADS))
if is_ad:
pat = re.compile("(?P<ad>(<p>|<br><br>)\\s*(<[!]--.*?-->)?\\s*)\\w+\\s+\\d{4}", re.DOTALL)
m = pat.search(page, idx2)
if m is not None:
page = page[:idx1] + page[m.end('ad'):]
return page
def convertParagraphs(page):
return re.sub("<p(\\s+[^>]*)?>", '<br/><br/>', page)
def extractTitle(page):
return re.search('<title>([^<]*)</title>', page).group(1).strip()
def guessTitle(text):
if text.startswith('(This is the first chapter of ANSI Common Lisp'):
return 'Chapter 1 of Ansi Common Lisp'
if text.startswith('(This is Chapter 2 of ANSI Common Lisp'):
return 'Chapter 2 of Ansi Common Lisp'
print '### ERROR: Cannot guess the title for this text: ###'
print text[:400],'[...]'
print '###'
raise RuntimeError("Please modify the program accordingly")
def extractComments(page):
def _collectComment(match, state):
if not INCLUDE_COMMENTS:
return ''
text = match.group(1)
if any(ad in text for ad in COMMENT_ADS):
return ''
pos = text.find('name="')
if pos != -1:
pos += len('name="')
text = text[:pos] + 'deleted_' + text[pos:]
state['comments'].append(text)
return '<sup><a href="#_comment{0}">({0})</a></sup>'.format(len(state['comments']))
pat_comments = re.compile("<!--(.*?)-->", re.DOTALL)
state = dict()
state['comments'] = []
page = re.sub(pat_comments, lambda match : _collectComment(match, state), page)
if len(state['comments']) > 0:
# Insert comments at the end of body
pos = page.find("</body")
comments_html = ''.join('<br/><br /><a name="_comment{0}">({0})</a> {1}'.format(idx+1, comm) for idx, comm in enumerate(state['comments']))
comments_div = '<div id="__comments"><br /><b>Comments and Edits</b>{0}</div>'.format(comments_html)
page = page[:pos] + comments_div + page[pos:]
return page
def preprocessPage(page):
page = page.encode('ascii', 'xmlcharrefreplace')
page = extractBody(page)
page = fixWeirdTags(page)
page = fixXmpTags(page)
page = doAdhocFixes(page)
page = removeBanners(page)
page = convertParagraphs(page)
page = extractComments(page)
return page
def findTitleImage(soup):
title_img = soup.find('img', { 'alt': lambda alt: alt is not None })
if title_img is None:
raise RuntimeError("Title img not found")
return title_img
def isLinksSection(table):
if table.find('a') is None or table.find('img') is None:
return False
for link in table.findAll('a'):
font = link.parent
if font.name != 'font' or font.get('size') != '2' or font.get('face') != 'verdana':
return False
for img in table.findAll('img'):
if not (img['src'].endswith('trans_1x1.gif') or img['src'].startswith('http://ep.yimg.com/ca/I/paulgraham_')):
return False
w = 0 if img.get('width') is None else int(img['width'])
h = 0 if img.get('height') is None else int(img['height'])
if w > 20 or h > 20:
return False
return True
def rewriteLinksSection(dom, soup, links_table):
links = []
for fnt in links_table.findAll('font', {'size': '2', 'face':'verdana'}):
if str(fnt).startswith('<font size="2" face="verdana"><a href="'):
link = fnt.find('a')
caption = link.getText('').strip()
if caption.endswith(' Translation') and OMIT_TRANSLATIONS:
continue
links.append((link['href'], caption))
links_table.decompose()
if not INCLUDE_LINKS or len(links) == 0:
return
b = Tag(soup, 'b')
b.string = 'Links'
dom.append(b)
ul = Tag(soup, 'ul')
for url, caption in links:
li = Tag(soup, 'li')
a = Tag(soup, 'a', {'href': url})
a.string = caption
li.append(a)
ul.append(li)
dom.append(ul)
def isAdSection(table):
text = table.getText(' ')
if any(ad in text for ad in SECTION_ADS):
return True
return False
def isDisqusSection(table):
return table.find('div', { 'id' : 'disqus_thread' }) is not None
def isEndSection(table):
return table.find('hr') is not None and table.getText('').strip() == ''
def appendCustomSection(dom, soup, table):
for tr in table.contents:
for td in tr.contents:
if td.get('width') is not None and int(td['width']) < 10:
continue
for img in td.findAll('img'):
if img['src'].endswith('trans_1x1.gif'):
img.decompose()
if len(td.contents) == 0:
continue
for item in td.contents:
dom.append(item)
table.decompose()
def embedRootsOfLispArticle(dom, soup):
def _checkInstalled(name, cmdline, expected):
try:
out, err = Popen(cmdline, shell=False, stdout=PIPE, stderr=PIPE).communicate()
out = (out + err).strip()
if not out.startswith(expected):
raise RuntimeError()
except:
raise RuntimeError(name + " does not appear to be installed")
TEMP_DIR = 'temp_rootsoflisp'
WIDTH = 800
HEIGHT = 940*WIDTH / 600
X = 176*WIDTH / 600
Y = 170*WIDTH / 600
DPI = 112*WIDTH / 600
try:
if not os.path.isdir(TEMP_DIR):
os.mkdir(TEMP_DIR)
data = getPage('http://lib.store.yahoo.net/lib/paulgraham/jmc.ps')
ps_filename = os.path.join(TEMP_DIR, 'jmc.ps')
writeFile(ps_filename, data)
print "Checking all required programs are installed..."
_checkInstalled('ps2pdf', ['ps2pdf'], 'Usage: ps2pdf')
_checkInstalled('pdftoppm', ['pdftoppm', '-h'], 'pdftoppm version ')
print "Converting to PDF..."
pdf_filename = os.path.join(TEMP_DIR, 'jmc.pdf')
subprocess.call(['ps2pdf', ps_filename, pdf_filename])
print "Extracting page images..."
page_filename = os.path.join(TEMP_DIR, 'jmc_page')
subprocess.call(['pdftoppm', '-q', '-png', '-r', str(DPI),
'-x', str(X), '-y', str(Y),
'-W', str(WIDTH), '-H', str(HEIGHT),
pdf_filename, page_filename])
for i in xrange(1, 14):
src = page_filename + '-{0:02d}.png'.format(i)
dest = cachedPageFilename('jmc_paper/page{0}.png'.format(i))
shutil.copyfile(src, dest)
shutil.rmtree(TEMP_DIR, True)
# Add embedded pages to the DOM
center = Tag(soup, 'center')
for i in xrange(1, 14):
center.append(Tag(soup, 'br'))
img = Tag(soup, 'img', { 'src': 'jmc_paper/page{0}.png'.format(i),
'width': str(WIDTH), 'height': str(HEIGHT),
'class': '_embedded_page' })
center.append(img)
center.append(Tag(soup, 'br'))
dom.append(center)
except RuntimeError as e:
shutil.rmtree(TEMP_DIR, True)
raise RuntimeError("Cannot embed 'Roots of Lisp': {0}".format(e))
def extractMainContent(soup):
title_img = findTitleImage(soup)
title = title_img['alt'].strip()
main_td = title_img.parent
if INCLUDE_ROOTS_OF_LISP and title == 'The Roots of Lisp':
embedRootsOfLispArticle(main_td, soup)
main_table = main_td.parent.parent
while True:
section = main_table.nextSibling
if section is None:
break
if section.name == 'br':
main_td.append(section)
elif section.name != 'table':
raise RuntimeError("Expected <br> or <table> in main <td>!")
elif isLinksSection(section):
rewriteLinksSection(main_td, soup, section)
elif isEndSection(section) or isAdSection(section) or isDisqusSection(section):
section.decompose()
else:
appendCustomSection(main_td, soup, section)
return main_td.extract()
def retrieveComments(dom, soup):
comments = soup.find('div', {'id':'__comments'})
if comments is not None:
while len(comments.contents) > 0:
item = comments.contents[0].extract()
dom.append(item)
def replaceImageWithHeading(img, tag, title, soup):
hdg = Tag(soup, tag)
hdg.string = title
img.replaceWith(hdg)
# Delete the <br>s that follow, up to a maximum of 2
for _ in xrange(0,2):
for sib in hdg.nextSiblingGenerator():
if isinstance(sib, Tag):
if sib.name != 'br':
return
sib.decompose()
break
else:
if str(sib).strip() != '':
return
def replaceTitleImages(dom, soup):
img = findTitleImage(dom)
replaceImageWithHeading(img, 'h1', img['alt'], soup)
for img in dom.findAll('img'):
_, filename = os.path.split(img['src'])
if filename in TITLE_IMAGES:
replaceImageWithHeading(img, 'h2', TITLE_IMAGES[filename], soup)
def removeBottomAds(dom):
for table in dom.findAll('table'):
tbl_text = table.getText('')
if "You'll find this essay and 14 others" in tbl_text:
while type(table.nextSibling)==type(table) and table.nextSibling.name == 'br':
table.nextSibling.decompose()
table.decompose()
def removeScripts(dom):
for script in dom.findAll('script'):
script.decompose()
def fixEntities(dom):
for text_elem in dom.findAll(text=lambda text:not isinstance(text, Comment)):
text = str(text_elem)
text = re.sub("&(?!(\\w\\w|#))", '&amp;', text)
text = re.sub("&(\\w);", "&amp;\\1", text)
text = text.replace('<', '&lt;').replace('>','&gt;')
text_elem.replaceWith(text)
def addStyle(tag, style):
if style=='':
return
sty = tag.get('style').strip() if tag.get('style') is not None else ''
if sty != '' and not sty.endswith(';'):
sty += ';'
if not style.strip().endswith(';'):
style += ';'
tag['style'] = sty + style
def addClass(tag, cls):
cl = tag.get('class').strip() if tag.get('class') is not None else ''
tag['class'] = cl + ' ' + cls
def attrToCss(tag, attr, css=None):
curr_val = tag.get(attr)
if curr_val is None:
return
if css is None:
css = attr+':{0}'
addStyle(tag, css.format(curr_val))
del tag[attr]
def convertFontTags(dom):
for font in dom.findAll('font'):
attrToCss(font, 'color')
del font['face'] # face changes are ignored
del font['size'] # size changes are ignored
if font.get('style') is not None:
font.name = 'span'
else:
font.replaceWithChildren()
def convertStrikethrough(dom):
for st in dom.findAll('s'):
st.name = 'span';
addStyle(st, 'text-decoration: line-through')
def stripRootUrl(url):
if url.startswith(ROOT_URL):
return url[len(ROOT_URL):]
if url.startswith(ROOT_URL.replace('http://www.','http://')):
return url[len(ROOT_URL)-4:]
return url
def mustExternalize(link):
if link in FORCE_EXTERNAL_ARTICLES:
return True
if link in MAIN_ARTICLES:
return False
if not INCLUDE_APPENDICES:
return True
if not INCLUDE_IMAGE_APPENDICES and link in IMAGE_APPENDICES:
return True
return False
def fixReference(url, bookData):
link, sep, fragment = url.partition('#')
if link != '':
link = stripRootUrl(link)
if not link.startswith(ROOT_URL) and mustExternalize(link):
link = ROOT_URL + link
if not isExternalUrl(link):
if link not in bookData.articles:
bookData.unresolved.add(link)
return link + sep + fragment
def fixAnchors(dom, bookData):
for link in dom.findAll('a'):
if REMOVE_DEPRECATED_LINKS:
if link.get('class') == '_deprecated_link':
link.replaceWithChildren()
continue
if link.get('name') is not None:
link['id'] = link['name']
del link['name']
if link.get('hef') is not None:
if link.get('name') is None:
link['href'] = link['hef']
del link['hef']
url = link.get('href')
if url is not None:
link['href'] = fixReference(url, bookData)
addClass(link, '_external_link' if isExternalUrl(link['href']) else '_local_link')
def fixTableStyles(dom):
for t in dom.findAll(['table','tr','td']):
attrToCss(t, 'width')
attrToCss(t, 'bgcolor', 'background-color:{0}')
for cent in dom.findAll('center'):
for tbl in cent.findAll('table'):
addStyle(tbl, 'margin: auto')
def fixBrAndHrStyles(dom):
for br in dom.findAll('br'):
del br['clear']
for hr in dom.findAll('hr'):
del hr['color']
del hr['height']
def fixImageStyle(img):
if img.get('alt') is None:
img['alt'] = ''
attrToCss(img, 'align', 'float:{0}')
attrToCss(img, 'border')
attrToCss(img, 'hspace', 'margin-left:{0};margin-right:{0}')
attrToCss(img, 'vspace', 'margin-top:{0};margin-bottom:{0}')
def resolveImages(dom, bookData):
for img in dom.findAll('img'):
data = getPage(img['src'])
md5 = hashlib.md5(data).digest()
if md5 in bookData.images:
img['src'] = bookData.images[md5][1]
else:
old_path = cachedPageFilename(img['src'])
new_path = 'img{0}.{1}'.format(len(bookData.images)+1, imghdr.what(old_path))
bookData.images[md5] = (old_path, new_path)
img['src'] = new_path
fixImageStyle(img)
def processDom(soup, bookData):
main_td = extractMainContent(soup)
retrieveComments(main_td, soup)
replaceTitleImages(main_td, soup)
removeBottomAds(main_td)
removeScripts(main_td)
convertFontTags(main_td)
convertStrikethrough(main_td)
fixEntities(main_td)
fixAnchors(main_td, bookData)
fixTableStyles(main_td)
fixBrAndHrStyles(main_td)
resolveImages(main_td, bookData)
return main_td
def fixBlockquotes(page):
page = re.sub('(</?blockquote[^>]*>)', "</p>\\1<p>", page)
return page
def fixCenterTags(page):
# Compensate for the new line breaks we will introduce
page = re.sub("</center>\\s*<br />", '</center>', page)
page = re.sub("<br />\\s*</center>", '</center>', page)
page = re.sub("<br />\\s*<center>", '<center>', page)
page = re.sub("<center>\\s*<br />", '<center>', page)
page = re.sub('</center><center[^>]*>', '<br />', page)
# Replace CENTER tags proper
page = re.sub('(<center[^>]*>)', '</p><p style="text-align:center">', page)
page = re.sub('(</center[^>]*>)', '</p><p>', page)
return page
def fixBlockTags(page):
page = re.sub('(<(hr)\\b[^>]*>)', "</p>\\1<p>", page)
page = re.sub('(<(pre|ol|ul|table|h\\d)\\b)', "</p>\\1", page)
page = re.sub('(</(pre|ol|ul|table|h\\d)\\b[^>]*>)', "\\1<p>", page)
return page
def applyFinalCorrections(page):
page = re.sub('(<(td|li)\\b[^>]*>[^<]*)</p>', "\\1", page)
page = re.sub('<p>([^<]*</(td|li)\\b)', "\\1", page)
page = re.sub("<p>\\s*</p>", '', page)
return page
def addCoda(page):
return re.sub('(\\s*<br />)*</p>$', '<br /><br /><br /><br /></p><hr />', page)
def postprocessPage(page):
page = fixBlockquotes(page)
page = fixCenterTags(page)
page = fixBlockTags(page)
page = applyFinalCorrections(page)
page = addCoda(page)
return page
def articleFilename(link):
return link if not isExternalUrl(link) else os.path.split(link)[1]
def renderSection(title, css, content):
stream = SECTION_TEMPLATE.generate(title=title, css=css, text=genshi.core.Markup(content))
return stream.render('xhtml', doctype='xhtml11', drop_xml_decl=False, strip_whitespace=False)
def loadArticle(bookData, link):
url = link if isExternalUrl(link) else ROOT_URL + link
page = getPage(url).decode('iso-8859-1')
if '.html' in link:
title = extractTitle(page)
page = preprocessPage(page)
soup = BeautifulSoup(page)
dom = processDom(soup, bookData)
content = '<p>{0}</p>'.format(''.join(str(item) for item in dom.contents))
content = postprocessPage(content)
else:
title = guessTitle(page)
content = '<pre>{0}</pre>'.format(htmlEntities(page))
bookData.articles[link] = renderSection(title, '', content)
bookData.unresolved.discard(link)
return title
def getEssayLinks():
page = getPage(ROOT_URL + 'articles.html')
soup = BeautifulSoup(page)
return [link['href'] for link in soup.findAll('table', {'width': '455'})[1].findAll('a')]
def getBookData():
bookData = BookData()
print "Processing essays..."
links = getEssayLinks()
MAIN_ARTICLES.extend(links)
for link in links:
title = loadArticle(bookData, link)
bookData.main_toc.append((link, title))
if INCLUDE_APPENDICES:
print "Processing Appendices..."
while len(bookData.unresolved) > 0:
link = bookData.unresolved.pop()
title = loadArticle(bookData, link)
if link in IMAGE_APPENDICES:
bookData.image_toc.append((link, title))
else:
bookData.appendix_toc.append((link, title))
bookData.appendix_toc.sort(key=lambda pair:pair[1])
bookData.image_toc.sort(key=lambda pair:pair[1])
return bookData
def makeBook(bookData, outputFile):
book = epub.EpubBook()
book.setTitle(BOOK_TITLE)
book.setLang('en-US')
book.addCreator('Paul Graham')
book.addTitlePage()
book.addTocPage()
for link, title in bookData.main_toc:
item = book.addHtml('', articleFilename(link), bookData.articles[link])
book.addSpineItem(item)
book.addTocMapNode(item.destPath, title, 1)
for fname, heading, toc in [('_appendices.html', 'Appendices', bookData.appendix_toc),
('_images.html', 'Images', bookData.image_toc)]:
first = True
for link, title in toc:
if first:
item = book.addHtml('', fname, renderSection(heading, '', '<h1>'+heading+'</h1>'))
book.addSpineItem(item)
book.addTocMapNode(item.destPath, heading, 1)
first = False
item = book.addHtml('', articleFilename(link), bookData.articles[link])
book.addSpineItem(item)
book.addTocMapNode(item.destPath, title, 2)
for old_path, new_path in bookData.images.values():
book.addImage(old_path, new_path)
outputDir = outputFile+"_files.d"
if os.path.isdir(outputDir): shutil.rmtree(outputDir)
book.createBook(outputDir)
book.createArchive(outputDir, outputFile)
if not KEEP_OUTPUT_DIR: shutil.rmtree(outputDir)
def checkEPub(outputFile):
checkers = sorted([f for f in os.listdir('.') if re.match('epubcheck.*[.]jar', f)])
if len(checkers) == 0:
print "No epubcheck-*.jar found, cannot check book!"
return
jar = checkers[-1]
subprocess.call(['java', '-jar', jar, outputFile], shell = False)
def main():
bookData = getBookData()
makeBook(bookData, OUTPUT_FILE)
if CHECK_EPUB:
checkEPub(OUTPUT_FILE)
main()
@timrogers
Copy link

Thanks @goc9000! Just for everyone's reference - I had to run this a couple of times as it kept fail with 502 errors from www.paulgraham.com :( Fortunately, it cached the pages loaded so far so it wasn't a problem.

@philippludwig
Copy link

Traceback (most recent call last):
  File "pgessays.py", line 830, in <module>
    main()
  File "pgessays.py", line 823, in main
    bookData = getBookData()
  File "pgessays.py", line 752, in getBookData
    links = getEssayLinks()
  File "pgessays.py", line 745, in getEssayLinks
    return [link['href'] for link in soup.findAll('table', {'width': '455'})[1].findAll('a')]
IndexError: list index out of range

Too bad that this does not work anymore.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment