Skip to content

Instantly share code, notes, and snippets.

@vitorio
Last active August 29, 2015 14:03
Show Gist options
  • Save vitorio/8dc8853ee01ec651f263 to your computer and use it in GitHub Desktop.
Save vitorio/8dc8853ee01ec651f263 to your computer and use it in GitHub Desktop.
Turn a Distance ePub file into something more web-appropriate: replace the XHTML doctype with HTML5; inline zeitgeist.css, template.css, and some CSS from your.distance.cc to move the paragraph numbers; add IDs; protect the email address with JS; and resolve and embiggen the dsn.tc short URLs. Deeded to the public domain. To the extent possible …
# coding=utf-8
__author__ = 'vitorio'
import bs4
import argparse
import re
import requests
parser = argparse.ArgumentParser(description='Turn a Distance ePub file into something more web-appropriate')
parser.add_argument('epubfile', help='The Distance ePub file to read from')
args = parser.parse_args()
soup = bs4.BeautifulSoup(open(args.epubfile))
# Replace XML doctype
for child in soup.contents:
if isinstance(child, bs4.element.Doctype):
html5doctype = bs4.Doctype('html')
child.replace_with(html5doctype)
break
# Update HTML tag
htmltag = soup.find('html')
del htmltag['xmlns']
htmltag['lang'] = 'en'
# Add UTF-8 meta tag
headtag = soup.find('head')
newmeta = soup.new_tag('meta', charset='utf-8')
headtag.insert(0, '\n')
headtag.insert(1, newmeta)
# Replace template.css so file can stand alone, add additional styles
soup.find('link').decompose()
zeitgeistcss = '''
/* zeitgeist.css */
article,aside,details,figcaption,figure,footer,header,hgroup,nav,section,summary{display:block;}audio,canvas,video{display:inline-block;*display:inline;*zoom:1;}audio:not([controls]){display:none;}[hidden]{display:none;}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%;}html,button,input,select,textarea{font-family:sans-serif;color:#222;}body{width:48em;margin:0 auto;font-size:1em;line-height:1.4;}::-moz-selection{background:#b3d4fc;text-shadow:none;}::selection{background:#b3d4fc;text-shadow:none;}a{color:#00e;}a:visited{color:#551a8b;}a:hover{color:#06e;}a:focus{outline:thin dotted;}a:hover,a:active{outline:0;}body{font:1em/1.625em "Lucida Grande","Lucida Sans Unicode",sans-serif;font-size-adjust:none;font-style:normal;font-variant:normal;font-weight:normal;background-color:#FFFEF0;}h1,h2,h3,h4,h5,h6{font-weight:normal;color:#333;font-family:Georgia,serif;}h1{font-size:2.125em;margin-bottom:.765em;}h2{font-size:1.9em;margin-bottom:.855em;}h3{font-size:1.7em;margin-bottom:.956em;}h4{font-size:1.4em;margin-bottom:1.161em;}h5,h6{font-size:1.313em;margin-bottom:1.238em;}a{color:#005AF2;text-decoration:none;}a:hover{text-decoration:underline;}abbr,acronym{border-bottom:1px dotted #000;}address{margin-top:1.625em;font-style:italic;}b,strong{font-weight:bold;}blockquote{padding:1em 1em 1.625em 1em;font-family:Georgia,serif;font-style:italic;}blockquote:before{content:"\\201C";font-size:3em;margin-left:-0.625em;font-family:Georgia,serif;color:#aaa;line-height:0;}blockquote>p{padding:0;margin:0;}caption{text-align:center;font-family:Georgia,serif;}del{color:#000;}dfn,em{font-style:italic;}dfn{font-weight:bold;}dl{margin:0 0 1.625em 0;}dl dt{font-weight:bold;}dl dd{margin-left:1.625em;}hr{display:block;height:1px;border:0;border-top:1px solid #ccc;margin-bottom:1.625em;padding:0;}ins{background:#ff9;color:#000;text-decoration:none;}mark{background:#ff0;color:#000;font-style:italic;font-weight:bold;}p{padding:0 0 .8125em 0;color:#111;font-weight:300;}p+p{text-indent:1.625em;}p.first:first-letter{float:left;font-family:Baskerville,"Palatino Linotype",serif;font-size:3em;font-weight:700;line-height:1em;margin-bottom:-0.2em;padding:.2em .1em 0 0;}p img{float:left;margin:.5em .8125em .8125em 0;padding:0;}p img.right{float:right;margin:.5em 0 .8125em .8125em;}pre,code,kbd,samp,tt{font-family:Monaco,"Lucida Mono","Liberation Mono","Courier New","Courier",monospace;_font-family:'courier new',monospace;font-size:1em;background:#eee;line-height:1.5;}pre,code{white-space:pre;white-space:pre-wrap;word-wrap:break-word;margin:1.625em 0;}q{quotes:none;}q:before,q:after{content:"";content:none;}small{font-size:85%;}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline;}sup{top:-0.5em;}sub{bottom:-0.25em;}tt{display:block;margin:1.625em 0;}ul,ol{list-style-position:outside;margin:0 0 1.625em 0;padding:0 0 0 40px;}li ul,li ol{margin:0 1.625em;}nav ul,nav ol{list-style:none;list-style-image:none;margin:0;padding:0;}img{border:0;vertical-align:middle;-ms-interpolation-mode:bicubic;}svg:not(:root){overflow:hidden;}figure{margin:0;}form{margin:0;}fieldset{border:0;margin:0;padding:0;}label{cursor:pointer;}legend{border:0;padding:0;white-space:normal;*margin-left:-7px;}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;}button,input{line-height:normal;}button,input[type="button"],input[type="reset"],input[type="submit"]{cursor:pointer;-webkit-appearance:button;*overflow:visible;}button[disabled],input[disabled]{cursor:default;}input[type="checkbox"],input[type="radio"]{box-sizing:border-box;padding:0;*width:13px;*height:13px;}input[type="search"]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box;}input[type="search"]::-webkit-search-decoration,input[type="search"]::-webkit-search-cancel-button{-webkit-appearance:none;}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0;}textarea{overflow:auto;vertical-align:top;resize:vertical;}table{border-collapse:collapse;border-spacing:0;margin-bottom:1.625em;}th{font-weight:bold;}tr,th,td{margin:0;padding:0 1.625em 0 1em;height:26px;}td{vertical-align:top;}tfoot{font-style:italic;}.chromeframe{margin:.2em 0;background:#ccc;color:#000;padding:.2em 0;}@media print{*{background:transparent!important;color:#000!important;box-shadow:none!important;text-shadow:none!important;}a,a:visited{text-decoration:underline;}a[href]:after{content:"(" attr(href) ")";}abbr[title]:after{content:"(" attr(title) ")";}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:"";}pre,blockquote{border:1px solid #999;page-break-inside:avoid;}thead{display:table-header-group;}tr,img{page-break-inside:avoid;}img{max-width:100%!important;}@page{margin:.5cm;}p,h2,h3{orphans:3;widows:3;}h2,h3{page-break-after:avoid;}}
'''
templatecss = '''
/* template.css */
@page {
margin: 0.5em;
}
tbody, thead, tfoot, tr, td, th {
border-style: inherit;
border-width: inherit;
border-color: inherit;
}
.leftFloat {
float: left;
}
.rightFloat {
float: right;
}
.page-break {
page-break-before: always;
}
.pgh_no {
font-size: 0.5em;
}
.attribution {
text-align: right;
font-style: italic;
}
h1, h2, h3.byline {
text-align: center;
}
'''
paragraphcss = '''
/* your.distance.cc */
.pgh_no {
display: inline;
float: right;
font-size: 0.5em;
margin-top: 1em;
text-decoration: none;
color: #613418;
/* margin-right: -2.25em; */
margin-right: -3em;
}
'''
newcss = soup.new_tag('style')
newcss.string = zeitgeistcss
headtag.append(newcss)
headtag.append('\n\n')
newcss = soup.new_tag('style')
newcss.string = templatecss
headtag.append(newcss)
headtag.append('\n\n')
newcss = soup.new_tag('style')
newcss.string = paragraphcss
headtag.append(newcss)
headtag.append('\n\n')
# Rewrite the paragraph spans
for pspans in soup.find_all('span', attrs={'class': 'pgh_no'}):
pnum = int(pspans.string.strip('[]'))
pspans.string = u'¶ %d' % pnum
pspans.parent['id'] = 'p%d' % pnum
# Rewrite the header tags
for numh, hs in enumerate(soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])):
hs['id'] = 'h%d' % (numh + 1)
# Replace the byline email address with JavaScript
emaila = soup.find('h3', attrs={'class': 'byline'}).find('a', href=re.compile('mailto:'))
emailstr = emaila.string.split('@')
emailjs = '''
var DISTANCE = DISTANCE || {};
DISTANCE.email = '%s';
DISTANCE.email += '@';
DISTANCE.email += '%s';
DISTANCE.bylineemail = document.getElementById('bylineemail');
DISTANCE.bylineemail.setAttribute('href', 'mailto:' + DISTANCE.email);
DISTANCE.bylineemail.innerHTML = DISTANCE.email;
''' % (emailstr[0], emailstr[1])
emaila['id'] = 'bylineemail'
del emaila['href']
emaila.string = ''
newjs = soup.new_tag('script')
newjs.string = emailjs
soup.find('body').append(newjs)
soup.find('body').append('\n')
# Resolve and embiggen the dsn.tc short URLs
for dsntc in soup.find_all('a', href=re.compile('http://dsn\.tc/')):
r = requests.get(dsntc['href'], verify=False, allow_redirects=False)
dsntc['data-dsntc'] = dsntc['href']
dsntc.string = r.headers['location']
dsntc['href'] = r.headers['location']
with open(args.epubfile + '.py.html', 'wb') as file:
file.write(str(soup))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment