Skip to content

Instantly share code, notes, and snippets.

@mikezhuyuan
Created July 20, 2012 07:33
Show Gist options
  • Save mikezhuyuan/3149304 to your computer and use it in GitHub Desktop.
Save mikezhuyuan/3149304 to your computer and use it in GitHub Desktop.
convert html to pdf
import re, urllib2, subprocess, zipfile, os, shutil
#first install wkhtmltopdf
tool_path = r'C:/Program Files/wkhtmltopdf/wkhtmltopdf.exe'
data_source = r'D:\TFS\ETTfsApp\EnglishTown\Team_ELab\Development\Source\ELab\ELab\EFSchools.Englishtown.ELab.UI\_scripts\elab-ui\library\binddataelibresults.js'
def process(urls):
for url in urls:
print 'processing', url
try:
fullname = url.split('/')[-1]
name = fullname.split('.')[0]
ext = fullname.split('.')[-1]
pdfname = name + '.pdf'
download(url, fullname)
unzip(fullname, name)
pdf(name, name, pdfname)
except Exception as e:
print e
def download(url, name):
print 'downloading', url
if not os.path.exists(name) or os.stat(name).st_size == 0:
BUFFER_SIZE = 256 * 1024
req = urllib2.Request(url)
req.add_header('User-Agent', 'Fiddler')
res = urllib2.urlopen(req)
with open(name, 'wb') as f:
f.write(res.read())
def unzip(path, name):
print 'unzip', path
if not os.path.exists(name) and not os.path.exists(name+'.pdf'):
zip = zipfile.ZipFile(path)
zip.extractall('.')
def pdf(foldername, name, pdfname):
source = None
for ext in ['.txt', '.htm']:
p = name + ext
if os.path.exists(p):
source = p
break
p = foldername + '/' + name + ext
if os.path.exists(p):
source = p
break
if source is None:
raise Exception('No source file to convert')
if not os.path.exists(pdfname):
print 'create pdf', source
subprocess.call([tool_path, source, pdfname])
def extract_urls(path):
with open(path, 'r') as f:
txt = f.read()
return re.findall(r'http://[^"\']+', txt)
def exam(path, urls):
for url in urls:
fullname = url.split('/')[-1]
name = fullname.split('.')[0]
if not os.path.exists(name + '.pdf'):
print url, 'not created'
def extract_titles(path):
with open(path, 'r') as f:
txt = f.read()
return re.findall(r'Book\d+\.Title = \"(.+?)\";', txt)
def rename_to_title(urls, titles):
p = re.compile(r'[\\/:"*?<>|]+')
for url, title in zip(urls, titles):
name = url.split('/')[-1].split('.')[0] + '.pdf'
title = p.sub('', title).decode('utf8')
target = 'OUTPUT/'+title+'.pdf'
if os.path.exists(name) and not os.path.exists(target):
shutil.copyfile(name, target)
urls = extract_urls(data_source)
titles = extract_titles(data_source)
process(urls)
rename_to_title(urls, titles)
exam('.', urls)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment