Created
July 20, 2012 07:33
-
-
Save mikezhuyuan/3149304 to your computer and use it in GitHub Desktop.
convert html to pdf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, urllib2, subprocess, zipfile, os, shutil | |
#first install wkhtmltopdf | |
tool_path = r'C:/Program Files/wkhtmltopdf/wkhtmltopdf.exe' | |
data_source = r'D:\TFS\ETTfsApp\EnglishTown\Team_ELab\Development\Source\ELab\ELab\EFSchools.Englishtown.ELab.UI\_scripts\elab-ui\library\binddataelibresults.js' | |
def process(urls): | |
for url in urls: | |
print 'processing', url | |
try: | |
fullname = url.split('/')[-1] | |
name = fullname.split('.')[0] | |
ext = fullname.split('.')[-1] | |
pdfname = name + '.pdf' | |
download(url, fullname) | |
unzip(fullname, name) | |
pdf(name, name, pdfname) | |
except Exception as e: | |
print e | |
def download(url, name): | |
print 'downloading', url | |
if not os.path.exists(name) or os.stat(name).st_size == 0: | |
BUFFER_SIZE = 256 * 1024 | |
req = urllib2.Request(url) | |
req.add_header('User-Agent', 'Fiddler') | |
res = urllib2.urlopen(req) | |
with open(name, 'wb') as f: | |
f.write(res.read()) | |
def unzip(path, name): | |
print 'unzip', path | |
if not os.path.exists(name) and not os.path.exists(name+'.pdf'): | |
zip = zipfile.ZipFile(path) | |
zip.extractall('.') | |
def pdf(foldername, name, pdfname): | |
source = None | |
for ext in ['.txt', '.htm']: | |
p = name + ext | |
if os.path.exists(p): | |
source = p | |
break | |
p = foldername + '/' + name + ext | |
if os.path.exists(p): | |
source = p | |
break | |
if source is None: | |
raise Exception('No source file to convert') | |
if not os.path.exists(pdfname): | |
print 'create pdf', source | |
subprocess.call([tool_path, source, pdfname]) | |
def extract_urls(path): | |
with open(path, 'r') as f: | |
txt = f.read() | |
return re.findall(r'http://[^"\']+', txt) | |
def exam(path, urls): | |
for url in urls: | |
fullname = url.split('/')[-1] | |
name = fullname.split('.')[0] | |
if not os.path.exists(name + '.pdf'): | |
print url, 'not created' | |
def extract_titles(path): | |
with open(path, 'r') as f: | |
txt = f.read() | |
return re.findall(r'Book\d+\.Title = \"(.+?)\";', txt) | |
def rename_to_title(urls, titles): | |
p = re.compile(r'[\\/:"*?<>|]+') | |
for url, title in zip(urls, titles): | |
name = url.split('/')[-1].split('.')[0] + '.pdf' | |
title = p.sub('', title).decode('utf8') | |
target = 'OUTPUT/'+title+'.pdf' | |
if os.path.exists(name) and not os.path.exists(target): | |
shutil.copyfile(name, target) | |
urls = extract_urls(data_source) | |
titles = extract_titles(data_source) | |
process(urls) | |
rename_to_title(urls, titles) | |
exam('.', urls) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment