mikezhuyuan/html2pdf.py

## html2pdf.py
import re, urllib2, subprocess, zipfile, os, shutil

#first install wkhtmltopdf
tool_path = r'C:/Program Files/wkhtmltopdf/wkhtmltopdf.exe'
data_source = r'D:\TFS\ETTfsApp\EnglishTown\Team_ELab\Development\Source\ELab\ELab\EFSchools.Englishtown.ELab.UI\_scripts\elab-ui\library\binddataelibresults.js'

def process(urls):
	for url in urls:
		print 'processing', url
		try:
			fullname = url.split('/')[-1]
			name = fullname.split('.')[0]
			ext = fullname.split('.')[-1]
			pdfname = name + '.pdf'
			download(url, fullname)
			unzip(fullname, name)
			pdf(name, name, pdfname)
		except Exception as e:
  			print e

def download(url, name):
	print 'downloading', url
	if not os.path.exists(name) or os.stat(name).st_size == 0:
		BUFFER_SIZE = 256 * 1024
		req = urllib2.Request(url)
		req.add_header('User-Agent', 'Fiddler')
		res = urllib2.urlopen(req)
		with open(name, 'wb') as f:
			f.write(res.read())

def unzip(path, name):
	print 'unzip', path
	if not os.path.exists(name) and not os.path.exists(name+'.pdf'):
		zip  = zipfile.ZipFile(path)
		zip.extractall('.')

def pdf(foldername, name, pdfname):
	source = None
	for ext in ['.txt', '.htm']:
		p = name + ext
		if os.path.exists(p):
			source = p
			break
		p = foldername + '/' + name + ext
		if os.path.exists(p):
			source = p
			break
	if source is None:
		raise Exception('No source file to convert')
	if not os.path.exists(pdfname):
		print 'create pdf', source
		subprocess.call([tool_path, source, pdfname])

def extract_urls(path):
	with open(path, 'r') as f:
		txt = f.read()
		return re.findall(r'http://[^"\']+', txt)

def exam(path, urls):
	for url in urls:
		fullname = url.split('/')[-1]
		name = fullname.split('.')[0]
		if not os.path.exists(name + '.pdf'):
			print url, 'not created'

def extract_titles(path):
	with open(path, 'r') as f:
		txt = f.read()
		return re.findall(r'Book\d+\.Title = \"(.+?)\";', txt)

def rename_to_title(urls, titles):
	p = re.compile(r'[\\/:"*?<>|]+')
	for url, title in zip(urls, titles):
		name = url.split('/')[-1].split('.')[0] + '.pdf'
		title = p.sub('', title).decode('utf8')
		target = 'OUTPUT/'+title+'.pdf'
		if os.path.exists(name) and not os.path.exists(target):
			shutil.copyfile(name, target)

urls = extract_urls(data_source)
titles = extract_titles(data_source)

process(urls)
rename_to_title(urls, titles)

exam('.', urls)
	import re, urllib2, subprocess, zipfile, os, shutil

	#first install wkhtmltopdf
	tool_path = r'C:/Program Files/wkhtmltopdf/wkhtmltopdf.exe'
	data_source = r'D:\TFS\ETTfsApp\EnglishTown\Team_ELab\Development\Source\ELab\ELab\EFSchools.Englishtown.ELab.UI\_scripts\elab-ui\library\binddataelibresults.js'

	def process(urls):
	for url in urls:
	print 'processing', url
	try:
	fullname = url.split('/')[-1]
	name = fullname.split('.')[0]
	ext = fullname.split('.')[-1]
	pdfname = name + '.pdf'
	download(url, fullname)
	unzip(fullname, name)
	pdf(name, name, pdfname)
	except Exception as e:
	print e

	def download(url, name):
	print 'downloading', url
	if not os.path.exists(name) or os.stat(name).st_size == 0:
	BUFFER_SIZE = 256 * 1024
	req = urllib2.Request(url)
	req.add_header('User-Agent', 'Fiddler')
	res = urllib2.urlopen(req)
	with open(name, 'wb') as f:
	f.write(res.read())

	def unzip(path, name):
	print 'unzip', path
	if not os.path.exists(name) and not os.path.exists(name+'.pdf'):
	zip = zipfile.ZipFile(path)
	zip.extractall('.')

	def pdf(foldername, name, pdfname):
	source = None
	for ext in ['.txt', '.htm']:
	p = name + ext
	if os.path.exists(p):
	source = p
	break
	p = foldername + '/' + name + ext
	if os.path.exists(p):
	source = p
	break
	if source is None:
	raise Exception('No source file to convert')
	if not os.path.exists(pdfname):
	print 'create pdf', source
	subprocess.call([tool_path, source, pdfname])

	def extract_urls(path):
	with open(path, 'r') as f:
	txt = f.read()
	return re.findall(r'http://[^"\']+', txt)

	def exam(path, urls):
	for url in urls:
	fullname = url.split('/')[-1]
	name = fullname.split('.')[0]
	if not os.path.exists(name + '.pdf'):
	print url, 'not created'

	def extract_titles(path):
	with open(path, 'r') as f:
	txt = f.read()
	return re.findall(r'Book\d+\.Title = \"(.+?)\";', txt)

	def rename_to_title(urls, titles):
	p = re.compile(r'[\\/:"*?<>\|]+')
	for url, title in zip(urls, titles):
	name = url.split('/')[-1].split('.')[0] + '.pdf'
	title = p.sub('', title).decode('utf8')
	target = 'OUTPUT/'+title+'.pdf'
	if os.path.exists(name) and not os.path.exists(target):
	shutil.copyfile(name, target)

	urls = extract_urls(data_source)
	titles = extract_titles(data_source)

	process(urls)
	rename_to_title(urls, titles)

	exam('.', urls)