mojimi/weasyprint-async.py

## weasyprint-async.py
import aiohttp
import asyncio
import base64
import time
from io import BytesIO
from PIL import Image
from bs4 import BeautifulSoup
from weasyprint import HTML

html_with_images = """
<div>
    <img style="width:100%" src="http://via.placeholder.com/3000">
    <img style="width:100%" src="http://via.placeholder.com/2500">
    <img style="width:100%" src="http://via.placeholder.com/2000">
    <img style="width:100%" src="http://via.placeholder.com/1500">
    <img style="width:100%" src="http://via.placeholder.com/1000">
    <img style="width:100%" src="http://via.placeholder.com/500">
    <img style="width:100%" src="http://via.placeholder.com/250">
    <img style="width:100%" src="http://via.placeholder.com/100">
</div>
"""

#Should probably make this into a class
def preprocessHtmlImages(html):
    #aiohttp client, set limit according to your tests
    conn = aiohttp.TCPConnector(limit=10, limit_per_host=10)
    session = aiohttp.ClientSession(connector=conn)
    #Coroutine
    async def replaceImg(img):
        img_url = img['src']
        print('Downloading image of url {}'.format(img_url))
        async with session.get(img_url) as resp:
            imgBytes = await resp.read()
        image = Image.open(BytesIO(imgBytes))
        print('Finished downloading image of url {}'.format(img_url))
        #TODO: Get image mimetype from request
        img['src'] = 'data:image/{};base64,{}'.format(image.format.lower(),base64.b64encode(imgBytes).decode())
    #Reading img tags in HTML string using BeautifulSoup4
    soup = BeautifulSoup(html, 'html.parser')
    gather_tasks = []
    for img in soup.find_all('img'):
        href = img.get('src')
        #If it's an url
        if href and (href.startswith('http://') or href.startswith('https://')):
            gather_tasks.append(replaceImg(img))
    #Fetching images asynchronously
    if len(gather_tasks) > 0:
        loop = asyncio.get_event_loop()
        loop.run_until_complete(asyncio.gather(*gather_tasks))
        loop.run_until_complete(session.close())
    return soup.prettify()

timea = time.time()
html_string = preprocessHtmlImages(html_with_images)
HTML(string=html_string).write_pdf('output_async.pdf')
print('Finished creating pdf asynchronously, time to complete : {}'.format(time.time() - timea))

timea = time.time()
HTML(string=html_with_images).write_pdf('output_sync.pdf')
print('Finished creating pdf synchronously, time to complete : {}'.format(time.time() - timea))
	import aiohttp
	import asyncio
	import base64
	import time
	from io import BytesIO
	from PIL import Image
	from bs4 import BeautifulSoup
	from weasyprint import HTML

	html_with_images = """
	<div>
	<img style="width:100%" src="http://via.placeholder.com/3000">
	<img style="width:100%" src="http://via.placeholder.com/2500">
	<img style="width:100%" src="http://via.placeholder.com/2000">
	<img style="width:100%" src="http://via.placeholder.com/1500">
	<img style="width:100%" src="http://via.placeholder.com/1000">
	<img style="width:100%" src="http://via.placeholder.com/500">
	<img style="width:100%" src="http://via.placeholder.com/250">
	<img style="width:100%" src="http://via.placeholder.com/100">
	</div>
	"""

	#Should probably make this into a class
	def preprocessHtmlImages(html):
	#aiohttp client, set limit according to your tests
	conn = aiohttp.TCPConnector(limit=10, limit_per_host=10)
	session = aiohttp.ClientSession(connector=conn)
	#Coroutine
	async def replaceImg(img):
	img_url = img['src']
	print('Downloading image of url {}'.format(img_url))
	async with session.get(img_url) as resp:
	imgBytes = await resp.read()
	image = Image.open(BytesIO(imgBytes))
	print('Finished downloading image of url {}'.format(img_url))
	#TODO: Get image mimetype from request
	img['src'] = 'data:image/{};base64,{}'.format(image.format.lower(),base64.b64encode(imgBytes).decode())
	#Reading img tags in HTML string using BeautifulSoup4
	soup = BeautifulSoup(html, 'html.parser')
	gather_tasks = []
	for img in soup.find_all('img'):
	href = img.get('src')
	#If it's an url
	if href and (href.startswith('http://') or href.startswith('https://')):
	gather_tasks.append(replaceImg(img))
	#Fetching images asynchronously
	if len(gather_tasks) > 0:
	loop = asyncio.get_event_loop()
	loop.run_until_complete(asyncio.gather(*gather_tasks))
	loop.run_until_complete(session.close())
	return soup.prettify()

	timea = time.time()
	html_string = preprocessHtmlImages(html_with_images)
	HTML(string=html_string).write_pdf('output_async.pdf')
	print('Finished creating pdf asynchronously, time to complete : {}'.format(time.time() - timea))

	timea = time.time()
	HTML(string=html_with_images).write_pdf('output_sync.pdf')
	print('Finished creating pdf synchronously, time to complete : {}'.format(time.time() - timea))