Skip to content

Instantly share code, notes, and snippets.

@mojimi
Last active March 20, 2023 06:29
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mojimi/1eb30c27f9f85540cc2f2fddd40a1a96 to your computer and use it in GitHub Desktop.
Save mojimi/1eb30c27f9f85540cc2f2fddd40a1a96 to your computer and use it in GitHub Desktop.
WeasyPrint Async Images/asyncio
import aiohttp
import asyncio
import base64
import time
from io import BytesIO
from PIL import Image
from bs4 import BeautifulSoup
from weasyprint import HTML
html_with_images = """
<div>
<img style="width:100%" src="http://via.placeholder.com/3000">
<img style="width:100%" src="http://via.placeholder.com/2500">
<img style="width:100%" src="http://via.placeholder.com/2000">
<img style="width:100%" src="http://via.placeholder.com/1500">
<img style="width:100%" src="http://via.placeholder.com/1000">
<img style="width:100%" src="http://via.placeholder.com/500">
<img style="width:100%" src="http://via.placeholder.com/250">
<img style="width:100%" src="http://via.placeholder.com/100">
</div>
"""
#Should probably make this into a class
def preprocessHtmlImages(html):
#aiohttp client, set limit according to your tests
conn = aiohttp.TCPConnector(limit=10, limit_per_host=10)
session = aiohttp.ClientSession(connector=conn)
#Coroutine
async def replaceImg(img):
img_url = img['src']
print('Downloading image of url {}'.format(img_url))
async with session.get(img_url) as resp:
imgBytes = await resp.read()
image = Image.open(BytesIO(imgBytes))
print('Finished downloading image of url {}'.format(img_url))
#TODO: Get image mimetype from request
img['src'] = 'data:image/{};base64,{}'.format(image.format.lower(),base64.b64encode(imgBytes).decode())
#Reading img tags in HTML string using BeautifulSoup4
soup = BeautifulSoup(html, 'html.parser')
gather_tasks = []
for img in soup.find_all('img'):
href = img.get('src')
#If it's an url
if href and (href.startswith('http://') or href.startswith('https://')):
gather_tasks.append(replaceImg(img))
#Fetching images asynchronously
if len(gather_tasks) > 0:
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*gather_tasks))
loop.run_until_complete(session.close())
return soup.prettify()
timea = time.time()
html_string = preprocessHtmlImages(html_with_images)
HTML(string=html_string).write_pdf('output_async.pdf')
print('Finished creating pdf asynchronously, time to complete : {}'.format(time.time() - timea))
timea = time.time()
HTML(string=html_with_images).write_pdf('output_sync.pdf')
print('Finished creating pdf synchronously, time to complete : {}'.format(time.time() - timea))
@mojimi
Copy link
Author

mojimi commented Mar 29, 2019

Hopefully more enhancements can be made! Please post your version!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment