Last active
March 20, 2023 06:29
-
-
Save mojimi/1eb30c27f9f85540cc2f2fddd40a1a96 to your computer and use it in GitHub Desktop.
WeasyPrint Async Images/asyncio
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import aiohttp | |
import asyncio | |
import base64 | |
import time | |
from io import BytesIO | |
from PIL import Image | |
from bs4 import BeautifulSoup | |
from weasyprint import HTML | |
html_with_images = """ | |
<div> | |
<img style="width:100%" src="http://via.placeholder.com/3000"> | |
<img style="width:100%" src="http://via.placeholder.com/2500"> | |
<img style="width:100%" src="http://via.placeholder.com/2000"> | |
<img style="width:100%" src="http://via.placeholder.com/1500"> | |
<img style="width:100%" src="http://via.placeholder.com/1000"> | |
<img style="width:100%" src="http://via.placeholder.com/500"> | |
<img style="width:100%" src="http://via.placeholder.com/250"> | |
<img style="width:100%" src="http://via.placeholder.com/100"> | |
</div> | |
""" | |
#Should probably make this into a class | |
def preprocessHtmlImages(html): | |
#aiohttp client, set limit according to your tests | |
conn = aiohttp.TCPConnector(limit=10, limit_per_host=10) | |
session = aiohttp.ClientSession(connector=conn) | |
#Coroutine | |
async def replaceImg(img): | |
img_url = img['src'] | |
print('Downloading image of url {}'.format(img_url)) | |
async with session.get(img_url) as resp: | |
imgBytes = await resp.read() | |
image = Image.open(BytesIO(imgBytes)) | |
print('Finished downloading image of url {}'.format(img_url)) | |
#TODO: Get image mimetype from request | |
img['src'] = 'data:image/{};base64,{}'.format(image.format.lower(),base64.b64encode(imgBytes).decode()) | |
#Reading img tags in HTML string using BeautifulSoup4 | |
soup = BeautifulSoup(html, 'html.parser') | |
gather_tasks = [] | |
for img in soup.find_all('img'): | |
href = img.get('src') | |
#If it's an url | |
if href and (href.startswith('http://') or href.startswith('https://')): | |
gather_tasks.append(replaceImg(img)) | |
#Fetching images asynchronously | |
if len(gather_tasks) > 0: | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(asyncio.gather(*gather_tasks)) | |
loop.run_until_complete(session.close()) | |
return soup.prettify() | |
timea = time.time() | |
html_string = preprocessHtmlImages(html_with_images) | |
HTML(string=html_string).write_pdf('output_async.pdf') | |
print('Finished creating pdf asynchronously, time to complete : {}'.format(time.time() - timea)) | |
timea = time.time() | |
HTML(string=html_with_images).write_pdf('output_sync.pdf') | |
print('Finished creating pdf synchronously, time to complete : {}'.format(time.time() - timea)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hopefully more enhancements can be made! Please post your version!