Skip to content

Instantly share code, notes, and snippets.

@agucova
Created September 30, 2023 00:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save agucova/a34ea38c014121ba388154e8e0a5ae38 to your computer and use it in GitHub Desktop.
Save agucova/a34ea38c014121ba388154e8e0a5ae38 to your computer and use it in GitHub Desktop.
Script for converting <img> references in an Anki CSV to inline webp images.
#!/usr/bin/env python3
import typer
import pandas as pd
import requests
import base64
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
from typing import Optional
def img_to_base64(url: str) -> str:
response = requests.get(url)
img = Image.open(BytesIO(response.content))
buffered = BytesIO()
img.save(buffered, format="WEBP")
image_data = base64.b64encode(buffered.getvalue()).decode('utf-8')
return f'data:image/webp;base64,{image_data}'
def convert_images_to_base64(html: Optional[str]) -> str:
if html is None or pd.isna(html):
return html
soup = BeautifulSoup(html, 'html.parser')
images = soup.find_all('img')
for img in images:
src = img['src']
if not src.startswith('data:'):
base64_img = img_to_base64(src)
img['src'] = base64_img
return str(soup)
def main(input_file: str, output_file: str):
"""
Convert image URLs in a CSV file to base64 encoded WebP images.
Arguments:
input_file: Path to the input CSV file.
output_file: Path to the output CSV file.
"""
# Read the CSV file into a DataFrame
df = pd.read_csv(input_file)
# Remove rows where all elements are NaN
df.dropna(how='all', inplace=True)
# Iterate through each row and each HTML field to convert images to base64
for index, row in df.iterrows():
for column in ['Question', 'Answer']:
html_content = row[column]
html_content_with_base64 = convert_images_to_base64(html_content)
df.at[index, column] = html_content_with_base64
# Save the DataFrame back to a new CSV file
df.to_csv(output_file, index=False)
typer.echo(f"Converted images in {input_file} and saved to {output_file}")
if __name__ == "__main__":
typer.run(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment