Skip to content

Instantly share code, notes, and snippets.

@dado3212
Created April 3, 2023 19:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dado3212/43aed1d0d702d7392fd82c97e3166a81 to your computer and use it in GitHub Desktop.
Save dado3212/43aed1d0d702d7392fd82c97e3166a81 to your computer and use it in GitHub Desktop.
A script to extract the alt text from images in an ePub
from ebooklib import epub
import io
from PIL import Image
from bs4 import BeautifulSoup
# Open the EPUB file
book = epub.read_epub('frugal.epub')
image_alts = {}
raw_images = {}
# Iterate over each item in the book, extracting the alt and the raw image
# files
for item in book.get_items():
if item.get_type() == 9: # xhtml file
soup = BeautifulSoup(item.get_content(), 'lxml')
images = soup.find_all('img')
for image in images:
alt = image.get('alt', None)
src = image.get('src', None)
src = src[10:] # strip ../Images/ prefix
image_alts[src] = alt
if item.get_type() == 1: # image
img = Image.open(io.BytesIO(item.get_content()))
name = item.get_name()[7:] # strip Images/ prefix
raw_images[name] = img
# For Frugal Only
# # Chapter Titles
# sorted_chapter_titles = {}
# for key in image_alts:
# if key[:2] == 'ch':
# number = int(key[2:][:-4])
# sorted_chapter_titles[number] = image_alts[key]
# sorted_chapter_titles = dict(sorted(sorted_chapter_titles.items()))
# for k in sorted_chapter_titles:
# print(str(k) + ' - ' + sorted_chapter_titles[k])
# print()
# print('Epilogue - ' + image_alts['epi.jpg'])
# # FAQs
# print()
# print('FAQs')
# print()
# sorted_faqs = {}
# for key in image_alts:
# if key[:3] == 'faq':
# number = int(key[3:][:-4])
# sorted_faqs[number] = image_alts[key]
# sorted_faqs = dict(sorted(sorted_faqs.items()))
# for k in sorted_faqs:
# print(str(k) + ' - ' + sorted_faqs[k])
# print()
# # Margins
# print()
# print('Margins')
# print()
# sorted_margins = {}
# for key in image_alts:
# if key[:7] == 'margins':
# number = int(key[7:][:-4])
# sorted_margins[number] = image_alts[key]
# sorted_margins = dict(sorted(sorted_margins.items()))
# for k in sorted_margins:
# print(str(k) + ' - ' + sorted_margins[k])
# print()
for key in image_alts:
print(key)
print(image_alts[key])
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment