Skip to content

Instantly share code, notes, and snippets.

@Red5d
Created January 16, 2020 01:55
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save Red5d/94b022e527b9ddfe198207e2536e21bf to your computer and use it in GitHub Desktop.
Save Red5d/94b022e527b9ddfe198207e2536e21bf to your computer and use it in GitHub Desktop.
# /usr/bin/env python
#
# Author: Red5d
#
# Description: Extract and run OCR on subtitles from a PGS-format .sup file.
#
# Example Usage: python sup2srt.py bd_subtitles.sup bd_subtitles.srt
#
# Dependencies:
# - pytesseract
# - tqdm
# - pysrt
# - pgsreader and imagemaker modules from (https://github.com/SavSanta/pgsreader)
#
import sys, pytesseract
from pgsreader import PGSReader
from imagemaker import make_image
from pysrt import SubRipFile, SubRipItem, SubRipTime
from tqdm import tqdm
supFile = sys.argv[1]
pgs = PGSReader(supFile)
srtFile = sys.argv[2]
srt = SubRipFile()
# get all DisplaySets that contain an image
print("Loading DisplaySets...")
allsets = [ds for ds in tqdm(pgs.iter_displaysets())]
print(f"Running OCR on {len(allsets)} DisplaySets and building SRT file...")
subText = ""
subStart = 0
subIndex = 0
for ds in tqdm(allsets):
if ds.has_image:
# get Palette Display Segment
pds = ds.pds[0]
# get Object Display Segment
ods = ds.ods[0]
img = make_image(ods, pds)
subText = pytesseract.image_to_string(img)
subStart = ods.presentation_timestamp
else:
startTime = SubRipTime(milliseconds=int(subStart))
endTime = SubRipTime(milliseconds=int(ds.end[0].presentation_timestamp))
srt.append(SubRipItem(subIndex, startTime, endTime, subText))
subIndex += 1
print(f"Done. SRT file saved as {srtFile}")
srt.save(srtFile, encoding='utf-8')
@autumnwindbleak
Copy link

For a brand new environment, numpy dependency is needed too

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment