Skip to content

Instantly share code, notes, and snippets.

@syntaxhacker
Forked from Red5d/sup2srt.py
Created February 4, 2021 18:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save syntaxhacker/dae6151cba30cf93a145ab163175a93b to your computer and use it in GitHub Desktop.
Save syntaxhacker/dae6151cba30cf93a145ab163175a93b to your computer and use it in GitHub Desktop.
# /usr/bin/env python
#
# Author: Red5d
#
# Description: Extract and run OCR on subtitles from a PGS-format .sup file.
#
# Example Usage: python sup2srt.py bd_subtitles.sup bd_subtitles.srt
#
# Dependencies:
# - pytesseract
# - tqdm
# - pysrt
# - pgsreader and imagemaker modules from (https://github.com/SavSanta/pgsreader)
#
import sys, pytesseract
from pgsreader import PGSReader
from imagemaker import make_image
from pysrt import SubRipFile, SubRipItem, SubRipTime
from tqdm import tqdm
supFile = sys.argv[1]
pgs = PGSReader(supFile)
srtFile = sys.argv[2]
srt = SubRipFile()
# get all DisplaySets that contain an image
print("Loading DisplaySets...")
allsets = [ds for ds in tqdm(pgs.iter_displaysets())]
print(f"Running OCR on {len(allsets)} DisplaySets and building SRT file...")
subText = ""
subStart = 0
subIndex = 0
for ds in tqdm(allsets):
if ds.has_image:
# get Palette Display Segment
pds = ds.pds[0]
# get Object Display Segment
ods = ds.ods[0]
img = make_image(ods, pds)
subText = pytesseract.image_to_string(img)
subStart = ods.presentation_timestamp
else:
startTime = SubRipTime(milliseconds=int(subStart))
endTime = SubRipTime(milliseconds=int(ds.end[0].presentation_timestamp))
srt.append(SubRipItem(subIndex, startTime, endTime, subText))
subIndex += 1
print(f"Done. SRT file saved as {srtFile}")
srt.save(srtFile, encoding='utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment