Skip to content

Instantly share code, notes, and snippets.

@jaredyam
Last active October 18, 2024 11:39
Show Gist options
  • Save jaredyam/c5e55e2e8c089f65798fc9b529798315 to your computer and use it in GitHub Desktop.
Save jaredyam/c5e55e2e8c089f65798fc9b529798315 to your computer and use it in GitHub Desktop.
Remove duplicate subtitles in a .srt file
"""Remove duplicate subtitles in a .srt file.
- What does the standard srt file look like?
- See https://www.3playmedia.com/2017/03/08/create-srt-file/
- What does the duplicate subtitle look like?
- The duplicate subtitle is something like the addition of its previous and next subtitle.
- Which match pattern has been used in this script?
- Line 85, 87
- What does the output srt file look like?
- [original_file_name]_changed.srt
Use Case
--------
From
#########################################
22
00:01:47,800 --> 00:01:49,880
Well, you've probably heard the word bit.
23
00:01:49,880 --> 00:01:49,920
# This is a duplicate subtitle.
- We live in the information age
Well, you've probably heard the word bit.
24
00:01:49,920 --> 00:01:51,720
- We live in the information age
#########################################
To
#########################################
22
00:01:47,800 --> 00:01:49,880
Well, you've probably heard the word bit.
23
00:01:49,880 --> 00:01:49,920
24
00:01:49,920 --> 00:01:51,720
- We live in the information age
#########################################
"""
import sys
def entries_generator(srt_file):
"""Generate a entries queue.
input:
srt_file: The original filename. [*.srt]
output:
entries: A queue generator.
"""
with open(srt_file, 'r') as srt:
while True:
# read lines in order
number_in_sequence = srt.readline()
timecode = srt.readline()
# whether it's the end of the file.
if not number_in_sequence:
break
# put all subtitles seperated by newline into a list.
subtitles = []
while True:
subtitle = srt.readline()
# whether it's the end of a entry.
if subtitle == '\n':
break
subtitles.append(subtitle)
yield number_in_sequence, timecode, subtitles
def replace_duplicationss(entries):
"""Replace the duplicate subtitles with empty string inplace.
input:
entries: The entries generator.
output:
entries: A modified entries.
"""
count_for_duplications = 0
# save the previous subtitles then can be used to make comparsion.
previous_subtitles = ['']
for number_in_sequence, timecode, subtitles in entries:
# whether the present subtitles match the preset pattern.
# the last line of each other is same.
if subtitles[-1] == previous_subtitles[-1]:
# except the normal repeated subtitle.
if not subtitles == previous_subtitles:
count_for_duplications += 1
# used to check out which duplicate subtitles have been
# detected.
print('######################################################')
print(f'Current number in sequence: {number_in_sequence}')
print(f'Duplicate subtitle:')
print(f"{''.join(subtitles)}")
print(f'The previous one:')
print(f"{''.join(previous_subtitles)}")
print('######################################################')
subtitles = ['']
else:
previous_subtitles = subtitles
yield number_in_sequence, timecode, subtitles
print(f'Total {count_for_duplications} duplications have been fuund out.')
if __name__ == '__main__':
srt_file = sys.argv[1]
entries = entries_generator(srt_file)
with open(srt_file[:-4] + '_changed.srt', 'w') as f:
for number_in_sequence, timecode, subtitles in replace_duplicationss(entries):
f.write(number_in_sequence)
f.write(timecode)
for subtitle in subtitles:
f.write(subtitle)
f.write('\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment