Last active
October 18, 2024 11:39
-
-
Save jaredyam/c5e55e2e8c089f65798fc9b529798315 to your computer and use it in GitHub Desktop.
Remove duplicate subtitles in a .srt file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Remove duplicate subtitles in a .srt file. | |
- What does the standard srt file look like? | |
- See https://www.3playmedia.com/2017/03/08/create-srt-file/ | |
- What does the duplicate subtitle look like? | |
- The duplicate subtitle is something like the addition of its previous and next subtitle. | |
- Which match pattern has been used in this script? | |
- Line 85, 87 | |
- What does the output srt file look like? | |
- [original_file_name]_changed.srt | |
Use Case | |
-------- | |
From | |
######################################### | |
22 | |
00:01:47,800 --> 00:01:49,880 | |
Well, you've probably heard the word bit. | |
23 | |
00:01:49,880 --> 00:01:49,920 | |
# This is a duplicate subtitle. | |
- We live in the information age | |
Well, you've probably heard the word bit. | |
24 | |
00:01:49,920 --> 00:01:51,720 | |
- We live in the information age | |
######################################### | |
To | |
######################################### | |
22 | |
00:01:47,800 --> 00:01:49,880 | |
Well, you've probably heard the word bit. | |
23 | |
00:01:49,880 --> 00:01:49,920 | |
24 | |
00:01:49,920 --> 00:01:51,720 | |
- We live in the information age | |
######################################### | |
""" | |
import sys | |
def entries_generator(srt_file): | |
"""Generate a entries queue. | |
input: | |
srt_file: The original filename. [*.srt] | |
output: | |
entries: A queue generator. | |
""" | |
with open(srt_file, 'r') as srt: | |
while True: | |
# read lines in order | |
number_in_sequence = srt.readline() | |
timecode = srt.readline() | |
# whether it's the end of the file. | |
if not number_in_sequence: | |
break | |
# put all subtitles seperated by newline into a list. | |
subtitles = [] | |
while True: | |
subtitle = srt.readline() | |
# whether it's the end of a entry. | |
if subtitle == '\n': | |
break | |
subtitles.append(subtitle) | |
yield number_in_sequence, timecode, subtitles | |
def replace_duplicationss(entries): | |
"""Replace the duplicate subtitles with empty string inplace. | |
input: | |
entries: The entries generator. | |
output: | |
entries: A modified entries. | |
""" | |
count_for_duplications = 0 | |
# save the previous subtitles then can be used to make comparsion. | |
previous_subtitles = [''] | |
for number_in_sequence, timecode, subtitles in entries: | |
# whether the present subtitles match the preset pattern. | |
# the last line of each other is same. | |
if subtitles[-1] == previous_subtitles[-1]: | |
# except the normal repeated subtitle. | |
if not subtitles == previous_subtitles: | |
count_for_duplications += 1 | |
# used to check out which duplicate subtitles have been | |
# detected. | |
print('######################################################') | |
print(f'Current number in sequence: {number_in_sequence}') | |
print(f'Duplicate subtitle:') | |
print(f"{''.join(subtitles)}") | |
print(f'The previous one:') | |
print(f"{''.join(previous_subtitles)}") | |
print('######################################################') | |
subtitles = [''] | |
else: | |
previous_subtitles = subtitles | |
yield number_in_sequence, timecode, subtitles | |
print(f'Total {count_for_duplications} duplications have been fuund out.') | |
if __name__ == '__main__': | |
srt_file = sys.argv[1] | |
entries = entries_generator(srt_file) | |
with open(srt_file[:-4] + '_changed.srt', 'w') as f: | |
for number_in_sequence, timecode, subtitles in replace_duplicationss(entries): | |
f.write(number_in_sequence) | |
f.write(timecode) | |
for subtitle in subtitles: | |
f.write(subtitle) | |
f.write('\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment