Last active
March 20, 2019 20:46
-
-
Save zengargoyle/e0eff422521ca901bd94fbfefa4f1bed to your computer and use it in GitHub Desktop.
simple webvtt subtitle stripping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cat 1aZHmb8P | perl -ne 's/\r?\n$//;/-->/&&next;s/<\/?c.*?>//g;s/<\d.*?>//g;/^\s*$/&&next;printf"$_\n"' | |
# get rid of webvtt header | |
cat 1aZHmb8P | perl -ne 's/\r?\n$//;/-->/&&next;s/<\/?c.*?>//g;s/<\d.*?>//g;next if /^\s*$/;printf"$_\n";BEGIN{while(<>){/##/&&last}}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
youtube-dl --write-auto-sub --sub-lang en --skip-download -o /tmp/test.vtt $1 | |
perl -0777 -pi.bak -e ' | |
s|(?m)^(\d{2}:\d{2}:\d{2}\.\d+) +--> +(\d{2}:\d{2}:\d{2}\.\d+).*[\r\n]+\s*(?s)((?:(?!\r?\n\r?\n).)*)|\3|mg; | |
s|<[^>]*>||mg; | |
s|^\s*\n||mg; | |
s|^(.*)(\r?\n\1)+$|\1|mg; | |
s|^(\d{2}:\d{2}:\d{2}\.\d+) +--> +(\d{2}:\d{2}:\d{2}\.\d+).*[\r\n]||mg; | |
' /tmp/test.en.vtt | |
cat /tmp/test.en.vtt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cat 1aZHmb8P | perl -lne '/-->/&&next;s/<\/?c.*?>//g;s/<\d.*?>//g;print' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment