Created
September 29, 2016 22:17
-
-
Save avioli/d72cfe1ee9b310c05ee844ca3e609789 to your computer and use it in GitHub Desktop.
A SubRip subtitle fixer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SRT fixer | |
# Version: 1.0 | |
# Author: Evo Stamatov <aviolito@gmail.com> | |
# USAGE: | |
######## | |
# To re-number the lines and do whitespace trimming: | |
# cat <filename>.srt | awk -f path/to/srt.awk | |
# To shift to the beginning with 3s: | |
# cat <filename>.srt | awk -v constant=-3000 -f path/to/srt.awk | |
# To shift forwards with 10% and 300ms - 00:01:00,000 becomes 00:01:06,300: | |
# cat <filename>.srt | awk -v multiplier=1.1 -v constant=300 -f path/to/srt.awk | |
# To remove Subtitles for the Deaf or Hard-of-hearing (SDH): | |
# cat <filename>.srt | awk -v remove_sdh=1 -f path/to/srt.awk | |
# NOTES: | |
######## | |
# If no output, make sure your input is UTF-8 using "file -I <filename>" | |
# then converting it using "iconv -f <IN-ENC> -t UTF-8 <filename>" | |
# The multiplier is always applied first and the constant added second. | |
# If you need it the other way around - run once with the constant only, | |
# then again with only the multiplier: | |
# cat <filename>.srt | awk -v constant=300 -f path/to/srt.awk | awk -v multiplier=1.1 -f path/to/srt.awk | |
# Tested with BSD awk and GNU awk 4.1.4 | |
# FORMAT SPEC: https://en.wikipedia.org/wiki/SubRip#SubRip_text_file_format | |
BEGIN { | |
## public | |
if (length(multiplier) == 0) { | |
multiplier = 1 | |
} | |
if (length(constant) == 0) { | |
constant = 0 | |
} | |
## advanced | |
if (length(remove_sdh) == 0) { | |
remove_sdh = 0 # remove subtitles for the deaf or hard-of-hearing | |
} | |
if (length(fix_whitespace) == 0) { | |
fix_whitespace = 1 # consolidate whitespace | |
} | |
if (length(fix_trim) == 0) { | |
fix_trim = 1 # trim whitespace | |
} | |
# fix_short = 0 # fix short subtitles | |
# fix_overlap = 0 # fix overlapping subtitle times | |
# remove_tags = 1 # remote html tags | |
# http://translationjournal.net/journal/04stndrd.htm | |
# min_duration = 1500 # minimum duration of a single subtitle | |
# min_spacing = 250 # minimum duration between two consecutive subtitles | |
## private globals | |
__ln = 0 | |
RS = ORS = "" | |
FS = OFS = "\n" | |
debug = 0 # enable debug mode | |
} | |
{ | |
__ln++ | |
delete __lines | |
n = 0 | |
__lines[n++] = $3 | |
for (i = 4; i <= NF; i++) { | |
__lines[n++] = $i | |
} | |
if (println(__ln, $2, __lines) == -1) { | |
__ln-- | |
} | |
# if (__ln > 5) { exit } # debug | |
} | |
function preprocess(line) { | |
if (remove_sdh) { | |
gsub(/^(-? ?)[A-Z ]+: ?/, "", line) ## strip "NAME: text" => "text" | |
gsub(/(^-)? ?[\(\[][A-Z ]+[\)\]] ?/, " ", line) ## strip "pretext (ACTION) posttext" => "pretext posttext" | |
} | |
if (fix_whitespace) { | |
gsub(/ +/, " ", line) ## convert multiple spaces to single space | |
} | |
if (fix_trim) { | |
gsub(/^[ \t\n]+|[ \t\n]+$/, "", line) ## trim | |
} | |
return line | |
} | |
# convert HH:MM:SS,_MS to a timestamp | |
function timestamp(time, _, _ts) { | |
_ts = substr(time, 1, 2) * 3600000 | |
_ts = _ts + (substr(time, 4, 2) * 60000) | |
_ts = _ts + (substr(time, 7, 2) * 1000) | |
_ts = _ts + substr(time, 10, 3) | |
return _ts | |
} | |
# get array length | |
function alen(a, i, k) { | |
k = 0 | |
for (i in a) k++ | |
return k | |
} | |
# output a subtitle line | |
function println(ln, time, lines, _, _start, _end, _duration, _k, _line, _lines, _count, _hh, _mm, _ss, _sss, _starttime, _endtime) { | |
_start = timestamp(time) * multiplier + constant | |
_end = timestamp(substr(time, 18)) * multiplier + constant | |
_duration = _end - _start | |
# printf("%s --> %s ==> %s\n", _start, _end, _duration) | |
k = 0 | |
delete _lines | |
for (n in lines) { | |
_line = preprocess(lines[n]) | |
if (length(_line) > 0) { | |
_lines[k++] = _line | |
} | |
} | |
_count = alen(_lines) | |
if (_count == 0) { | |
return -1 | |
} | |
if (_count == 1 && remove_sdh) { | |
gsub(/^-[ \t]+?/, "", _lines[0]) ## remove dialogue dash for single-line subtitle | |
} | |
_hh = (_start / 3600000) % 24 | |
_mm = (_start / 60000) % 60 | |
_ss = (_start / 1000) % 60 | |
_sss = (_start % 1000) | |
_starttime = sprintf("%02d:%02d:%02d,%03d", _hh, _mm, _ss, _sss) | |
_hh = (_end / 3600000) % 24 | |
_mm = (_end / 60000) % 60 | |
_ss = (_end / 1000) % 60 | |
_sss = (_end % 1000) | |
_endtime = sprintf("%02d:%02d:%02d,%03d", _hh, _mm, _ss, _sss) | |
out = "/dev/stdout" | |
printf("%s\n", ln) > out | |
# printf("%s\n", time) > out | |
printf("%s --> %s\n", _starttime, _endtime) > out | |
for (n in _lines) { | |
printf("%s\n", _lines[n]) > out | |
} | |
printf("\n") > out | |
return _count | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment