Skip to content

Instantly share code, notes, and snippets.

@avioli
Created September 29, 2016 22:17
Show Gist options
  • Save avioli/d72cfe1ee9b310c05ee844ca3e609789 to your computer and use it in GitHub Desktop.
Save avioli/d72cfe1ee9b310c05ee844ca3e609789 to your computer and use it in GitHub Desktop.
A SubRip subtitle fixer
# SRT fixer
# Version: 1.0
# Author: Evo Stamatov <aviolito@gmail.com>
# USAGE:
########
# To re-number the lines and do whitespace trimming:
# cat <filename>.srt | awk -f path/to/srt.awk
# To shift to the beginning with 3s:
# cat <filename>.srt | awk -v constant=-3000 -f path/to/srt.awk
# To shift forwards with 10% and 300ms - 00:01:00,000 becomes 00:01:06,300:
# cat <filename>.srt | awk -v multiplier=1.1 -v constant=300 -f path/to/srt.awk
# To remove Subtitles for the Deaf or Hard-of-hearing (SDH):
# cat <filename>.srt | awk -v remove_sdh=1 -f path/to/srt.awk
# NOTES:
########
# If no output, make sure your input is UTF-8 using "file -I <filename>"
# then converting it using "iconv -f <IN-ENC> -t UTF-8 <filename>"
# The multiplier is always applied first and the constant added second.
# If you need it the other way around - run once with the constant only,
# then again with only the multiplier:
# cat <filename>.srt | awk -v constant=300 -f path/to/srt.awk | awk -v multiplier=1.1 -f path/to/srt.awk
# Tested with BSD awk and GNU awk 4.1.4
# FORMAT SPEC: https://en.wikipedia.org/wiki/SubRip#SubRip_text_file_format
BEGIN {
## public
if (length(multiplier) == 0) {
multiplier = 1
}
if (length(constant) == 0) {
constant = 0
}
## advanced
if (length(remove_sdh) == 0) {
remove_sdh = 0 # remove subtitles for the deaf or hard-of-hearing
}
if (length(fix_whitespace) == 0) {
fix_whitespace = 1 # consolidate whitespace
}
if (length(fix_trim) == 0) {
fix_trim = 1 # trim whitespace
}
# fix_short = 0 # fix short subtitles
# fix_overlap = 0 # fix overlapping subtitle times
# remove_tags = 1 # remote html tags
# http://translationjournal.net/journal/04stndrd.htm
# min_duration = 1500 # minimum duration of a single subtitle
# min_spacing = 250 # minimum duration between two consecutive subtitles
## private globals
__ln = 0
RS = ORS = ""
FS = OFS = "\n"
debug = 0 # enable debug mode
}
{
__ln++
delete __lines
n = 0
__lines[n++] = $3
for (i = 4; i <= NF; i++) {
__lines[n++] = $i
}
if (println(__ln, $2, __lines) == -1) {
__ln--
}
# if (__ln > 5) { exit } # debug
}
function preprocess(line) {
if (remove_sdh) {
gsub(/^(-? ?)[A-Z ]+: ?/, "", line) ## strip "NAME: text" => "text"
gsub(/(^-)? ?[\(\[][A-Z ]+[\)\]] ?/, " ", line) ## strip "pretext (ACTION) posttext" => "pretext posttext"
}
if (fix_whitespace) {
gsub(/ +/, " ", line) ## convert multiple spaces to single space
}
if (fix_trim) {
gsub(/^[ \t\n]+|[ \t\n]+$/, "", line) ## trim
}
return line
}
# convert HH:MM:SS,_MS to a timestamp
function timestamp(time, _, _ts) {
_ts = substr(time, 1, 2) * 3600000
_ts = _ts + (substr(time, 4, 2) * 60000)
_ts = _ts + (substr(time, 7, 2) * 1000)
_ts = _ts + substr(time, 10, 3)
return _ts
}
# get array length
function alen(a, i, k) {
k = 0
for (i in a) k++
return k
}
# output a subtitle line
function println(ln, time, lines, _, _start, _end, _duration, _k, _line, _lines, _count, _hh, _mm, _ss, _sss, _starttime, _endtime) {
_start = timestamp(time) * multiplier + constant
_end = timestamp(substr(time, 18)) * multiplier + constant
_duration = _end - _start
# printf("%s --> %s ==> %s\n", _start, _end, _duration)
k = 0
delete _lines
for (n in lines) {
_line = preprocess(lines[n])
if (length(_line) > 0) {
_lines[k++] = _line
}
}
_count = alen(_lines)
if (_count == 0) {
return -1
}
if (_count == 1 && remove_sdh) {
gsub(/^-[ \t]+?/, "", _lines[0]) ## remove dialogue dash for single-line subtitle
}
_hh = (_start / 3600000) % 24
_mm = (_start / 60000) % 60
_ss = (_start / 1000) % 60
_sss = (_start % 1000)
_starttime = sprintf("%02d:%02d:%02d,%03d", _hh, _mm, _ss, _sss)
_hh = (_end / 3600000) % 24
_mm = (_end / 60000) % 60
_ss = (_end / 1000) % 60
_sss = (_end % 1000)
_endtime = sprintf("%02d:%02d:%02d,%03d", _hh, _mm, _ss, _sss)
out = "/dev/stdout"
printf("%s\n", ln) > out
# printf("%s\n", time) > out
printf("%s --> %s\n", _starttime, _endtime) > out
for (n in _lines) {
printf("%s\n", _lines[n]) > out
}
printf("\n") > out
return _count
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment