Skip to content

Instantly share code, notes, and snippets.

@nimatrueway
Last active November 23, 2021 07:18
Show Gist options
  • Save nimatrueway/0b25126406b49438f7cbede69691522b to your computer and use it in GitHub Desktop.
Save nimatrueway/0b25126406b49438f7cbede69691522b to your computer and use it in GitHub Desktop.
Little tool to fix overlapping subtitles (especially the ones extracted from english auto-subtitles of youtube, vtt files that you would convert to srt with ffmpeg)
{
"name": "nima-scripts",
"version": "1.0.0",
"@comment dependencies": [
"// argparse: powerful argument parser",
"// https://github.com/nodeca/argparse",
"// subtitle: subtitle parser",
"// https://github.com/gsantiago/subtitle.js"
],
"dependencies": {
"@types/argparse": "^2.0.10",
"argparse": "^2.0.1",
"subtitle": "4.1.1"
},
"devDependencies": {
"@types/node": "^16.11.7",
"ts-node": "^10.4.0",
"typescript": "^4.4.4"
}
}
#!/usr/bin/env ts-node --transpile-only
import { ArgumentParser } from 'argparse';
import { parseSync, stringifySync, NodeList, Node, Cue, NodeCue } from 'subtitle'
import { copyFileSync, readFileSync, writeFileSync } from 'fs'
type ParsedArgs = {
"srt-file": string
};
function create_argument_parser(): ArgumentParser {
const parser = new ArgumentParser({
description: 'A tool to fix overlapping subtitles (especially the ones extracted from english auto-subtitles of youtube, vtt files that you would convert to srt with ffmpeg)',
add_help: true
});
parser.add_argument(
'srt-file',
{
help: 'srt file to process and fix (a backup will be created as srt-file.bak)'
}
);
return parser;
}
/**
* An interface to allow you modify a node in a subtitle file. A node is a single subtitle text.
* Both parameters (`prev` and `current`) are modifiable and any change will impact the resulting
* subtitle file.
*
* @param prev useful if you are writing a context-aware subtitle modifier,
* this file provides the previous node
* @param current current node to process
* @return if true, `current` node will be added to the resulting subtitle.
* if false `current` node will be dropped.
*/
interface IModifier {
(prev: Cue | null, current: Cue): boolean;
}
function traverse(nodes: NodeList, modifier: IModifier) {
var new_nodes: NodeCue[] = []
// try to modify prev/current nodes, and drop them if modifier instructs so
function try_modify(prev: NodeCue | null, current: NodeCue) {
const shouldInclude = modifier(prev?.data ?? null, current.data);
if (shouldInclude == true) {
new_nodes = [...new_nodes, {
type: 'cue',
data: current.data
}]
}
}
var prev: NodeCue | null = null
// non-cue nodes are those that could not be processed.
const is_node_cue = (node: Node) => node.type == 'cue'
for (const current of nodes) {
if (is_node_cue(current)) {
try_modify(prev, current as NodeCue);
prev = current as NodeCue
} else {
prev = null
}
}
return new_nodes;
}
const fix: IModifier = function (prev: Cue | null, current: Cue): boolean {
// remove all beginning/trailing whitespace characters
current.text = current.text.replace(/^\s+|\s+$/g, '');
// skip empty subtitles
if (current.text.trim().length == 0) {
return false;
}
// no further processing needed for first node
if (prev == null) {
return true;
}
// skip over super-short subtitles that basically contain what their previous subtitle contains, and just prolong previous subtitle
if (current.end - current.start < 150 && prev.text.indexOf(current.text) > -1) {
prev.end = current.end;
return false;
}
// if first-line of current subtitle is repeating last-line of previous-subtitle remove it
const currentLines = current.text.split(/\n/g)
const prevLines = prev.text.split(/\n/g)
if (currentLines[0] == prevLines[prevLines.length - 1]) {
current.text = currentLines.slice(1).join("\n")
}
// if first-line of current subtitle is repeating last-line of previous-subtitle remove it
if (current.start < prev.end) {
prev.end = current.start - 1
}
return true;
}
const parsed_args = create_argument_parser().parse_args() as ParsedArgs
const file_content = readFileSync(parsed_args['srt-file'], { encoding: "utf-8" })
const nodes = parseSync(file_content.toString())
const modified_nodes = traverse(nodes, fix)
copyFileSync(parsed_args['srt-file'], `${parsed_args['srt-file']}.bak`);
writeFileSync(parsed_args['srt-file'], stringifySync(modified_nodes, { format: 'SRT' }));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment