empathicqubit/srtmerge.js

## srtmerge.js
const fs = require('fs');
const util = require('util');
const net = require('net');
const child_process = require('child_process');

const colors = {
    Reset : "\x1b[0m",
    Bright : "\x1b[1m",
    Dim : "\x1b[2m",
    Underscore : "\x1b[4m",
    Blink : "\x1b[5m",
    Reverse : "\x1b[7m",
    Hidden : "\x1b[8m",

    FgBlack : "\x1b[30m",
    FgRed : "\x1b[31m",
    FgGreen : "\x1b[32m",
    FgYellow : "\x1b[33m",
    FgBlue : "\x1b[34m",
    FgMagenta : "\x1b[35m",
    FgCyan : "\x1b[36m",
    FgWhite : "\x1b[37m",

    BgBlack : "\x1b[40m",
    BgRed : "\x1b[41m",
    BgGreen : "\x1b[42m",
    BgYellow : "\x1b[43m",
    BgBlue : "\x1b[44m",
    BgMagenta : "\x1b[45m",
    BgCyan : "\x1b[46m",
    BgWhite : "\x1b[47m",
};

const parse = (text) => {
    const rex = /(\d+)(\r\n|\n)(((\d+)\s*:\s*(\d+)\s*:\s*(\d+)\s*,\s*(\d+))\s*-->\s*((\d+)\s*:\s*(\d+)\s*:\s*(\d+)\s*,\s*(\d+)))(\r\n|\n)((.|\r\n|\n)+?)(\r\n|\n){2}/gim;

    const lines = [];
    let rexMatch;
    while(rexMatch = rex.exec(text)) {
        const [
            ,
            sequence,
            ,
            ,
            ,
            startHours,
            startMinutes,
            startSeconds,
            startMillis,
            ,
            endHours,
            endMinutes,
            endSeconds,
            endMillis,
            ,
            text,
        ] = rexMatch;

        const line = {
            sequence,
            startHours,
            startMinutes,
            startSeconds,
            startMillis,
            endHours,
            endMinutes,
            endSeconds,
            endMillis,
            text,
        };

        const ints = [
            'sequence',
            'startHours',
            'startMinutes',
            'startSeconds',
            'startMillis',
            'endHours',
            'endMinutes',
            'endSeconds',
            'endMillis',
        ]

        for(const int of ints) {
            line[int] = parseInt(line[int]);
        }

        const replacements = [
            [/^\s*-\s*/gim, ''],
            [/\(.*?\)/gim, ''],
            [/^\s*[A-Z0-9\s]+:/gm, ''],
        ];

        for(const replacement of replacements) {
            line.text = line.text.replace(...replacement);
        }

        lines.push(line);
    }

    return lines;
};


const lookup = async (dict, word) => {
    return await new Promise(async (res, rej) => {
        const dictConn = new net.Socket({});
        await util.promisify(dictConn.connect.bind(dictConn))({
            host: '127.0.0.1',
            port: 2628,
        });

        const _write = util.promisify(dictConn.write.bind(dictConn));

        let commandId = -1;
        let defs = [];
        let def = [];
        let matches = [];
        let okayCount = 0;
        let collecting = true;
        let matching = true;
        const dataHook = async data => {
            try {
                const text = data.toString('utf8');
                const lines = text.split(/\r\n|\n/g);
                for(const line of lines) {
                    if(line == '') {
                        continue;
                    }

                    if(commandId == 152) {
                        if(line == '.') {
                            commandId = -1;
                            matching = false;
                            matches.sort((a, b) => ((word.toLowerCase() == b) - (word.toLowerCase() == a)) * 0xffffff + (b.length - a.length));
                            for(const match of matches) {
                                await _write(`DEFINE ${dict} ${match}\r\n`, 'utf8');
                            }
                            continue;
                        }
                        else {
                            matches.push(/"(.*)"/.exec(line)[1]);
                        }
                    }
                    else if(commandId == -1) {
                        commandId = parseInt(line);
                        if(commandId === NaN) {
                            dictConn.off('error', rej);
                            dictConn.off('data', dataHook);
                            rej(new Error(`Invalid response: ${line}`));
                            return;
                        }
                    }

                    if(commandId == 152) {
                        continue;
                    }
                    else if(commandId == 151) {
                        if(line == '.') {
                            commandId = -1;
                            defs.push(def.join('\n'));
                            def = [];
                            continue;
                        }
                        else {
                            def.push(line.replace(new RegExp('^' + commandId), ''));
                        }
                    }
                    else if(commandId == 551) {
                        dictConn.off('error', rej);
                        dictConn.off('data', dataHook);
                        _write('QUIT\r\n');
                        rej(new Error('Invalid command'));
                        return;
                    }
                    else if((commandId == 250 || commandId == 552)) {
                        if(commandId == 552) {
                            matching = false;
                            collecting = false;
                        }

                        if(!matching && collecting) {
                            if(!matches.length) {
                                collecting = false;
                            }
                            else {
                                okayCount++;
                                if(okayCount >= matches.length) {
                                    collecting = false;
                                }
                            }
                        }

                        if(!matching && !collecting) {
                            dictConn.off('error', rej);
                            dictConn.off('data', dataHook);
                            _write('QUIT\r\n');
                            res(defs);
                            return;
                        }

                        commandId = -1;
                    }
                    else {
                        commandId = -1;
                        continue;
                    }
                }
            }
            catch(e) {
                console.error(e);
                dictConn.off('error', rej);
                dictConn.off('data', dataHook);
                rej(e);
            }
        }

        dictConn.once('error', rej);
        dictConn.on('data', dataHook);

        await _write(`MATCH ${dict} lev ${word}\r\n`, 'utf8');
    });
}

const getWords = async(targetLine, nativeLine, targetLanguage, nativeLanguage) => {
    const searchWords = [];
    const wordRex = /\p{L}+/gum;
    const targetText = targetLine.text;
    const nativeText = nativeLine.text;
    let wordMatch;
    while(wordMatch = wordRex.exec(targetText)) {
        const searchWord = wordMatch[0];
        if(!searchWords.includes(searchWord)) {
            searchWords.push(searchWord);
        }
    }

    const results = [];
    outer: for(const searchWord of searchWords) {
        const defs = await lookup(`${targetLanguage}-${nativeLanguage}`, searchWord);
        const rex = /^\s\s\s(((the|an?|to)\s+)?(\p{L}*))/umi;

        for(const defText of defs) {
            let defMatch = rex.exec(defText);
            if(!defMatch) {
                continue;
            }

            const def = defMatch[4] || defMatch[3];
            if(!def) {
                continue;
            }

            if(!new RegExp(`${def}`, 'gium').test(nativeText)) {
                continue;
            }

            process.stdout.write(colors.FgGreen + searchWord + ' ' + colors.Reset);
            results.push({
                native: defMatch[1],
                target: searchWord,
            });
            continue outer;
        }

        process.stdout.write(colors.FgRed + searchWord + ' ' + colors.Reset);
    }


    return results;
};

const main = async() => {
    const [
        processPath,
        scriptPath,
        nativePath,
        nativeLanguage,
        targetPath,
        targetLanguage,
        csvPath,
    ] = process.argv;

    if(process.argv.length < 7) {
        console.log(`
    Converts subtitles files to an Anki flashcard CSV.

    Syntax:
    "${processPath}" "${scriptPath}" <native language srt> <english name of native language> <target language srt> <english name of target language> <csv filename>
    `);
        return;
    }

    let dictExists = false;
    try {
        const results = await util.promisify(child_process.execFile)('dict', ['-D', '-f']);
        const dicts = results.stdout
            .split(/[\r\n]+/g)
            .map(x => x.split('\t')[2]);
        dictExists = dicts.includes(`${targetLanguage}-${nativeLanguage}`);
    }
    catch {
        console.error(e);
    }

    if(!dictExists) {
        console.error(`Dictionary ${targetLanguage}-${nativeLanguage} not found. Make sure dictd is correctly installed`);
    }

    const [nativeSrt, targetSrt] = await Promise.all([
        fs.readFileSync(nativePath, 'utf8'),
        fs.readFileSync(targetPath, 'utf8'),
    ])

    const nativeLines = parse(nativeSrt);
    const targetLines = parse(targetSrt);

    const matchLines = [];
    for(const targetLine of targetLines) {
        const nativeLine = nativeLines.find(x =>
            x.startHours == targetLine.startHours
            && x.startMinutes == targetLine.startMinutes
            && x.startSeconds == targetLine.startSeconds
            )

        if(!nativeLine) {
            continue;
        }

        const matchLine = {
            native: nativeLine,
            target: targetLine,
        };

        if(dictExists) {
            matchLine.words = await getWords(targetLine, nativeLine, targetLanguage, nativeLanguage);
        }
        else {
            matchLine.words = [];
        }

        matchLines.push(matchLine);
    }

    const uniqWords = [];
    for(const word of matchLines.flatMap(x => x.words)) {
        if(uniqWords.find(x => x.native.toLowerCase() == word.native.toLowerCase() && x.target.toLowerCase() == word.target.toLowerCase())) {
            continue;
        }

        uniqWords.push(word);
    }

    console.log(`${matchLines.length}/${targetLines.length} lines matched.`);
    console.log(`${uniqWords.length} unique words matched.`);

    const csvFile = await util.promisify(fs.open)(csvPath, 'w');
    await Promise.all(uniqWords.map(x =>
        util.promisify(fs.write)(csvFile, `${x.target}|${x.native}\n`)
    ));
    await Promise.all(matchLines.map(matchLine =>
        util.promisify(fs.write)(csvFile, `${matchLine.target.text.replace(/[\r\n]+/gim, ' ')}|${matchLine.native.text.replace(/[\r\n]+/gim, ' ')}\n`)
    ));
    await fs.close(csvFile);
}

main().then(() => {
    process.exit(0);
}).catch((e) => {
    console.error(e);
    process.exit(1);
});
	const fs = require('fs');
	const util = require('util');
	const net = require('net');
	const child_process = require('child_process');

	const colors = {
	Reset : "\x1b[0m",
	Bright : "\x1b[1m",
	Dim : "\x1b[2m",
	Underscore : "\x1b[4m",
	Blink : "\x1b[5m",
	Reverse : "\x1b[7m",
	Hidden : "\x1b[8m",

	FgBlack : "\x1b[30m",
	FgRed : "\x1b[31m",
	FgGreen : "\x1b[32m",
	FgYellow : "\x1b[33m",
	FgBlue : "\x1b[34m",
	FgMagenta : "\x1b[35m",
	FgCyan : "\x1b[36m",
	FgWhite : "\x1b[37m",

	BgBlack : "\x1b[40m",
	BgRed : "\x1b[41m",
	BgGreen : "\x1b[42m",
	BgYellow : "\x1b[43m",
	BgBlue : "\x1b[44m",
	BgMagenta : "\x1b[45m",
	BgCyan : "\x1b[46m",
	BgWhite : "\x1b[47m",
	};

	const parse = (text) => {
	const rex = /(\d+)(\r\n\|\n)(((\d+)\s:\s(\d+)\s:\s(\d+)\s,\s(\d+))\s-->\s((\d+)\s:\s(\d+)\s:\s(\d+)\s,\s(\d+)))(\r\n\|\n)((.\|\r\n\|\n)+?)(\r\n\|\n){2}/gim;

	const lines = [];
	let rexMatch;
	while(rexMatch = rex.exec(text)) {
	const [
	,
	sequence,
	,
	,
	,
	startHours,
	startMinutes,
	startSeconds,
	startMillis,
	,
	endHours,
	endMinutes,
	endSeconds,
	endMillis,
	,
	text,
	] = rexMatch;

	const line = {
	sequence,
	startHours,
	startMinutes,
	startSeconds,
	startMillis,
	endHours,
	endMinutes,
	endSeconds,
	endMillis,
	text,
	};

	const ints = [
	'sequence',
	'startHours',
	'startMinutes',
	'startSeconds',
	'startMillis',
	'endHours',
	'endMinutes',
	'endSeconds',
	'endMillis',
	]

	for(const int of ints) {
	line[int] = parseInt(line[int]);
	}

	const replacements = [
	[/^\s-\s/gim, ''],
	[/\(.*?\)/gim, ''],
	[/^\s*[A-Z0-9\s]+:/gm, ''],
	];

	for(const replacement of replacements) {
	line.text = line.text.replace(...replacement);
	}

	lines.push(line);
	}

	return lines;
	};


	const lookup = async (dict, word) => {
	return await new Promise(async (res, rej) => {
	const dictConn = new net.Socket({});
	await util.promisify(dictConn.connect.bind(dictConn))({
	host: '127.0.0.1',
	port: 2628,
	});

	const _write = util.promisify(dictConn.write.bind(dictConn));

	let commandId = -1;
	let defs = [];
	let def = [];
	let matches = [];
	let okayCount = 0;
	let collecting = true;
	let matching = true;
	const dataHook = async data => {
	try {
	const text = data.toString('utf8');
	const lines = text.split(/\r\n\|\n/g);
	for(const line of lines) {
	if(line == '') {
	continue;
	}

	if(commandId == 152) {
	if(line == '.') {
	commandId = -1;
	matching = false;
	matches.sort((a, b) => ((word.toLowerCase() == b) - (word.toLowerCase() == a)) * 0xffffff + (b.length - a.length));
	for(const match of matches) {
	await _write(`DEFINE ${dict} ${match}\r\n`, 'utf8');
	}
	continue;
	}
	else {
	matches.push(/"(.*)"/.exec(line)[1]);
	}
	}
	else if(commandId == -1) {
	commandId = parseInt(line);
	if(commandId === NaN) {
	dictConn.off('error', rej);
	dictConn.off('data', dataHook);
	rej(new Error(`Invalid response: ${line}`));
	return;
	}
	}

	if(commandId == 152) {
	continue;
	}
	else if(commandId == 151) {
	if(line == '.') {
	commandId = -1;
	defs.push(def.join('\n'));
	def = [];
	continue;
	}
	else {
	def.push(line.replace(new RegExp('^' + commandId), ''));
	}
	}
	else if(commandId == 551) {
	dictConn.off('error', rej);
	dictConn.off('data', dataHook);
	_write('QUIT\r\n');
	rej(new Error('Invalid command'));
	return;
	}
	else if((commandId == 250 \|\| commandId == 552)) {
	if(commandId == 552) {
	matching = false;
	collecting = false;
	}

	if(!matching && collecting) {
	if(!matches.length) {
	collecting = false;
	}
	else {
	okayCount++;
	if(okayCount >= matches.length) {
	collecting = false;
	}
	}
	}

	if(!matching && !collecting) {
	dictConn.off('error', rej);
	dictConn.off('data', dataHook);
	_write('QUIT\r\n');
	res(defs);
	return;
	}

	commandId = -1;
	}
	else {
	commandId = -1;
	continue;
	}
	}
	}
	catch(e) {
	console.error(e);
	dictConn.off('error', rej);
	dictConn.off('data', dataHook);
	rej(e);
	}
	}

	dictConn.once('error', rej);
	dictConn.on('data', dataHook);

	await _write(`MATCH ${dict} lev ${word}\r\n`, 'utf8');
	});
	}

	const getWords = async(targetLine, nativeLine, targetLanguage, nativeLanguage) => {
	const searchWords = [];
	const wordRex = /\p{L}+/gum;
	const targetText = targetLine.text;
	const nativeText = nativeLine.text;
	let wordMatch;
	while(wordMatch = wordRex.exec(targetText)) {
	const searchWord = wordMatch[0];
	if(!searchWords.includes(searchWord)) {
	searchWords.push(searchWord);
	}
	}

	const results = [];
	outer: for(const searchWord of searchWords) {
	const defs = await lookup(`${targetLanguage}-${nativeLanguage}`, searchWord);
	const rex = /^\s\s\s(((the\|an?\|to)\s+)?(\p{L}*))/umi;

	for(const defText of defs) {
	let defMatch = rex.exec(defText);
	if(!defMatch) {
	continue;
	}

	const def = defMatch[4] \|\| defMatch[3];
	if(!def) {
	continue;
	}

	if(!new RegExp(`${def}`, 'gium').test(nativeText)) {
	continue;
	}

	process.stdout.write(colors.FgGreen + searchWord + ' ' + colors.Reset);
	results.push({
	native: defMatch[1],
	target: searchWord,
	});
	continue outer;
	}

	process.stdout.write(colors.FgRed + searchWord + ' ' + colors.Reset);
	}


	return results;
	};

	const main = async() => {
	const [
	processPath,
	scriptPath,
	nativePath,
	nativeLanguage,
	targetPath,
	targetLanguage,
	csvPath,
	] = process.argv;

	if(process.argv.length < 7) {
	console.log(`
	Converts subtitles files to an Anki flashcard CSV.

	Syntax:
	"${processPath}" "${scriptPath}" <native language srt> <english name of native language> <target language srt> <english name of target language> <csv filename>
	`);
	return;
	}

	let dictExists = false;
	try {
	const results = await util.promisify(child_process.execFile)('dict', ['-D', '-f']);
	const dicts = results.stdout
	.split(/[\r\n]+/g)
	.map(x => x.split('\t')[2]);
	dictExists = dicts.includes(`${targetLanguage}-${nativeLanguage}`);
	}
	catch {
	console.error(e);
	}

	if(!dictExists) {
	console.error(`Dictionary ${targetLanguage}-${nativeLanguage} not found. Make sure dictd is correctly installed`);
	}

	const [nativeSrt, targetSrt] = await Promise.all([
	fs.readFileSync(nativePath, 'utf8'),
	fs.readFileSync(targetPath, 'utf8'),
	])

	const nativeLines = parse(nativeSrt);
	const targetLines = parse(targetSrt);

	const matchLines = [];
	for(const targetLine of targetLines) {
	const nativeLine = nativeLines.find(x =>
	x.startHours == targetLine.startHours
	&& x.startMinutes == targetLine.startMinutes
	&& x.startSeconds == targetLine.startSeconds
	)

	if(!nativeLine) {
	continue;
	}

	const matchLine = {
	native: nativeLine,
	target: targetLine,
	};

	if(dictExists) {
	matchLine.words = await getWords(targetLine, nativeLine, targetLanguage, nativeLanguage);
	}
	else {
	matchLine.words = [];
	}

	matchLines.push(matchLine);
	}

	const uniqWords = [];
	for(const word of matchLines.flatMap(x => x.words)) {
	if(uniqWords.find(x => x.native.toLowerCase() == word.native.toLowerCase() && x.target.toLowerCase() == word.target.toLowerCase())) {
	continue;
	}

	uniqWords.push(word);
	}

	console.log(`${matchLines.length}/${targetLines.length} lines matched.`);
	console.log(`${uniqWords.length} unique words matched.`);

	const csvFile = await util.promisify(fs.open)(csvPath, 'w');
	await Promise.all(uniqWords.map(x =>
	util.promisify(fs.write)(csvFile, `${x.target}\|${x.native}\n`)
	));
	await Promise.all(matchLines.map(matchLine =>
	util.promisify(fs.write)(csvFile, `${matchLine.target.text.replace(/[\r\n]+/gim, ' ')}\|${matchLine.native.text.replace(/[\r\n]+/gim, ' ')}\n`)
	));
	await fs.close(csvFile);
	}

	main().then(() => {
	process.exit(0);
	}).catch((e) => {
	console.error(e);
	process.exit(1);
	});