Skip to content

Instantly share code, notes, and snippets.

@empathicqubit
Last active October 1, 2021 05:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save empathicqubit/505e53d280c0e94c58882b49e7bcb4bd to your computer and use it in GitHub Desktop.
Save empathicqubit/505e53d280c0e94c58882b49e7bcb4bd to your computer and use it in GitHub Desktop.
Combine two SRT files into CSV for Anki
const fs = require('fs');
const util = require('util');
const net = require('net');
const child_process = require('child_process');
const colors = {
Reset : "\x1b[0m",
Bright : "\x1b[1m",
Dim : "\x1b[2m",
Underscore : "\x1b[4m",
Blink : "\x1b[5m",
Reverse : "\x1b[7m",
Hidden : "\x1b[8m",
FgBlack : "\x1b[30m",
FgRed : "\x1b[31m",
FgGreen : "\x1b[32m",
FgYellow : "\x1b[33m",
FgBlue : "\x1b[34m",
FgMagenta : "\x1b[35m",
FgCyan : "\x1b[36m",
FgWhite : "\x1b[37m",
BgBlack : "\x1b[40m",
BgRed : "\x1b[41m",
BgGreen : "\x1b[42m",
BgYellow : "\x1b[43m",
BgBlue : "\x1b[44m",
BgMagenta : "\x1b[45m",
BgCyan : "\x1b[46m",
BgWhite : "\x1b[47m",
};
const parse = (text) => {
const rex = /(\d+)(\r\n|\n)(((\d+)\s*:\s*(\d+)\s*:\s*(\d+)\s*,\s*(\d+))\s*-->\s*((\d+)\s*:\s*(\d+)\s*:\s*(\d+)\s*,\s*(\d+)))(\r\n|\n)((.|\r\n|\n)+?)(\r\n|\n){2}/gim;
const lines = [];
let rexMatch;
while(rexMatch = rex.exec(text)) {
const [
,
sequence,
,
,
,
startHours,
startMinutes,
startSeconds,
startMillis,
,
endHours,
endMinutes,
endSeconds,
endMillis,
,
text,
] = rexMatch;
const line = {
sequence,
startHours,
startMinutes,
startSeconds,
startMillis,
endHours,
endMinutes,
endSeconds,
endMillis,
text,
};
const ints = [
'sequence',
'startHours',
'startMinutes',
'startSeconds',
'startMillis',
'endHours',
'endMinutes',
'endSeconds',
'endMillis',
]
for(const int of ints) {
line[int] = parseInt(line[int]);
}
const replacements = [
[/^\s*-\s*/gim, ''],
[/\(.*?\)/gim, ''],
[/^\s*[A-Z0-9\s]+:/gm, ''],
];
for(const replacement of replacements) {
line.text = line.text.replace(...replacement);
}
lines.push(line);
}
return lines;
};
const lookup = async (dict, word) => {
return await new Promise(async (res, rej) => {
const dictConn = new net.Socket({});
await util.promisify(dictConn.connect.bind(dictConn))({
host: '127.0.0.1',
port: 2628,
});
const _write = util.promisify(dictConn.write.bind(dictConn));
let commandId = -1;
let defs = [];
let def = [];
let matches = [];
let okayCount = 0;
let collecting = true;
let matching = true;
const dataHook = async data => {
try {
const text = data.toString('utf8');
const lines = text.split(/\r\n|\n/g);
for(const line of lines) {
if(line == '') {
continue;
}
if(commandId == 152) {
if(line == '.') {
commandId = -1;
matching = false;
matches.sort((a, b) => ((word.toLowerCase() == b) - (word.toLowerCase() == a)) * 0xffffff + (b.length - a.length));
for(const match of matches) {
await _write(`DEFINE ${dict} ${match}\r\n`, 'utf8');
}
continue;
}
else {
matches.push(/"(.*)"/.exec(line)[1]);
}
}
else if(commandId == -1) {
commandId = parseInt(line);
if(commandId === NaN) {
dictConn.off('error', rej);
dictConn.off('data', dataHook);
rej(new Error(`Invalid response: ${line}`));
return;
}
}
if(commandId == 152) {
continue;
}
else if(commandId == 151) {
if(line == '.') {
commandId = -1;
defs.push(def.join('\n'));
def = [];
continue;
}
else {
def.push(line.replace(new RegExp('^' + commandId), ''));
}
}
else if(commandId == 551) {
dictConn.off('error', rej);
dictConn.off('data', dataHook);
_write('QUIT\r\n');
rej(new Error('Invalid command'));
return;
}
else if((commandId == 250 || commandId == 552)) {
if(commandId == 552) {
matching = false;
collecting = false;
}
if(!matching && collecting) {
if(!matches.length) {
collecting = false;
}
else {
okayCount++;
if(okayCount >= matches.length) {
collecting = false;
}
}
}
if(!matching && !collecting) {
dictConn.off('error', rej);
dictConn.off('data', dataHook);
_write('QUIT\r\n');
res(defs);
return;
}
commandId = -1;
}
else {
commandId = -1;
continue;
}
}
}
catch(e) {
console.error(e);
dictConn.off('error', rej);
dictConn.off('data', dataHook);
rej(e);
}
}
dictConn.once('error', rej);
dictConn.on('data', dataHook);
await _write(`MATCH ${dict} lev ${word}\r\n`, 'utf8');
});
}
const getWords = async(targetLine, nativeLine, targetLanguage, nativeLanguage) => {
const searchWords = [];
const wordRex = /\p{L}+/gum;
const targetText = targetLine.text;
const nativeText = nativeLine.text;
let wordMatch;
while(wordMatch = wordRex.exec(targetText)) {
const searchWord = wordMatch[0];
if(!searchWords.includes(searchWord)) {
searchWords.push(searchWord);
}
}
const results = [];
outer: for(const searchWord of searchWords) {
const defs = await lookup(`${targetLanguage}-${nativeLanguage}`, searchWord);
const rex = /^\s\s\s(((the|an?|to)\s+)?(\p{L}*))/umi;
for(const defText of defs) {
let defMatch = rex.exec(defText);
if(!defMatch) {
continue;
}
const def = defMatch[4] || defMatch[3];
if(!def) {
continue;
}
if(!new RegExp(`${def}`, 'gium').test(nativeText)) {
continue;
}
process.stdout.write(colors.FgGreen + searchWord + ' ' + colors.Reset);
results.push({
native: defMatch[1],
target: searchWord,
});
continue outer;
}
process.stdout.write(colors.FgRed + searchWord + ' ' + colors.Reset);
}
return results;
};
const main = async() => {
const [
processPath,
scriptPath,
nativePath,
nativeLanguage,
targetPath,
targetLanguage,
csvPath,
] = process.argv;
if(process.argv.length < 7) {
console.log(`
Converts subtitles files to an Anki flashcard CSV.
Syntax:
"${processPath}" "${scriptPath}" <native language srt> <english name of native language> <target language srt> <english name of target language> <csv filename>
`);
return;
}
let dictExists = false;
try {
const results = await util.promisify(child_process.execFile)('dict', ['-D', '-f']);
const dicts = results.stdout
.split(/[\r\n]+/g)
.map(x => x.split('\t')[2]);
dictExists = dicts.includes(`${targetLanguage}-${nativeLanguage}`);
}
catch {
console.error(e);
}
if(!dictExists) {
console.error(`Dictionary ${targetLanguage}-${nativeLanguage} not found. Make sure dictd is correctly installed`);
}
const [nativeSrt, targetSrt] = await Promise.all([
fs.readFileSync(nativePath, 'utf8'),
fs.readFileSync(targetPath, 'utf8'),
])
const nativeLines = parse(nativeSrt);
const targetLines = parse(targetSrt);
const matchLines = [];
for(const targetLine of targetLines) {
const nativeLine = nativeLines.find(x =>
x.startHours == targetLine.startHours
&& x.startMinutes == targetLine.startMinutes
&& x.startSeconds == targetLine.startSeconds
)
if(!nativeLine) {
continue;
}
const matchLine = {
native: nativeLine,
target: targetLine,
};
if(dictExists) {
matchLine.words = await getWords(targetLine, nativeLine, targetLanguage, nativeLanguage);
}
else {
matchLine.words = [];
}
matchLines.push(matchLine);
}
const uniqWords = [];
for(const word of matchLines.flatMap(x => x.words)) {
if(uniqWords.find(x => x.native.toLowerCase() == word.native.toLowerCase() && x.target.toLowerCase() == word.target.toLowerCase())) {
continue;
}
uniqWords.push(word);
}
console.log(`${matchLines.length}/${targetLines.length} lines matched.`);
console.log(`${uniqWords.length} unique words matched.`);
const csvFile = await util.promisify(fs.open)(csvPath, 'w');
await Promise.all(uniqWords.map(x =>
util.promisify(fs.write)(csvFile, `${x.target}|${x.native}\n`)
));
await Promise.all(matchLines.map(matchLine =>
util.promisify(fs.write)(csvFile, `${matchLine.target.text.replace(/[\r\n]+/gim, ' ')}|${matchLine.native.text.replace(/[\r\n]+/gim, ' ')}\n`)
));
await fs.close(csvFile);
}
main().then(() => {
process.exit(0);
}).catch((e) => {
console.error(e);
process.exit(1);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment