Last active
October 1, 2021 05:51
-
-
Save empathicqubit/505e53d280c0e94c58882b49e7bcb4bd to your computer and use it in GitHub Desktop.
Combine two SRT files into CSV for Anki
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
const util = require('util'); | |
const net = require('net'); | |
const child_process = require('child_process'); | |
const colors = { | |
Reset : "\x1b[0m", | |
Bright : "\x1b[1m", | |
Dim : "\x1b[2m", | |
Underscore : "\x1b[4m", | |
Blink : "\x1b[5m", | |
Reverse : "\x1b[7m", | |
Hidden : "\x1b[8m", | |
FgBlack : "\x1b[30m", | |
FgRed : "\x1b[31m", | |
FgGreen : "\x1b[32m", | |
FgYellow : "\x1b[33m", | |
FgBlue : "\x1b[34m", | |
FgMagenta : "\x1b[35m", | |
FgCyan : "\x1b[36m", | |
FgWhite : "\x1b[37m", | |
BgBlack : "\x1b[40m", | |
BgRed : "\x1b[41m", | |
BgGreen : "\x1b[42m", | |
BgYellow : "\x1b[43m", | |
BgBlue : "\x1b[44m", | |
BgMagenta : "\x1b[45m", | |
BgCyan : "\x1b[46m", | |
BgWhite : "\x1b[47m", | |
}; | |
const parse = (text) => { | |
const rex = /(\d+)(\r\n|\n)(((\d+)\s*:\s*(\d+)\s*:\s*(\d+)\s*,\s*(\d+))\s*-->\s*((\d+)\s*:\s*(\d+)\s*:\s*(\d+)\s*,\s*(\d+)))(\r\n|\n)((.|\r\n|\n)+?)(\r\n|\n){2}/gim; | |
const lines = []; | |
let rexMatch; | |
while(rexMatch = rex.exec(text)) { | |
const [ | |
, | |
sequence, | |
, | |
, | |
, | |
startHours, | |
startMinutes, | |
startSeconds, | |
startMillis, | |
, | |
endHours, | |
endMinutes, | |
endSeconds, | |
endMillis, | |
, | |
text, | |
] = rexMatch; | |
const line = { | |
sequence, | |
startHours, | |
startMinutes, | |
startSeconds, | |
startMillis, | |
endHours, | |
endMinutes, | |
endSeconds, | |
endMillis, | |
text, | |
}; | |
const ints = [ | |
'sequence', | |
'startHours', | |
'startMinutes', | |
'startSeconds', | |
'startMillis', | |
'endHours', | |
'endMinutes', | |
'endSeconds', | |
'endMillis', | |
] | |
for(const int of ints) { | |
line[int] = parseInt(line[int]); | |
} | |
const replacements = [ | |
[/^\s*-\s*/gim, ''], | |
[/\(.*?\)/gim, ''], | |
[/^\s*[A-Z0-9\s]+:/gm, ''], | |
]; | |
for(const replacement of replacements) { | |
line.text = line.text.replace(...replacement); | |
} | |
lines.push(line); | |
} | |
return lines; | |
}; | |
const lookup = async (dict, word) => { | |
return await new Promise(async (res, rej) => { | |
const dictConn = new net.Socket({}); | |
await util.promisify(dictConn.connect.bind(dictConn))({ | |
host: '127.0.0.1', | |
port: 2628, | |
}); | |
const _write = util.promisify(dictConn.write.bind(dictConn)); | |
let commandId = -1; | |
let defs = []; | |
let def = []; | |
let matches = []; | |
let okayCount = 0; | |
let collecting = true; | |
let matching = true; | |
const dataHook = async data => { | |
try { | |
const text = data.toString('utf8'); | |
const lines = text.split(/\r\n|\n/g); | |
for(const line of lines) { | |
if(line == '') { | |
continue; | |
} | |
if(commandId == 152) { | |
if(line == '.') { | |
commandId = -1; | |
matching = false; | |
matches.sort((a, b) => ((word.toLowerCase() == b) - (word.toLowerCase() == a)) * 0xffffff + (b.length - a.length)); | |
for(const match of matches) { | |
await _write(`DEFINE ${dict} ${match}\r\n`, 'utf8'); | |
} | |
continue; | |
} | |
else { | |
matches.push(/"(.*)"/.exec(line)[1]); | |
} | |
} | |
else if(commandId == -1) { | |
commandId = parseInt(line); | |
if(commandId === NaN) { | |
dictConn.off('error', rej); | |
dictConn.off('data', dataHook); | |
rej(new Error(`Invalid response: ${line}`)); | |
return; | |
} | |
} | |
if(commandId == 152) { | |
continue; | |
} | |
else if(commandId == 151) { | |
if(line == '.') { | |
commandId = -1; | |
defs.push(def.join('\n')); | |
def = []; | |
continue; | |
} | |
else { | |
def.push(line.replace(new RegExp('^' + commandId), '')); | |
} | |
} | |
else if(commandId == 551) { | |
dictConn.off('error', rej); | |
dictConn.off('data', dataHook); | |
_write('QUIT\r\n'); | |
rej(new Error('Invalid command')); | |
return; | |
} | |
else if((commandId == 250 || commandId == 552)) { | |
if(commandId == 552) { | |
matching = false; | |
collecting = false; | |
} | |
if(!matching && collecting) { | |
if(!matches.length) { | |
collecting = false; | |
} | |
else { | |
okayCount++; | |
if(okayCount >= matches.length) { | |
collecting = false; | |
} | |
} | |
} | |
if(!matching && !collecting) { | |
dictConn.off('error', rej); | |
dictConn.off('data', dataHook); | |
_write('QUIT\r\n'); | |
res(defs); | |
return; | |
} | |
commandId = -1; | |
} | |
else { | |
commandId = -1; | |
continue; | |
} | |
} | |
} | |
catch(e) { | |
console.error(e); | |
dictConn.off('error', rej); | |
dictConn.off('data', dataHook); | |
rej(e); | |
} | |
} | |
dictConn.once('error', rej); | |
dictConn.on('data', dataHook); | |
await _write(`MATCH ${dict} lev ${word}\r\n`, 'utf8'); | |
}); | |
} | |
const getWords = async(targetLine, nativeLine, targetLanguage, nativeLanguage) => { | |
const searchWords = []; | |
const wordRex = /\p{L}+/gum; | |
const targetText = targetLine.text; | |
const nativeText = nativeLine.text; | |
let wordMatch; | |
while(wordMatch = wordRex.exec(targetText)) { | |
const searchWord = wordMatch[0]; | |
if(!searchWords.includes(searchWord)) { | |
searchWords.push(searchWord); | |
} | |
} | |
const results = []; | |
outer: for(const searchWord of searchWords) { | |
const defs = await lookup(`${targetLanguage}-${nativeLanguage}`, searchWord); | |
const rex = /^\s\s\s(((the|an?|to)\s+)?(\p{L}*))/umi; | |
for(const defText of defs) { | |
let defMatch = rex.exec(defText); | |
if(!defMatch) { | |
continue; | |
} | |
const def = defMatch[4] || defMatch[3]; | |
if(!def) { | |
continue; | |
} | |
if(!new RegExp(`${def}`, 'gium').test(nativeText)) { | |
continue; | |
} | |
process.stdout.write(colors.FgGreen + searchWord + ' ' + colors.Reset); | |
results.push({ | |
native: defMatch[1], | |
target: searchWord, | |
}); | |
continue outer; | |
} | |
process.stdout.write(colors.FgRed + searchWord + ' ' + colors.Reset); | |
} | |
return results; | |
}; | |
const main = async() => { | |
const [ | |
processPath, | |
scriptPath, | |
nativePath, | |
nativeLanguage, | |
targetPath, | |
targetLanguage, | |
csvPath, | |
] = process.argv; | |
if(process.argv.length < 7) { | |
console.log(` | |
Converts subtitles files to an Anki flashcard CSV. | |
Syntax: | |
"${processPath}" "${scriptPath}" <native language srt> <english name of native language> <target language srt> <english name of target language> <csv filename> | |
`); | |
return; | |
} | |
let dictExists = false; | |
try { | |
const results = await util.promisify(child_process.execFile)('dict', ['-D', '-f']); | |
const dicts = results.stdout | |
.split(/[\r\n]+/g) | |
.map(x => x.split('\t')[2]); | |
dictExists = dicts.includes(`${targetLanguage}-${nativeLanguage}`); | |
} | |
catch { | |
console.error(e); | |
} | |
if(!dictExists) { | |
console.error(`Dictionary ${targetLanguage}-${nativeLanguage} not found. Make sure dictd is correctly installed`); | |
} | |
const [nativeSrt, targetSrt] = await Promise.all([ | |
fs.readFileSync(nativePath, 'utf8'), | |
fs.readFileSync(targetPath, 'utf8'), | |
]) | |
const nativeLines = parse(nativeSrt); | |
const targetLines = parse(targetSrt); | |
const matchLines = []; | |
for(const targetLine of targetLines) { | |
const nativeLine = nativeLines.find(x => | |
x.startHours == targetLine.startHours | |
&& x.startMinutes == targetLine.startMinutes | |
&& x.startSeconds == targetLine.startSeconds | |
) | |
if(!nativeLine) { | |
continue; | |
} | |
const matchLine = { | |
native: nativeLine, | |
target: targetLine, | |
}; | |
if(dictExists) { | |
matchLine.words = await getWords(targetLine, nativeLine, targetLanguage, nativeLanguage); | |
} | |
else { | |
matchLine.words = []; | |
} | |
matchLines.push(matchLine); | |
} | |
const uniqWords = []; | |
for(const word of matchLines.flatMap(x => x.words)) { | |
if(uniqWords.find(x => x.native.toLowerCase() == word.native.toLowerCase() && x.target.toLowerCase() == word.target.toLowerCase())) { | |
continue; | |
} | |
uniqWords.push(word); | |
} | |
console.log(`${matchLines.length}/${targetLines.length} lines matched.`); | |
console.log(`${uniqWords.length} unique words matched.`); | |
const csvFile = await util.promisify(fs.open)(csvPath, 'w'); | |
await Promise.all(uniqWords.map(x => | |
util.promisify(fs.write)(csvFile, `${x.target}|${x.native}\n`) | |
)); | |
await Promise.all(matchLines.map(matchLine => | |
util.promisify(fs.write)(csvFile, `${matchLine.target.text.replace(/[\r\n]+/gim, ' ')}|${matchLine.native.text.replace(/[\r\n]+/gim, ' ')}\n`) | |
)); | |
await fs.close(csvFile); | |
} | |
main().then(() => { | |
process.exit(0); | |
}).catch((e) => { | |
console.error(e); | |
process.exit(1); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment