Created
May 4, 2015 05:48
-
-
Save g6123/4a275b6fa7194c5ad2a1 to your computer and use it in GitHub Desktop.
smi2vtt.js
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var async = require('async'); | |
var fs = require('fs'); | |
var os = require('os'); | |
var detect_encoding = require('detect-encoding'); | |
var Iconv = require('iconv').Iconv; | |
var split_by_tag = function(string, tagname, callback){ | |
var pattern = '<'+tagname; | |
var list = string.split(new RegExp(pattern, 'gi')); | |
async.waterfall([ | |
function(callback){ | |
async.map(list, function(item, callback){ | |
item = item.trim(); | |
if(item){ | |
callback(null, pattern+' '+item); | |
} else { | |
callback(null, null); | |
} | |
}, callback); | |
}, | |
function(list, callback){ | |
async.filter(list, function(item, callback){ | |
if(item === null){ | |
callback(false); | |
} else { | |
callback(true); | |
} | |
}, function(result){ | |
callback(null, result); | |
}); | |
} | |
], callback); | |
}; | |
var zerofill = function(number, digit){ | |
return ((new Array(digit).join('0'))+number).slice(-digit); | |
} | |
var ms2stamp =function(ms){ | |
var s = 0; | |
var m = 0; | |
var h = 0; | |
ms = ms/1000; | |
s = parseInt(ms); | |
ms = ms-s; | |
if(s > 59){ | |
m = parseInt(s/60); | |
s = s%60; | |
if(m > 59){ | |
h = parseInt(m/60); | |
m = m%60; | |
} | |
} | |
var result = [zerofill(h, 2), zerofill(m, 2), zerofill(s, 2)].join(':'); | |
result += ('.'+zerofill(ms, 3)); | |
return result; | |
}; | |
module.exports = function(file, update, log, callback){ | |
update({ status: 'started', detail: null }); | |
async.waterfall([ | |
function(callback){ | |
fs.readFile(file[0], function(error, buffer){ | |
if(error){ | |
callback('원본 자막 파일에 접근할 수 없습니다.', null); | |
} else { | |
callback(null, buffer); | |
} | |
}); | |
}, | |
function(buffer, callback){ | |
detect_encoding(buffer, function(error, encoding){ | |
if(error){ | |
callback('원본 자막 파일의 인코딩을 감지할 수 없습니다.', null); | |
} else { | |
encoding = encoding.toLowerCase(); | |
if(encoding === 'euc-kr'){ | |
encoding = 'cp949'; | |
} | |
callback(null, buffer, encoding); | |
} | |
}); | |
}, | |
function(buffer, encoding, callback){ | |
var iconv = new Iconv(encoding, 'utf-8'); | |
callback(null, iconv.convert(buffer).toString()); | |
}, | |
function(smi, callback){ | |
split_by_tag(smi, 'sync', callback); | |
}, | |
function(sync_list, callback){ | |
sync_list.splice(0, 1); | |
sync_list.push(sync_list.pop().replace(/<( +)?\/( +)?body>/i, '').trim()); | |
callback(null, sync_list); | |
}, | |
function(sync_list, callback){ | |
async.map(sync_list, function(item, callback){ | |
var time = item.match(/<sync(.+)start=([0-9]+)(.+)?>/i)[2]; | |
item = item.replace(/<sync[^>]+>/i, ''); | |
split_by_tag(item, 'p', function(error, result){ | |
if(error){ | |
callback(error, null); | |
} else { | |
callback(null, [time, result]); | |
} | |
}); | |
}, callback); | |
}, | |
function(list, callback){ | |
var parsed_sub = {}; | |
async.each(list, function(item, callback){ | |
var time = item[0]; | |
var p_list = item[1]; | |
async.each(p_list, function(item, callback){ | |
var lang = item.match(/<p(.+)class=([a-z]+)(.+)?>/i)[2]; | |
var content = item.split(/<p[^>]+>/i)[1].trim(); | |
content = content.replace(/(\r\n|\n|\r)/g, ''); | |
content = content.replace(/<br( +)?\/?( +)?>/gi, '\n'); | |
content = content.replace(/<[^>]+>/g, ''); | |
if(!parsed_sub[time]){ | |
parsed_sub[time] = {}; | |
} | |
parsed_sub[time][lang] = content; | |
callback(null); | |
}, callback); | |
}, function(error){ | |
if(error){ | |
callback(error, null); | |
} else { | |
callback(null, parsed_sub); | |
} | |
}); | |
}, | |
function(parsed_sub, callback){ | |
var vtt_sub = 'WEBVTT\n'; | |
var sub_index = 1; | |
var ms_list = Object.keys(parsed_sub); | |
ms_list.sort(function(one, another){ return one-another; }); | |
ms_list.forEach(function(ms, ms_index){ | |
var content = parsed_sub[ms]['KRCC'].trim(); | |
if(content && content !== ' '){ | |
vtt_sub += ('\n'+sub_index); | |
vtt_sub += ('\n'+ms2stamp(ms)+' --> '); | |
if(ms_index === ms_list.length-1){ | |
vtt_sub += ms2stamp(ms+5000); | |
} else { | |
vtt_sub += ms2stamp(ms_list[ms_index+1]); | |
} | |
vtt_sub += ('\n'+content+'\n'); | |
sub_index++; | |
} | |
}); | |
vtt_sub = vtt_sub.trim(); | |
callback(null, vtt_sub); | |
}, | |
function(vtt_sub, callback){ | |
fs.writeFile(file[1], vtt_sub, function(error){ | |
if(error){ | |
callback('변환한 자막 파일을 저장할 수 없습니다.'); | |
} else { | |
callback(null); | |
} | |
}); | |
} | |
], function(error){ | |
if(error){ | |
var msg = '원본 자막 파일의 SAMI 문법을 분석하는 데 실패했습니다.'; | |
msg += ' ('; | |
msg += error.message; | |
msg += ')'; | |
error.message = msg; | |
callback(error); | |
} else { | |
callback(null); | |
} | |
}); | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment