|
import webvtt |
|
from webvtt import WebVTT, Caption |
|
import re |
|
import MeCab |
|
from google.cloud import translate |
|
import argparse |
|
import glob |
|
|
|
# timestamps = [] |
|
# sentence = '' |
|
translate_client = translate.Client() |
|
mecab = MeCab.Tagger ("-Ochasen") |
|
|
|
KIGOU_OK_LIST = ['句点', '読点', '一般'] |
|
JOSHI_NG_LIST = ['連体化'] |
|
|
|
# for caption in webvtt.read('input\Basic Recipe for Machine Learning-en.vtt'): |
|
# timestamps.append((caption.start, caption.end, len(caption.text))) |
|
# sentence += ' ' + caption.text |
|
# # 末尾がピリオドか感嘆符で終わっている場合はそこで一つの文とする |
|
# m = re.match('[.!?]', caption.text[-1:]) |
|
# if m: |
|
# # Google Translate APIで翻訳する |
|
# print(timestamps) |
|
# print(u'Text: {}'.format(sentence.strip())) |
|
# # translation = translate_client.translate(sentence.strip(), target_language='ja') |
|
# # print(u'Translation: {}'.format(translation['translatedText'])) |
|
# # text = translation['translatedText'] |
|
# # 全体の長さを取得する。sentenceのlenだと結合の際に加算したスペース分だけ増えてしまう |
|
# s_len_en = sum(i[2] for i in timestamps) |
|
# print(s_len_en) |
|
# # print(len(sentence.strip()) - len(timestamps) + 1) |
|
# timestamps = [] |
|
# sentence = '' |
|
# break |
|
|
|
def getArgumets(): |
|
parser = argparse.ArgumentParser(description='WEBVTTファイルを英語から日本語に翻訳します。') |
|
parser.add_argument('-f', help='翻訳するWEBVTTファイル') |
|
parser.add_argument('-d', help='翻訳するWEBVTTファイルを配置したディレクトリパス') |
|
return parser.parse_args() |
|
|
|
def translateWebvtt(vttfile): |
|
sentence = Sentence() |
|
captions = Captions() |
|
# 字幕を順番に読んで1文ずつつなげて翻訳する |
|
counter = 1 |
|
word_counter = 0 |
|
caption_counter = 0 |
|
for caption in webvtt.read(vttfile): |
|
caption_counter += 1 |
|
sentence.addCaption(caption.text, caption.start, caption.end) |
|
# 末尾がピリオドか感嘆符で終わっている場合はそこで1つの文とする |
|
if sentence.validate(): |
|
print('Sentence: ', counter) |
|
counter += 1 |
|
word_counter += len(sentence.text) |
|
sentence.translate() |
|
print(sentence.timestamps) |
|
print(sentence.text) |
|
print(sentence.translated_text) |
|
captions.addSentence(sentence.timestamps, sentence.translated_text) |
|
sentence.clear() |
|
|
|
# デバッグ用 |
|
# if counter > 10: |
|
# break |
|
# counter += 1 |
|
# timestamps = [('00:03:14.594', '00:03:16.390', 32)] |
|
# translated_text = u'そのため、注意すべき点がいくつかあります。' |
|
# captions.addSentence(timestamps, translated_text) |
|
# break |
|
|
|
print('Caption count:', caption_counter) |
|
print('Word count:', word_counter) |
|
|
|
captions.save(vttfile[:-4] + '-ja.vtt') |
|
|
|
class Sentence: |
|
|
|
def __init__(self): |
|
self.timestamps = [] |
|
self.text = '' |
|
self.translate_client = translate.Client() |
|
self.source = 'en' |
|
self.target = 'ja' |
|
self.translated_text = '' |
|
|
|
def addCaption(self, caption_text, start, end): |
|
self.timestamps.append((start, end, len(caption_text))) |
|
self.text = (self.text + ' ' + caption_text).strip() |
|
|
|
def validate(self): |
|
# 空文字の場合はfalseを返す |
|
if self.text: |
|
# 末尾がピリオドか感嘆符で終わっている場合は完成した文であると判定する |
|
return bool(re.match('[.!?]', self.text[-1:])) |
|
return false |
|
|
|
def translate(self): |
|
translation = translate_client.translate( |
|
self.text, |
|
source_language=self.source, |
|
target_language=self.target, |
|
) |
|
self.translated_text = translation['translatedText'] |
|
|
|
def clear(self): |
|
self.timestamps.clear() |
|
self.text = '' |
|
|
|
class Captions: |
|
|
|
KIGOU_OK_LIST = ['句点', '読点', '一般'] |
|
JOSHI_NG_LIST = ['連体化'] |
|
|
|
def __init__(self): |
|
self.webvtt = WebVTT() |
|
self.mecab = MeCab.Tagger ("-Ochasen") |
|
|
|
# 文を文字列の長さの比率で分割してそれぞれを字幕の1つとする |
|
def addSentence(self, timestamps, text): |
|
source_length = sum(idx[2] for idx in timestamps) |
|
target_length = len(text) |
|
node = mecab.parseToNode(text) |
|
# デバッグ用形態素解析 |
|
# print(mecab.parse(text)) |
|
|
|
# 最後は残った文字全部となるので、ループは1回少ない |
|
for idx in range(len(timestamps) - 1): |
|
target_ratio = timestamps[idx][2] / source_length |
|
caption = '' |
|
pre_caption = '' |
|
pre_diff = 1 |
|
pre_node = None |
|
# MeCabで解析した単語を順番に処理する |
|
while node: |
|
caption += node.surface |
|
pos = self.getPos(node.feature) |
|
# 助詞か句読点で文章を切る判定をする |
|
if self.isSeparatable(pos): |
|
# もし次の単語も区切り単語ならそれもまとめてしまう |
|
pos = self.getPos(node.next.feature) |
|
if self.isSeparatable(pos): |
|
caption += node.next.surface |
|
node = node.next |
|
# 目標の比率を超えたら出力する |
|
diff = target_ratio - (len(caption) / target_length) |
|
if diff <= 0: |
|
# ひとつ前の区切りと比較して誤差が小さい方を採用する |
|
if abs(diff) < abs(pre_diff): |
|
print('No.{} -- diff: {: .4f}, caption: {}'.format(idx, diff, caption)) |
|
self.addCaption(timestamps[idx], caption) |
|
node = node.next |
|
else: |
|
print('No.{} -- pre_diff: {: .4f}, pre_caption: {}'.format(idx, pre_diff, pre_caption)) |
|
self.addCaption(timestamps[idx], pre_caption) |
|
node = pre_node.next |
|
break |
|
pre_caption = caption |
|
pre_diff = diff |
|
pre_node = node |
|
node = node.next |
|
# もしdiffが負にならず最後のnodeまできたらその時点のcaptionを出力する |
|
if self.isEnd(node): |
|
self.addCaption(timestamps[idx], caption) |
|
print('No.{} -- pre_diff: {: .4f}, pre_caption: {}'.format(idx, diff, caption)) |
|
|
|
# 残りの文字を出力する。残りが無ければ何もしない |
|
if not self.isEnd(node): |
|
caption = self.getRemainingText(node) |
|
self.addCaption(timestamps[-1], caption) |
|
print('remaining', caption) |
|
|
|
def isEnd(self, node): |
|
if node: |
|
if 'BOS/EOS' in node.feature: |
|
# 最初のnodeもBOS/EOSの場合がある |
|
if node.next: |
|
return False |
|
return True |
|
return False |
|
return True |
|
|
|
def addCaption(self, timestamp, text): |
|
# vlcだと00:00:00.000の場合、冒頭字幕が表示されない仕様?に対応 |
|
if timestamp[0] == '00:00:00.000': |
|
start = '00:00:00.001' |
|
else: |
|
start = timestamp[0] |
|
|
|
caption = Caption( |
|
start, |
|
timestamp[1], |
|
[self.normalization(text)], |
|
) |
|
self.webvtt.captions.append(caption) |
|
|
|
def normalization(self, text): |
|
norm_text = text |
|
norm_text = norm_text.replace('(', '(') |
|
norm_text = norm_text.replace(')', ')') |
|
# 翻訳APIにかけるとHTMLエンコードされるので不要っぽい? |
|
# norm_text = norm_text.replace('&', '&') |
|
# norm_text = norm_text.replace('<', '<') |
|
# norm_text = norm_text.replace('>', '>') |
|
norm_text = norm_text.replace('&', '&') |
|
norm_text = norm_text.replace('<', '<') |
|
norm_text = norm_text.replace('>', '>') |
|
return norm_text |
|
|
|
def getRemainingText(self, node): |
|
caption = '' |
|
while node: |
|
caption += node.surface |
|
node = node.next |
|
return caption |
|
|
|
def isSeparatable(self, pos): |
|
return (pos[0] == '助詞' and not pos[1] in self.JOSHI_NG_LIST) or self.isKutouten(pos) |
|
|
|
def isKutouten(self, pos): |
|
return (pos[0] == '記号' and pos[1] in self.KIGOU_OK_LIST) |
|
|
|
# 品詞(part of speech)を取得する |
|
def getPos(self, feature): |
|
# featureの内容は以下の通り |
|
# 品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用形,活用型,原形,読み,発音 |
|
return feature.split(',') |
|
|
|
def save(self, filepath): |
|
self.webvtt.save(filepath) |
|
|
|
def main(): |
|
args = getArgumets() |
|
|
|
if args.f: |
|
translateWebvtt(args.f) |
|
if args.d: |
|
files = glob.glob(args.d + "\\*.vtt") |
|
for filename in files: |
|
# print(filename) |
|
translateWebvtt(filename) |
|
|
|
if __name__ == "__main__": |
|
main() |