Skip to content

Instantly share code, notes, and snippets.

@nihemak
Created December 5, 2016 12:41
Show Gist options
  • Save nihemak/57e64a8959fa36e35a17cecc77e7f85c to your computer and use it in GitHub Desktop.
Save nihemak/57e64a8959fa36e35a17cecc77e7f85c to your computer and use it in GitHub Desktop.
# en2ja_data_download.shで落としてきたデータから英日翻訳の訓練データを作成するスクリプト
import random
# 和文と英文を取り出す
en_sentences, ja_sentences = {}, {}
lines = open('sentences.csv').read().split('\n')
for i in range(len(lines)):
if lines[i] == '':
continue
no, lang, sentence = lines[i].split('\t')
if lang == 'eng':
en_sentences[no] = sentence
elif lang == 'jpn':
ja_sentences[no] = sentence
import MeCab
mecab = MeCab.Tagger("-Owakati")
# 対訳データを作る
lines = open('jpn_indices.csv').read().split('\n')
# 今回はデータサイズを制限する
random.shuffle(lines)
data_size = 2500
with open('en.txt', 'w') as en_file, open('ja.txt', 'w') as ja_file:
for i in range(len(lines)):
if lines[i] == '':
continue
ja_no, en_no, _ = lines[i].split('\t')
if en_no == -1:
continue
if en_no not in en_sentences:
continue
if ja_no not in ja_sentences:
continue
# 英文は小文字に
en_file.write(en_sentences[en_no].lower())
en_file.write('\n')
# 和文は分かち書きにする
ja_file.write(mecab.parse(ja_sentences[ja_no]))
# 必要なデータサイズに達したので終了
data_size -= 1
if data_size <= 0:
break
@nihemak
Copy link
Author

nihemak commented Dec 5, 2016

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment