Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import codecs
import sys
from gensim.models import doc2vec
# 歌詞を読み込み
lyrics = doc2vec.TaggedLineDocument('lyric.txt')
# 曲名を読み込み
with codecs.open('title.txt', 'r', encoding='utf-8') as f:
title_list = [title.replace('\n', '') for title in f.readlines()]
# モデルを作成
model = doc2vec.Doc2Vec(lyrics, size=200, window=5, min_count=1, workers=4, dm=0, iter=20)
# モデルを保存
model.save('nana.model')
model.save_word2vec_format('nana.w2vmodel')
# コマンドライン引数から曲名を取得
target_song_title = sys.argv[1].decode('utf-8')
song_index = title_list.index(target_song_title)
# 類似カードと類似度のタプル(類似度上位10件)のリストを受け取る
similar_lyrics = model.docvecs.most_similar(song_index)
print title_list[song_index] + u' に似ている曲は'
for similar_lyric in similar_lyrics:
print title_list[similar_lyric[0]], similar_lyric[1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment