Created
July 21, 2018 11:46
-
-
Save monisoi/f937ed9cd3ac06e8b1460160d2d4e74b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from gensim import corpora | |
from gensim import models | |
from scipy import spatial | |
import numpy as np | |
import csv | |
import sys | |
def get_sentences(file_name): | |
sentences = [] | |
with open(file_name, 'r') as f: | |
reader = csv.reader(f) | |
for sentence in reader: | |
sentences.append(sentence) | |
return sentences | |
def create_gensim_bow(sentences): | |
dictionary = corpora.Dictionary(sentences) | |
dictionary.token2id | |
return list(map(dictionary.doc2bow, sentences)), len(dictionary) | |
def apply_tfidf(corpus): | |
test_model = models.TfidfModel(corpus) | |
return test_model[corpus] | |
def create_word_vectors(corpus, num_of_words): | |
# outputs as ndarray | |
word_vectors = [] | |
for id_freq_pairs in corpus: | |
word_vector = [0 for i in range(num_of_words)] | |
for id_freq_pair in id_freq_pairs: | |
word_vector[id_freq_pair[0]] = id_freq_pair[1] | |
word_vectors.append(word_vector) | |
return np.array(word_vectors) | |
def calculate_cos_similarity(vector1, vector2): | |
return 1 - spatial.distance.cosine(vector1, vector2) | |
def find_most_similar_id(target_id, word_vectors): | |
max = {'id': 0, 'similarity': 0} | |
for index, word_vector in enumerate(word_vectors): | |
if index is not target_id: | |
similarity = calculate_cos_similarity( | |
word_vector, word_vectors[target_id]) | |
if similarity > max['similarity']: | |
max = {'id': index, 'similarity': similarity} | |
return max | |
if __name__ == '__main__': | |
# ex) | |
# python find_smilar_sentence.py input.csv vectors.npy | |
import_file = sys.argv[1] | |
vector_file = sys.argv[2] | |
sentences = get_sentences(import_file) | |
print('finish loading sentences') | |
# enable these when you create word_vectors | |
corpus, num_of_words = create_gensim_bow(sentences) | |
corpus_tfidf = apply_tfidf(corpus) | |
word_vectors = create_word_vectors(corpus_tfidf, num_of_words) | |
np.save(vector_file, word_vectors) | |
print('finish saving vectors') | |
# enable this when you load word_vectors from csv | |
# word_vectors = np.load(vector_file) | |
# print('finish loading vectors') | |
similar = find_most_similar_id(0, word_vectors) | |
print(similar) | |
print(sentences[0]) | |
print(sentences[similar['id']]) |
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 19 columns, instead of 18. in line 1.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
アジア,気まま,旅,ある日,バンコク,自殺,男,一,枚,地図,それ,伝説,楽園,ビーチ,地図,旅の途中,カップル,一緒に,島 | |
見ず知らず,差出人,はず,父,死,酒,勢い,差出人,もと,旅,ビリヤード,場,好き,楽器,気まま,生活,遊び,心 | |
FBI,ニューヨーク,警察,共同,組織,テロリズム,対策,本部長,アンソニー,ハバード,ブルックリン,テロリスト,ハイジャック,事件,CIA,容疑者,逮捕,彼女,裏の顔 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from natto import MeCab | |
import csv | |
import sys | |
import glob | |
def make_input_csv(import_directory_name, export_file_name): | |
import_file_names = glob.glob('{}/*'.format(import_directory_name)) | |
with open(export_file_name, 'w') as export_file: | |
for import_file_name in import_file_names: | |
print(import_file_name) | |
with open(import_file_name, 'r') as import_file: | |
reader = csv.reader(import_file) | |
writer = csv.writer(export_file, lineterminator='\n') | |
for sentence in reader: | |
words = [] | |
with MeCab(r'-F%m,%f[0],%f[1],%f[2]') as nm: | |
for n in nm.parse(sentence[0], as_nodes=True): | |
if not n.is_eos(): | |
word_and_type = n.feature.split(',') | |
if word_and_type[1] == '名詞' and not (word_and_type[2] == '固有名詞' and word_and_type[3] == '人名'): | |
words.append(word_and_type[0]) | |
writer.writerow(words) | |
if __name__ == '__main__': | |
# ex) | |
# python make_input_csv.py dir_name input.csv | |
import_directory_name = sys.argv[1] | |
export_file_name = sys.argv[2] | |
make_input_csv(import_directory_name, export_file_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment