Skip to content

Instantly share code, notes, and snippets.

@monisoi
Created July 21, 2018 11:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save monisoi/f937ed9cd3ac06e8b1460160d2d4e74b to your computer and use it in GitHub Desktop.
Save monisoi/f937ed9cd3ac06e8b1460160d2d4e74b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from gensim import corpora
from gensim import models
from scipy import spatial
import numpy as np
import csv
import sys
def get_sentences(file_name):
sentences = []
with open(file_name, 'r') as f:
reader = csv.reader(f)
for sentence in reader:
sentences.append(sentence)
return sentences
def create_gensim_bow(sentences):
dictionary = corpora.Dictionary(sentences)
dictionary.token2id
return list(map(dictionary.doc2bow, sentences)), len(dictionary)
def apply_tfidf(corpus):
test_model = models.TfidfModel(corpus)
return test_model[corpus]
def create_word_vectors(corpus, num_of_words):
# outputs as ndarray
word_vectors = []
for id_freq_pairs in corpus:
word_vector = [0 for i in range(num_of_words)]
for id_freq_pair in id_freq_pairs:
word_vector[id_freq_pair[0]] = id_freq_pair[1]
word_vectors.append(word_vector)
return np.array(word_vectors)
def calculate_cos_similarity(vector1, vector2):
return 1 - spatial.distance.cosine(vector1, vector2)
def find_most_similar_id(target_id, word_vectors):
max = {'id': 0, 'similarity': 0}
for index, word_vector in enumerate(word_vectors):
if index is not target_id:
similarity = calculate_cos_similarity(
word_vector, word_vectors[target_id])
if similarity > max['similarity']:
max = {'id': index, 'similarity': similarity}
return max
if __name__ == '__main__':
# ex)
# python find_smilar_sentence.py input.csv vectors.npy
import_file = sys.argv[1]
vector_file = sys.argv[2]
sentences = get_sentences(import_file)
print('finish loading sentences')
# enable these when you create word_vectors
corpus, num_of_words = create_gensim_bow(sentences)
corpus_tfidf = apply_tfidf(corpus)
word_vectors = create_word_vectors(corpus_tfidf, num_of_words)
np.save(vector_file, word_vectors)
print('finish saving vectors')
# enable this when you load word_vectors from csv
# word_vectors = np.load(vector_file)
# print('finish loading vectors')
similar = find_most_similar_id(0, word_vectors)
print(similar)
print(sentences[0])
print(sentences[similar['id']])
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 19 columns, instead of 18. in line 1.
アジア,気まま,旅,ある日,バンコク,自殺,男,一,枚,地図,それ,伝説,楽園,ビーチ,地図,旅の途中,カップル,一緒に,島
見ず知らず,差出人,はず,父,死,酒,勢い,差出人,もと,旅,ビリヤード,場,好き,楽器,気まま,生活,遊び,心
FBI,ニューヨーク,警察,共同,組織,テロリズム,対策,本部長,アンソニー,ハバード,ブルックリン,テロリスト,ハイジャック,事件,CIA,容疑者,逮捕,彼女,裏の顔
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from natto import MeCab
import csv
import sys
import glob
def make_input_csv(import_directory_name, export_file_name):
import_file_names = glob.glob('{}/*'.format(import_directory_name))
with open(export_file_name, 'w') as export_file:
for import_file_name in import_file_names:
print(import_file_name)
with open(import_file_name, 'r') as import_file:
reader = csv.reader(import_file)
writer = csv.writer(export_file, lineterminator='\n')
for sentence in reader:
words = []
with MeCab(r'-F%m,%f[0],%f[1],%f[2]') as nm:
for n in nm.parse(sentence[0], as_nodes=True):
if not n.is_eos():
word_and_type = n.feature.split(',')
if word_and_type[1] == '名詞' and not (word_and_type[2] == '固有名詞' and word_and_type[3] == '人名'):
words.append(word_and_type[0])
writer.writerow(words)
if __name__ == '__main__':
# ex)
# python make_input_csv.py dir_name input.csv
import_directory_name = sys.argv[1]
export_file_name = sys.argv[2]
make_input_csv(import_directory_name, export_file_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment