Last active
July 13, 2018 06:39
-
-
Save nanonanomachine/00b936ca5634c661ef506150d211c2d6 to your computer and use it in GitHub Desktop.
Word2Vec
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib import request | |
import pandas as pd | |
import MeCab | |
import mojimoji | |
import re | |
df = pd.read_csv('./csv/support-mail/support-mails-from-2016.csv') | |
# ストップワード | |
# 日本語 | |
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt") | |
stopwords = [line.decode("utf-8").strip() for line in res] | |
# 英語 | |
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/English.txt") | |
stopwords += [line.decode("utf-8").strip() for line in res] | |
tagger = MeCab.Tagger ('-Owakati -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/mecab-ipadic-neologd') | |
def tokenize(text): | |
# 半角文字を全角文字に変換 | |
text = mojimoji.han_to_zen(text, digit=False, ascii=False) | |
# 数字をゼロに | |
text = re.sub(r'\d+', '0', text) | |
# 分かち書き & ストップワード処理 | |
return [w for w in tagger.parse(text).split() if w not in stopwords] | |
def review_to_sentences(review): | |
regularized_review = review.translate(str.maketrans('。!?','。!?')) | |
raw_sentences = re.split('。|\!|\?', regularized_review) | |
sentences =[] | |
for raw_sentence in raw_sentences: | |
if len(raw_sentence) > 0: | |
sentences.append(tokenize(raw_sentence)) | |
return sentences | |
sentences = [] | |
print("Parsing sentences from training set") | |
for review in df["body"]: | |
sentences += review_to_sentences(review) | |
# Import thebuilt-in logging module and configure it so that Word2Vec | |
# creates nice output messages | |
import logging | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
# set values for various parameters | |
num_features = 300 # Word vector dimensionality | |
min_word_count = 40 #Minimum word count | |
num_workers = 4 # Number of threads to run in parallel | |
context = 10 # Context window size | |
downsampling = 1e-3 #Downsample setting for frequent words | |
# Initialize and train the model(this will take some time) | |
from gensim.models import word2vec | |
print("Training model...") | |
model = word2vec.Word2Vec(sentences, workers=num_workers, \ | |
size=num_features, min_count= min_word_count, \ | |
window = context, sample = downsampling) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment