Skip to content

Instantly share code, notes, and snippets.

@nanonanomachine
Last active July 13, 2018 06:39
Show Gist options
  • Save nanonanomachine/00b936ca5634c661ef506150d211c2d6 to your computer and use it in GitHub Desktop.
Save nanonanomachine/00b936ca5634c661ef506150d211c2d6 to your computer and use it in GitHub Desktop.
Word2Vec
from urllib import request
import pandas as pd
import MeCab
import mojimoji
import re
df = pd.read_csv('./csv/support-mail/support-mails-from-2016.csv')
# ストップワード
# 日本語
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = [line.decode("utf-8").strip() for line in res]
# 英語
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/English.txt")
stopwords += [line.decode("utf-8").strip() for line in res]
tagger = MeCab.Tagger ('-Owakati -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/mecab-ipadic-neologd')
def tokenize(text):
# 半角文字を全角文字に変換
text = mojimoji.han_to_zen(text, digit=False, ascii=False)
# 数字をゼロに
text = re.sub(r'\d+', '0', text)
# 分かち書き & ストップワード処理
return [w for w in tagger.parse(text).split() if w not in stopwords]
def review_to_sentences(review):
regularized_review = review.translate(str.maketrans('。!?','。!?'))
raw_sentences = re.split('。|\!|\?', regularized_review)
sentences =[]
for raw_sentence in raw_sentences:
if len(raw_sentence) > 0:
sentences.append(tokenize(raw_sentence))
return sentences
sentences = []
print("Parsing sentences from training set")
for review in df["body"]:
sentences += review_to_sentences(review)
# Import thebuilt-in logging module and configure it so that Word2Vec
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# set values for various parameters
num_features = 300 # Word vector dimensionality
min_word_count = 40 #Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 10 # Context window size
downsampling = 1e-3 #Downsample setting for frequent words
# Initialize and train the model(this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
size=num_features, min_count= min_word_count, \
window = context, sample = downsampling)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment