nanonanomachine/word2vec.py

## word2vec.py
from urllib import request
import pandas as pd
import MeCab
import mojimoji
import re

df = pd.read_csv('./csv/support-mail/support-mails-from-2016.csv')

# ストップワード
# 日本語
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
stopwords = [line.decode("utf-8").strip() for line in res]

# 英語
res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/English.txt")
stopwords += [line.decode("utf-8").strip() for line in res]

tagger =  MeCab.Tagger ('-Owakati -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/mecab-ipadic-neologd')

def tokenize(text):
    # 半角文字を全角文字に変換
    text = mojimoji.han_to_zen(text, digit=False,  ascii=False)
    # 数字をゼロに
    text = re.sub(r'\d+', '0', text)
    # 分かち書き & ストップワード処理
    return [w for w in tagger.parse(text).split() if w not in stopwords]

def review_to_sentences(review):
    regularized_review = review.translate(str.maketrans('。！？','。!?'))
    raw_sentences = re.split('。|\!|\?', regularized_review)
    sentences =[]
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(tokenize(raw_sentence))
    return sentences

sentences = []

print("Parsing sentences from training set")
for review in df["body"]:
    sentences += review_to_sentences(review)


# Import thebuilt-in logging module and configure it so that Word2Vec
# creates nice output messages

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# set values for various parameters
num_features = 300 # Word vector dimensionality
min_word_count = 40 #Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 10 # Context window size
downsampling = 1e-3 #Downsample setting for frequent words

# Initialize and train the model(this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
                          size=num_features, min_count= min_word_count, \
                          window = context, sample = downsampling)
	from urllib import request
	import pandas as pd
	import MeCab
	import mojimoji
	import re

	df = pd.read_csv('./csv/support-mail/support-mails-from-2016.csv')

	# ストップワード
	# 日本語
	res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt")
	stopwords = [line.decode("utf-8").strip() for line in res]

	# 英語
	res = request.urlopen("http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/English.txt")
	stopwords += [line.decode("utf-8").strip() for line in res]

	tagger = MeCab.Tagger ('-Owakati -d /usr/local/Cellar/mecab/0.996/lib/mecab/dic/mecab-ipadic-neologd')

	def tokenize(text):
	# 半角文字を全角文字に変換
	text = mojimoji.han_to_zen(text, digit=False, ascii=False)
	# 数字をゼロに
	text = re.sub(r'\d+', '0', text)
	# 分かち書き & ストップワード処理
	return [w for w in tagger.parse(text).split() if w not in stopwords]

	def review_to_sentences(review):
	regularized_review = review.translate(str.maketrans('。！？','。!?'))
	raw_sentences = re.split('。\|\!\|\?', regularized_review)
	sentences =[]
	for raw_sentence in raw_sentences:
	if len(raw_sentence) > 0:
	sentences.append(tokenize(raw_sentence))
	return sentences

	sentences = []

	print("Parsing sentences from training set")
	for review in df["body"]:
	sentences += review_to_sentences(review)


	# Import thebuilt-in logging module and configure it so that Word2Vec
	# creates nice output messages

	import logging
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

	# set values for various parameters
	num_features = 300 # Word vector dimensionality
	min_word_count = 40 #Minimum word count
	num_workers = 4 # Number of threads to run in parallel
	context = 10 # Context window size
	downsampling = 1e-3 #Downsample setting for frequent words

	# Initialize and train the model(this will take some time)
	from gensim.models import word2vec
	print("Training model...")
	model = word2vec.Word2Vec(sentences, workers=num_workers, \
	size=num_features, min_count= min_word_count, \
	window = context, sample = downsampling)