Created
April 23, 2019 15:46
-
-
Save cordx56/09ea955374ce7cffd50aa30efd714406 to your computer and use it in GitHub Desktop.
お手軽Word2Vec
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import math | |
import numpy as np | |
model = None | |
def calcIDF(filteredDocs, word): | |
df = 0 | |
for v in filteredDocs.values(): | |
if word in v: df += 1 | |
if df != 0: return math.log(len(filteredDocs) / df) + 1 | |
else: return 5 | |
def w2v(word): | |
if word in model.wv: return model.wv[word] | |
else: return np.zeros(200) | |
calcdVec = {} | |
def getVec(filteredDocs, filtered): | |
if id(filteredDocs) not in calcdVec: | |
calcdVec[id(filteredDocs)] = {} | |
if ",".join(filtered) in calcdVec[id(filteredDocs)]: | |
return calcdVec[id(filteredDocs)][",".join(filtered)] | |
vec = np.zeros(200) | |
for i, v in enumerate(filtered): | |
vec += calcIDF(filteredDocs, v) / len(filtered) * w2v(v) | |
calcdVec[id(filteredDocs)][",".join(filtered)] = vec | |
return vec | |
def calcVecCos(v1, v2): | |
normprod = np.linalg.norm(v1) * np.linalg.norm(v2) | |
return np.dot(v1, v2) / normprod if normprod != 0 else 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import usemecab | |
import calc | |
import sys | |
from gensim.models import word2vec | |
calc.model = word2vec.Word2Vec.load(sys.argv[1]) | |
filteredDocs = {} | |
result = {} | |
with open(sys.argv[2]) as f: | |
for line in f: | |
filteredDocs[line[:-1]] = usemecab.filter(usemecab.splitMeCab(line[:-1])) | |
searchVec = calc.getVec(filteredDocs, usemecab.filter(usemecab.splitMeCab(sys.argv[3]))) | |
for key, val in filteredDocs.items(): | |
result[key] = calc.calcVecCos(searchVec, calc.getVec(filteredDocs, val)) | |
for key, val in sorted(result.items(), key = lambda x: -x[1]): | |
print(str(val) + " " + key) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[[source]] | |
name = "pypi" | |
url = "https://pypi.org/simple" | |
verify_ssl = true | |
[dev-packages] | |
[packages] | |
gensim = "*" | |
paramiko = "*" | |
mecab-python3 = "*" | |
[requires] | |
python_version = "3.7" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from gensim.models import word2vec | |
import logging | |
import sys | |
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO) | |
data = word2vec.LineSentence(sys.argv[1]) | |
model = word2vec.Word2Vec(data, size = 200, window = 5, min_count = 20, workers = 14) | |
model.save(sys.argv[2]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import MeCab | |
import sys | |
mecab = MeCab.Tagger("-d /usr/lib/mecab/dic/mecab-ipadic-neologd") | |
def splitMeCab(text): | |
parse = mecab.parse(text).split("\n") | |
for i, v in enumerate(parse): | |
parse[i] = parse[i].split("\t") | |
if len(parse[i]) < 2: | |
return parse[:i] | |
parse[i][1] = parse[i][1].split(",") | |
return parse | |
def isIndep(node): | |
#return node[1][0] not in ["助動詞", "助詞", "記号"] | |
return node[1][0] in ["名詞", "動詞", "形容詞"] | |
def filter(parse): | |
filtered = [] | |
for i, v in enumerate(parse): | |
if (isIndep(v)): filtered.append(v[0]) | |
return filtered | |
if __name__ == "__main__": | |
print(splitMeCab(sys.argv[1])) | |
for i, v in enumerate(splitMeCab(sys.argv[1])): | |
print(str(i) + ": " + str(v)) | |
print(isIndep(v)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment