Skip to content

Instantly share code, notes, and snippets.

@cordx56
Created April 23, 2019 15:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cordx56/09ea955374ce7cffd50aa30efd714406 to your computer and use it in GitHub Desktop.
Save cordx56/09ea955374ce7cffd50aa30efd714406 to your computer and use it in GitHub Desktop.
お手軽Word2Vec
#!/usr/bin/env python3
import math
import numpy as np
model = None
def calcIDF(filteredDocs, word):
df = 0
for v in filteredDocs.values():
if word in v: df += 1
if df != 0: return math.log(len(filteredDocs) / df) + 1
else: return 5
def w2v(word):
if word in model.wv: return model.wv[word]
else: return np.zeros(200)
calcdVec = {}
def getVec(filteredDocs, filtered):
if id(filteredDocs) not in calcdVec:
calcdVec[id(filteredDocs)] = {}
if ",".join(filtered) in calcdVec[id(filteredDocs)]:
return calcdVec[id(filteredDocs)][",".join(filtered)]
vec = np.zeros(200)
for i, v in enumerate(filtered):
vec += calcIDF(filteredDocs, v) / len(filtered) * w2v(v)
calcdVec[id(filteredDocs)][",".join(filtered)] = vec
return vec
def calcVecCos(v1, v2):
normprod = np.linalg.norm(v1) * np.linalg.norm(v2)
return np.dot(v1, v2) / normprod if normprod != 0 else 0
#!/usr/bin/env python3
import usemecab
import calc
import sys
from gensim.models import word2vec
calc.model = word2vec.Word2Vec.load(sys.argv[1])
filteredDocs = {}
result = {}
with open(sys.argv[2]) as f:
for line in f:
filteredDocs[line[:-1]] = usemecab.filter(usemecab.splitMeCab(line[:-1]))
searchVec = calc.getVec(filteredDocs, usemecab.filter(usemecab.splitMeCab(sys.argv[3])))
for key, val in filteredDocs.items():
result[key] = calc.calcVecCos(searchVec, calc.getVec(filteredDocs, val))
for key, val in sorted(result.items(), key = lambda x: -x[1]):
print(str(val) + " " + key)
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
[packages]
gensim = "*"
paramiko = "*"
mecab-python3 = "*"
[requires]
python_version = "3.7"
#!/usr/bin/env python3
from gensim.models import word2vec
import logging
import sys
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.INFO)
data = word2vec.LineSentence(sys.argv[1])
model = word2vec.Word2Vec(data, size = 200, window = 5, min_count = 20, workers = 14)
model.save(sys.argv[2])
#!/usr/bin/env python3
import MeCab
import sys
mecab = MeCab.Tagger("-d /usr/lib/mecab/dic/mecab-ipadic-neologd")
def splitMeCab(text):
parse = mecab.parse(text).split("\n")
for i, v in enumerate(parse):
parse[i] = parse[i].split("\t")
if len(parse[i]) < 2:
return parse[:i]
parse[i][1] = parse[i][1].split(",")
return parse
def isIndep(node):
#return node[1][0] not in ["助動詞", "助詞", "記号"]
return node[1][0] in ["名詞", "動詞", "形容詞"]
def filter(parse):
filtered = []
for i, v in enumerate(parse):
if (isIndep(v)): filtered.append(v[0])
return filtered
if __name__ == "__main__":
print(splitMeCab(sys.argv[1]))
for i, v in enumerate(splitMeCab(sys.argv[1])):
print(str(i) + ": " + str(v))
print(isIndep(v))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment