Last active
December 12, 2018 10:50
-
-
Save pizzacat83/e5d251b7a9b308ef3412597d13469b77 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import MeCab | |
import math | |
tagger = MeCab.Tagger("") | |
node=tagger.parseToNode(sentence) | |
dic={} | |
ad={} | |
g=open("corpus/wiki_wakachi.txt","w") | |
lcount=0 | |
for line in open("corpus/wiki.txt","r",encoding="utf-8"): | |
if line=="": | |
continue | |
lcount+=1 | |
node=tagger.parseToNode(line) | |
while node: | |
feature = node.feature.split(",") | |
base = feature[6] | |
if base == "*": | |
node = node.next | |
continue | |
g.write(base+" ") | |
if not feature[0] in ["接頭詞", "助詞","助動詞","記号","その他","未知語"]: | |
dic[base] = dic[base]+1 if base in dic else 1 | |
if feature[0] in ["副詞", "形容詞"] or feature[1] in ["形容動詞語幹"]: | |
ad[base] = ad[base]+1 if base in ad else 1 | |
node = node.next | |
for word in dic: | |
dic[word]/=lcount | |
g=open("frequency.txt","w") | |
for word, frq in dic.items(): | |
print(word, -math.log10(frq), file=g) | |
g=open("ad.txt","w") | |
for word, frq in ad.items(): | |
print(word, -math.log10(frq), file=g) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment