Skip to content

Instantly share code, notes, and snippets.

@pizzacat83
Last active December 12, 2018 10:50
Show Gist options
  • Save pizzacat83/e5d251b7a9b308ef3412597d13469b77 to your computer and use it in GitHub Desktop.
Save pizzacat83/e5d251b7a9b308ef3412597d13469b77 to your computer and use it in GitHub Desktop.
import MeCab
import math
tagger = MeCab.Tagger("")
node=tagger.parseToNode(sentence)
dic={}
ad={}
g=open("corpus/wiki_wakachi.txt","w")
lcount=0
for line in open("corpus/wiki.txt","r",encoding="utf-8"):
if line=="":
continue
lcount+=1
node=tagger.parseToNode(line)
while node:
feature = node.feature.split(",")
base = feature[6]
if base == "*":
node = node.next
continue
g.write(base+" ")
if not feature[0] in ["接頭詞", "助詞","助動詞","記号","その他","未知語"]:
dic[base] = dic[base]+1 if base in dic else 1
if feature[0] in ["副詞", "形容詞"] or feature[1] in ["形容動詞語幹"]:
ad[base] = ad[base]+1 if base in ad else 1
node = node.next
for word in dic:
dic[word]/=lcount
g=open("frequency.txt","w")
for word, frq in dic.items():
print(word, -math.log10(frq), file=g)
g=open("ad.txt","w")
for word, frq in ad.items():
print(word, -math.log10(frq), file=g)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment