Created
March 30, 2016 10:29
-
-
Save YoshihitoAso/1e98effc961bf6fc3a14ce723d7c49f9 to your computer and use it in GitHub Desktop.
メール文章のポジネガ判定
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
# -*- coding:utf-8 -*- | |
import json | |
import csv | |
import MeCab as mc | |
################################################ | |
# メール文章のポジネガ判定 | |
# | |
# Copyright 2016 YoshihitoAso(@y_asoh) | |
################################################ | |
# 形態素解析(Mecab) | |
def mecab_analysis(sentence): | |
#mt = mc.Tagger('-Owakati -d /var/lib/mecab/dic/ipadic-utf8/') | |
mt = mc.Tagger('-Ochasen -d /var/lib/mecab/dic/ipadic-utf8/') | |
sentence_u = sentence.encode('utf-8') | |
res = mt.parseToNode(sentence_u) | |
sents = [] | |
while res: | |
if res.surface != "": | |
word_type = res.feature.split(",")[0] | |
#print(word_type) | |
if word_type in ["形容詞","動詞","名詞","副詞"]: | |
plain_word = res.feature.split(",")[6] | |
if plain_word != "*": | |
#print("pw = " + plain_word) | |
#print("res = " + res.feature) | |
sents.append(plain_word) | |
res = res.next | |
return sents | |
# 名詞評価極性辞書を読み込む | |
in_file = csv.reader(file(r'wago.txt'), delimiter = '\t') | |
pne = [] | |
for line in in_file: | |
try: | |
if line[1] == 'p': score = 1.0 | |
elif line[1] == 'e': score = 0.5 | |
elif line[1] == 'n': score = 0.0 | |
pne.append((line[0],score)) | |
#print(line[0]) | |
except: pass | |
# トークンのリストのポジネガを判定 | |
def judge_pn(token): | |
score = 0 | |
num_score = 0 | |
for _pne in pne: | |
if token == _pne[0]: | |
score += _pne[1] | |
num_score += 1 | |
if num_score != 0: | |
pn_rate = float(score)/float(num_score) | |
else: pn_rate = 0.5 | |
return pn_rate | |
def pn_rates_and_sents(sents): | |
pn_rates = [] | |
pn_rates_with_sents = [] | |
for sent_list in sents: | |
for sent in sent_list: | |
#print(sent) #ポジネガ判定の対象となる単語を出力する | |
pn_rate = judge_pn(sent) | |
pn_rates.append(pn_rate) | |
pn_rates_with_sents.append((sent[0], pn_rate)) | |
return pn_rates, pn_rates_with_sents | |
# P(ポジティブ)/E(イーブン)/N(ネガティブ) のスコアを算出して出力する | |
def print_scores(pn_rates): | |
p, e, n = 0.0, 0.0, 0.0 | |
p_num, e_num, n_num = 0.0, 0.0, 0.0 | |
for pn in pn_rates: | |
if pn > 0.5: | |
p += pn | |
p_num += 1 | |
elif pn == 0.5: | |
e += pn | |
e_num += 1 | |
elif pn < 0.5: | |
n += pn | |
n_num += 1 | |
sum = p_num + e_num + n_num | |
print("") | |
print("#### Positive Score, Even Score, Negative Score ####") | |
print(p, e, n) | |
print("") | |
print("#### Postive Num, Even Num, Negative Num ####") | |
print(p_num, e_num, n_num) | |
print("") | |
print("#### Positive %, Even %, Negative % ####") | |
print(p_num/sum, e_num/sum, n_num/sum) | |
print("") | |
# Main | |
if __name__ == "__main__": | |
#メールデータ(JSONフォーマット)の読み込み | |
with open('mail.json', 'r') as f: | |
mail_list = json.load(f) | |
sents = [] | |
body_all_str = "" | |
for mail in mail_list: | |
body = mail['body'] | |
#body_all_str = body_all_str + body | |
sent = mecab_analysis(body) | |
sents.append(sent) | |
#sents = mecab_analysis(body_all_str) | |
pn_rates, pn_rates_with_sents = pn_rates_and_sents(sents) | |
print_scores(pn_rates) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment