Skip to content

Instantly share code, notes, and snippets.

@EnsekiTT
Last active October 12, 2017 17:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EnsekiTT/c1b0c786615648662bb1fdc05a1d1d5b to your computer and use it in GitHub Desktop.
Save EnsekiTT/c1b0c786615648662bb1fdc05a1d1d5b to your computer and use it in GitHub Desktop.
http://ensekitt.hatenablog.com/entry/tree2 はMeCabでやったのでJUMAN++でもやってみようと思ったコード。
# coding: utf-8
# In[ ]:
# JUMAN++版
# ツイートを集める
# https://github.com/sixohsix/twitter
# 環境変数にトークンとか入れておく
import os
from twitter import *
import time
# In[ ]:
# PyKNPの形態素解析
from janome.charfilter import *
from janome.tokenfilter import *
from pyknp import Jumanpp
# 記号、助詞、助動詞は削除
# 動詞は原形に変換してみる。
# In[ ]:
# Scikit-learnの決定木と可視化
from sklearn import tree
from sklearn.externals.six import StringIO
import pydotplus
from IPython.display import Image
import numpy as np
# In[37]:
# Jumanpp向けにJanomeで使えていたフィルタを使えるようにするために別途定義下。
# JanomeのTokenFilterを継承して作ってるのでAnalyzerもほとんどJanomeと同じ
class CompoundNounFilterForJumanpp(TokenFilter):
def apply(self, tokens):
_ret = None
for token in tokens:
if _ret:
if token.hinsi == '名詞' and _ret.hinsi == '名詞':
_ret.midasi += token.midasi
_ret.bunrui = '複合名詞'
_ret.genkei += token.genkei
_ret.yomi += token.yomi
else:
ret = _ret
_ret = token
yield ret
else:
_ret = token
if _ret:
yield _ret
class POSStopFilterForJumanpp(TokenFilter):
"""
Juman++とIPAでは品詞体系が違うので注意が必要
http://www.unixuser.org/~euske/doc/postag/
"""
def __init__(self, pos_list):
self.pos_list = pos_list
def apply(self, tokens):
for token in tokens:
if any(token.hinsi == pos for pos in self.pos_list):
continue
yield token
class AnalyzerForJumanpp(object):
def __init__(self, char_filters=[], tokenizer=None, token_filters=[]):
self.tokenizer = tokenizer
self.char_filters = char_filters
self.token_filters = token_filters
def analyze(self, text):
for cfilter in self.char_filters:
text = cfilter.filter(text)
if text == '':
text = ' '
tokens = tokenizer.analysis(text)
for tfilter in self.token_filters:
tokens = tfilter.filter(tokens)
return tokens
# In[5]:
# Twitterの設定
TOKEN = os.environ["DTA_TWITTER_TOKEN"]
TOKEN_SECRET = os.environ["DTA_TWITTER_TOKEN_SECRET"]
CONSUMER_KEY = os.environ["DTA_TWITTER_CONSUMER_KEY"]
CONSUMER_SECRET = os.environ["DTA_TWITTER_CONSUMER_SECRET"]
t = Twitter(
auth = OAuth(TOKEN, TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET))
# In[6]:
# 設定
NUM_TWEET = 200
NUM_USER = 50
# In[29]:
def get_word_count(analyzer, tweets, word_list):
"""
ワードをカウントしてくれる
"""
word_count = [0]*len(word_list)
for text in tweets:
try:
for token in analyzer.analyze(text):
#print(u"見出し:%s\n 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
# % (token.midasi, token.yomi, token.genkei, token.hinsi, token.bunrui, token.katuyou1, token.katuyou2, token.imis, token.repname))
if '動詞' in token.hinsi:
word = token.genkei
elif '形容詞' in token.hinsi:
word = token.genkei
else:
word = token.midasi
if word in word_list:
word_index = word_list.index(word)
word_count[word_index] += 1
else:
word_list.append(word)
word_count.append(1)
except ValueError as e:
print(e)
print(text)
continue
return word_count
# In[8]:
# Userリストの取得
engineer = [user['screen_name'] for user in t.lists.members(owner_screen_name="EnsekiTT", slug="engineer", count=NUM_USER)['users']]
not_engineer = [user['screen_name'] for user in t.lists.members(owner_screen_name="EnsekiTT", slug="notengineer", count=NUM_USER)['users']]
users = list(engineer)
users.extend(not_engineer)
print(users)
# In[9]:
word_list = []
user_vectors = {}
user_vectors_raw = {}
last_id = 0
for user in users:
tweets = t.statuses.user_timeline(screen_name=user, count=200, include_rts=False, exclude_replies=True)
#print("user :" + user)
while (len(tweets)< NUM_TWEET):
max_id = tweets[-1]['id']
if max_id == last_id:
print("now :" + str(len(tweets)) +', '+ str(tweets[-1]['id']) + ', ' + tweets[-1]['text'])
print("Break!!!" + user)
break
last_id = max_id
tweets.extend(t.statuses.user_timeline(screen_name=users[1], count=200, include_rts=False, exclude_replies=True, max_id=max_id+1))
time.sleep(1) # Twitter APIのstatuses.user_timelineの制限が15分間に900回なので念のためちょっと長めにしてある。
user_vectors_raw[user]=[tweet['text'] for tweet in tweets[:NUM_TWEET]]
# In[10]:
from datetime import datetime
import json
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
path = ts + '_tweets.json'
with open(path, 'w') as f:
json.dump(user_vectors_raw, f)
# In[11]:
len(user_vectors_raw[user])
# In[40]:
for i, user in enumerate(users):
# Janomeの設定
char_filters = [UnicodeNormalizeCharFilter()
, RegexReplaceCharFilter(u'[()()*/:゚∀.&;|%д@_○!,?・#@٩( )و]', u'')
, RegexReplaceCharFilter(u"http[:\/A-Za-z0-9\n]*", u"")]
#token_filters = [CompoundNounFilter(),
# POSStopFilter(['動詞','記号', '助詞', '助動詞','接頭詞','数','フィラー']),
# LowerCaseFilter()]
tokenizer = Jumanpp()
token_filters = [CompoundNounFilterForJumanpp(),
POSStopFilterForJumanpp(['動詞','記号', '助詞', '助動詞','接頭詞','接尾詞','特殊'])]
analyzer = AnalyzerForJumanpp(char_filters, tokenizer, token_filters)
print(str(i) + ': ' + user)
user_vectors[user] = get_word_count(analyzer, user_vectors_raw[user], word_list)
# In[41]:
max_len = max([len(user_vectors[key]) for key in user_vectors.keys()])
for key in user_vectors.keys():
user_len = len(user_vectors[key])
user_vectors[key].extend([0]*(max_len - user_len))
user_list=[]
vectors = []
labels = []
print(engineer)
print(len(engineer))
print(not_engineer)
print(len(not_engineer))
# Not Engineerなら0, Engineerなら1ってことで。
for key in user_vectors.keys():
user_list.append(key)
if key in engineer:
labels.append(1)
elif key in not_engineer:
labels.append(0)
vectors.append(user_vectors[key])
print(labels)
print(len(vectors))
# In[42]:
clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=3, min_samples_leaf=2)
clf = clf.fit(vectors, labels)
predicted = clf.predict(vectors)
print(predicted)
print(sum(predicted == labels) / len(labels))
# In[43]:
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data, feature_names = word_list,
class_names = ['not engineer', 'engineer'],
filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment