Skip to content

Instantly share code, notes, and snippets.

@EnsekiTT
Created October 8, 2017 03:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EnsekiTT/bbf842cd93f014af7c3771951c596efb to your computer and use it in GitHub Desktop.
Save EnsekiTT/bbf842cd93f014af7c3771951c596efb to your computer and use it in GitHub Desktop.
http://ensekitt.hatenablog.com/entry/2017/10/06/233619 をやってみるためのコード
# ツイートを集める
# https://github.com/sixohsix/twitter
# 環境変数にトークンとか入れておく
import os
from twitter import *
import time
# Janomeの形態素解析
# https://github.com/mocobeta/janome
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *
# 記号、助詞、助動詞は削除
# 動詞は原形に変換してみる。
# Scikit-learnの決定木と可視化
from sklearn import tree
from sklearn.externals.six import StringIO
import pydotplus
from IPython.display import Image
import numpy as np
# Twitterの設定
TOKEN = os.environ["DTA_TWITTER_TOKEN"]
TOKEN_SECRET = os.environ["DTA_TWITTER_TOKEN_SECRET"]
CONSUMER_KEY = os.environ["DTA_TWITTER_CONSUMER_KEY"]
CONSUMER_SECRET = os.environ["DTA_TWITTER_CONSUMER_SECRET"]
t = Twitter(
auth = OAuth(TOKEN, TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET))
# 設定
NUM_TWEET = 200
NUM_USER = 50
def get_word_count(analyzer, tweets, word_list):
"""
ワードをカウントしてくれる
"""
word_count = [0]*len(word_list)
for text in tweets:
#print("###BASE###" + text)
for token in analyzer.analyze(text):
#print(token)
if '動詞' in token.part_of_speech.split(','):
word = token.base_form
elif '形容詞' in token.part_of_speech.split(','):
word = token.base_form
else:
word = token.surface
if word in word_list:
word_index = word_list.index(word)
word_count[word_index] += 1
else:
word_list.append(word)
word_count.append(1)
#print(word_list)
#print(word_count)
return word_count
# Userリストの取得
engineer = [user['screen_name'] for user in t.lists.members(owner_screen_name="EnsekiTT", slug="engineer", count=NUM_USER)['users']]
not_engineer = [user['screen_name'] for user in t.lists.members(owner_screen_name="EnsekiTT", slug="notengineer", count=NUM_USER)['users']]
users = list(engineer)
users.extend(not_engineer)
print(users)
word_list = []
user_vectors = {}
user_vectors_raw = {}
last_id = 0
for user in users:
tweets = t.statuses.user_timeline(screen_name=user, count=200, include_rts=False, exclude_replies=True)
#print("user :" + user)
while (len(tweets)< NUM_TWEET):
max_id = tweets[-1]['id']
if max_id == last_id:
print("now :" + str(len(tweets)) +', '+ str(tweets[-1]['id']) + ', ' + tweets[-1]['text'])
print("Break!!!" + user)
break
last_id = max_id
tweets.extend(t.statuses.user_timeline(screen_name=users[1], count=200, include_rts=False, exclude_replies=True, max_id=max_id+1))
time.sleep(1) # Twitter APIのstatuses.user_timelineの制限が15分間に900回なので念のためちょっと長めにしてある。
user_vectors_raw[user]=[tweet['text'] for tweet in tweets[:NUM_TWEET]]
from datetime import datetime
import json
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
path = ts + '_tweets.json'
with open(path, 'w') as f:
json.dump(user_vectors_raw, f)
len(user_vectors_raw[user])
# Janomeの設定
char_filters = [UnicodeNormalizeCharFilter()
, RegexReplaceCharFilter(u'[ー()()*/\n:゚∀.&;|%д@_○!,?・]', u'')
, RegexReplaceCharFilter(u"http[:\/A-Za-z0-9\n]*", u"")]
tokenizer = Tokenizer()
token_filters = [CompoundNounFilter(), POSStopFilter(['動詞','記号', '助詞', '助動詞','接頭詞','数','フィラー']), LowerCaseFilter()]
#token_filters = [POSKeepFilter('名詞'), LowerCaseFilter()]
analyzer = Analyzer(char_filters, tokenizer, token_filters)
for user in users:
user_vectors[user] = get_word_count(analyzer, user_vectors_raw[user], word_list)
#break
max_len = max([len(user_vectors[key]) for key in user_vectors.keys()])
for key in user_vectors.keys():
user_len = len(user_vectors[key])
user_vectors[key].extend([0]*(max_len - user_len))
user_list=[]
vectors = []
labels = []
print(engineer)
print(len(engineer))
print(not_engineer)
print(len(not_engineer))
# Not Engineerなら0, Engineerなら1ってことで。
for key in user_vectors.keys():
user_list.append(key)
if key in engineer:
labels.append(1)
elif key in not_engineer:
labels.append(0)
vectors.append(user_vectors[key])
print(labels)
print(len(vectors))
clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=3, min_samples_leaf=2)
clf = clf.fit(vectors, labels)
predicted = clf.predict(vectors)
print(predicted)
print(sum(predicted == labels) / len(labels))
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data, feature_names = word_list,
class_names = ['not engineer', 'engineer'],
filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
# 非エンジニアが0、エンジニアが1なのでつぶやいていたらそのWordのカウント数が0より大きいはず
oppai = 'おっぱい'
if oppai in word_list:
oppai_index = word_list.index(oppai)
oppai_predicted = np.array([int(vector[oppai_index]>0) for vector in vectors])
print(sum(oppai_predicted))
print(sum(oppai_predicted == labels) / len(labels))
from sklearn import tree, cross_validation
scores = cross_validation.cross_val_score(clf, vectors, labels,cv=10)
print(scores.mean(), scores)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment