Skip to content

Instantly share code, notes, and snippets.

@yoshi0309
Last active August 29, 2015 14:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yoshi0309/1dbcd8ddea554f9b35af to your computer and use it in GitHub Desktop.
Save yoshi0309/1dbcd8ddea554f9b35af to your computer and use it in GitHub Desktop.
Connpass Event Classifiere using Pocket - Jubatus Hackathon - Team Paper.
#!/usr/bin/env python
# coding: utf-8
# jubatus server info.
host = 'localhost'
port = 9199
import sys
import json
import random
import jubatus
from jubatus.common import Datum
# for connpass json data
def getTitleList(filepath):
f = open(filepath)
data = json.load(f)
f.close
# print(json.dumps(data, sort_keys=True, indent=4))
list = data["events"]
titleList = []
for event in list:
titleList.append(event["title"])
return titleList
# for hatebu feed data
def getTitleFromFeed(path):
import feedparser
entries = feedparser.parse(path)['entries']
titleList = []
for entry in entries:
titleList.append(entry['title'])
return titleList
def getTitleFromTxt(path):
titleList = []
f = open(path)
for line in f:
titleList.append(line)
f.close
return titleList
def buildTrainData(titleList,classLabel):
traindata = []
for title in titleList:
traindata.append((classLabel, Datum({'title': title})))
return traindata
def train(client,traindata):
random.shuffle(traindata)
client.train(traindata)
def traindata_ty():
t1 = getTitleFromFeed('data/hatebu-1.xml')
t1.extend(getTitleFromFeed('data/hatebu-2.xml'))
t1.extend(getTitleFromFeed('data/hatebu-3.xml'))
t1.extend(getTitleFromFeed('data/hatebu-4.xml'))
t1.extend(getTitleFromFeed('data/hatebu-5.xml'))
t1.extend(getTitleFromFeed('data/hatebu-6.xml'))
traindata = buildTrainData(t1,'興味あり')
t2 = []
# t2 = getTitleFromFeed('data/hot-economics.xml')
t2.extend(getTitleFromFeed('data/hot-entertainment.xml'))
t2.extend(getTitleFromFeed('data/hot-general.xml'))
t2.extend(getTitleFromFeed('data/hot-lif.xml'))
t2.extend(getTitleFromFeed('data/hot-social.xml'))
traindata_n = buildTrainData(t2,'興味なし')
traindata.extend(traindata_n)
return traindata
def traindata_mo():
t1 = getTitleFromTxt('data/pocket.txt')
traindata = buildTrainData(t1,'興味あり')
t2 = []
# t2 = getTitleFromFeed('data/hot-economics.xml')
t2.extend(getTitleFromFeed('data/hot-entertainment.xml'))
t2.extend(getTitleFromFeed('data/hot-general.xml'))
t2.extend(getTitleFromFeed('data/hot-lif.xml'))
t2.extend(getTitleFromFeed('data/hot-social.xml'))
traindata_n = buildTrainData(t2,'興味なし')
traindata.extend(traindata_n)
return traindata
def traindata(userid):
import psycopg2
conn = psycopg2.connect("dbname=paper_development host=localhost user=postgres password=password")
cur = conn.cursor()
cur.execute("select given_title from Crawled_Pockets where user_id=" + str(userid))
records = cur.fetchall()
titleList = []
for record in records:
titleList.append(record[0])
traindata = buildTrainData(titleList,'1') #興味あり
t2 = []
# t2 = getTitleFromFeed('data/hot-economics.xml')
t2.extend(getTitleFromFeed('data/hot-entertainment.xml'))
t2.extend(getTitleFromFeed('data/hot-general.xml'))
t2.extend(getTitleFromFeed('data/hot-lif.xml'))
t2.extend(getTitleFromFeed('data/hot-social.xml'))
traindata_n = buildTrainData(t2,'0') #興味なし
traindata.extend(traindata_n)
return traindata
def predict(client):
#data = [
# Datum({'title': u'ダイエー'}),
# Datum({'title': u'機械学習'}),
# Datum({'title': u'オムニ・チャネル'}),
# Datum({'title': u'AKB48'}), #OMG!
# Datum({'title': u'消費税増税'}),
# Datum({'title': u'Apache'}),
# Datum({'title': u'プロジェクト管理'}),
# Datum({'title': u'Webデザイン'}),
# Datum({'title': u'Ocaml'}),
# Datum({'title': u'Java'}),
# Datum({'title': u'Solr'}),
# Datum({'title': u'Elasticsearch'})
# ]
eventIdList = []
import psycopg2
conn = psycopg2.connect("dbname=paper_development host=localhost user=postgres password=password")
cur = conn.cursor()
cur.execute("select * from connpasses")
records = cur.fetchall()
for record in records:
# print record[1]
# print record[2]
d = Datum({'title': record[2]})
# print record[2]
# print d
res = client.classify([d])
# sys.stdout.write(max(res[0], key=lambda x: x.score).label)
# sys.stdout.write(' ')
# sys.stdout.write(d.string_values[0][1].encode('utf-8'))
# sys.stdout.write('\n')
result = max(res[0], key=lambda x: x.score)
# print res[0]
# print d.string_values[0][1]
# max かつ score が 0.3 よりも大きい物
if result.label == '1' and result.score >= 0.3:
eventIdList.append(record[1])
print record[1], record[2], result.score # for debug
conn.close()
cur.close()
return eventIdList
def getUserIds():
import psycopg2
conn = psycopg2.connect("dbname=paper_development host=localhost user=postgres password=password")
cur = conn.cursor()
cur.execute("select * from Users")
records = cur.fetchall()
userIds = []
for rec in records:
userIds.append((rec[0],rec[1]))
cur.close()
conn.close()
return userIds
def saveDataToTable(userid,eventIdList):
import psycopg2
conn = psycopg2.connect("dbname=paper_development host=localhost user=postgres password=password")
cur = conn.cursor()
cur.execute("delete from recommends where user_id=" + str(userid))
conn.commit()
for eventId in eventIdList:
cur.execute("INSERT INTO Recommends(user_id,event_id) VALUES (%s,%s)", [userid,eventId])
conn.commit()
cur.close()
conn.close()
if __name__ == '__main__':
# connect to the jubatus
# client_ty = jubatus.Classifier(host, port, 'yoshida')
# client_ty.clear()
# train(client_ty, traindata_ty())
# print 'predict for ty .............................'
# predict(client_ty)
# client_mo = jubatus.Classifier(host, port, 'morimoto')
# client_mo.clear()
# train(client_mo,traindata_mo())
# print 'predict for mo .............................'
# predict(client_mo)
for user in getUserIds():
client = jubatus.Classifier(host, port, str(user[0]))
client.clear()
train(client, traindata(user[0]))
print 'predict for ' + str(user[0]) + ' ' + str(user[1]) + ' .............................'
eventIdList = predict(client)
saveDataToTable(user[0],eventIdList)
{
"method": "AROW",
"converter": {
"num_filter_types": {},
"num_filter_rules": [],
"string_filter_types": {},
"string_filter_rules": [],
"num_types": {},
"num_rules": [],
"string_types": {
"bigram": { "method": "ngram", "char_num": "3" },
"mecab": {
"method": "dynamic",
"path": "libmecab_splitter.so",
"function": "create",
"arg": "-d /var/lib/mecab/dic/ipadic"
}
},
"string_rules": [
{ "key": "*", "type": "bigram", "sample_weight": "bin", "global_weight": "bin" }
]
},
"parameter": {
"regularization_weight" : 1.0
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment