Skip to content

Instantly share code, notes, and snippets.

@rubyu
Created May 2, 2016 01:19
Show Gist options
  • Save rubyu/4371649d30116581b8ef0d29e0a98dc8 to your computer and use it in GitHub Desktop.
Save rubyu/4371649d30116581b8ef0d29e0a98dc8 to your computer and use it in GitHub Desktop.
TWCNB
print "each classes -> count[^c] * log((w[^c] + s) / (cw[^c] + s_all))"
cw = {}
cw_all = 0
for c in classes:
cw[c] = 0
denominator = 0
for ec in classes:
if ec != c:
denominator += classWeights[ec]
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
count = 0
for ec in classes:
if ec != c:
count += getClassWordCount(ec, word)
numerator = 0
for ec in classes:
if ec != c:
numerator += getClassWordWeight(ec, word)
cw[c] += float(count) * math.log( (numerator + s) / (denominator + s_all) )
cw_all += cw[c]
for c in classes:
cw[c] /= cw_all
print "\"%s\": %s," % (c, cw[c])
# The variable cw should be assgined to class_weights[smoothing_parameter]["log_completement"] here.
# I would guess do this each time by my hands... :P
#!-*- coding:utf-8 -*-
import os
import sys
import math
import random
from datetime import datetime
import time
import urllib
import sqlite3
conn = sqlite3.connect("classifier_for_nripper.db")
conn.execute("PRAGMA synchronous=OFF")
conn.execute("PRAGMA cache_size=20000")
conn.execute("""
CREATE TABLE IF NOT EXISTS word_doc_count (
word TEXT PRIMARY KEY NOT NULL,
doc_count INTEGER NOT NULL
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS global (
key TEXT PRIMARY KEY NOT NULL,
value INTEGER NOT NULL
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS class_word_weight (
class TEXT NOT NULL,
word TEXT NOT NULL,
weight REAL NOT NULL,
UNIQUE(class, word)
)
""")
conn.execute("""
CREATE TABLE IF NOT EXISTS class_word_count (
class TEXT NOT NULL,
word TEXT NOT NULL,
count INTEGER NOT NULL,
UNIQUE(class, word)
)
""")
conn.commit()
import MeCab
m = MeCab.Tagger("-Owakati")
def segment(s):
s = m.parse(s.encode("utf-8"))
s = s.decode("utf-8")
return s.rstrip(" \n").split(" ")
def getWords(doc):
words = segment(doc)
words = [s.strip().lower() for s in words]
return words
def toDataArray(words):
h = {}
for w in words:
if h.has_key(w):
h[w] += 1
else:
h[w] = 1
data = []
for w in h:
data.append((w, h[w]))
return data
def getWordDocCount(word):
res = conn.execute("SELECT doc_count FROM word_doc_count WHERE word = ?", [word]).fetchone()
if res == None:
return 0
else:
return res[0]
def bulkIncrWordDocCount(data):
for d in data:
word, count = d
#count += getWordDocCount(word)
count = getWordDocCount(word) + 1
conn.execute("INSERT OR REPLACE INTO word_doc_count VALUES(?, ?)", [word, count])
def getGlobalValue(key):
res = conn.execute("SELECT value FROM global WHERE key = ?", [key]).fetchone()
if res == None:
return 0
else:
return res[0]
def incrGlobalValue(key):
value = getGlobalValue(key)
value += 1
conn.execute("INSERT OR REPLACE INTO global VALUES(?, ?)", [key, value])
def getClassWordWeight(c, word):
res = conn.execute("SELECT weight FROM class_word_weight WHERE class= ? AND word = ?", [c, word]).fetchone()
if res == None:
return 0
else:
return res[0]
def bulkAddClassWordWeight(c, weights):
for d in weights:
weight, word = d
weight += getClassWordWeight(c, word)
conn.execute("INSERT OR REPLACE INTO class_word_weight VALUES(?, ?, ?)", [c, word, weight])
def getClassWordCount(c, word):
res = conn.execute("SELECT count FROM class_word_count WHERE class= ? AND word = ?", [c, word]).fetchone()
if res == None:
return 0
else:
return res[0]
def bulkAddClassWordCount(c, data):
for d in data:
word, count = d
count += getClassWordCount(c, word)
conn.execute("INSERT OR REPLACE INTO class_word_count VALUES(?, ?, ?)", [c, word, count])
"""
def getClassList():
res = conn.execute("SELECT DISTINCT class FROM class_word_weight").fetchone()
if res == None:
return 0
else:
return res[0]
"""
def getClassWeight(c):
res = conn.execute("SELECT SUM(weight) FROM class_word_weight WHERE class = ?", [c]).fetchone()
if res == None:
return 0
else:
return res[0]
def getTF(count):
tf = math.log( count + 1 )
return tf
def getIDF(docs, total):
idf = math.log( float(total) / docs )
return idf
def getSD(weights, target):
if 0 == len(weights):
return
t = 0
avg = 0
for d in weights:
weight, targetc = d
avg += weight
if targetc == target:
t = weight
avg /= len(weights)
sd = 0
for d in weights:
weight, targetc = d
sd += (weight - avg) ** 2
sd /= len(weights)
sd = math.sqrt(sd)
if 0 == sd:
return
t_sd = (10 * (t - avg)) / sd + 50
return (avg, sd, t_sd)
def getOrder(weights, target):
order = 0
for d in weights:
order += 1
weight, targetc = d
if targetc == target:
return order
def printProbs(weights, target):
print "-" * 30
order = 0
for d in weights:
order += 1
weight, targetc = d
if targetc == target:
print "%2d %s: %s \t!target!" % (order, weight, targetc)
else:
print "%2d %s: %s" % (order, weight, targetc)
print "-" * 30
def getDirs(path):
list = []
for i in os.listdir(path):
if os.path.isdir(path + "\\" + i):
list.append(i)
return list
def getFiles(path):
list = []
for i in os.listdir(path):
if os.path.isfile(path + "\\" + i):
list.append(i)
return list
if __name__ == "__main__":
corpus = "arcadia"
classes = getDirs(corpus)
use_for_test = 10
if False:
print "add document ..."
for c in classes:
print "class: %s" % c
files = getFiles(corpus + "\\" + c)
print "total: %s" % len(files)
fcount = 0
for f in files[:-10]:
fcount += 1
print "%s / %s" % (fcount, len(files))
doc = open(corpus + "\\" + c + "\\" + f).read()
doc = unicode(doc, "utf-8", errors="replace")
words = getWords(doc)
data = toDataArray(words)
print "%s unique words" % len(data)
bulkIncrWordDocCount(data)
incrGlobalValue("totalDocCount")
conn.commit()
if False:
print "compute weights ..."
for c in classes:
print "class: %s" % c
files = getFiles(corpus + "\\" + c)
print "total: %s" % len(files)
fcount = 0
for f in files[:-10]:
fcount += 1
print "%s / %s" % (fcount, len(files))
doc = open(corpus + "\\" + c + "\\" + f).read()
doc = unicode(doc, "utf-8", errors="replace")
words = getWords(doc)
data = toDataArray(words)
print "%s unique words" % len(data)
weights = []
weightTotal = 0
for d in data:
word, count = d
totalDocCount = getGlobalValue("totalDocCount")
docCount = getWordDocCount(word)
tf = getTF(count)
idf = getIDF(docCount, totalDocCount)
weight = tf * idf
weights.append((weight, word))
weightTotal += weight ** 2
weightTotal = math.sqrt(weightTotal)
nWeights = []
nWeightTotal = 0
for d in weights:
weight, word = d
weight /= weightTotal
nWeightTotal += weight
nWeights.append((weight, word))
if False:
nWeights.sort(reverse=True)
for d in nWeights[:50]:
weight, word = d
try:
print "%s: %s" % (word.encode("shift-jis"), weight)
except:
pass
bulkAddClassWordCount(c, data)
bulkAddClassWordWeight(c, nWeights)
conn.commit()
"""
クラスごとの単語数
"""
if False:
total = 0
count = {}
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
total += 1
word = row[0]
for c in classes:
if 0 != getClassWordWeight(c, word):
if not count.has_key(c):
count[c] = 1
else:
count[c] += 1
for c in classes:
print c
print "count: %s" % count[c]
print "total: %s" % total
sys.exit()
"""
クラスごとのウェイト平均, 全ての単語について, スムージングあり
"""
if False:
smoothing = 1
for c in classes:
print c
weights = []
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
weights.append( getClassWordWeight(c, word) + smoothing )
avg = 0
for weight in weights:
avg += weight
avg /= len(weights)
sd = 0
for weight in weights:
sd += (weight - avg) ** 2
sd /= len(weights)
sd = math.sqrt(sd)
print "avg: %s" % avg
print "sd: %s" % sd
sys.exit()
"""
クラスごとのウェイト平均, 全ての単語について
"""
if False:
for c in classes:
print c
weights = []
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
weights.append( getClassWordWeight(c, word) )
avg = 0
for weight in weights:
avg += weight
avg /= len(weights)
sd = 0
for weight in weights:
sd += (weight - avg) ** 2
sd /= len(weights)
sd = math.sqrt(sd)
print "avg: %s" % avg
print "sd: %s" % sd
sys.exit()
"""
クラスごとのウェイト平均
"""
if False:
for c in classes:
print c
weights = []
cur = conn.execute("SELECT weight FROM class_word_weight WHERE class= ?", [c])
for row in cur:
weight = row[0]
weights.append(weight)
avg = 0
for weight in weights:
avg += weight
avg /= len(weights)
sd = 0
for weight in weights:
sd += (weight - avg) ** 2
sd /= len(weights)
sd = math.sqrt(sd)
print "avg: %s" % avg
print "sd: %s" % sd
sys.exit()
"""
docごとのウェイトの詳細出力
"""
if False:
print "printing weight details "
for c in classes:
dir = "arcadia_weight\\" + c
try:
os.makedirs(dir)
except:
pass
print "class: %s" % c
files = getFiles(corpus + "\\" + c)
print "total: %s" % len(files)
files = files[-use_for_test:]
print "use for test: %s" % len(files)
fcount = 0
for f in files:
fcount += 1
print "fileNo: %s (%s / %s)" % (f, fcount, len(files))
doc = open(corpus + "\\" + c + "\\" + f).read()
doc = unicode(doc, "utf-8", errors="replace")
words = getWords(doc)
data = toDataArray(words)
print "unique words: %s" % len(data)
allWeights = []
for d in data:
word, count = d
weights = []
for targetc in classes:
res = conn.execute("SELECT weight FROM class_word_weight WHERE class= ? AND word = ?", [targetc, word]).fetchone()
if res == None:
w = "NA"
else:
w = res[0]
weights.append(w)
if targetc == c:
weights.insert(0, count)
weights.insert(0, word)
weights.insert(0, w)
allWeights.append(tuple(weights))
allWeights.sort(reverse=True)
fout = open(dir + "\\" + f, 'w')
line = []
line.append("word")
line.append("count")
for targetc in classes:
line.append(targetc)
fout.write("\t".join([str(w) for w in line]))
fout.write("\n")
for w in allWeights:
try:
line = []
#line.append("\"" + w[1].replace("\"", "__").encode("utf-8") + "\"") #word
line.append("\"" + urllib.quote(w[1].replace("\"", "__").encode("utf-8")) + "\"") #word
line.append(w[2]) #count
for d in w[3:]:
line.append(d) #weight
fout.write("\t".join([str(w) for w in line]))
fout.write("\n")
except:
pass
sys.exit()
"""
classごとのwordのweightの詳細出力
"""
if False:
print "printing count and weight details "
fout = open('class_word_count_weight_detail.txt', 'w')
line = []
line.append("classes")
line.append("word")
line.append("weight")
fout.write("\t".join([str(w) for w in line]))
fout.write("\n")
try:
for c in classes:
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
line = []
line.append("\"" + c + "\"")
line.append("\"" + urllib.quote(word.replace("\"", "__").encode("utf-8")) + "\"")
res = conn.execute("SELECT weight FROM class_word_weight WHERE class= ? AND word = ?", [c, word]).fetchone()
if res == None:
line.append("NA")
else:
line.append(res[0])
fout.write("\t".join([str(w) for w in line]))
fout.write("\n")
except:
pass
"""
classごとのwordのweightの詳細出力, ウェイトについて、IDF値で割る
"""
if False:
print "printing count and weight details "
fout = open('class_word_count_weight_detail_div_idf.txt', 'w')
line = []
line.append("classes")
line.append("word")
line.append("weight")
fout.write("\t".join([str(w) for w in line]))
fout.write("\n")
try:
totalDocCount = getGlobalValue("totalDocCount")
for c in classes:
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
line = []
line.append("\"" + c + "\"")
line.append("\"" + urllib.quote(word.replace("\"", "__").encode("utf-8")) + "\"")
res = conn.execute("SELECT weight FROM class_word_weight WHERE class= ? AND word = ?", [c, word]).fetchone()
if res == None:
line.append("NA")
else:
docCount = getWordDocCount(word)
idf = getIDF(docCount, totalDocCount)
if idf != 0:
line.append(res[0] / idf)
else:
line.append(0)
fout.write("\t".join([str(w) for w in line]))
fout.write("\n")
except:
pass
"""
ここまでで、tf-idfを用いて クラス - 単語 - ウェイト を求めている。
4.1(TF transform)
単語頻度をべき乗分布に従うようにし、実際の単語頻度確率分布に近づける。
4.2(IDF transform)
頻出単語のクラス分類への影響を低減させる。
4.3(length norm)
長いドキュメントが重用されないよう、ドキュメント長での正規化。
を適用している。
getClassWordWeight(c, word)
"""
if True:
print "testing ..."
classWeights = {
"akamatu": 10338.780247,
"eva": 7652.34294295,
"ff": 4499.4428822,
"gs": 1214.35158326,
"HxH": 1957.9397041,
"muv": 14541.1070473,
"nade": 1122.17844428,
"naruto": 5903.74842047,
"original": 26770.7560395,
"sammon": 979.966347776,
"toraha": 24123.1373033,
"type": 14555.178196,
"zero": 10239.0622349
}
if True:
print "-" * 30
print "*class weights*"
classWeights = {}
classWeightsTotal = 0
for c in classes:
classWeights[c] = getClassWeight(c)
classWeightsTotal += classWeights[c]
print "\"%s\": %s," % (c, classWeights[c])
print "-" * 30
print "-" * 30
print "*class weights normalized*"
for c in classes:
print "\"%s\": %s," % (c, classWeights[c] / classWeightsTotal)
print "-" * 30
class_weights = {
1 : {
"count_log" : {
"akamatu": 0.076803957698,
"eva": 0.0765281913222,
"ff": 0.0776863314586,
"gs": 0.0782604959874,
"HxH": 0.0781194882895,
"muv": 0.0764444270908,
"nade": 0.0781613567559,
"naruto": 0.0770410538314,
"original": 0.0745866402645,
"sammon": 0.0782463671122,
"toraha": 0.0756274083759,
"type": 0.0757030920232,
"zero": 0.0767911897903,
},
"count" : {
"akamatu": 0.0817124441002,
"eva": 0.0372208887495,
"ff": 0.0278040000315,
"gs": 0.00382368938481,
"HxH": 0.00844992107636,
"muv": 0.153335036161,
"nade": 0.0018630685205,
"naruto": 0.0363817681699,
"original": 0.203180054696,
"sammon": 0.00328466417964,
"toraha": 0.271105060781,
"type": 0.0922599155798,
"zero": 0.0795794885704
},
"log" : {
"akamatu": 0.0769167788887,
"eva": 0.0769125068799,
"ff": 0.0768909989191,
"gs": 0.0768846856777,
"HxH": 0.0768855946199,
"muv": 0.0769452721712,
"nade": 0.0768852921445,
"naruto": 0.0769031605635,
"original": 0.0770171973361,
"sammon": 0.0768843171399,
"toraha": 0.0770047945764,
"type": 0.0769507056099,
"zero": 0.0769186954732
},
"log_complement" : {
"akamatu": 0.076868318422,
"eva": 0.0770057713849,
"ff": 0.0767609289076,
"gs": 0.0767150451683,
"HxH": 0.0767237198601,
"muv": 0.076940334663,
"nade": 0.0767342202184,
"naruto": 0.0768963973054,
"original": 0.0775000489223,
"sammon": 0.0767192394693,
"toraha": 0.0770806816713,
"type": 0.0771654693196,
"zero": 0.0768898246878,
}
},
0.1: {
"count_log": {
"akamatu": 0.0762512836488,
"eva": 0.0749601826474,
"ff": 0.0782878349419,
"gs": 0.0805614637159,
"HxH": 0.0798892961017,
"muv": 0.0758655048967,
"nade": 0.0800124210774,
"naruto": 0.076244145346,
"original": 0.072583801544,
"sammon": 0.0804913988864,
"toraha": 0.0748565320527,
"type": 0.073825056199,
"zero": 0.0761710789423,
},
"count": {
"akamatu": 0.0756639558836,
"eva": 0.0464631526662,
"ff": 0.0185410289536,
"gs": 0.00215203163463,
"HxH": 0.00448477587876,
"muv": 0.159492779821,
"nade": 0.00126296539504,
"naruto": 0.0416746791096,
"original": 0.202696157374,
"sammon": 0.00158442528543,
"toraha": 0.260535990882,
"type": 0.106342988214,
"zero": 0.0791050689028
},
"log": {
"akamatu": 0.07699133716,
"eva": 0.076845241008,
"ff": 0.0764467954692,
"gs": 0.0761991627296,
"HxH": 0.0762487503426,
"muv": 0.0774045672112,
"nade": 0.0762109372266,
"naruto": 0.0766707975439,
"original": 0.0782230139347,
"sammon": 0.076185786234,
"toraha": 0.0780864337276,
"type": 0.0774748960333,
"zero": 0.0770122813794
},
"log_complement": {
"akamatu": 0.0768208954766,
"eva": 0.0771082955739,
"ff": 0.0768253492256,
"gs": 0.076840441902,
"HxH": 0.0768340577217,
"muv": 0.0768082872779,
"nade": 0.0768719322054,
"naruto": 0.076991293746,
"original": 0.0773086211446,
"sammon": 0.0768524950237,
"toraha": 0.0767250394207,
"type": 0.0771475661595,
"zero": 0.0768657251224,
}
},
0.01: {
"count_log": {
"akamatu": 0.0769066375194,
"eva": 0.0739037067159,
"ff": 0.0778579980725,
"gs": 0.0797584560021,
"HxH": 0.0791186404913,
"muv": 0.0773352371408,
"nade": 0.0783064605351,
"naruto": 0.0750942542841,
"original": 0.0742536071925,
"sammon": 0.0793357349613,
"toraha": 0.0771055671215,
"type": 0.074318007383,
"zero": 0.0767056925805,
},
"count": {
"akamatu": 0.0866491274899,
"eva": 0.065706559771,
"ff": 0.0270600176155,
"gs": 0.00434317823757,
"HxH": 0.00757625053516,
"muv": 0.161510359574,
"nade": 0.00295720211797,
"naruto": 0.0654134395297,
"original": 0.160667140588,
"sammon": 0.00288996606205,
"toraha": 0.212703142655,
"type": 0.109843093169,
"zero": 0.0926805226562
},
"log": {
"akamatu": 0.0779073235827,
"eva": 0.0775962381183,
"ff": 0.075760836814,
"gs": 0.0735975439802,
"HxH": 0.0742280213893,
"muv": 0.0790295088583,
"nade": 0.0736528865082,
"naruto": 0.0768763460077,
"original": 0.0803984430262,
"sammon": 0.0733801716318,
"toraha": 0.0800780482133,
"type": 0.0794299946095,
"zero": 0.0780646372604
},
"log_complement": {
"akamatu": 0.0768101047625,
"eva": 0.0771377553335,
"ff": 0.0768614819677,
"gs": 0.0768978181705,
"HxH": 0.0768865536412,
"muv": 0.076764183868,
"nade": 0.0769321952581,
"naruto": 0.0770269395279,
"original": 0.077194430214,
"sammon": 0.0769128105813,
"toraha": 0.0765881596931,
"type": 0.0771212791979,
"zero": 0.0768662877843,
}
},
0.001: {
"count_log": {
"akamatu": 0.0780185415292,
"eva": 0.0744963985378,
"ff": 0.0781212304252,
"gs": 0.0774890170339,
"HxH": 0.0780030329864,
"muv": 0.0787748459599,
"nade": 0.0754859561034,
"naruto": 0.0754724012128,
"original": 0.0757668347929,
"sammon": 0.0762996258978,
"toraha": 0.0787218466361,
"type": 0.075559194554,
"zero": 0.0777910743308,
},
"count": {
"akamatu": 0.087973665158,
"eva": 0.0713664055092,
"ff": 0.0329470176543,
"gs": 0.00898003655301,
"HxH": 0.0123538226244,
"muv": 0.15620341616,
"nade": 0.00650070208851,
"naruto": 0.0755201410784,
"original": 0.146020928595,
"sammon": 0.006473791386,
"toraha": 0.194744686646,
"type": 0.106456586688,
"zero": 0.0944587998596
},
"log": {
"akamatu": 0.0782107352173,
"eva": 0.0787720800924,
"ff": 0.0762193791294,
"gs": 0.0726834267532,
"HxH": 0.0740859173818,
"muv": 0.0792287623342,
"nade": 0.0728466156321,
"naruto": 0.0779899472264,
"original": 0.0797516011932,
"sammon": 0.0720145093176,
"toraha": 0.0792177632836,
"type": 0.0802871127608,
"zero": 0.0786921496779
},
"log_complement": {
"akamatu": 0.0768090069784,
"eva": 0.0771374611645,
"ff": 0.0768651848909,
"gs": 0.0769002184338,
"HxH": 0.0768892863743,
"muv": 0.0767625371869,
"nade": 0.076934847317,
"naruto": 0.0770284299144,
"original": 0.0771921511656,
"sammon": 0.0769160441484,
"toraha": 0.0765805790049,
"type": 0.0771147553449,
"zero": 0.0768694980761,
}
},
0.0001: {
"count_log": {
"akamatu": 0.0780923730448,
"eva": 0.0746496463985,
"ff": 0.0782014629913,
"gs": 0.077194459967,
"HxH": 0.0779437865349,
"muv": 0.0788988453706,
"nade": 0.0753137295455,
"naruto": 0.07564764567,
"original": 0.0758632887556,
"sammon": 0.0757972084518,
"toraha": 0.0787955634873,
"type": 0.0757206257892,
"zero": 0.0778813639935,
},
"count": {
"akamatu": 0.0877946516109,
"eva": 0.0718364824067,
"ff": 0.0337676403197,
"gs": 0.0104117347871,
"HxH": 0.0134384001925,
"muv": 0.154946393348,
"nade": 0.00765304224568,
"naruto": 0.0766786954138,
"original": 0.143815282776,
"sammon": 0.00776600792853,
"toraha": 0.191962313085,
"type": 0.105621987204,
"zero": 0.0943073686819
},
"log": {
"akamatu": 0.0774683894707,
"eva": 0.0791320992099,
"ff": 0.0765333112053,
"gs": 0.0741150313085,
"HxH": 0.0752868374142,
"muv": 0.0781419999124,
"nade": 0.0745485141111,
"naruto": 0.0785667689259,
"original": 0.0777005930993,
"sammon": 0.0734073877121,
"toraha": 0.076954612506,
"type": 0.0798603463815,
"zero": 0.078284108743
},
"log_complement": {
"akamatu": 0.076809105821,
"eva": 0.0771335341665,
"ff": 0.0768645996078,
"gs": 0.0768954693754,
"HxH": 0.0768854944305,
"muv": 0.0767662220885,
"nade": 0.0769300425931,
"naruto": 0.0770255558043,
"original": 0.0772051013097,
"sammon": 0.0769118675756,
"toraha": 0.0765895738483,
"type": 0.0771111032019,
"zero": 0.0768723301775,
}
},
0.00001: {
"log_count": {
"akamatu": 0.0779002954313,
"eva": 0.0746198573607,
"ff": 0.0781516623642,
"gs": 0.0774355417012,
"HxH": 0.078062502327,
"muv": 0.0787193133767,
"nade": 0.075844989475,
"naruto": 0.0756769264951,
"original": 0.0756491702872,
"sammon": 0.0760771991106,
"toraha": 0.0785395909979,
"type": 0.0756113949565,
"zero": 0.0777115561166,
},
"count": {
"akamatu": 0.0877666570356,
"eva": 0.0718771209795,
"ff": 0.0338510695395,
"gs": 0.0105852858453,
"HxH": 0.013560346216,
"muv": 0.154801162357,
"nade": 0.00779482272841,
"naruto": 0.076791296446,
"original": 0.143576409062,
"sammon": 0.00792963326056,
"toraha": 0.191659569346,
"type": 0.105525194695,
"zero": 0.09428143249
},
"log": {
"akamatu": 0.0766722033925,
"eva": 0.0792740159204,
"ff": 0.0767142005174,
"gs": 0.0757035173,
"HxH": 0.07644277834,
"muv": 0.0770362397981,
"nade": 0.0764108290148,
"naruto": 0.0789193706818,
"original": 0.0757889916054,
"sammon": 0.0750999310093,
"toraha": 0.0748666230714,
"type": 0.0793042247333,
"zero": 0.0777670746155
},
"log_complement": {
"akamatu": 0.0768093307345,
"eva": 0.0771292494819,
"ff": 0.0768635805306,
"gs": 0.0768900025971,
"HxH": 0.0768810447395,
"muv": 0.0767704444539,
"nade": 0.0769244892152,
"naruto": 0.0770222444065,
"original": 0.0772195844618,
"sammon": 0.0769069458823,
"toraha": 0.0766002382159,
"type": 0.07710775636,
"zero": 0.0768750889207,
}
},
0.000001: {
"log_count": {
"akamatu": 0.0776802445669,
"eva": 0.0745687723841,
"ff": 0.0780863657591,
"gs": 0.0777347837739,
"HxH": 0.0781979014009,
"muv": 0.0785078202681,
"nade": 0.0764504623561,
"naruto": 0.0756880491118,
"original": 0.0754029154941,
"sammon": 0.0764458096232,
"toraha": 0.0782499260423,
"type": 0.0754728302138,
"zero": 0.0775141190057,
},
"count": {
"akamatu": 0.0877637342908,
"eva": 0.0718811026902,
"ff": 0.0338594194431,
"gs": 0.0106030098068,
"HxH": 0.0135726913015,
"muv": 0.154786405451,
"nade": 0.00780932820254,
"naruto": 0.0768025052285,
"original": 0.143552304321,
"sammon": 0.00794643264095,
"toraha": 0.19162900391,
"type": 0.105515355829,
"zero": 0.0942787068858
},
"log": {
"akamatu": 0.0759959218038,
"eva": 0.0793751006252,
"ff": 0.0768547708465,
"gs": 0.0770568151608,
"HxH": 0.0774128113192,
"muv": 0.076101924644,
"nade": 0.0779959724918,
"naruto": 0.0791974414203,
"original": 0.0741883143549,
"sammon": 0.076554810523,
"toraha": 0.0731201540118,
"type": 0.0788243189481,
"zero": 0.0773216438505
},
"log_complement": {
"akamatu": 0.0768095678823,
"eva": 0.0771249373728,
"ff": 0.0768625201368,
"gs": 0.0768844746722,
"HxH": 0.0768765379101,
"muv": 0.0767747124247,
"nade": 0.0769188717703,
"naruto": 0.0770188957282,
"original": 0.0772341929426,
"sammon": 0.0769019592646,
"toraha": 0.0766110487998,
"type": 0.0771044464911,
"zero": 0.0768778346044,
}
},
0.0000001: {
"count_log": {
"akamatu": 0.0774598449828,
"eva": 0.0745161216571,
"ff": 0.0780202342653,
"gs": 0.0780365066339,
"HxH": 0.0783334377885,
"muv": 0.0782954903244,
"nade": 0.0770566248788,
"naruto": 0.0756972247399,
"original": 0.0751561942427,
"sammon": 0.0768192033893,
"toraha": 0.0779601222388,
"type": 0.0753328787885,
"zero": 0.07731611607,
},
"count": {
"akamatu": 0.0877634407559,
"eva": 0.071881500017,
"ff": 0.0338602544945,
"gs": 0.0106047859704,
"HxH": 0.013573927332,
"muv": 0.154784927375,
"nade": 0.00781078210065,
"naruto": 0.0768036255699,
"original": 0.143549891631,
"sammon": 0.0079481170654,
"toraha": 0.191625944399,
"type": 0.105514370313,
"zero": 0.0942784329759
},
"log": {
"akamatu": 0.075427080173,
"eva": 0.0794583624502,
"ff": 0.076971817242,
"gs": 0.0781954798292,
"HxH": 0.078227651215,
"muv": 0.0753164901562,
"nade": 0.0793295879658,
"naruto": 0.0794293909472,
"original": 0.0728440233786,
"sammon": 0.0777801322794,
"toraha": 0.0716535980466,
"type": 0.0784199750961,
"zero": 0.0769464112207
},
"log_complement": {
"akamatu": 0.0768098057981,
"eva": 0.0771206307573,
"ff": 0.0768614576473,
"gs": 0.0768789511982,
"HxH": 0.0768720339827,
"muv": 0.0767789767999,
"nade": 0.076913258656,
"naruto": 0.0770155497224,
"original": 0.0772487860473,
"sammon": 0.0768969756883,
"toraha": 0.0766218533355,
"type": 0.0771011466239,
"zero": 0.076880573743,
}
},
0.00000001: {
"count_log": {
"akamatu": 0.0772418338363,
"eva": 0.0744638956495,
"ff": 0.0779547474875,
"gs": 0.0783351567524,
"HxH": 0.0784674975569,
"muv": 0.0780854121642,
"nade": 0.0776561920649,
"naruto": 0.0757061079394,
"original": 0.0749121392586,
"sammon": 0.0771889633168,
"toraha": 0.0776734905092,
"type": 0.0751943294385,
"zero": 0.0771202340258,
},
"count": {
"akamatu": 0.0877634113898,
"eva": 0.0718815397412,
"ff": 0.0338603380002,
"gs": 0.0106049636246,
"HxH": 0.0135740509502,
"muv": 0.154784779543,
"nade": 0.00781092752405,
"naruto": 0.0768037375987,
"original": 0.14354965034,
"sammon": 0.00794828555283,
"toraha": 0.191625638418,
"type": 0.105514271746,
"zero": 0.0942784055714
},
"log": {
"akamatu": 0.0749431204614,
"eva": 0.0795290371209,
"ff": 0.077071287961,
"gs": 0.0791642677641,
"HxH": 0.0789208012298,
"muv": 0.0746482986623,
"nade": 0.0804642307083,
"naruto": 0.0796265495841,
"original": 0.0717005189699,
"sammon": 0.0788227586966,
"toraha": 0.0704061066145,
"type": 0.0780759041939,
"zero": 0.0766271180331
},
"log_complement": {
"akamatu": 0.0768100433374,
"eva": 0.0771163328925,
"ff": 0.0768603969728,
"gs": 0.0768734386889,
"HxH": 0.0768675389235,
"muv": 0.0767832326941,
"nade": 0.0769076566651,
"naruto": 0.0770122103564,
"original": 0.0772633498228,
"sammon": 0.0768920019077,
"toraha": 0.0766326366881,
"type": 0.0770978540391,
"zero": 0.0768833070117,
}
},
0.000000001: {
"count_log": {
"akamatu": 0.0770264385046,
"eva": 0.0744122817982,
"ff": 0.0778900393121,
"gs": 0.0786302430614,
"HxH": 0.0785999480243,
"muv": 0.0778778497505,
"nade": 0.0782485626974,
"naruto": 0.0757148653935,
"original": 0.0746710118418,
"sammon": 0.0775543276262,
"toraha": 0.0773903010702,
"type": 0.075057431106,
"zero": 0.0769266998139,
},
"count": {
"akamatu": 0.0877634084531,
"eva": 0.0718815437135,
"ff": 0.0338603463508,
"gs": 0.0106049813904,
"HxH": 0.0135740633122,
"muv": 0.15478476476,
"nade": 0.00781094206672,
"naruto": 0.0768037488015,
"original": 0.143549626211,
"sammon": 0.00794830240203,
"toraha": 0.19162560782,
"type": 0.105514261889,
"zero": 0.0942784028308
},
"log": {
"akamatu": 0.0745264717067,
"eva": 0.0795898669339,
"ff": 0.0771569136676,
"gs": 0.0799983161095,
"HxH": 0.079517535966,
"muv": 0.0740730456274,
"nade": 0.0814410656171,
"naruto": 0.079796269937,
"original": 0.0707160754542,
"sammon": 0.0797203862322,
"toraha": 0.0693321421226,
"type": 0.0777796821406,
"zero": 0.0763522284853
},
"log_complement": {
"akamatu": 0.0768102803871,
"eva": 0.0771120440774,
"ff": 0.0768593384972,
"gs": 0.0768679377608,
"HxH": 0.0768630533009,
"muv": 0.0767874796453,
"nade": 0.0769020664415,
"naruto": 0.0770088780059,
"original": 0.077277882965,
"sammon": 0.076887038567,
"toraha": 0.0766433974122,
"type": 0.077094568445,
"zero": 0.0768860344947,
}
},
0.0000000001: {
"count_log": {
"akamatu": 0.0768136384817,
"eva": 0.0743612884056,
"ff": 0.0778261100987,
"gs": 0.0789217758045,
"HxH": 0.078730802512,
"muv": 0.0776727877809,
"nade": 0.0788337955667,
"naruto": 0.0757235154245,
"original": 0.0744327897079,
"sammon": 0.0779152937141,
"toraha": 0.0771105241055,
"type": 0.0749221811379,
"zero": 0.07673549726,
},
"count": {
"akamatu": 0.0877634081594,
"eva": 0.0718815441108,
"ff": 0.0338603471859,
"gs": 0.0106049831669,
"HxH": 0.0135740645484,
"muv": 0.154784763281,
"nade": 0.007810943521,
"naruto": 0.0768037499218,
"original": 0.143549623798,
"sammon": 0.00794830408695,
"toraha": 0.19162560476,
"type": 0.105514260903,
"zero": 0.0942784025568
},
"log": {
"akamatu": 0.0741640144616,
"eva": 0.0796427834868,
"ff": 0.0772314014908,
"gs": 0.080723884071,
"HxH": 0.0800366553981,
"muv": 0.0735726133606,
"nade": 0.0822908485295,
"naruto": 0.0799439140127,
"original": 0.0698596754713,
"sammon": 0.0805012649201,
"toraha": 0.0683978650218,
"type": 0.0775219877257,
"zero": 0.0761130920499
},
"log_complement": {
"akamatu": 0.0768105169376,
"eva": 0.0771077643158,
"ff": 0.0768582822526,
"gs": 0.0768624484425,
"HxH": 0.0768585771443,
"muv": 0.0767917176332,
"nade": 0.0768964880159,
"naruto": 0.0770055526882,
"original": 0.0772923854314,
"sammon": 0.0768820857004,
"toraha": 0.0766541354285,
"type": 0.0770912897924,
"zero": 0.0768887562172,
}
},
0.00000000001: {
"count_log": {
"akamatu": 0.0766033897454,
"eva": 0.0743109062364,
"ff": 0.0777629472703,
"gs": 0.0792098135176,
"HxH": 0.0788600881609,
"muv": 0.0774701842772,
"nade": 0.0794120119783,
"naruto": 0.0757320615598,
"original": 0.0741974236423,
"sammon": 0.0782719325382,
"toraha": 0.0768341014534,
"type": 0.0747885525866,
"zero": 0.0765465870337,
},
"count": {
"akamatu": 0.08776340813,
"eva": 0.0718815441505,
"ff": 0.0338603472694,
"gs": 0.0106049833446,
"HxH": 0.013574064672,
"muv": 0.154784763134,
"nade": 0.00781094366642,
"naruto": 0.0768037500338,
"original": 0.143549623556,
"sammon": 0.00794830425544,
"toraha": 0.191625604454,
"type": 0.105514260804,
"zero": 0.0942784025294
},
"log": {
"akamatu": 0.0738458226359,
"eva": 0.0796892374256,
"ff": 0.0772967923307,
"gs": 0.0813608414149,
"HxH": 0.0804923768176,
"muv": 0.0731332968716,
"nade": 0.0830368509551,
"naruto": 0.0800735267761,
"original": 0.0691078642702,
"sammon": 0.081186778209,
"toraha": 0.0675776875439,
"type": 0.0772957644232,
"zero": 0.0759031603262
},
"log_complement": {
"akamatu": 0.0768107529891,
"eva": 0.0771034935823,
"ff": 0.0768572282358,
"gs": 0.0768569707035,
"HxH": 0.0768541104299,
"muv": 0.0767959466814,
"nade": 0.0768909213577,
"naruto": 0.0770022343852,
"original": 0.0773068573054,
"sammon": 0.0768771432815,
"toraha": 0.0766648507938,
"type": 0.0770880180566,
"zero": 0.0768914721978,
}
},
0.000000000001: {
"count_log": {
"akamatu": 0.0763956469456,
"eva": 0.0742611245535,
"ff": 0.0777005372671,
"gs": 0.0794944181534,
"HxH": 0.0789878328646,
"muv": 0.0772699955822,
"nade": 0.0799833366764,
"naruto": 0.0757405058157,
"original": 0.0739648628837,
"sammon": 0.0786243206554,
"toraha": 0.076560973462,
"type": 0.0746565167312,
"zero": 0.0763599284094,
},
"count": {
"akamatu": 0.0877634081271,
"eva": 0.0718815441545,
"ff": 0.0338603472777,
"gs": 0.0106049833624,
"HxH": 0.0135740646844,
"muv": 0.154784763119,
"nade": 0.00781094368097,
"naruto": 0.076803750045,
"original": 0.143549623532,
"sammon": 0.00794830427229,
"toraha": 0.191625604423,
"type": 0.105514260795,
"zero": 0.0942784025266
},
"log": {
"akamatu": 0.0735642540501,
"eva": 0.0797303445963,
"ff": 0.077354656808,
"gs": 0.081924486251,
"HxH": 0.0808956456087,
"muv": 0.0727445448312,
"nade": 0.0836969900022,
"naruto": 0.0801882213565,
"original": 0.0684425850438,
"sammon": 0.0817933903066,
"toraha": 0.0668519108644,
"type": 0.0770955789636,
"zero": 0.0757173913177
},
"log_complement": {
"akamatu": 0.0768109885432,
"eva": 0.0770992318487,
"ff": 0.0768561764402,
"gs": 0.0768515045081,
"HxH": 0.0768496531284,
"muv": 0.0768001668174,
"nade": 0.0768853664303,
"naruto": 0.0769989230749,
"original": 0.0773212986821,
"sammon": 0.0768722112779,
"toraha": 0.0766755435783,
"type": 0.0770847532156,
"zero": 0.076894182455,
}
},
0.0000000000001: {
"count_log": {
"akamatu": 0.076190365571,
"eva": 0.0742119327039,
"ff": 0.0776388667235,
"gs": 0.079775650674,
"HxH": 0.0791140639945,
"muv": 0.0770721788077,
"nade": 0.0805478920755,
"naruto": 0.0757488500185,
"original": 0.073735057604,
"sammon": 0.078972533532,
"toraha": 0.0762910816077,
"type": 0.0745260452918,
"zero": 0.0761754813959,
},
"count": {
"akamatu": 0.0877634081268,
"eva": 0.0718815441549,
"ff": 0.0338603472786,
"gs": 0.0106049833641,
"HxH": 0.0135740646856,
"muv": 0.154784763117,
"nade": 0.00781094368242,
"naruto": 0.0768037500461,
"original": 0.14354962353,
"sammon": 0.00794830427398,
"toraha": 0.19162560442,
"type": 0.105514260794,
"zero": 0.0942784025264
},
"log": {
"akamatu": 0.0733133298964,
"eva": 0.0797669778797,
"ff": 0.077406223621,
"gs": 0.0824267869713,
"HxH": 0.0812550247629,
"muv": 0.072398102502,
"nade": 0.0842852830169,
"naruto": 0.080290433185,
"original": 0.0678497112818,
"sammon": 0.0823339819609,
"toraha": 0.0662051238713,
"type": 0.076917180628,
"zero": 0.0755518404228
},
"log_complement": {
"akamatu": 0.0768112236014,
"eva": 0.0770949790866,
"ff": 0.0768551268587,
"gs": 0.0768460498197,
"HxH": 0.0768452052101,
"muv": 0.0768043780695,
"nade": 0.0768798231968,
"naruto": 0.0769956187354,
"original": 0.0773357096578,
"sammon": 0.0768672896569,
"toraha": 0.0766862138529,
"type": 0.0770814952475,
"zero": 0.0768968870066,
}
},
0.0000000000001: {
"count_log": {
"akamatu": 0.0759875021358,
"eva": 0.0741633202682,
"ff": 0.0775779225762,
"gs": 0.0800535706528,
"HxH": 0.0792388082909,
"muv": 0.0768766920497,
"nade": 0.0811057977688,
"naruto": 0.0757570959377,
"original": 0.0735079591224,
"sammon": 0.079316644928,
"toraha": 0.0760243687178,
"type": 0.0743971106309,
"zero": 0.0759932069209,
},
"count": {
"akamatu": 0.0877634081268,
"eva": 0.0718815441549,
"ff": 0.0338603472787,
"gs": 0.0106049833643,
"HxH": 0.0135740646858,
"muv": 0.154784763117,
"nade": 0.00781094368256,
"naruto": 0.0768037500463,
"original": 0.14354962353,
"sammon": 0.00794830427415,
"toraha": 0.19162560442,
"type": 0.105514260793,
"zero": 0.0942784025263
},
"log": {
"akamatu": 0.0730883055965,
"eva": 0.0797998299535,
"ff": 0.0774524678187,
"gs": 0.082877241288,
"HxH": 0.0815773095695,
"muv": 0.0720874192083,
"nade": 0.0848128536856,
"naruto": 0.0803820949283,
"original": 0.0673180326798,
"sammon": 0.0828187749028,
"toraha": 0.065625096847,
"type": 0.0767571961857,
"zero": 0.0754033773364
},
"log_complement": {
"akamatu": 0.0768114581653,
"eva": 0.0770907352678,
"ff": 0.0768540794844,
"gs": 0.076840606602,
"HxH": 0.0768407666455,
"muv": 0.0768085804657,
"nade": 0.0768742916203,
"naruto": 0.0769923213447,
"original": 0.0773500903283,
"sammon": 0.0768623783856,
"toraha": 0.0766968616889,
"type": 0.0770782441306,
"zero": 0.0768995858708,
}
}
}
"""
s_params = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0.0000001,
0.00000001, 0.000000001, 0.0000000001, 0.00000000001, 0.000000000001, 0.0000000000001, 0.00000000000001]
"""
s_params = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0.0000001,
0.00000001, 0.000000001, 0.0000000001, 0.00000000001, 0.000000000001, 0.0000000000001, 0.00000000000001,
0.000000000000001, 0.0000000000000001, 0.00000000000000001, 0.000000000000000001, 0.0000000000000000001, 0.00000000000000000001,
0.000000000000000000001, 0.0000000000000000000001, 0.00000000000000000000001, 0.000000000000000000000001, 0.0000000000000000000000001,
0.000000000000000000000000001, 0.000000000000000000000000001, 0.0000000000000000000000000001, 0.00000000000000000000000000001]
for s in s_params:
s_all = s * conn.execute("SELECT COUNT(DISTINCT word) FROM word_doc_count").fetchone()[0]
print "s: %s" % s
print "s_all: %s" % s_all
if False:
#classごとのwordのweight計算、countの影響を含める
if False:
print "-" * 30
print "each classes -> count[c] * log((w + s) / (cw + s_all))"
cw = {}
cw_all = 0
for c in classes:
cw[c] = 0
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
weight = getClassWordWeight(c, word)
count = getClassWordCount(c, word)
cw[c] += float(count) * math.log((weight + s) / (classWeights[c] + s_all))
cw_all += cw[c]
for c in classes:
cw[c] /= cw_all
print "\"%s\": %s," % (c, cw[c])
print "-" * 30
if True:
print "-" * 30
print "each classes -> count * log((w + s) / (cw + s_all))"
cw = {}
cw_all = 0
for c in classes:
cw[c] = 0
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
count = 0
for ec in classes:
count += getClassWordCount(ec, word)
weight = getClassWordWeight(c, word)
cw[c] += float(count) * math.log((weight + s) / (classWeights[c] + s_all))
cw_all += cw[c]
for c in classes:
cw[c] /= cw_all
print "\"%s\": %s," % (c, cw[c])
print "-" * 30
#classごとのwordのweight計算、countの影響を含める
if False:
print "-" * 30
print "each classes -> count[c] * ((w + s) / (cw + s_all))"
cw = {}
cw_all = 0
for c in classes:
cw[c] = 0
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
weight = getClassWordWeight(c, word)
count = getClassWordCount(c, word)
cw[c] += float(count) * ((weight + s) / (classWeights[c] + s_all))
cw_all += cw[c]
for c in classes:
cw[c] /= cw_all
print "\"%s\": %s," % (c, cw[c])
print "-" * 30
#classごとのwordのweight計算、countの影響を含める
if True:
print "-" * 30
print "each classes -> count * ((w + s) / (cw + s_all))"
cw = {}
cw_all = 0
for c in classes:
cw[c] = 0
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
count = 0
for ec in classes:
count += getClassWordCount(ec, word)
weight = getClassWordWeight(c, word)
cw[c] += float(count) * ((weight + s) / (classWeights[c] + s_all))
cw_all += cw[c]
for c in classes:
cw[c] /= cw_all
print "\"%s\": %s," % (c, cw[c])
print "-" * 30
#NB で使うクラスごとのウェイト
if False:
print "-" * 30
print "each classes -> log((w + s) / (cw + s_all))"
cw = {}
cw_all = 0
for c in classes:
cw[c] = 0
denominator = classWeights[c]
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
numerator = getClassWordWeight(c, word)
cw[c] += math.log( (numerator + s) / (denominator + s_all) )
cw_all += cw[c]
for c in classes:
cw[c] /= cw_all
print "\"%s\": %s," % (c, cw[c])
print "-" * 30
if False:
print "-" * 30
print "each classes -> ((w + s) / (cw + s_all))"
cw = {}
cw_all = 0
for c in classes:
cw[c] = 0
denominator = classWeights[c]
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
numerator = getClassWordWeight(c, word)
cw[c] += (numerator + s) / (denominator + s_all)
cw_all += cw[c]
for c in classes:
cw[c] /= cw_all
print "\"%s\": %s," % (c, cw[c])
print "-" * 30
#TWCNB 6で使うクラスごとのウェイト
if False:
print "-" * 30
print "each classes -> log((w[^c] + s) / (cw[^c] + s_all))"
cw = {}
cw_all = 0
for c in classes:
cw[c] = 0
denominator = 0
for ec in classes:
if ec != c:
denominator += classWeights[ec]
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
numerator = 0
for ec in classes:
if ec != c:
numerator += getClassWordWeight(ec, word)
cw[c] += math.log( (numerator + s) / (denominator + s_all) )
cw_all += cw[c]
for c in classes:
cw[c] /= cw_all
print "\"%s\": %s," % (c, cw[c])
print "-" * 30
#↑* count
if True:
print "-" * 30
print "each classes -> count[c] * log((w[^c] + s) / (cw[^c] + s_all))"
cw = {}
cw_all = 0
for c in classes:
cw[c] = 0
denominator = 0
for ec in classes:
if ec != c:
denominator += classWeights[ec]
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
numerator = 0
for ec in classes:
if ec != c:
numerator += getClassWordWeight(ec, word)
count = getClassWordCount(c, word)
cw[c] += float(count) * math.log( (numerator + s) / (denominator + s_all) )
cw_all += cw[c]
for c in classes:
cw[c] /= cw_all
print "\"%s\": %s," % (c, cw[c])
print "-" * 30
if True:
print "-" * 30
print "each classes -> count * log((w[^c] + s) / (cw[^c] + s_all))"
cw = {}
cw_all = 0
for c in classes:
cw[c] = 0
denominator = 0
for ec in classes:
if ec != c:
denominator += classWeights[ec]
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
count = 0
for ec in classes:
count += getClassWordCount(ec, word)
numerator = 0
for ec in classes:
if ec != c:
numerator += getClassWordWeight(ec, word)
cw[c] += float(count) * math.log( (numerator + s) / (denominator + s_all) )
cw_all += cw[c]
for c in classes:
cw[c] /= cw_all
print "\"%s\": %s," % (c, cw[c])
print "-" * 30
if True:
print "-" * 30
print "each classes -> count[^c] * log((w[^c] + s) / (cw[^c] + s_all))"
cw = {}
cw_all = 0
for c in classes:
cw[c] = 0
denominator = 0
for ec in classes:
if ec != c:
denominator += classWeights[ec]
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
count = 0
for ec in classes:
if ec != c:
count += getClassWordCount(ec, word)
numerator = 0
for ec in classes:
if ec != c:
numerator += getClassWordWeight(ec, word)
cw[c] += float(count) * math.log( (numerator + s) / (denominator + s_all) )
cw_all += cw[c]
for c in classes:
cw[c] /= cw_all
print "\"%s\": %s," % (c, cw[c])
print "-" * 30
continue
"""
誤ったスムージングパラメータでのデータ
classComplementWeightsForNorm = {
"akamatu": -3080841.17686,
"eva": -3086012.45238,
"ff": -3092460.60499,
"gs": -3098540.73891,
"HxH": -3097159.98922,
"muv": -3072167.23895,
"nade": -3098715.89665,
"naruto": -3089468.30269,
"original": -3044780.5968,
"sammon": -3098987.98064,
"toraha": -3050861.25974,
"type": -3071706.76705,
"zero": -3081028.99847
}
if False:
#TWCNB 6で使うクラスごとのウェイト
print "computing classComplementWeightsForNorm"
classComplementWeightsForNorm = {}
for c in classes:
denominator = 0
for ec in classes:
if ec != c:
denominator += classWeights[ec]
classWeight = 0
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
#cur = conn.execute("SELECT word FROM class_word_weight WHERE class = ?", [c])
for row in cur:
word = row[0]
numerator = 0
for ec in classes:
if ec != c:
numerator += getClassWordWeight(ec, word)
classWeight += math.log( (numerator + smoothing) / (denominator + smoothingAll) )
classComplementWeightsForNorm[c] = classWeight
print "%s: %s" % (c, classWeight)
"""
"""
語彙集合全体でウェイト計算してるが、悪影響
classComplementWeightsForNorm = {
"akamatu": -3408574.74087,
"eva": -3409327.91309,
"ff": -3410757.3117,
"gs": -3411789.25816,
"HxH": -3411535.54758,
"muv": -3407090.61123,
"nade": -3411825.33577,
"naruto": -3409978.75976,
"original": -3402823.41769,
"sammon": -3411883.10451,
"toraha": -3403595.02465,
"type": -3406654.81041,
"zero": -3408596.15052
}
if False:
#TWCNB 6で使うクラスごとのウェイト
print "computing classComplementWeightsForNorm"
classComplementWeightsForNorm = {}
for c in classes:
denominator = 0
for ec in classes:
if ec != c:
denominator += classWeights[ec]
classWeight = 0
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
numerator = 0
for ec in classes:
if ec != c:
numerator += getClassWordWeight(ec, word)
classWeight += math.log( (numerator + smoothing) / (denominator + smoothingAll) )
classComplementWeightsForNorm[c] = classWeight
print "%s: %s" % (c, classWeight)
"""
"""
classComplementWeightsForNorm = {
"akamatu": -1069934.25368,
"eva": -683627.513276,
"ff": -754925.208654,
"gs": -327454.900299,
"HxH": -447114.888567,
"muv": -1157362.67506,
"nade": -219929.813111,
"naruto": -628259.562952,
"original": -1456057.22919,
"sammon": -309198.554574,
"toraha": -1543564.08653,
"type": -905372.088819,
"zero": -943972.752648
}
if False:
#TWCNB 6で使うクラスごとのウェイト
print "computing classComplementWeightsForNorm"
classComplementWeightsForNorm = {}
for c in classes:
denominator = 0
for ec in classes:
if ec != c:
denominator += classWeights[ec]
classWeight = 0
cur = conn.execute("SELECT DISTINCT word FROM class_word_weight WHERE class = ?", [c])
for row in cur:
word = row[0]
numerator = 0
for ec in classes:
if ec != c:
numerator += getClassWordWeight(ec, word)
classWeight += math.log( (numerator + smoothing) / (denominator + smoothingAll) )
classComplementWeightsForNorm[c] = classWeight
print "\"%s\": %s," % (c, classWeight)
"""
"""
誤ったスムージングパラメータでのデータ
classWeightsForNorm = {
"akamatu": -2483221.10996,
"eva": -2404639.80365,
"ff": -2263931.79348,
"gs": -1915803.53723,
"HxH": -2042857.52367,
"muv": -2572298.69609,
"nade": -1894860.99644,
"naruto": -2336134.19345,
"original": -2728500.99277,
"sammon": -1858847.71614,
"toraha": -2702319.70272,
"type": -2572784.03741,
"zero": -2480791.83441
}
if False:
#NB で使うクラスごとのウェイト
print "computing classWeightsForNorm"
classWeightsForNorm = {}
for c in classes:
denominator = classWeights[c]
classWeight = 0
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
#cur = conn.execute("SELECT word FROM class_word_weight WHERE class = ?", [c])
for row in cur:
word = row[0]
numerator = getClassWordWeight(c, word)
classWeight += math.log( (numerator + smoothing) / (denominator + smoothingAll) )
classWeightsForNorm[c] = classWeight
print "%s: %s" % (c, classWeight)
"""
"""
語彙集合全体でウェイト計算してるが、悪影響
classWeightsForNorm = {
"akamatu": -3371995.21947,
"eva": -3371807.93663,
"ff": -3370865.0378,
"gs": -3370588.26828,
"HxH": -3370628.11587,
"muv": -3373244.3515,
"nade": -3370614.85549,
"naruto": -3371398.19853,
"original": -3376397.51673,
"sammon": -3370572.11175,
"toraha": -3375853.7856,
"type": -3373482.55089,
"zero": -3372079.24163
}
if False:
#NB で使うクラスごとのウェイト
print "computing classWeightsForNorm"
classWeightsForNorm = {}
for c in classes:
denominator = classWeights[c]
classWeight = 0
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
numerator = getClassWordWeight(c, word)
classWeight += math.log( (numerator + smoothing) / (denominator + smoothingAll) )
classWeightsForNorm[c] = classWeight
print "%s: %s" % (c, classWeight)
"""
"""
classWeightsForNorm = {
"akamatu": -1079456.95336,
"eva": -700571.429639,
"ff": -771821.524118,
"gs": -347791.872949,
"HxH": -468420.143683,
"muv": -1163708.88376,
"nade": -236711.390405,
"naruto": -646282.669202,
"original": -1454417.66756,
"sammon": -328997.667818,
"toraha": -1541410.46535,
"type": -916275.035706,
"zero": -955843.891426
}
if False:
#NB で使うクラスごとのウェイト
print "computing classWeightsForNorm"
classWeightsForNorm = {}
for c in classes:
denominator = classWeights[c]
classWeight = 0
cur = conn.execute("SELECT DISTINCT word FROM class_word_weight WHERE class = ?", [c])
for row in cur:
word = row[0]
numerator = getClassWordWeight(c, word)
classWeight += math.log( (numerator + smoothing) / (denominator + smoothingAll) )
classWeightsForNorm[c] = classWeight
print "\"%s\": %s," % (c, classWeight)
"""
classProbs = {}
for c in classes:
classProbs[c] = float(classWeights[c]) / classWeightsTotal
class_words = {
"akamatu": 86732,
"eva": 56374,
"ff": 61945,
"gs": 27869,
"HxH": 37560,
"muv": 93559,
"nade": 18980,
"naruto": 51965,
"original": 116971,
"sammon": 26351,
"toraha": 123815,
"type": 73836,
"zero": 76864
}
class_all_sd = {
"akamatu": 0.13820801,
"eva": 0.16284586,
"ff": 0.05680551,
"gs": 0.02823316,
"HxH": 0.03144673,
"muv": 0.20784636,
"nade": 0.03702682,
"naruto": 0.12971901,
"original": 0.30875787,
"sammon": 0.02235867,
"toraha": 0.30065981,
"type": 0.22778956,
"zero": 0.15278344
}
class_all_avg = {
"akamatu": 0.03835642,
"eva": 0.02838985,
"ff": 0.01669273,
"gs": 0.00450519,
"HxH": 0.00726387,
"muv": 0.05394686,
"nade": 0.00416323,
"naruto": 0.02190264,
"original": 0.09931832,
"sammon": 0.00363563,
"toraha": 0.08949577,
"type": 0.05399907,
"zero": 0.03798647
}
if False:
class_all_sd = {}
class_all_avg = {}
for targetc in classes:
weights = []
cur = conn.execute("SELECT DISTINCT word FROM word_doc_count")
for row in cur:
word = row[0]
weights.append( getClassWordWeight(targetc, word) )
avg = 0
for weight in weights:
avg += weight
avg /= len(weights)
sd = 0
for weight in weights:
sd += (weight - avg) ** 2
sd /= len(weights)
sd = math.sqrt(sd)
class_all_sd[targetc] = sd
class_all_avg[targetc] = avg
total_info = {}
total_info["count"] = 0
total_info["avg"] = 0
total_info["sd"] = 0
total_info["t_sd"] = 0
total_info["t_order"] = 0
class_info = {}
for c in classes:
print "-" * 30
print "class: %s" % c
files = getFiles(corpus + "\\" + c)
print "total: %s" % len(files)
files = files[-use_for_test:]
print "use for test: %s" % len(files)
class_info[c] = {}
class_info[c]["count"] = 0
class_info[c]["avg"] = 0
class_info[c]["sd"] = 0
class_info[c]["t_sd"] = 0
class_info[c]["t_order"] = 0
fcount = 0
for f in files:
fcount += 1
print "-" * 30
print "fileNo: %s (%s / %s)" % (f, fcount, len(files))
doc = open(corpus + "\\" + c + "\\" + f).read()
doc = unicode(doc, "utf-8", errors="replace")
words = getWords(doc)
data = toDataArray(words)
print "%s unique words" % len(data)
#caching
classWordWeightCache = {}
for targetc in classes:
classWordWeightCache[targetc] = {}
for d in data:
word, count = d
classWordWeightCache[targetc][word] = getClassWordWeight(targetc, word)
if False:
#WNB
results = []
for targetc in classes:
weights = []
denominator = classWeights[targetc] + smoothingAll
for d in data:
word, count = d
numerator = classWordWeightCache[targetc][word] + smoothing
weight = math.log( numerator / denominator )
weights.append((weight, word, count))
result = 0
for d in weights:
weight, word, count = d
weight /= classWeightsForNorm[targetc]
weight *= count
result += weight
results.append((-result, targetc))
results.sort(reverse=True)
"""
------------------------------
class: original
* avg: -0.137140975925
* sd: 0.00241394309644
* t_sd: 70.8995672172
* t_order: 1.0
------------------------------
class: type
* avg: -0.0855871992036
* sd: 0.00159530415131
* t_sd: 64.9241536277
* t_order: 2.0
------------------------------
class: toraha
* avg: -0.127250349819
* sd: 0.00239577621972
* t_sd: 63.2492613594
* t_order: 2.0
------------------------------
class: eva
* avg: -0.0458923484897
* sd: 0.00087739070701
* t_sd: 59.564346693
* t_order: 3.2
------------------------------
class: muv
* avg: -0.252436650238
* sd: 0.00474098712602
* t_sd: 59.4407303047
* t_order: 3.4
------------------------------
class: naruto
* avg: -0.467594472783
* sd: 0.00828687354933
* t_sd: 55.6357015593
* t_order: 4.2
------------------------------
class: zero
* avg: -0.120085050263
* sd: 0.00213030720249
* t_sd: 54.3892611741
* t_order: 4.4
------------------------------
class: akamatu
* avg: -0.284023821865
* sd: 0.00490075636513
* t_sd: 53.389997845
* t_order: 5.1
------------------------------
class: ff
* avg: -0.23744480302
* sd: 0.00410337005736
* t_sd: 44.1746394597
* t_order: 9.0
------------------------------
class: nade
* avg: -0.0966124102639
* sd: 0.00160440180478
* t_sd: 41.3514391445
* t_order: 9.9
------------------------------
class: HxH
* avg: -0.334581009127
* sd: 0.00586168727486
* t_sd: 40.0990159124
* t_order: 10.0
------------------------------
class: gs
* avg: -0.154576677205
* sd: 0.00276307731375
* t_sd: 39.306732706
* t_order: 10.6
------------------------------
class: sammon
* avg: -0.0872690688239
* sd: 0.00151748481904
* t_sd: 38.7632930885
* t_order: 10.9
------------------------------
"""
"""
------------------------------
total:
* avg: -0.18696114131
* sd: 0.00332241228363
* t_sd: 52.706780007
* t_order: 5.82307692308
------------------------------
"""
if False:
#NB
#単語ヒット率、クラスの規模の補正
#クラスウェイトの正規化
results = []
for targetc in classes:
is_zero = 0
weight = 0
for d in data:
word, count = d
w = classWordWeightCache[targetc][word]
if w == 0:
is_zero += 1
weight += count * ( ( w - class_all_avg[targetc] ) / class_all_sd[targetc] )
weight *= float(len(data)) / is_zero
weight /= math.log( class_words[targetc] )
results.append((weight, targetc))
results.sort(reverse=True)
if False:
#NB
#単語ヒット率、クラスの規模の補正
#クラスウェイトの正規化
results = []
for targetc in classes:
is_zero = 0
weight = 0
for d in data:
word, count = d
w = classWordWeightCache[targetc][word]
if w == 0:
is_zero += 1
weight += count * ( ( w - class_all_avg[targetc] ) / class_all_sd[targetc] )
weight *= (float(len(data)) / is_zero) / class_words[targetc]
weight /= math.log( class_words[targetc] )
results.append((weight, targetc))
results.sort(reverse=True)
"""
------------------------------
class: naruto
* avg: 7.85771505554
* sd: 3.56756696384
* t_sd: 79.4111516922
* t_order: 1.1
------------------------------
class: eva
* avg: 1.00478121043
* sd: 0.375988195826
* t_sd: 76.2106404704
* t_order: 1.3
------------------------------
class: muv
* avg: 3.98036105321
* sd: 1.49228261299
* t_sd: 76.0927934718
* t_order: 1.0
------------------------------
class: type
* avg: 2.34400303075
* sd: 1.0359124207
* t_sd: 75.5155838548
* t_order: 1.0
------------------------------
class: nade
* avg: 1.82518128031
* sd: 0.532632300371
* t_sd: 71.3938820751
* t_order: 2.6
------------------------------
class: zero
* avg: 2.65612617297
* sd: 1.36310860824
* t_sd: 70.5030024115
* t_order: 2.0
------------------------------
class: akamatu
* avg: 4.54102133063
* sd: 1.58929579748
* t_sd: 69.0907162482
* t_order: 2.2
------------------------------
class: original
* avg: 2.44676494541
* sd: 0.7840892653
* t_sd: 68.2339784306
* t_order: 2.4
------------------------------
class: ff
* avg: 4.28676540777
* sd: 1.55274003989
* t_sd: 67.8119782081
* t_order: 1.8
------------------------------
class: gs
* avg: 3.01878942588
* sd: 0.928394613408
* t_sd: 66.9629408737
* t_order: 2.4
------------------------------
class: sammon
* avg: 1.9845502653
* sd: 0.550444634306
* t_sd: 66.5805022288
* t_order: 2.6
------------------------------
class: HxH
* avg: 5.92661887139
* sd: 1.85299315149
* t_sd: 63.616400556
* t_order: 2.2
------------------------------
class: toraha
* avg: 2.39737000475
* sd: 0.532713263543
* t_sd: 61.0279889062
* t_order: 4.0
------------------------------
"""
"""
------------------------------
total:
* avg: 3.40538831187
* sd: 1.24293552826
* t_sd: 70.1885814944
* t_order: 2.04615384615
------------------------------
"""
if False:
#NB
#単語ヒット率、クラスの規模の補正
results = []
for targetc in classes:
is_zero = 0
weight = 0
for d in data:
word, count = d
w = classWordWeightCache[targetc][word]
if w == 0:
is_zero += 1
weight += count * ( ( w - class_all_avg[targetc] ) / class_all_sd[targetc] )
weight *= float(len(data) / is_zero) / class_words[targetc]
results.append((weight, targetc))
results.sort(reverse=True)
"""
------------------------------
class: naruto
* avg: 83.665214889
* sd: 41.0130984792
* t_sd: 78.0367048027
* t_order: 1.1
------------------------------
class: eva
* avg: 10.6662471736
* sd: 4.31819472949
* t_sd: 76.7328628376
* t_order: 1.0
------------------------------
class: type
* avg: 25.2991514693
* sd: 12.1691816085
* t_sd: 76.3066622729
* t_order: 1.0
------------------------------
class: muv
* avg: 41.9424871572
* sd: 18.9360139496
* t_sd: 75.5579215868
* t_order: 1.0
------------------------------
class: zero
* avg: 28.7064297744
* sd: 15.9524786569
* t_sd: 71.8242593558
* t_order: 1.2
------------------------------
class: original
* avg: 26.1331836316
* sd: 9.83567733139
* t_sd: 69.62326732
* t_order: 1.9
------------------------------
class: akamatu
* avg: 48.4283907255
* sd: 19.5069727213
* t_sd: 68.397178624
* t_order: 2.3
------------------------------
class: ff
* avg: 45.4521759868
* sd: 18.3730915449
* t_sd: 67.3787863052
* t_order: 1.8
------------------------------
class: nade
* avg: 19.1858614064
* sd: 5.78670013123
* t_sd: 66.424895457
* t_order: 3.5
------------------------------
class: toraha
* avg: 25.4939271441
* sd: 7.04939001171
* t_sd: 63.6507343774
* t_order: 3.3
------------------------------
class: gs
* avg: 32.0010532846
* sd: 11.1705940649
* t_sd: 63.6352680535
* t_order: 3.2
------------------------------
class: sammon
* avg: 21.2129327319
* sd: 6.58126896742
* t_sd: 62.9718761652
* t_order: 4.1
------------------------------
class: HxH
* avg: 62.7669063042
* sd: 22.5089669382
* t_sd: 59.5150998471
* t_order: 3.3
------------------------------
total:
* avg: 36.2272278214
* sd: 14.8616637796
* t_sd: 69.2350397696
* t_order: 2.20769230769
------------------------------
"""
"""
------------------------------
total:
* avg: 36.2272278214
* sd: 14.8616637796
* t_sd: 69.2350397696
* t_order: 2.20769230769
------------------------------
"""
if False:
#NB
#単語ウェイトの正規化, スムージング
#クラスウェイトの正規化
results = []
for targetc in classes:
weight = 0
for d in data:
word, count = d
weight += count * ( classWordWeightCache[targetc][word] / class_all_sd[targetc] )
weight /= math.log( class_words[targetc] )
results.append((weight, targetc))
results.sort(reverse=True)
"""
------------------------------
class: naruto
* avg: 36736.8493577
* sd: 8471.6271288
* t_sd: 77.0856678675
* t_order: 1.2
------------------------------
class: eva
* avg: 3883.73147289
* sd: 870.795921128
* t_sd: 75.9676328865
* t_order: 1.1
------------------------------
class: original
* avg: 9997.3516104
* sd: 1675.12661203
* t_sd: 75.7489719824
* t_order: 1.0
------------------------------
class: type
* avg: 6545.00720195
* sd: 1482.05210241
* t_sd: 74.5002698981
* t_order: 1.1
------------------------------
class: nade
* avg: 7139.82405704
* sd: 1605.74081556
* t_sd: 68.2814762163
* t_order: 2.6
------------------------------
class: muv
* avg: 20449.2642375
* sd: 3557.76034642
* t_sd: 67.1759970731
* t_order: 1.7
------------------------------
class: gs
* avg: 12510.3978823
* sd: 2500.36131198
* t_sd: 66.336197872
* t_order: 1.9
------------------------------
class: zero
* avg: 9095.0785684
* sd: 1942.55118092
* t_sd: 66.1001569155
* t_order: 1.9
------------------------------
class: akamatu
* avg: 20046.9601612
* sd: 3935.47158031
* t_sd: 64.7523588523
* t_order: 2.6
------------------------------
class: sammon
* avg: 6536.01392528
* sd: 1239.54027135
* t_sd: 63.4389833057
* t_order: 2.5
------------------------------
class: HxH
* avg: 25274.2155252
* sd: 4607.40671408
* t_sd: 62.9553473146
* t_order: 2.1
------------------------------
class: ff
* avg: 17568.4392347
* sd: 3401.16867394
* t_sd: 61.6403605965
* t_order: 2.6
------------------------------
class: toraha
* avg: 10239.2715163
* sd: 1743.16152109
* t_sd: 58.419977765
* t_order: 3.0
------------------------------
total:
* avg: 14309.4157501
* sd: 2848.6741677
* t_sd: 67.8771845035
* t_order: 1.94615384615
------------------------------
"""
"""
------------------------------
total:
* avg: 14309.4157501
* sd: 2848.6741677
* t_sd: 67.8771845035
* t_order: 1.94615384615
------------------------------
"""
if False:
#NB
#単語ウェイトの正規化 2
#クラスウェイトの正規化係数= log( log(words) / log(SD / AVG) )
results = []
for targetc in classes:
weight = 0
for d in data:
word, count = d
weight += count * ( (classWordWeightCache[targetc][word] - class_all_avg[targetc]) / class_all_sd[targetc] )
weight /= math.log( math.log(class_words[targetc]) / math.log(class_all_sd[targetc] / class_all_avg[targetc]) )
results.append((weight, targetc))
results.sort(reverse=True)
"""
------------------------------
class: naruto
* avg: 189265.862807
* sd: 49526.5670983
* t_sd: 80.6490641381
* t_order: 1.0
------------------------------
class: eva
* avg: 20099.6794491
* sd: 4962.69373565
* t_sd: 79.391573231
* t_order: 1.1
------------------------------
class: type
* avg: 33426.4734412
* sd: 7068.30109253
* t_sd: 77.034314645
* t_order: 1.0
------------------------------
class: nade
* avg: 36841.4148841
* sd: 10889.4924887
* t_sd: 75.6910995884
* t_order: 1.7
------------------------------
class: gs
* avg: 64493.514528
* sd: 13764.6643212
* t_sd: 72.5862967107
* t_order: 1.2
------------------------------
class: original
* avg: 50930.2441433
* sd: 7330.19966959
* t_sd: 72.0680309075
* t_order: 1.1
------------------------------
class: sammon
* avg: 33384.9985805
* sd: 6027.70954624
* t_sd: 70.896436528
* t_order: 1.5
------------------------------
class: muv
* avg: 104887.404081
* sd: 14881.1424357
* t_sd: 68.3065954679
* t_order: 1.9
------------------------------
class: zero
* avg: 46335.5360043
* sd: 8496.93168815
* t_sd: 68.3043528458
* t_order: 1.7
------------------------------
class: akamatu
* avg: 101531.931976
* sd: 16207.504749
* t_sd: 63.7851562894
* t_order: 2.7
------------------------------
class: HxH
* avg: 128988.172982
* sd: 19942.0923829
* t_sd: 63.6255964155
* t_order: 2.2
------------------------------
class: ff
* avg: 89365.4850534
* sd: 13877.0466773
* t_sd: 57.0439242543
* t_order: 3.7
------------------------------
class: toraha
* avg: 52458.0530545
* sd: 6785.60393744
* t_sd: 54.9613531469
* t_order: 4.7
------------------------------
"""
"""
------------------------------
total:
* avg: 73231.4439218
* sd: 13827.6884479
* t_sd: 69.5649072437
* t_order: 1.96153846154
------------------------------
"""
if False:
#NB
#単語ウェイトの正規化 2
#単語数でのクラスウェイトの正規化
results = []
for targetc in classes:
weight = 0
for d in data:
word, count = d
weight += count * ( (classWordWeightCache[targetc][word] - class_all_avg[targetc]) / class_all_sd[targetc] )
weight /= math.log( class_words[targetc] )
results.append((weight, targetc))
results.sort(reverse=True)
"""
------------------------------
class: naruto
* avg: 34077.3653344
* sd: 8382.7525182
* t_sd: 77.8673962735
* t_order: 1.1
------------------------------
class: eva
* avg: 3622.16296242
* sd: 858.64236675
* t_sd: 76.7281191421
* t_order: 1.1
------------------------------
class: original
* avg: 9218.06230557
* sd: 1573.40403082
* t_sd: 75.801984534
* t_order: 1.0
------------------------------
class: type
* avg: 6058.1654544
* sd: 1429.37855987
* t_sd: 75.2048839853
* t_order: 1.1
------------------------------
class: nade
* avg: 6591.2918163
* sd: 1613.84016076
* t_sd: 69.6614553039
* t_order: 2.5
------------------------------
class: muv
* avg: 19011.684212
* sd: 3340.3349179
* t_sd: 67.6568118079
* t_order: 1.7
------------------------------
class: gs
* avg: 11630.6584688
* sd: 2451.88951519
* t_sd: 67.5696050299
* t_order: 1.8
------------------------------
class: zero
* avg: 8412.49062239
* sd: 1846.9647025
* t_sd: 66.66432557
* t_order: 1.9
------------------------------
class: sammon
* avg: 6040.06865616
* sd: 1187.23994403
* t_sd: 64.8867094221
* t_order: 2.0
------------------------------
class: akamatu
* avg: 18434.3729229
* sd: 3683.29775318
* t_sd: 64.830745411
* t_order: 2.5
------------------------------
class: HxH
* avg: 23372.185875
* sd: 4350.48754053
* t_sd: 63.3883860154
* t_order: 2.1
------------------------------
class: ff
* avg: 16219.3481427
* sd: 3194.91399609
* t_sd: 61.1189712744
* t_order: 2.6
------------------------------
class: toraha
* avg: 9514.65660175
* sd: 1622.71629987
* t_sd: 57.9614916424
* t_order: 3.2
------------------------------
"""
"""
------------------------------
total:
* avg: 13246.3471827
* sd: 2733.52786967
* t_sd: 68.4108373394
* t_order: 1.89230769231
------------------------------
"""
if False:
#NB
#単語ウェイトの正規化 2
#クラスウェイトの正規化
results = []
for targetc in classes:
weight = 0
for d in data:
word, count = d
weight += count * ( (classWordWeightCache[targetc][word] - class_all_avg[targetc]) / class_all_sd[targetc] )
weight /= -classWeightsForNorm[targetc]
results.append((weight, targetc))
results.sort(reverse=True)
"""
------------------------------
class: naruto
* avg: 0.160942025021
* sd: 0.0370557594227
* t_sd: 78.4960158972
* t_order: 1.0
------------------------------
class: eva
* avg: 0.0170871531807
* sd: 0.00362073220782
* t_sd: 76.4896782051
* t_order: 1.1
------------------------------
class: sammon
* avg: 0.0286161577636
* sd: 0.00578067964044
* t_sd: 73.9101605979
* t_order: 1.3
------------------------------
class: nade
* avg: 0.0313155653088
* sd: 0.00835129249704
* t_sd: 73.7113111562
* t_order: 2.1
------------------------------
class: gs
* avg: 0.0553352603794
* sd: 0.0131028595326
* t_sd: 73.6267484021
* t_order: 1.2
------------------------------
class: type
* avg: 0.0285401959268
* sd: 0.0057798041796
* t_sd: 73.4601258049
* t_order: 1.2
------------------------------
class: original
* avg: 0.0435356146707
* sd: 0.00642413168162
* t_sd: 71.3864022587
* t_order: 1.0
------------------------------
class: HxH
* avg: 0.110601527111
* sd: 0.0199869706161
* t_sd: 69.9501118356
* t_order: 1.3
------------------------------
class: muv
* avg: 0.0895934667559
* sd: 0.0126265828289
* t_sd: 66.7424818325
* t_order: 2.2
------------------------------
class: zero
* avg: 0.0396481296124
* sd: 0.00754967201756
* t_sd: 66.3253622488
* t_order: 2.1
------------------------------
class: akamatu
* avg: 0.0868682239756
* sd: 0.0145991212338
* t_sd: 65.2543720628
* t_order: 2.3
------------------------------
class: ff
* avg: 0.076645317374
* sd: 0.0139070521464
* t_sd: 64.3128200964
* t_order: 2.2
------------------------------
class: toraha
* avg: 0.0448700190444
* sd: 0.00622059147879
* t_sd: 53.4540809836
* t_order: 5.7
------------------------------
"""
"""
------------------------------
total:
* avg: 0.0625845120095
* sd: 0.0119234807295
* t_sd: 69.7784362601
* t_order: 1.9
------------------------------
"""
if False:
#NB
#単語ウェイトの正規化 2
results = []
for targetc in classes:
weight = 0
for d in data:
word, count = d
weight += count * ( (classWordWeightCache[targetc][word] - class_all_avg[targetc]) / class_all_sd[targetc] )
results.append((weight, targetc))
results.sort(reverse=True)
"""
------------------------------
class: naruto
* avg: 374070.266236
* sd: 97855.7054523
* t_sd: 76.1569057905
* t_order: 1.2
------------------------------
class: original
* avg: 101304.03087
* sd: 20402.8737226
* t_sd: 76.1314332706
* t_order: 1.0
------------------------------
class: eva
* avg: 39783.8487203
* sd: 10212.2924983
* t_sd: 75.3033192364
* t_order: 1.2
------------------------------
class: type
* avg: 66640.1424431
* sd: 17514.9732109
* t_sd: 73.9593859944
* t_order: 1.2
------------------------------
class: muv
* avg: 209285.359505
* sd: 44498.6504037
* t_sd: 67.3233009506
* t_order: 1.7
------------------------------
class: zero
* avg: 92604.0942076
* sd: 23180.7915126
* t_sd: 65.8545925468
* t_order: 1.8
------------------------------
class: nade
* avg: 71977.5259164
* sd: 16783.2715684
* t_sd: 64.7475623292
* t_order: 3.2
------------------------------
class: akamatu
* avg: 202987.189671
* sd: 47461.5896899
* t_sd: 64.6357850464
* t_order: 2.4
------------------------------
class: gs
* avg: 127346.87151
* sd: 27654.9541361
* t_sd: 62.8827447079
* t_order: 2.6
------------------------------
class: toraha
* avg: 104716.473673
* sd: 21762.1283769
* t_sd: 60.0863294481
* t_order: 2.5
------------------------------
class: ff
* avg: 178312.030172
* sd: 40201.771219
* t_sd: 59.8977989823
* t_order: 2.7
------------------------------
class: HxH
* avg: 256655.957158
* sd: 53653.9692937
* t_sd: 59.5694826358
* t_order: 2.4
------------------------------
class: sammon
* avg: 66297.0414346
* sd: 14282.1635122
* t_sd: 59.4733775141
* t_order: 3.0
------------------------------
"""
"""
------------------------------
total:
* avg: 145536.98704
* sd: 33497.3180459
* t_sd: 66.6170783425
* t_order: 2.06923076923
------------------------------
"""
if False:
#NB
#単語ウェイトの正規化 1
class_sd = {}
class_avg = {}
for targetc in classes:
weights = []
cur = conn.execute("SELECT weight FROM class_word_weight WHERE class= ?", [targetc])
for row in cur:
weight = row[0]
weights.append(weight)
avg = 0
for weight in weights:
avg += weight
avg /= len(weights)
sd = 0
for weight in weights:
sd += (weight - avg) ** 2
sd /= len(weights)
sd = math.sqrt(sd)
class_sd[targetc] = sd
class_avg[targetc] = avg
results = []
for targetc in classes:
weight = 0
for d in data:
word, count = d
weight += count * ( (classWordWeightCache[targetc][word] - class_avg[targetc]) / class_sd[targetc] )
weight /= class_avg[targetc]
results.append((weight, targetc))
results.sort(reverse=True)
"""
------------------------------
class: ff
* avg: 585137.901143
* sd: 269168.974919
* t_sd: 74.3095734354
* t_order: 1.1
------------------------------
class: naruto
* avg: 1236820.63119
* sd: 494423.709634
* t_sd: 71.5399838653
* t_order: 1.3
------------------------------
class: gs
* avg: 455179.330769
* sd: 206311.418285
* t_sd: 70.8176027906
* t_order: 1.5
------------------------------
class: akamatu
* avg: 629731.60129
* sd: 284433.382604
* t_sd: 69.4210865101
* t_order: 1.3
------------------------------
class: sammon
* avg: 221456.081932
* sd: 98340.4590167
* t_sd: 69.3733641139
* t_order: 1.5
------------------------------
class: HxH
* avg: 857955.912583
* sd: 366781.261103
* t_sd: 68.014557396
* t_order: 1.6
------------------------------
class: eva
* avg: 135886.709242
* sd: 46455.1012248
* t_sd: 67.3218388071
* t_order: 1.8
------------------------------
class: muv
* avg: 703424.857361
* sd: 250662.714997
* t_sd: 64.2382874655
* t_order: 2.0
------------------------------
class: zero
* avg: 301009.668601
* sd: 127967.687533
* t_sd: 63.5756741035
* t_order: 2.2
------------------------------
class: original
* avg: 323296.062981
* sd: 118501.205049
* t_sd: 59.3690605231
* t_order: 2.6
------------------------------
class: nade
* avg: 228528.555375
* sd: 76917.900427
* t_sd: 57.842596454
* t_order: 5.1
------------------------------
class: type
* avg: 215106.633613
* sd: 81639.2951301
* t_sd: 57.3273133441
* t_order: 3.8
------------------------------
class: toraha
* avg: 353454.258832
* sd: 125493.347475
* t_sd: 55.9504832529
* t_order: 4.4
------------------------------
"""
"""
------------------------------
total:
* avg: 480537.554224
* sd: 195930.496723
* t_sd: 65.3154940048
* t_order: 2.32307692308
------------------------------
"""
if False:
#TWMNB これが最終
results = []
for targetc in classes:
weight = 0
for d in data:
word, count = d
#w = (classWordWeightCache[targetc][word] + s ) / (classWeights[targetc] + s_all )
w = math.log( (classWordWeightCache[targetc][word] + s ) / (classWeights[targetc] + s_all ) )
w /= class_weights[s]["count_log"][targetc]
w *= count
weight += w
results.append((weight, targetc))
results.sort(reverse=True)
if False:
#TMNB?
results = []
for targetc in classes:
weight = 0
for d in data:
word, count = d
#w = (classWordWeightCache[targetc][word] + s ) / (classWeights[targetc] + s_all )
w = math.log( (classWordWeightCache[targetc][word] + s ) / (classWeights[targetc] + s_all ) )
w *= count
weight += w
weight += math.log( classProbs[targetc] )
results.append((weight, targetc))
results.sort(reverse=True)
if False:
#NB
#countを使用
results = []
for targetc in classes:
weight = 0
for d in data:
word, count = d
weight += count * classWordWeightCache[targetc][word]
weight = math.log( (weight + smoothing ) / (classWeights[targetc] + smoothingAll ) )
results.append((weight, targetc))
results.sort(reverse=True)
"""
------------------------------
class: nade
* avg: -0.35321569126
* sd: 0.317693824586
* t_sd: 77.9451057489
* t_order: 1.0
------------------------------
class: naruto
* avg: 1.13571734522
* sd: 0.321846774216
* t_sd: 76.7327089693
* t_order: 1.0
------------------------------
class: eva
* avg: -1.45683742732
* sd: 0.317861947967
* t_sd: 74.0866510457
* t_order: 1.1
------------------------------
class: gs
* avg: -0.020378879127
* sd: 0.261082028984
* t_sd: 70.1074012718
* t_order: 1.1
------------------------------
class: sammon
* avg: -0.276199347805
* sd: 0.227855771152
* t_sd: 69.8730540628
* t_order: 1.2
------------------------------
class: type
* avg: -0.809200536449
* sd: 0.232368260019
* t_sd: 68.2639881204
* t_order: 1.5
------------------------------
class: zero
* avg: -0.715821091564
* sd: 0.190345773408
* t_sd: 60.4115325197
* t_order: 3.3
------------------------------
class: muv
* avg: 0.239033979658
* sd: 0.199338212326
* t_sd: 57.9367844563
* t_order: 3.8
------------------------------
class: HxH
* avg: 1.26110426092
* sd: 0.20027967335
* t_sd: 56.2902263005
* t_order: 4.9
------------------------------
class: akamatu
* avg: 0.288005638889
* sd: 0.199621080142
* t_sd: 53.6432333844
* t_order: 5.4
------------------------------
class: original
* avg: -0.30187523483
* sd: 0.201562382152
* t_sd: 53.1190422145
* t_order: 5.8
------------------------------
class: ff
* avg: 0.779932499854
* sd: 0.196900144832
* t_sd: 46.1756097496
* t_order: 8.6
------------------------------
class: toraha
* avg: 0.20802236361
* sd: 0.180850722226
* t_sd: 44.8129294908
* t_order: 8.7
------------------------------
"""
"""
------------------------------
total:
* avg: -0.00167016309207
* sd: 0.234431276566
* t_sd: 62.2614051796
* t_order: 3.64615384615
------------------------------
"""
if False:
#NB
results = []
for targetc in classes:
weight = 0
for d in data:
word, count = d
weight += classWordWeightCache[targetc][word]
weight = math.log( (weight + smoothing ) / (classWeights[targetc] + smoothingAll ) )
results.append((weight, targetc))
results.sort(reverse=True)
"""
------------------------------
class: naruto
* avg: -1.36474854074
* sd: 0.0990535691217
* t_sd: 79.2813468834
* t_order: 1.0
------------------------------
class: nade
* avg: -2.01728841807
* sd: 0.143152413854
* t_sd: 79.1537377799
* t_order: 1.0
------------------------------
class: eva
* avg: -2.74128547465
* sd: 0.178064823951
* t_sd: 76.2231618872
* t_order: 1.0
------------------------------
class: sammon
* avg: -1.95365327094
* sd: 0.0983952958705
* t_sd: 72.2969196803
* t_order: 1.1
------------------------------
class: gs
* avg: -1.88885688744
* sd: 0.0969894733765
* t_sd: 71.8209223179
* t_order: 1.0
------------------------------
class: type
* avg: -2.33734414044
* sd: 0.119035821633
* t_sd: 71.264145829
* t_order: 1.0
------------------------------
class: HxH
* avg: -1.21203437949
* sd: 0.0607120323132
* t_sd: 70.9494113472
* t_order: 1.0
------------------------------
class: muv
* avg: -1.80312983296
* sd: 0.0861196852463
* t_sd: 68.3250090695
* t_order: 1.7
------------------------------
class: zero
* avg: -2.24971291528
* sd: 0.0929647454724
* t_sd: 63.4397230574
* t_order: 2.7
------------------------------
class: akamatu
* avg: -1.73289032254
* sd: 0.0731294988792
* t_sd: 58.1629916693
* t_order: 4.1
------------------------------
class: toraha
* avg: -1.77935713556
* sd: 0.0718235961978
* t_sd: 56.2762876058
* t_order: 4.8
------------------------------
class: original
* avg: -2.03148244387
* sd: 0.0835857883468
* t_sd: 55.8556104571
* t_order: 4.6
------------------------------
class: ff
* avg: -1.5566886795
* sd: 0.0587450262853
* t_sd: 48.5787467624
* t_order: 7.7
------------------------------
"""
"""
------------------------------
total:
* avg: -1.89757480319
* sd: 0.0970593669652
* t_sd: 67.0483087959
* t_order: 2.51538461538
------------------------------
"""
if False:
#TCNB
#weightの正規化
#クラスウェイトの正規化
results = []
for targetc in classes:
totalWeight = 0
for d in data:
word, count = d
weight = 0
for ec in classes:
if targetc != ec:
weight += ((classWordWeightCache[ec][word] - class_all_avg[ec]) / class_all_sd[ec]) / math.log(class_words[ec])
weight *= count
totalWeight += weight
results.append((-totalWeight, targetc)) #以降の処理の共通化のために反転させている
results.sort(reverse=True)
"""
------------------------------
class: naruto
* avg: -408928.384013
* sd: 8382.7525182
* t_sd: 77.8673962735
* t_order: 1.1
------------------------------
class: eva
* avg: -43465.955549
* sd: 858.64236675
* t_sd: 76.7281191421
* t_order: 1.1
------------------------------
class: original
* avg: -110616.747667
* sd: 1573.40403082
* t_sd: 75.801984534
* t_order: 1.0
------------------------------
class: type
* avg: -72697.9854528
* sd: 1429.37855987
* t_sd: 75.2048839853
* t_order: 1.1
------------------------------
class: nade
* avg: -79095.5017956
* sd: 1613.84016076
* t_sd: 69.6614553039
* t_order: 2.5
------------------------------
class: muv
* avg: -228140.210543
* sd: 3340.3349179
* t_sd: 67.6568118079
* t_order: 1.7
------------------------------
class: gs
* avg: -139567.901626
* sd: 2451.88951519
* t_sd: 67.5696050299
* t_order: 1.8
------------------------------
class: zero
* avg: -100949.887469
* sd: 1846.9647025
* t_sd: 66.66432557
* t_order: 1.9
------------------------------
class: sammon
* avg: -72480.823874
* sd: 1187.23994403
* t_sd: 64.8867094221
* t_order: 2.0
------------------------------
class: akamatu
* avg: -221212.475075
* sd: 3683.29775318
* t_sd: 64.830745411
* t_order: 2.5
------------------------------
class: HxH
* avg: -280466.2305
* sd: 4350.48754053
* t_sd: 63.3883860154
* t_order: 2.1
------------------------------
class: ff
* avg: -194632.177712
* sd: 3194.91399609
* t_sd: 61.1189712744
* t_order: 2.6
------------------------------
class: toraha
* avg: -114175.879221
* sd: 1622.71629987
* t_sd: 57.9614916424
* t_order: 3.2
------------------------------
"""
"""
------------------------------
total:
* avg: -158956.166192
* sd: 2733.52786967
* t_sd: 68.4108373394
* t_order: 1.89230769231
------------------------------
"""
if False:
#TWCNB 最終
results = []
for targetc in classes:
denominator = 0
for ec in classes:
if targetc != ec:
denominator += classWeights[ec]
weights = []
for d in data:
word, count = d
numerator = 0
for ec in classes:
if targetc != ec:
numerator += classWordWeightCache[ec][word]
weight = math.log( (numerator + s) / (denominator + s_all) )
weights.append((weight, word, count))
result = 0
for d in weights:
weight, word, count = d
weight *= count
weight /= class_weights[s]["log_complement"][targetc]
result += weight
#result -= math.log(classProbs[targetc])
results.append((-result, targetc)) #以降の処理の共通化のために反転させている
results.sort(reverse=True)
"""
------------------------------
class: naruto
* avg: 1311695.63587
* sd: 4568.74229051
* t_sd: 70.0650279355
* t_order: 1.0
------------------------------
class: eva
* avg: 128274.546375
* sd: 438.090584609
* t_sd: 69.1028324724
* t_order: 1.0
------------------------------
class: nade
* avg: 272165.286103
* sd: 851.564247431
* t_sd: 64.260053795
* t_order: 1.0
------------------------------
class: type
* avg: 239876.368143
* sd: 783.932784687
* t_sd: 63.0837281902
* t_order: 1.1
------------------------------
class: gs
* avg: 433578.305835
* sd: 1332.75961534
* t_sd: 61.2718654195
* t_order: 1.0
------------------------------
class: sammon
* avg: 245704.418527
* sd: 751.569559483
* t_sd: 60.2438011563
* t_order: 1.2
------------------------------
class: HxH
* avg: 941604.543454
* sd: 2817.82654017
* t_sd: 58.8588912753
* t_order: 2.1
------------------------------
class: zero
* avg: 337397.073691
* sd: 1066.67336837
* t_sd: 57.1218074469
* t_order: 4.2
------------------------------
class: muv
* avg: 706452.265907
* sd: 1932.89368071
* t_sd: 56.0904083166
* t_order: 5.2
------------------------------
class: ff
* avg: 668932.715533
* sd: 2043.17281962
* t_sd: 55.5907307836
* t_order: 6.3
------------------------------
class: akamatu
* avg: 799218.461441
* sd: 2374.55913981
* t_sd: 52.866347537
* t_order: 7.3
------------------------------
class: original
* avg: 385981.180659
* sd: 1136.53176629
* t_sd: 41.6690512266
* t_order: 11.1
------------------------------
class: toraha
* avg: 356330.850268
* sd: 842.690874277
* t_sd: 30.9558353575
* t_order: 12.6
------------------------------
"""
"""
------------------------------
total:
* avg: 525170.127062
* sd: 1610.84671318
* t_sd: 57.0138754548
* t_order: 4.23846153846
------------------------------
"""
if False:
#TCNB
#weightのnormalizeしない
#p(class)の符号間違ってるか
results = []
for targetc in classes:
denominator = 0
for ec in classes:
if targetc != ec:
denominator += classWeights[ec]
weights = []
for d in data:
word, count = d
numerator = 0
for ec in classes:
if targetc != ec:
numerator += classWordWeightCache[ec][word]
weight = math.log( (numerator + smoothing) / (denominator + smoothingAll) )
weights.append((weight, word, count))
result = 0
for d in weights:
weight, word, count = d
weight *= count
result += weight
result += math.log(classProbs[targetc])
results.append((-result, targetc)) #以降の処理の共通化のために反転させている
results.sort(reverse=True)
"""
------------------------------
class: naruto
* avg: 1311701.77275
* sd: 4570.14264271
* t_sd: 70.039306356
* t_order: 1.0
------------------------------
class: eva
* avg: 128280.683259
* sd: 439.51732589
* t_sd: 68.6234471934
* t_order: 1.0
------------------------------
class: nade
* avg: 272171.422988
* sd: 853.347967684
* t_sd: 64.307921588
* t_order: 1.0
------------------------------
class: type
* avg: 239882.505028
* sd: 785.329783374
* t_sd: 62.8375718977
* t_order: 1.3
------------------------------
class: gs
* avg: 433584.44272
* sd: 1334.47600307
* t_sd: 61.3190306866
* t_order: 1.0
------------------------------
class: sammon
* avg: 245710.555412
* sd: 753.288574189
* t_sd: 60.3069055735
* t_order: 1.2
------------------------------
class: HxH
* avg: 941610.680338
* sd: 2819.53528976
* t_sd: 58.8642982022
* t_order: 2.1
------------------------------
class: zero
* avg: 337403.210576
* sd: 1068.26404341
* t_sd: 56.9676393264
* t_order: 4.1
------------------------------
class: muv
* avg: 706458.402791
* sd: 1934.43150062
* t_sd: 55.9748821475
* t_order: 5.2
------------------------------
class: ff
* avg: 668938.852418
* sd: 2044.84485651
* t_sd: 55.5855891945
* t_order: 6.4
------------------------------
class: akamatu
* avg: 799224.598326
* sd: 2376.16498253
* t_sd: 52.8101602153
* t_order: 7.3
------------------------------
class: original
* avg: 385987.317543
* sd: 1138.20185836
* t_sd: 41.6069569223
* t_order: 11.1
------------------------------
class: toraha
* avg: 356336.987152
* sd: 844.509998667
* t_sd: 30.9725229619
* t_order: 12.6
------------------------------
"""
"""
------------------------------
total:
* avg: 525176.263947
* sd: 1612.46575591
* t_sd: 56.9397101743
* t_order: 4.25384615385
------------------------------
"""
if False:
#TWCNB
results = []
for targetc in classes:
denominator = 0
for ec in classes:
if targetc != ec:
denominator += classWeights[ec]
weights = []
for d in data:
word, count = d
numerator = 0
for ec in classes:
if targetc != ec:
numerator += classWordWeightCache[ec][word]
weight = math.log( (numerator + s) / (denominator + s_all) )
weights.append((weight, word, count))
result = 0
for d in weights:
weight, word, count = d
weight /= classComplementWeightsForNorm[targetc]
weight *= count
result += weight
results.append((result, targetc))
results.sort(reverse=True)
"""
------------------------------
class: original
* avg: 0.12525061358
* sd: 0.000438189289814
* t_sd: 77.3870036459
* t_order: 1.0
------------------------------
class: type
* avg: 0.0778401173711
* sd: 0.000302713438858
* t_sd: 69.6060377116
* t_order: 1.8
------------------------------
class: eva
* avg: 0.0416256310952
* sd: 0.000154656723194
* t_sd: 65.1709758438
* t_order: 2.2
------------------------------
class: toraha
* avg: 0.115629437295
* sd: 0.000426885832668
* t_sd: 64.7519737869
* t_order: 2.1
------------------------------
class: muv
* avg: 0.229242629576
* sd: 0.00082232383802
* t_sd: 63.5943890608
* t_order: 2.2
------------------------------
class: naruto
* avg: 0.425641252201
* sd: 0.00151855309519
* t_sd: 63.5884999764
* t_order: 2.1
------------------------------
class: zero
* avg: 0.109485181458
* sd: 0.000389838954393
* t_sd: 57.0235890994
* t_order: 3.4
------------------------------
class: akamatu
* avg: 0.259344670378
* sd: 0.000891560474804
* t_sd: 53.3260681909
* t_order: 4.4
------------------------------
class: nade
* avg: 0.0883176079352
* sd: 0.000286864991421
* t_sd: 46.8204370575
* t_order: 7.5
------------------------------
class: ff
* avg: 0.21706730743
* sd: 0.000738857892353
* t_sd: 44.8621334799
* t_order: 8.6
------------------------------
class: gs
* avg: 0.140695615643
* sd: 0.000477107046101
* t_sd: 44.3074991755
* t_order: 9.0
------------------------------
class: HxH
* avg: 0.305548421088
* sd: 0.00104171011893
* t_sd: 43.1230073263
* t_order: 9.6
------------------------------
class: sammon
* avg: 0.0797312092984
* sd: 0.000270106662254
* t_sd: 42.7610087446
* t_order: 9.8
------------------------------
"""
"""
------------------------------
total:
* avg: 0.170416899565
* sd: 0.000596874489076
* t_sd: 56.6402017769
* t_order: 4.9
------------------------------
"""
if False:
#TWCNB
#こっちが正しいと思うんだけどなー
results = []
for targetc in classes:
denominator = 0
for ec in classes:
if targetc != ec:
denominator += classWeights[ec]
weights = []
weightTotal = 0
for d in data:
word, count = d
numerator = 0
for ec in classes:
if targetc != ec:
numerator += classWordWeightCache[ec][word]
weight = math.log( (numerator + smoothing) / (denominator + smoothingAll) )
weightTotal += weight
weights.append((weight, word, count))
weightTotal = math.fabs(weightTotal)
result = 0
for d in weights:
weight, word, count = d
weight /= weightTotal
weight *= count
result += weight
results.append((-result, targetc)) #以降の処理の共通化のために反転させている
results.sort(reverse=True)
"""
クラス全体のウェイト差の補正が効き過ぎている
------------------------------
class: naruto
* avg: 13.6381259419
* sd: 0.027352385018
* t_sd: 72.261804576
* t_order: 1.0
------------------------------
class: eva
* avg: 4.79923405287
* sd: 0.00721792019273
* t_sd: 64.9145016627
* t_order: 1.6
------------------------------
class: nade
* avg: 6.73628791178
* sd: 0.0112160933504
* t_sd: 60.8821347702
* t_order: 2.1
------------------------------
class: gs
* avg: 7.68968496961
* sd: 0.0119009382662
* t_sd: 58.0733576324
* t_order: 3.4
------------------------------
class: type
* avg: 6.53928309049
* sd: 0.0101006274386
* t_sd: 58.0067771116
* t_order: 4.5
------------------------------
class: sammon
* avg: 6.99148441714
* sd: 0.0107147999552
* t_sd: 57.5027194924
* t_order: 3.1
------------------------------
class: HxH
* avg: 11.7329272032
* sd: 0.0181516989659
* t_sd: 53.0191543658
* t_order: 6.6
------------------------------
class: zero
* avg: 6.66400255582
* sd: 0.00956235635069
* t_sd: 51.6261243177
* t_order: 6.2
------------------------------
class: ff
* avg: 11.1111756914
* sd: 0.0164073247014
* t_sd: 51.4868003902
* t_order: 8.1
------------------------------
class: original
* avg: 7.29357490509
* sd: 0.0107120088055
* t_sd: 50.7473680855
* t_order: 7.8
------------------------------
class: akamatu
* avg: 9.98884550201
* sd: 0.0154106658725
* t_sd: 48.7774841378
* t_order: 8.6
------------------------------
class: muv
* avg: 9.7249427688
* sd: 0.0135902418477
* t_sd: 41.607986123
* t_order: 10.7
------------------------------
class: toraha
* avg: 8.61086969115
* sd: 0.0128939908039
* t_sd: 22.7685179454
* t_order: 13.0
------------------------------
"""
"""
------------------------------
total:
* avg: 8.57849528471
* sd: 0.0134793116591
* t_sd: 53.2057485085
* t_order: 5.9
------------------------------
"""
"""
#こうとも読み取れるが、どうも違うようだ
results = []
for targetc in classes:
denominator = 0
for ec in classes:
if targetc != ec:
denominator += classWeights[ec]
weights = []
weightTotal = 0
for d in data:
word, count = d
numerator = 0
for ec in classes:
if targetc != ec:
numerator += classWordWeightCache[ec][word]
weight = math.log( (numerator + smoothing) / (denominator + smoothingAll) )
weightTotal += weight
weights.append((weight, word, count))
weightTotal = math.fabs(weightTotal)
result = 0
for d in weights:
weight, word, count = d
weight /= weightTotal
weight *= count
result += weight
results.append((result, targetc))
print "-" * 20
results.sort()
for d in results:
weight, targetc = d
try:
print "%s: %s" % (targetc, weight)
except:
pass
continue
"""
if False:
#TCNB?
#weightのnormalize
results = []
for targetc in classes:
denominator = 0
for ec in classes:
if targetc != ec:
denominator += classWeights[ec]
numerator = 0
for d in data:
word, count = d
for ec in classes:
if targetc != ec:
numerator += float(count) * classWordWeightCache[ec][word]
weight = math.log( (numerator + smoothing) / (denominator + smoothingAll) )
weight /= classComplementWeightsForNorm[targetc]
results.append((weight, targetc))
results.sort(reverse=True)
"""
------------------------------
class: eva
* avg: 4.87099177835e-007
* sd: 1.06851775933e-008
* t_sd: 76.4226615668
* t_order: 1.1
------------------------------
class: type
* avg: 2.69077514492e-007
* sd: 1.00413501149e-008
* t_sd: 75.921013835
* t_order: 1.0
------------------------------
class: naruto
* avg: -3.55364232009e-007
* sd: 1.13495785547e-008
* t_sd: 73.8726364136
* t_order: 1.0
------------------------------
class: original
* avg: 1.12023754034e-007
* sd: 7.19700948595e-009
* t_sd: 66.5402984012
* t_order: 2.1
------------------------------
class: muv
* avg: -6.89989489719e-008
* sd: 7.15316890934e-009
* t_sd: 62.9137973967
* t_order: 2.8
------------------------------
class: zero
* avg: 2.44409939089e-007
* sd: 7.67566960156e-009
* t_sd: 61.0624614695
* t_order: 2.3
------------------------------
class: nade
* avg: 1.39660902629e-007
* sd: 6.44298117977e-009
* t_sd: 57.7200122252
* t_order: 3.2
------------------------------
class: akamatu
* avg: -8.06696463057e-008
* sd: 7.45562880648e-009
* t_sd: 55.1561437055
* t_order: 4.8
------------------------------
class: HxH
* avg: -3.89382459583e-007
* sd: 7.37873101148e-009
* t_sd: 54.1396830006
* t_order: 5.9
------------------------------
class: gs
* avg: 3.45436590018e-008
* sd: 7.51327949295e-009
* t_sd: 53.6397384794
* t_order: 4.2
------------------------------
class: sammon
* avg: 1.09837926853e-007
* sd: 6.9175453382e-009
* t_sd: 51.725170962
* t_order: 5.2
------------------------------
class: ff
* avg: -2.35942154699e-007
* sd: 7.37570574225e-009
* t_sd: 50.7426791356
* t_order: 8.1
------------------------------
class: toraha
* avg: -5.81099984164e-008
* sd: 4.91005895679e-009
* t_sd: 40.1049564352
* t_order: 10.1
------------------------------
"""
"""
------------------------------
total:
* avg: 1.60142641498e-008
* sd: 7.85352959906e-009
* t_sd: 59.9970194636
* t_order: 3.98461538462
------------------------------
"""
if False:
#TCNB?
#weightのnormalize
#countを使っていない
results = []
for targetc in classes:
denominator = 0
for ec in classes:
if targetc != ec:
denominator += classWeights[ec]
numerator = 0
for d in data:
word, count = d
for ec in classes:
if targetc != ec:
numerator += classWordWeightCache[ec][word]
weight = math.log( (numerator + smoothing) / (denominator + smoothingAll) )
weight /= classComplementWeightsForNorm[targetc]
results.append((weight, targetc))
results.sort(reverse=True)
"""
------------------------------
class: original
* avg: 6.62625214255e-007
* sd: 4.6525737139e-009
* t_sd: 77.5025301841
* t_order: 1.0
------------------------------
class: type
* avg: 7.61930966997e-007
* sd: 6.059993792e-009
* t_sd: 76.0942398112
* t_order: 1.1
------------------------------
class: muv
* avg: 5.85507315441e-007
* sd: 4.06532772127e-009
* t_sd: 73.1164726683
* t_order: 1.3
------------------------------
class: toraha
* avg: 5.77663868144e-007
* sd: 4.03817609203e-009
* t_sd: 72.9741398349
* t_order: 1.2
------------------------------
class: eva
* avg: 8.9897859696e-007
* sd: 6.27732496249e-009
* t_sd: 71.1492435196
* t_order: 1.4
------------------------------
class: naruto
* avg: 4.4697418711e-007
* sd: 3.03207874445e-009
* t_sd: 66.051791715
* t_order: 1.9
------------------------------
class: zero
* avg: 7.35161031788e-007
* sd: 4.93589286369e-009
* t_sd: 59.8616699465
* t_order: 2.9
------------------------------
class: akamatu
* avg: 5.64897463998e-007
* sd: 3.68844549795e-009
* t_sd: 56.4644369085
* t_order: 3.4
------------------------------
class: HxH
* avg: 3.96867750141e-007
* sd: 2.60172307858e-009
* t_sd: 45.5582059752
* t_order: 7.8
------------------------------
class: nade
* avg: 6.6466937989e-007
* sd: 3.88504712872e-009
* t_sd: 45.4412369728
* t_order: 7.7
------------------------------
class: ff
* avg: 5.08462211128e-007
* sd: 3.4411400955e-009
* t_sd: 45.0967878964
* t_order: 8.5
------------------------------
class: gs
* avg: 6.22407865462e-007
* sd: 3.9970898373e-009
* t_sd: 43.8867624539
* t_order: 8.5
------------------------------
class: sammon
* avg: 6.42764861928e-007
* sd: 3.98313719041e-009
* t_sd: 42.9237257041
* t_order: 8.8
------------------------------
"""
"""
------------------------------
total:
* avg: 6.2068543948e-007
* sd: 4.20445774756e-009
* t_sd: 59.7016341223
* t_order: 4.26923076923
------------------------------
"""
if False:
#TCNB?
#weightのnormalizeしていない
#データが小規模、クラス間の不均衡が小規模なら精度は十分
results = []
for targetc in classes:
denominator = 0
for ec in classes:
if targetc != ec:
denominator += classWeights[ec]
numerator = 0
for d in data:
word, count = d
for ec in classes:
if targetc != ec:
numerator += classWordWeightCache[ec][word]
weight = math.log( (numerator + smoothing) / (denominator + smoothingAll) )
weight *= count
results.append((-weight, targetc)) #処理の共通化のため反転
results.sort(reverse=True)
"""
------------------------------
class: eva
* avg: 7.31008067079
* sd: 0.0477056649335
* t_sd: 77.13911659
* t_order: 1.0
------------------------------
class: type
* avg: 3.41829310554
* sd: 0.0221013252225
* t_sd: 76.9002779576
* t_order: 1.0
------------------------------
class: muv
* avg: 8.01272630654
* sd: 0.0519458005303
* t_sd: 74.7011672721
* t_order: 1.0
------------------------------
class: naruto
* avg: 2.36184923361
* sd: 0.0135671588159
* t_sd: 73.271893772
* t_order: 1.0
------------------------------
class: toraha
* avg: 8.45739691564
* sd: 0.0261536550873
* t_sd: 66.9009896355
* t_order: 2.2
------------------------------
class: original
* avg: 5.25567675686
* sd: 0.0225737024698
* t_sd: 66.5506223112
* t_order: 1.7
------------------------------
class: zero
* avg: 3.83547109643
* sd: 0.0167425389659
* t_sd: 62.6982672155
* t_order: 2.1
------------------------------
class: nade
* avg: 2.32819454263
* sd: 0.00769474520835
* t_sd: 58.8801720694
* t_order: 3.4
------------------------------
class: akamatu
* avg: 2.59794472306
* sd: 0.0118857727134
* t_sd: 58.7072128044
* t_order: 3.4
------------------------------
class: HxH
* avg: 2.15826330926
* sd: 0.00905616346116
* t_sd: 55.1769348426
* t_order: 4.7
------------------------------
class: gs
* avg: 7.37470112088
* sd: 0.0307546830308
* t_sd: 53.7358576848
* t_order: 4.7
------------------------------
class: sammon
* avg: 3.55448591211
* sd: 0.0144491717436
* t_sd: 53.0977454714
* t_order: 4.9
------------------------------
class: ff
* avg: 8.67338981059
* sd: 0.0406567143574
* t_sd: 50.2106939107
* t_order: 7.2
------------------------------
"""
"""
------------------------------
total:
* avg: 5.02603642338
* sd: 0.02425285358
* t_sd: 63.6900731952
* t_order: 2.94615384615
------------------------------
"""
"""
#上手く動く
results = []
for targetc in classes:
complementAll = 0
for ec in classes:
if targetc != ec:
complementAll += classWeights[ec]
weight = 0
for d in data:
word, count = d
for ec in classes:
if targetc != ec:
weight += classWordWeightCache[ec][word]
weight = math.log( (weight + smoothing ) / (complementAll + smoothingAll ) )
results.append((weight, targetc))
print "-" * 20
results.sort()
for d in results:
weight, targetc = d
try:
print "%s: %s" % (targetc, weight)
except:
pass
continue
"""
"""#精度が微妙?
results = []
for targetc in classes:
print targetc
#compute denominator of complement
complementAll = smoothingAll
for ec in classes:
if targetc != ec:
complementAll += classWeights[ec]
print "complementAll: %s" % complementAll
weights = []
for d in data:
word, count = d
#compute numerator of complement
complement = smoothing
for ec in classes:
if targetc != ec:
complement += classWordWeightCache[ec][word]
#print "complement: %s" % complement
#compute weight
weight = math.log( float(complement) / complementAll )
#print "weight: %s" % weight
weights.append((weight, word, count))
weights.sort(reverse=True)
for d in weights[:50]:
weight, word, count = d
try:
print "%s: %s, %s" % (word.encode("shift-jis"), count, weight)
except:
pass
#normalize
nWeights = []
nWeightTotal = 0
for d in weights:
weight, word, count = d
#weight /= classWeights[targetc]
#weight *= count #
nWeights.append((weight, word))
nWeightTotal += weight
#print "weight: %s" % weight
print "nWeightTotal: %s" % nWeightTotal
results.append((nWeightTotal, targetc))
print "-" * 20
results.sort(reverse=True)
for result in results:
nWeightTotal, targetc = result
try:
print "%s: %s" % (targetc.encode("shift-jis"), nWeightTotal)
except:
pass
"""
"""#とりあえず動くがバグあり版
results = []
for targetc in classes:
#compute denominator of complement
print "computing denominator of complement"
complementAll = smoothingAll
for ec in classes:
if targetc != ec:
complementAll += classWeights[ec]
print "complementAll: %s" % complementAll
weights = []
for d in data:
word, count = d
#compute numerator of complement
complement = smoothing
for ec in classes:
if targetc != ec:
complement += getClassWordWeight(ec, word)
#compute weight
weight = count
weight *= math.log( float(complement) / complementAll )
weights.append((weight, word))
#normalize
nWeights = []
nWeightTotal = 0
for d in weights:
weight, word = d
weight /= classWeights[targetc]
nWeights.append((weight, word))
nWeightTotal += weight
print "nWeightTotal: %s" % nWeightTotal
results.append((nWeightTotal, targetc))
results.sort(reverse=True)
for result in results:
nWeightTotal, targetc = result
try:
print "%s: %s" % (targetc.encode("shift-jis"), nWeightTotal)
except:
pass
"""
print "-" * 20
(avg, sd, t_sd) = getSD(results, c)
print "* avg: %s" % avg
print "* sd: %s" % sd
print "* t_sd %s" % t_sd
printProbs(results, c)
t_order = getOrder(results, c)
class_info[c]["count"] += 1
class_info[c]["avg"] += avg
class_info[c]["sd"] += sd
class_info[c]["t_sd"] += t_sd
class_info[c]["t_order"] += float(t_order)
total_info["count"] += class_info[c]["count"]
total_info["avg"] += class_info[c]["avg"]
total_info["sd"] += class_info[c]["sd"]
total_info["t_sd"] += class_info[c]["t_sd"]
total_info["t_order"] += class_info[c]["t_order"]
class_info[c]["avg"] /= class_info[c]["count"]
class_info[c]["sd"] /= class_info[c]["count"]
class_info[c]["t_sd"] /= class_info[c]["count"]
class_info[c]["t_order"] /= class_info[c]["count"]
total_info["avg"] /= total_info["count"]
total_info["sd"] /= total_info["count"]
total_info["t_sd"] /= total_info["count"]
total_info["t_order"] /= total_info["count"]
ci = []
for c in class_info:
ci.append((class_info[c]["t_sd"], c, class_info[c]["avg"], class_info[c]["sd"], class_info[c]["t_order"]))
ci.sort(reverse=True)
print "-" * 30
for d in ci:
t_sd, c, avg, sd, t_order = d
print "class: %s" % c
print "* avg: %s" % avg
print "* sd: %s" % sd
print "* t_sd: %s" % t_sd
print "* t_order: %s" % t_order
print "-" * 30
print "total:"
print "* avg: %s" % total_info["avg"]
print "* sd: %s" % total_info["sd"]
print "* t_sd: %s" % total_info["t_sd"]
print "* t_order: %s" % total_info["t_order"]
print "-" * 30
results = []
for targetc in classes:
denominator = 0
for ec in classes:
if targetc != ec:
denominator += classWeights[ec]
weights = []
for d in data:
word, count = d
numerator = 0
for ec in classes:
if targetc != ec:
numerator += classWordWeightCache[ec][word]
weight = math.log( (numerator + s) / (denominator + s_all) )
weights.append((weight, word, count))
result = 0
for d in weights:
weight, word, count = d
weight *= count
weight /= class_weights[s]["log_complement"][targetc]
result += weight
#result -= math.log(classProbs[targetc])
results.append((-result, targetc)) #以降の処理の共通化のために反転させている
results.sort(reverse=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment