Skip to content

Instantly share code, notes, and snippets.

@johnsmith17th
Created May 27, 2013 05:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johnsmith17th/5655323 to your computer and use it in GitHub Desktop.
Save johnsmith17th/5655323 to your computer and use it in GitHub Desktop.
Text Classifiers
'''
@author: John Smith
'''
class Classifier(object):
''' base classifier '''
def __init__(self, **kwargs):
pass
def loadm(self, data):
''' load model from dictionary object '''
pass
def savem(self):
''' get dictionary object of model '''
return { }
def load(self, filename):
''' load model from file '''
import json
finp = open(filename, 'r')
text = finp.read()
finp.close()
data = json.loads(text)
self.loadm(data)
pass
def save(self, filename):
''' save model to file '''
import json
data = self.savem()
text = json.dumps(data, ensure_ascii = False, indent = 4)
fout = open(filename, 'w')
fout.write(text)
fout.close()
pass
def model(self, X, Y):
''' build classifier model '''
''' X: the feature weight array of samples '''
''' Y: the label array of samples '''
pass
def predict(self, x):
''' to predict the label of sample '''
pass
'''
@author: John Smith
'''
from base import Classifier
class NaiveBayes(Classifier):
''' base naive bayes classifier '''
@staticmethod
def sampling(terms, docs):
X = [] # features
Y = [] # labels
for d in docs:
d.prepare()
X.append([d.seglist.count(t) for t in terms])
Y.append(1 if d.cat == '-' else -1)
return (X, Y)
def savem(self):
data = {'probc': self.probc,
'probf': self.probf,
'cats': self.cats,
'numx': self.numx,
'numf': self.numf,
'numc': self.numc,
}
return data
def loadm(self, data):
self.probc = data['probc']
self.probf = data['probf']
self.cats = data['cats']
self.numx = data['numx']
self.numf = data['numf']
self.numc = data['numc']
def model(self, X, Y):
Classifier.model(self, X, Y)
self.probc = []
self.probf = []
self.cats = list(set(Y))
self.numx = len(X) # number of sample
self.numf = len(X[0]) # number of feature
self.numc = len(self.cats) # number of category
class BernoulliBayes(NaiveBayes):
''' bernoulli naive bayes classifier '''
def model(self, X, Y):
NaiveBayes.model(self, X, Y)
sit = range(self.numx)
cit = range(self.numc)
fit = range(self.numf)
sta1 = lambda c: sum(1 for y in Y if y == c) # N(cat)
sta2 = lambda f, c: sum(1 for i in sit if Y[i] == c and X[i][f] > 0) # N(feature, cat)
for c in cit:
ci = self.cats[c]
nc = sta1(ci) # N(c)
pc = float(nc) / float(self.numx) # P(c)
pf = [0] * self.numf
for f in fit:
fn = sta2(f, ci) # N(f, c)
pf[f] = float(1 + fn) / float(self.numc + nc)
self.probc.append(pc)
self.probf.append(pf)
def predict(self, x):
p = [0] * self.numc
cit = range(self.numc)
fit = range(self.numf)
for c in cit:
pi = 1
for f in fit:
if x[f] > 0: pi *= self.probf[c][f]
else: pi *= (1.0 - self.probf[c][f])
p[c] = pi * self.probc[c]
return self.cats[p.index(max(p))]
class MultinomialBayes(NaiveBayes):
''' multinomial naive bayes classifier '''
def model(self, X, Y):
NaiveBayes.model(self, X, Y)
sit = range(self.numx)
cit = range(self.numc)
fit = range(self.numf)
sta1 = lambda c: sum(1 for y in Y if y == c) # N(cat)
sta2 = lambda f, c: sum(X[i][f] for i in sit if Y[i] == c) # F(feature, cat)
for c in cit:
ci = self.cats[c]
nc = sta1(ci) # N(c)
pc = float(nc) / float(self.numx) # P(c)
pf = [0] * self.numf
for f in fit:
pf[f] = sta2(f, ci)
ss = sum(pf) + self.numx
for f in fit:
pf[f] = float(1 + pf[f]) / float(ss)
self.probc.append(pc)
self.probf.append(pf)
def predict(self, x):
p = [0] * self.numc
cit = range(self.numc)
fit = range(self.numf)
fact = lambda x: x and x * fact(x - 1) or 1
for c in cit:
pi = 1
for f in fit:
nf = x[f]
pi *= float(self.probf[c][f] ** nf) / float(fact(nf))
p[c] = pi * self.probc[c]
return self.cats[p.index(max(p))]
'''
@author: Smith
'''
from bayes import NaiveBayes
from nnetwork import NeuralNetwork
from utils import udoc
import random
class BayesNetwork(object):
def __init__(self, features):
tar = {+1: [+1], -1:[-1]}
self.features = features
self.bayes = {}
for k in self.features:
self.bayes[k] = NaiveBayes(len(self.features[k]), [+1, -1])
self.network = NeuralNetwork(tar, len(self.bayes), len(self.bayes), 1, 0.01)
pass
def model(self, docs):
sf = {}
for k in self.features:
s = udoc.sampling(self.features[k], docs, {'+': +1, '-': -1})
sf[k] = s
self.bayes[k].model(s)
for i in range(len(docs)):
d = docs[i]
c = +1 if d.cat == '+' else -1
sx = []
fx = []
for k in self.features:
s = sf[k][i]
fx.append(self.bayes[k].predict(s))
sx.append((fx, c))
self.network.model(sx)
def predict(self, doc):
sx = []
c = +1 if doc.cat == '+' else -1
for k in self.features:
s = udoc.sampling_one(self.features[k], doc, {'+': +1, '-': -1})
sx.append(self.bayes[k].predict(s))
return self.network.predict((sx, c))
def validate(self, docs, fold = 10):
p, r, a, f = [0] * 4
s = list(docs)
random.shuffle(s)
for k in range(fold):
trainset = [x for i, x in enumerate(s) if i % fold != k]
testset = [x for i, x in enumerate(s) if i % fold == k]
(pi, ri, ai, fi) = self.test(trainset, testset)
print (pi, ri, ai, fi)
p += pi; r += ri; a += ai; f += fi;
m = float(fold)
print (p / m, r / m, a / m, f / m)
return (p / m, r / m, a / m, f / m)
def test(self, trainset, testset):
self.model(trainset)
pc = [(self.predict(x), +1 if x.cat == '+' else -1) for x in testset]
mp, np, nr, ma = [0] * 4
for x in pc:
if x[0] == -1 and x[1] == -1: mp += 1
if x[0] == -1: np += 1
if x[1] == -1: nr += 1
if x[0] == x[1]: ma += 1
p = (float(mp) / float(np)) if np > 0 else 0 # precision
r = (float(mp) / float(nr)) if nr > 0 else 0 # recall
a = float(ma) / float(len(pc)) # accuracy
f = self.f_mesure(p, r, 1.0) # f-mesure
return (p, r, a, f)
def f_mesure(self, p, r, beta = 1):
b = beta ** 2
try: f = float((b + 1) * p * r) / float(b * p + r)
except: f = 0
return f
'''
@author: John Smith
'''
from base import Classifier
class AdaBoost(Classifier):
def __init__(self, classifier, T):
self.classifier = classifier
self.T = T
pass
def model(self, X, Y):
import copy, math
self.alpha = []
self.classifiers = []
N = len(X)
nit = range(N)
w = [1.0 / N] * N
for t in range(self.T):
cls = copy.deepcopy(self.classifier)
cls.model(X, Y)
Yp = [cls.predict(x) for x in X]
e = sum(w[i] for i in nit if Yp[i] != Y[i]) / float(N)
a = 0.5 * math.log((1 - e) / e)
for i in nit:
if Yp[i] == Y[i]: w[i] *= math.exp(-a)
else: w[i] *= math.exp(a)
z = sum(w) * 1.0
for i in nit: w[i] /= z
self.alpha.append(a)
self.classifiers.append(cls)
def predict(self, x):
p = sum(self.alpha[t] * self.classifiers[t].predict(x) for t in range(self.T))
return 1 if p > 0 else -1
'''
@author: John Smith
'''
import math, random
from base import Classifier
class NeuralNetwork(Classifier):
''' neural network classifier '''
@staticmethod
def sampling(terms, docs):
X = [] # features
Y = [] # labels
for d in docs:
d.prepare()
X.append([d.seglist.count(t) for t in terms])
Y.append(1 if d.cat == '-' else -1)
return (X, Y)
def __init__(self, target, i, h, o, r = 0.01):
self.target = target
# network output for each category
# e.g. {+1: [+1, +1, +1], [-1: [-1, -1, -1], ...}
self.ni = i # number of input nodes
self.nh = h # number of hidden nodes
self.no = o # number of output nodes
self.rate = r # learning rate
def savem(self):
data = {'target': self.target, 'rate': self.rate,
'ni': self.ni, 'nh': self.nh, 'no': self.no,
'vi': self.vi, 'vh': self.vh, 'vo': self.vo,
'wi': self.wi, 'wo': self.wo
}
return data
def loadm(self, data):
self.target = data['target']
self.rate = data['rate']
self.ni = data['ni']
self.nh = data['nh']
self.no = data['no']
self.vi = data['vi']
self.vh = data['vh']
self.vo = data['vo']
self.wi = data['wi']
self.wo = data['wo']
def model(self, X, Y):
ri = range(self.ni)
rh = range(self.nh)
ro = range(self.no)
rx = range(len(X))
self.vi = [1.0] * self.ni
# values of input nodes
self.vh = [1.0] * self.nh
# values of hidden nodes
self.vo = [1.0] * self.no
# values of output nodes
self.wi = [[random.uniform(-self.rate, self.rate) for j in ri] for x in rh]
# weight matrix for input nodes to hidden nodes
self.wo = [[random.uniform(-self.rate, self.rate) for j in rh] for x in ro]
# weight matrix for hidden nodes to output nodes
for i in rx:
self.forword(X[i])
self.backpropagate(self.target[Y[i]])
# train the network
def predict(self, x):
output = self.forword(x)
d = {t: self.euclidean(output, self.target[t]) for t in self.target}
return min(d, key = d.get)
def forword(self, inputs):
for i in range(self.ni):
self.vi[i] = inputs[i]
# activate inputs nodes
for i in range(self.nh):
self.vh[i] = self.sigmod(self.dotproduct(self.vi, self.wi[i]))
# activate hidden nodes
for i in range(self.no):
self.vo[i] = self.sigmod(self.dotproduct(self.vh, self.wo[i]))
# activate output nodes
return self.vo[:]
def backpropagate(self, outputs):
ri = range(self.ni)
rh = range(self.nh)
ro = range(self.no)
o_deltas = [0.0] * self.no
for i in ro:
e = outputs[i] - self.vo[i]
o_deltas[i] = self.dsigmoid(self.vo[i]) * e
# calculate error for output
h_deltas = [0.0] * self.nh
for i in rh:
e = 0.0
for j in ro:
e += o_deltas[j] * self.wo[j][i]
h_deltas[i] = self.dsigmoid(self.vh[i]) * e
# calculate error for hidden
for i in ro:
for j in rh:
self.wo[i][j] += self.rate * o_deltas[i] * self.vh[j]
# update output weights
for i in rh:
for j in ri:
self.wi[i][j] += self.rate * h_deltas[i] * self.vi[j]
# update input weights
e = sum([0.5 * (outputs[i] - self.vo[i]) ** 2 for i in ro])
#print e
return e
def sigmod(self, x):
#return 1.0 / (1 + math.exp(-x))
return math.tanh(x)
def dsigmoid(self, x):
#return x * (1.0 - x)
return 1.0 - x ** 2
def dotproduct(self, x, y):
# calculate dot product of two vectors
return math.fsum(x[i] * y[i] for i in range(len(x)))
def euclidean(self, x, y):
# calculate euclidean distance of two vector
return math.sqrt(math.fsum([math.pow(x[i] - y[i], 2) for i in range(len(x))]))
'''
@author: John Smith
'''
class CrossValidation(object):
def shuffle(self, samples):
import random
(X, Y) = samples
array = range(len(X))
random.shuffle(array)
samples = ([X[i] for i in array], [Y[i] for i in array])
return samples
def validate(self, classifier, samples, cat, fold = 10):
p, r, a, f = [0] * 4
X, Y = self.shuffle(samples)
sit = range(len(X))
for k in range(fold):
trainx = [X[i] for i in sit if i % fold != k]
trainy = [Y[i] for i in sit if i % fold != k]
testx = [X[i] for i in sit if i % fold == k]
testy = [Y[i] for i in sit if i % fold == k]
(pi, ri, ai, fi) = self.test(classifier, trainx, trainy, testx, testy, cat)
self.printf(pi, ri, ai, fi)
p += pi; r += ri; a += ai; f += fi;
m = float(fold)
self.printf(p / m, r / m, a / m, f / m)
return (p / m, r / m, a / m, f / m)
def test(self, classifier, trainx, trainy, testx, testy, cat):
classifier.model(trainx, trainy)
pc = [(classifier.predict(testx[i]), testy[i]) for i in range(len(testx))]
mp, np, nr, ma = [0] * 4
for x in pc:
if x[0] == cat and x[1] == cat: mp += 1
if x[0] == cat: np += 1
if x[1] == cat: nr += 1
if x[0] == x[1]: ma += 1
p = (float(mp) / float(np)) if np > 0 else 0 # precision
r = (float(mp) / float(nr)) if nr > 0 else 0 # recall
a = float(ma) / float(len(pc)) # accuracy
f = self.f_mesure(p, r, 1.0) # f-mesure
return (p, r, a, f)
def f_mesure(self, p, r, beta = 1):
b = beta ** 2
try: f = float((b + 1) * p * r) / float(b * p + r)
except: f = 0
return f
def printf(self, p, r, a, f):
t = (p * 100.0, r * 100.0, a * 100.0, f * 100.0)
print 'P: %0.4f%% R: %0.4f%% A: %0.4f%% F: %0.4f%%' % t
'''
@author: Smith
'''
from base import Classifier
class BalanceWinnow(Classifier):
@staticmethod
def sampling(terms, docs):
X = []
Y = []
for d in docs:
d.prepare()
X.append([1 if t in d.segset else 0 for t in terms])
Y.append(1 if d.cat == '-' else 0)
return (X, Y)
@staticmethod
def theta(terms, docs):
a = [sum(d.seglist.count(t) for t in terms) for d in docs]
return sum(a) / len(a)
def __init__(self, alpha, beta, theta):
self.alpha = alpha
self.beta = beta
self.theta = theta
def savem(self):
data = {'numf': self.numf,
'alpha': self.alpha,
'beta': self.beta,
'theta': self.theta,
'w_pos': self.w_pos,
'w_neg': self.w_neg,
}
return data
def loadm(self, data):
self.features = data['numf']
self.alpha = data['alpha']
self.beta = data['beta']
self.theta = data['theta']
self.w_pos = data['w_pos']
self.w_neg = data['w_neg']
def model(self, X, Y):
self.numf = len(X[0])
self.w_pos = [2] * self.numf
self.w_neg = [1] * self.numf
sit = range(len(X))
fit = range(self.numf)
for i in sit:
p = self.predict(X[i])
if p == 1 and p != Y[i]:
# decrease the weights
for f in fit:
if X[i][f] != 0:
self.w_pos[f] *= self.beta
self.w_neg[f] *= self.alpha
elif p == 0 and p != Y[i]:
# increase the weights
for f in fit:
if X[i][f] != 0:
self.w_pos[f] *= self.alpha
self.w_neg[f] *= self.beta
def predict(self, x):
r = range(self.numf)
s = sum((self.w_pos[i] - self.w_neg[i]) * x[i] for i in r)
return 1 if s > self.theta else 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment