Skip to content

Instantly share code, notes, and snippets.

@ningyuwhut
Created September 8, 2014 12:22
Show Gist options
  • Save ningyuwhut/46c260e99539c263ead8 to your computer and use it in GitHub Desktop.
Save ningyuwhut/46c260e99539c263ead8 to your computer and use it in GitHub Desktop.
#encoding=gbk
import random
import math
#读取数据集
def load_data(filename):
dataset = file(filename)
x_input=[]
y_output=[]
for line in dataset:
a_x_input=[]
splitted_line = line.strip().split(',')
length=len(splitted_line)
a_x_input=splitted_line[0:length-1]
a_x_input.append(1)
x_input.append(a_x_input)
y_output.append(splitted_line[length-1])
return (x_input, y_output)
def sigmoid(x):
return 1.0/(1.0+math.pow(math.e, -x))
def compute_log_likelihood(trainset_x, trainset_y,weights):
sample_num=len(trainset_x)
feature_num=len(trainset_x[0])
log_likelihood=0.0
for i in range(0,sample_num):
wx=0
for j in range(0, feature_num):
wx+=weights[j]*float(trainset_x[i][j])
log_likelihood+=(float(trainset_y[i])*wx-math.log(1+math.pow(math.e, wx)))
print "log_likelihood", log_likelihood
return log_likelihood
#梯度上升
def train(trainset_x, trainset_y, max_iter):
sample_num=len(trainset_x)
feature_num=len(trainset_x[0])
weights=[1]*feature_num
alpha=0.001
print "sample_num, feature_num", sample_num, feature_num
for m in range(max_iter): #0到max_iter-1
old_weights=weights[0:len(weights)]
old_log_likelihood = compute_log_likelihood(trainset_x, trainset_y, old_weights)
error=[0]*sample_num
for i in range(0,sample_num): #0到sample_num-1
tmp=0
for j in range(0, feature_num):
tmp+=float(trainset_x[i][j])*old_weights[j]
output=sigmoid(tmp)
error[i]+=float(trainset_y[i])-output
for k in range(0, feature_num): #0到feature_num-1
gradient=0
for i in range(0, sample_num):
gradient+=error[i]*float(trainset_x[i][k])
weights[k]=old_weights[k]+alpha*gradient
new_log_likelihood = compute_log_likelihood(trainset_x, trainset_y, weights)
if new_log_likelihood < old_log_likelihood :
print "new", new_log_likelihood
print "old", old_log_likelihood
print "error"
print (new_log_likelihood-old_log_likelihood)/old_log_likelihood
#break
if abs((new_log_likelihood-old_log_likelihood)/old_log_likelihood) < 0.001 :
print (new_log_likelihood-old_log_likelihood)/old_log_likelihood
#break
old_log_likelihood=new_log_likelihood
return weights
if __name__ == "__main__":
dataset_file="ds1.10.csv"
x_input, y_output = load_data(dataset_file)
print len(x_input)
trainset_x=[]
trainset_y=[]
testset_x=[]
testset_y=[]
max_iter=500
for i in range(0,len(x_input) ):
r = random.randint(1,10) #产生训练集和测试集
if r >2 :
trainset_x.append(x_input[i])
trainset_y.append(y_output[i])
else:
testset_x.append(x_input[i])
testset_y.append(y_output[i])
print len(trainset_x),len(trainset_y)
print len(testset_x), len(testset_y)
weights = train(trainset_x,trainset_y,max_iter)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment