sunskyhsh/algorithm.py

## algorithm.py
def qlearning(env, policy, num_iter1, alpha, gamma):
    actions = policy.actions
    for i in xrange(len(policy.theta)):
        policy.theta[i] = 0.1

    for iter1 in xrange(num_iter1):
        s_f       = env.reset()
        a         = policy.epsilon_greedy(s_f)
        count     = 0
        t         = False

        while False == t and count < 10000:
            s_f1,r,t,i = env.step(a)
            qmax = policy.qfunc(s_f1,a) #random
            for a1 in policy.actions:
                pvalue = policy.qfunc(s_f1, a1)
                if qmax < pvalue:
                    qmax = pvalue;
            update(policy, s_f, a, r + gamma * qmax, alpha);

            s_f     = s_f1
            a       = policy.epsilon_greedy(s_f)
            count   += 1

        if iter1%100 == 0:
            print "complete the %d epoches"%(iter1)

    return policy
	def qlearning(env, policy, num_iter1, alpha, gamma):
	actions = policy.actions
	for i in xrange(len(policy.theta)):
	policy.theta[i] = 0.1

	for iter1 in xrange(num_iter1):
	s_f = env.reset()
	a = policy.epsilon_greedy(s_f)
	count = 0
	t = False

	while False == t and count < 10000:
	s_f1,r,t,i = env.step(a)
	qmax = policy.qfunc(s_f1,a) #random
	for a1 in policy.actions:
	pvalue = policy.qfunc(s_f1, a1)
	if qmax < pvalue:
	qmax = pvalue;
	update(policy, s_f, a, r + gamma * qmax, alpha);

	s_f = s_f1
	a = policy.epsilon_greedy(s_f)
	count += 1

	if iter1%100 == 0:
	print "complete the %d epoches"%(iter1)

	return policy