tomykaira/gist:58e1271c0118462e71e17ca26c679f80

## gistfile1.txt
# import gym
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import gym
import random
init_notebook_mode(connected=True)

env = gym.make('CartPole-v1')
data = []

theta = np.random.uniform(low=-1, high=1, size=(5))
alpha = 0.1
gamma = 0.98

def q(state, action):
    return theta[0] * state[0] + theta[1] * state[1] + theta[2] * state[2] + theta[3] * state[3] + theta[4] * action

def update_qtable(state, action, reward, next_state, next_action):
    delta_q = alpha * (reward + gamma * q(next_state, next_action) - q(state, action))
    #超適当
    for i in range(5):
        if i < 4:
            theta[i] += delta_q * state[i] / (abs(delta_q) * abs(state[i]))
        else:
            theta[i] += delta_q * action / (abs(delta_q) * abs(action))

for i in range(10000):
    obs = env.reset()
    turn = 0
    if q(obs, -1) < q(obs, 1):
        action = 1
    else:
        action = -1
    while True:
        if action == -1:
            bin_action = 0
        else:
            bin_action = 1
        next_obs, reward, done, _ = env.step(bin_action)
        turn += 1
        if turn < 500 and done:
            reward = -500
        if q(next_obs, -1) < q(next_obs, 1):
            next_action = 1
        else:
            next_action = -1
        update_qtable(obs, action, reward, next_obs, next_action)
        obs = next_obs
        action = next_action

        if done:
            break

print(theta, turn)
	# import gym
	import numpy as np
	import plotly.plotly as py
	import plotly.graph_objs as go
	from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
	import gym
	import random
	init_notebook_mode(connected=True)

	env = gym.make('CartPole-v1')
	data = []

	theta = np.random.uniform(low=-1, high=1, size=(5))
	alpha = 0.1
	gamma = 0.98

	def q(state, action):
	return theta[0] * state[0] + theta[1] * state[1] + theta[2] * state[2] + theta[3] * state[3] + theta[4] * action

	def update_qtable(state, action, reward, next_state, next_action):
	delta_q = alpha * (reward + gamma * q(next_state, next_action) - q(state, action))
	#超適当
	for i in range(5):
	if i < 4:
	theta[i] += delta_q * state[i] / (abs(delta_q) * abs(state[i]))
	else:
	theta[i] += delta_q * action / (abs(delta_q) * abs(action))

	for i in range(10000):
	obs = env.reset()
	turn = 0
	if q(obs, -1) < q(obs, 1):
	action = 1
	else:
	action = -1
	while True:
	if action == -1:
	bin_action = 0
	else:
	bin_action = 1
	next_obs, reward, done, _ = env.step(bin_action)
	turn += 1
	if turn < 500 and done:
	reward = -500
	if q(next_obs, -1) < q(next_obs, 1):
	next_action = 1
	else:
	next_action = -1
	update_qtable(obs, action, reward, next_obs, next_action)
	obs = next_obs
	action = next_action

	if done:
	break

	print(theta, turn)