prakhar21/cartpole-randomsearch.py

## cartpole-randomsearch.py
'''
@uthor: Prakhar Mishra
'''

import gym
import numpy as np
env = gym.make('CartPole-v0')

class LinearCombinationPolicy():

    def __init__(self, episodes=1000, steps=200):
        self.action = -1
        self.episodes = episodes
        self.steps = steps

    def combine(self, parameters, observation):
        return np.matmul(parameters,observation)

    def action_selection(self, observation, parameters):
        if self.combine(parameters, observation) < 0:
            self.action = 0
        else:
            self.action = 1

    def get_action(self, observation, parameters):
        self.action_selection(observation, parameters)
        return self.action


linearLearn = LinearCombinationPolicy()
for _ in xrange(linearLearn.episodes):
    observation = env.reset() # reset the environment state
    parameters = np.random.rand(4) * 2 - 1  # start with variables weights every episode assigned randomly
    TOTALREWARD = 0
    for step in xrange(linearLearn.steps):
        env.render()
        action = linearLearn.get_action(observation, parameters)
        observation_, reward, done, info = env.step(action)
        TOTALREWARD += reward
        observation = observation_
        if done:
            break
    if TOTALREWARD == 200:
        print 'total reward is {}'.format(TOTALREWARD)
        print 'parameters are {}'.format(parameters)
        break
	'''
	@uthor: Prakhar Mishra
	'''

	import gym
	import numpy as np
	env = gym.make('CartPole-v0')

	class LinearCombinationPolicy():

	def __init__(self, episodes=1000, steps=200):
	self.action = -1
	self.episodes = episodes
	self.steps = steps

	def combine(self, parameters, observation):
	return np.matmul(parameters,observation)

	def action_selection(self, observation, parameters):
	if self.combine(parameters, observation) < 0:
	self.action = 0
	else:
	self.action = 1

	def get_action(self, observation, parameters):
	self.action_selection(observation, parameters)
	return self.action


	linearLearn = LinearCombinationPolicy()
	for _ in xrange(linearLearn.episodes):
	observation = env.reset() # reset the environment state
	parameters = np.random.rand(4) * 2 - 1 # start with variables weights every episode assigned randomly
	TOTALREWARD = 0
	for step in xrange(linearLearn.steps):
	env.render()
	action = linearLearn.get_action(observation, parameters)
	observation_, reward, done, info = env.step(action)
	TOTALREWARD += reward
	observation = observation_
	if done:
	break
	if TOTALREWARD == 200:
	print 'total reward is {}'.format(TOTALREWARD)
	print 'parameters are {}'.format(parameters)
	break