Jeremi Kaczmarczyk jknthn

## monte_carlo_method.py
def monte_carlo_e_soft(env, episodes=100, policy=None, epsilon=0.01):
    if not policy:
        policy = create_random_policy(env) # 1.

    Q = create_state_action_dictionary(env, policy) # 2.
    returns = {} # 3.

    for _ in range(episodes): # 4.
        G = 0 # 5.
        episode = run_game(env=env, policy=policy, display=False) # 6.

## value_iteration.py
def value_iteration(V_s, theta=0.01, discount_rate=0.5):
    value_for_state_map = create_value_for_state_map() # 1.

    delta = 100 # 2.
    while not delta < theta: # 3.
        delta = 0 # 4.
        for state in range(1, 15): # 5.
            v = V_s[state] # 6.

            totals = {} # 7.

## iterative_policy_evaluation_2.py
policy = create_random_policy()
V_s = iterative_policy_evaluation(policy) # {0: 0.0, 1: -1.7, 2: -1.9, 3: -1.9, 4: -1.7, 5: -1.9, 6: -1.9, 7: -1.9, 8: -1.9, 9: -1.9, 10: -1.9, 11: -1.7, 12: -1.9, 13: -1.9, 14: -1.7, 15: 0.0}
policy = create_greedy_policy(V_s)

V_s = iterative_policy_evaluation(policy) # {0: 0.0, 1: -1.0, 2: -1.5, 3: -1.8, 4: -1.0, 5: -1.5, 6: -1.8, 7: -1.5, 8: -1.5, 9: -1.8, 10: -1.5, 11: -1.0, 12: -1.8, 13: -1.5, 14: -1.0, 15: 0.0}
policy = create_greedy_policy(V_s)

## iterative_policy_evaluation.py
def iterative_policy_evaluation(policy, theta=0.01, discount_rate=0.5):
    V_s = {i: 0 for i in range(16)} # 1.
    probablitiy_map = create_probability_map() # 2.

    delta = 100 # 3.
    while not delta < theta: # 4.
        delta = 0 # 5.
        for state in range(16): # 6.
            v = V_s[state] # 7.


## p.csv

          
            state_prime
             reward
             state
             action

            
              0
               -1
               2
               'N'

## data.txt
// Random Policy
{0: {'E': 0.0, 'N': 0.0, 'S': 0.0, 'W': 0.0},
 1: {'E': 0.25, 'N': 0.25, 'S': 0.25, 'W': 0.25},
 2: {'E': 0.25, 'N': 0.25, 'S': 0.25, 'W': 0.25},
...
 13: {'E': 0.25, 'N': 0.25, 'S': 0.25, 'W': 0.25},
 14: {'E': 0.25, 'N': 0.25, 'S': 0.25, 'W': 0.25},
 15: {'E': 0.0, 'N': 0.0, 'S': 0.0, 'W': 0.0}}

// State to State prime

## agent.py
def agent(policy, starting_position=None):
    l = list(range(16))
    state_to_state_prime = create_state_to_state_prime_verbose_map()
    agent_position = randint(1, 14) if starting_position is None else starting_position

    step_number = 1

    while not (agent_position == 0 or agent_position == 15):
        current_policy = policy[agent_position]
        next_move = random()

## board.txt
-----------------
| X |   |   |   |
-----------------
|   |   |   |   |
-----------------
| A |   |   |   |
-----------------
|   |   |   | X |
-----------------

## ucb-1.py
class UCB(KBanditSolution):

    def count_ucb(self, q, c, step, n):
        if n == 0:
            return sys.maxsize
        return (q + (c * sqrt((log(step) / n))))

    def solve(self, c):
        Q = {i: 0 for i in range(k)} # 1. Value function
        N = {i: 0 for i in range(k)} # 2. Number of actions, for update rule

## weighted.py
class WeightedAverage(KBanditSolution):

    def solve(self, exploration_rate, step_size, initial_value):
        Q = {i: initial_value for i in range(k)} # 1. Value function
        N = {i: 0 for i in range(k)} # 2. Number of actions, for update rule

        for i in range(self.steps): # 3. Main loop
            explore = random.uniform(0, 1) < exploration_rate  # 4. Exploration
            if explore:
                action = random.randint(0, k - 1) # 5. Exploration: Choosing random action
	def monte_carlo_e_soft(env, episodes=100, policy=None, epsilon=0.01):
	if not policy:
	policy = create_random_policy(env) # 1.

	Q = create_state_action_dictionary(env, policy) # 2.
	returns = {} # 3.

	for _ in range(episodes): # 4.
	G = 0 # 5.
	episode = run_game(env=env, policy=policy, display=False) # 6.
	def value_iteration(V_s, theta=0.01, discount_rate=0.5):
	value_for_state_map = create_value_for_state_map() # 1.

	delta = 100 # 2.
	while not delta < theta: # 3.
	delta = 0 # 4.
	for state in range(1, 15): # 5.
	v = V_s[state] # 6.

	totals = {} # 7.
	policy = create_random_policy()
	V_s = iterative_policy_evaluation(policy) # {0: 0.0, 1: -1.7, 2: -1.9, 3: -1.9, 4: -1.7, 5: -1.9, 6: -1.9, 7: -1.9, 8: -1.9, 9: -1.9, 10: -1.9, 11: -1.7, 12: -1.9, 13: -1.9, 14: -1.7, 15: 0.0}
	policy = create_greedy_policy(V_s)

	V_s = iterative_policy_evaluation(policy) # {0: 0.0, 1: -1.0, 2: -1.5, 3: -1.8, 4: -1.0, 5: -1.5, 6: -1.8, 7: -1.5, 8: -1.5, 9: -1.8, 10: -1.5, 11: -1.0, 12: -1.8, 13: -1.5, 14: -1.0, 15: 0.0}
	policy = create_greedy_policy(V_s)
	def iterative_policy_evaluation(policy, theta=0.01, discount_rate=0.5):
	V_s = {i: 0 for i in range(16)} # 1.
	probablitiy_map = create_probability_map() # 2.

	delta = 100 # 3.
	while not delta < theta: # 4.
	delta = 0 # 5.
	for state in range(16): # 6.
	v = V_s[state] # 7.
	// Random Policy
	{0: {'E': 0.0, 'N': 0.0, 'S': 0.0, 'W': 0.0},
	1: {'E': 0.25, 'N': 0.25, 'S': 0.25, 'W': 0.25},
	2: {'E': 0.25, 'N': 0.25, 'S': 0.25, 'W': 0.25},
	...
	13: {'E': 0.25, 'N': 0.25, 'S': 0.25, 'W': 0.25},
	14: {'E': 0.25, 'N': 0.25, 'S': 0.25, 'W': 0.25},
	15: {'E': 0.0, 'N': 0.0, 'S': 0.0, 'W': 0.0}}

	// State to State prime
	def agent(policy, starting_position=None):
	l = list(range(16))
	state_to_state_prime = create_state_to_state_prime_verbose_map()
	agent_position = randint(1, 14) if starting_position is None else starting_position

	step_number = 1

	while not (agent_position == 0 or agent_position == 15):
	current_policy = policy[agent_position]
	next_move = random()
	-----------------
	\| X \| \| \| \|
	-----------------
	\| \| \| \| \|
	-----------------
	\| A \| \| \| \|
	-----------------
	\| \| \| \| X \|
	-----------------
	class UCB(KBanditSolution):

	def count_ucb(self, q, c, step, n):
	if n == 0:
	return sys.maxsize
	return (q + (c * sqrt((log(step) / n))))

	def solve(self, c):
	Q = {i: 0 for i in range(k)} # 1. Value function
	N = {i: 0 for i in range(k)} # 2. Number of actions, for update rule
	class WeightedAverage(KBanditSolution):

	def solve(self, exploration_rate, step_size, initial_value):
	Q = {i: initial_value for i in range(k)} # 1. Value function
	N = {i: 0 for i in range(k)} # 2. Number of actions, for update rule

	for i in range(self.steps): # 3. Main loop
	explore = random.uniform(0, 1) < exploration_rate # 4. Exploration
	if explore:
	action = random.randint(0, k - 1) # 5. Exploration: Choosing random action