Last active
December 11, 2020 13:41
-
-
Save UrosOgrizovic/07bbe15ac7f88f84dd98c81e54a87575 to your computer and use it in GitHub Desktop.
A bot that plays blackjack. Three policies are available: greedy, epsilon greedy and softmax.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
bot that plays blackjack | |
Uros Ogrizovic | |
""" | |
import numpy as np | |
from numpy.random import rand, randint | |
def greedy_policy(q): | |
""" | |
choose the best action | |
:param q: object {'hit': val1, 'hold': val2} | |
:return: best action | |
""" | |
return 'hit' if q['hit'] >= q['hold'] else 'hold' | |
def eps_greedy_policy(q, eps=0.1): | |
""" | |
choose a random action with a probability of eps, | |
otherwise choose best action, i.e. greedy | |
:param q: object {'hit': val1, 'hold': val2} | |
:param eps: exploration/exploitation threshold | |
:return: random/best action | |
""" | |
if rand() < eps: | |
# choose random action | |
if randint(0, len(q)) == 0: | |
return 'hit' | |
return 'hold' | |
else: | |
return greedy_policy(q) | |
def softmax_policy(q): | |
""" | |
the probability of choosing an action is proportional to its q-value | |
:param q: object {'hit': val1, 'hold': val2} | |
:return: chosen action | |
""" | |
s = np.exp(q['hit']) + np.exp(q['hold']) | |
# e.g. {'hit': 0.8, 'hold': 0.2} | |
probs_actions = {'hit': np.exp(q['hit'] / s), 'hold': np.exp(q['hold'] / s)} | |
rand_prob = rand() | |
best_action = max(probs_actions) | |
worst_action = min(probs_actions) | |
return best_action if probs_actions[best_action] >= rand_prob else worst_action | |
def draw_card(): | |
""" | |
Cards with a value of 10 are 4 times as likely to be drawn, | |
because 10, J, Q, K all have a value of 10. | |
An 11 will be converted to a 1 in the hit() function, if | |
necessary. | |
:return: a card value from the [2, 11] interval | |
""" | |
probabilities = [4 / 52] * 8 + [16 / 52] + [4 / 52] # [2 - 9], [10, J, Q, K], ace | |
return np.random.choice(np.arange(2, 12), p=probabilities) | |
def hit(cards): | |
""" | |
draw a card and add it to cards if necessary; otherwise, | |
add a terminal value (either -1 or 100) to cards | |
:param cards: cards of current player (either agent or dealer) | |
:return: modified cards | |
""" | |
value = draw_card() | |
score = sum(cards) | |
if score + value == 21: # current player wins | |
cards.append(100) | |
elif score + value > 21: | |
if value == 11: | |
# treat ace as 1 instead of as 11 | |
cards.append(1) | |
if sum(cards) == 21: # current player wins | |
cards.append(100) | |
elif 11 in cards: | |
cards[cards.index(11)] = 1 | |
cards.append(value) | |
if sum(cards) == 21: # current player wins | |
cards.append(100) | |
else: # current player loses | |
cards.append(-1) | |
else: | |
cards.append(value) | |
return cards | |
def dealer(dealer_cards, dealer_score): | |
""" | |
hit if score < 17, otherwise hold | |
:param dealer_cards: | |
:param dealer_score: | |
:return: 0 (i.e. hold, if score >= 17) or card value (i.e. hit, if score < 17) | |
""" | |
value = 0 | |
while dealer_score < 17: | |
dealer_cards = hit(dealer_cards) | |
value = dealer_cards[-1] | |
if value == -1 or value == 100: # game is over, exit while | |
break | |
return dealer_cards, value | |
def blackjack_bot(player_cards, q_values, player_score, policy_name): | |
""" | |
employ a policy to decide between hit and hold | |
:param player_cards: player's cards | |
:param q_values: hit and hold values for each score (e.g. {12: {'hit': val1, 'hold': val2}, ...}) | |
:param player_score: player's current score | |
:param policy_name: which policy to adopt | |
:return: | |
""" | |
value = 0 # hold value by default | |
action = get_action_by_policy_name(q_values, sum(player_cards), policy_name) | |
if action == 'hit': | |
player_cards = hit(player_cards) | |
value = player_cards[-1] | |
return player_cards, value | |
def tdl(q, lmbda, r, states_actions): | |
""" | |
temporal difference learning | |
q(s, a) = q(s, a) + lmbda * (r - q(s, a)) | |
:param q: q-value | |
:param lmbda: lambda, discounting factor | |
:param r: reward | |
:param states_actions: state-action pairs from last game | |
:return: updated value of q | |
""" | |
for state, action in states_actions.items(): | |
q[state][action] = q[state][action] + lmbda * (r - q[state][action]) | |
return q | |
def sarsa_q_learning(q, s, a, alpha, r, gamma, s_prime, is_sarsa=True): | |
""" | |
SARSA - on-policy temporal difference algorithm | |
q(s, a) = q(s, a) + alpha * (r + gamma * q(s', a') - q(s, a)) | |
Q-learning - similar to sarsa; the only difference is that | |
sarsa employs the given policy to get the new value (a_prime), | |
whereas q-learning always uses the greedy policy for that | |
same task. | |
q(s, a) = q(s, a) + alpha * (r + gamma * max(a')(q(s', a')) - q(s, a)) | |
it runs after each move | |
:param q: all q-values | |
:param s: old state | |
:param a: old action | |
:param alpha: learning rate | |
:param r: reward | |
:param gamma: discount factor | |
:param s_prime: new state | |
:param is_sarsa: True (sarsa) or False (q-learning) | |
:return: | |
""" | |
if is_sarsa: | |
a_prime = get_action_by_policy_name(q_values, s_prime, policy_name) | |
else: | |
a_prime = greedy_policy(q_values[player_score]) # use greedy policy for q-learning | |
new_q_value = 0 # all terminal nodes are initialized to zero | |
if s_prime < 21: | |
new_q_value = q[s_prime][a_prime] | |
q[s][a] = q[s][a] + alpha * (r + gamma * new_q_value - q[s][a]) | |
return q | |
def get_action_by_policy_name(q_values, player_score, policy_name): | |
""" | |
get action for q value and a specific policy | |
:param q_values: all q_values | |
:player_score: current player score | |
:param policy_name: the name of the policy to be adopted | |
:return: | |
""" | |
if player_score >= 21: | |
return 'hold' | |
if policy_name == 'greedy': | |
action = greedy_policy(q_values[player_score]) | |
elif policy_name == 'eps_greedy': | |
action = eps_greedy_policy(q_values[player_score]) | |
elif policy_name == 'softmax': | |
action = softmax_policy(q_values[player_score]) | |
else: | |
raise ValueError("Invalid policy name") | |
return action | |
if __name__ == "__main__": | |
policy_name = 'greedy' # 'greedy', 'eps_greedy' or 'softmax' | |
num_games = 0 | |
lmbda = 0.1 # for tdl, value from (0, 1) | |
alpha = 0.1 # learning rate for sarsa/q-learning, value from (0, 1) | |
gamma = 0.1 # discount factor for sarsa/q-learning, value from (0, 1) | |
# hit and hold values for each score (e.g. {12: {'hit': val1, 'hold': val2}, ...}) | |
q_values = {i: {'hit': 0, 'hold': 0} for i in range(4, 21)} | |
is_tdl_update = False # True (will update via tdl) or False (will update via sarsa_q_learning) | |
is_sarsa = True # True (SARSA) or False (q-learning) | |
# 1 - agent's wins, -1 - dealer's wins | |
wins_losses = {1: 0, -1: 0} | |
if is_tdl_update: | |
print('Method for updating q-values: TDL') | |
else: | |
if is_sarsa: | |
print('Method for updating q-values: SARSA') | |
else: | |
print('Method for updating q-values: Q-learning') | |
# outer loop for all games | |
while num_games < 10000: | |
num_games += 1 | |
player_cards = [] | |
dealer_cards = [] | |
player_cards.append(draw_card()) | |
player_cards.append(draw_card()) | |
dealer_cards.append(draw_card()) | |
dealer_cards.append(draw_card()) | |
player_turn = True | |
''' {12: 'hit', 7: 'hit', '18': 'hold', ...} - is used to update q_values | |
via TDL after each game''' | |
states_actions = {} | |
reward = 0 | |
if sum(player_cards) >= 21: # got lucky, just move on to the next game | |
continue | |
# inner loop for one game | |
while reward == 0: | |
if player_turn: | |
while True: # player can keep hitting | |
player_score = sum(player_cards) | |
player_cards, val = blackjack_bot(player_cards, q_values, player_score, policy_name) | |
if val == 0: # hold | |
states_actions[player_score] = 'hold' | |
if not is_tdl_update: # if not tdl, update after every move | |
q_values = sarsa_q_learning(q_values, player_score, 'hold', alpha, reward, gamma, | |
sum(player_cards), is_sarsa) | |
break | |
else: # hit | |
states_actions[player_score] = 'hit' | |
if val == -1 or val == 100: # player loses or player wins | |
reward = 1 if val == 100 else -1 | |
if not is_tdl_update: # if not tdl, update after every move | |
q_values = sarsa_q_learning(q_values, player_score, 'hit', alpha, reward, gamma, | |
sum(player_cards), is_sarsa) | |
break | |
if not is_tdl_update: # if not tdl, update after every move | |
q_values = sarsa_q_learning(q_values, player_score, 'hit', alpha, reward, gamma, | |
sum(player_cards), is_sarsa) | |
player_turn = not player_turn | |
else: | |
dealer_score = sum(dealer_cards) | |
dealer_cards, val = dealer(dealer_cards, dealer_score) | |
if val == 0: # hold - the game is over | |
player_score = sum(player_cards) | |
dealer_score = sum(dealer_cards) | |
if player_score != dealer_score: # if player wins or dealer wins | |
reward = 1 if player_score > dealer_score else -1 | |
else: # tie, don't update q-values | |
states_actions = {} | |
break # the game is over, so break | |
else: # hit | |
if val == -1 or val == 100: # if dealer wins or player wins | |
reward = -1 if val == 100 else 1 | |
break | |
if len(states_actions) != 0: # if not tie | |
if reward != 0: | |
wins_losses[reward] += 1 | |
if num_games == 1000: | |
lmbda = 0.001 # reduce value of lmbda, i.e. slow down the learning process | |
if is_tdl_update: | |
q_values = tdl(q_values, lmbda, reward, states_actions) # update q_values | |
print("Wins-losses:", wins_losses) | |
print("Q-values:", q_values) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment