Skip to content

Instantly share code, notes, and snippets.

@UrosOgrizovic
Last active December 11, 2020 13:41
Show Gist options
  • Save UrosOgrizovic/07bbe15ac7f88f84dd98c81e54a87575 to your computer and use it in GitHub Desktop.
Save UrosOgrizovic/07bbe15ac7f88f84dd98c81e54a87575 to your computer and use it in GitHub Desktop.
A bot that plays blackjack. Three policies are available: greedy, epsilon greedy and softmax.
"""
bot that plays blackjack
Uros Ogrizovic
"""
import numpy as np
from numpy.random import rand, randint
def greedy_policy(q):
"""
choose the best action
:param q: object {'hit': val1, 'hold': val2}
:return: best action
"""
return 'hit' if q['hit'] >= q['hold'] else 'hold'
def eps_greedy_policy(q, eps=0.1):
"""
choose a random action with a probability of eps,
otherwise choose best action, i.e. greedy
:param q: object {'hit': val1, 'hold': val2}
:param eps: exploration/exploitation threshold
:return: random/best action
"""
if rand() < eps:
# choose random action
if randint(0, len(q)) == 0:
return 'hit'
return 'hold'
else:
return greedy_policy(q)
def softmax_policy(q):
"""
the probability of choosing an action is proportional to its q-value
:param q: object {'hit': val1, 'hold': val2}
:return: chosen action
"""
s = np.exp(q['hit']) + np.exp(q['hold'])
# e.g. {'hit': 0.8, 'hold': 0.2}
probs_actions = {'hit': np.exp(q['hit'] / s), 'hold': np.exp(q['hold'] / s)}
rand_prob = rand()
best_action = max(probs_actions)
worst_action = min(probs_actions)
return best_action if probs_actions[best_action] >= rand_prob else worst_action
def draw_card():
"""
Cards with a value of 10 are 4 times as likely to be drawn,
because 10, J, Q, K all have a value of 10.
An 11 will be converted to a 1 in the hit() function, if
necessary.
:return: a card value from the [2, 11] interval
"""
probabilities = [4 / 52] * 8 + [16 / 52] + [4 / 52] # [2 - 9], [10, J, Q, K], ace
return np.random.choice(np.arange(2, 12), p=probabilities)
def hit(cards):
"""
draw a card and add it to cards if necessary; otherwise,
add a terminal value (either -1 or 100) to cards
:param cards: cards of current player (either agent or dealer)
:return: modified cards
"""
value = draw_card()
score = sum(cards)
if score + value == 21: # current player wins
cards.append(100)
elif score + value > 21:
if value == 11:
# treat ace as 1 instead of as 11
cards.append(1)
if sum(cards) == 21: # current player wins
cards.append(100)
elif 11 in cards:
cards[cards.index(11)] = 1
cards.append(value)
if sum(cards) == 21: # current player wins
cards.append(100)
else: # current player loses
cards.append(-1)
else:
cards.append(value)
return cards
def dealer(dealer_cards, dealer_score):
"""
hit if score < 17, otherwise hold
:param dealer_cards:
:param dealer_score:
:return: 0 (i.e. hold, if score >= 17) or card value (i.e. hit, if score < 17)
"""
value = 0
while dealer_score < 17:
dealer_cards = hit(dealer_cards)
value = dealer_cards[-1]
if value == -1 or value == 100: # game is over, exit while
break
return dealer_cards, value
def blackjack_bot(player_cards, q_values, player_score, policy_name):
"""
employ a policy to decide between hit and hold
:param player_cards: player's cards
:param q_values: hit and hold values for each score (e.g. {12: {'hit': val1, 'hold': val2}, ...})
:param player_score: player's current score
:param policy_name: which policy to adopt
:return:
"""
value = 0 # hold value by default
action = get_action_by_policy_name(q_values, sum(player_cards), policy_name)
if action == 'hit':
player_cards = hit(player_cards)
value = player_cards[-1]
return player_cards, value
def tdl(q, lmbda, r, states_actions):
"""
temporal difference learning
q(s, a) = q(s, a) + lmbda * (r - q(s, a))
:param q: q-value
:param lmbda: lambda, discounting factor
:param r: reward
:param states_actions: state-action pairs from last game
:return: updated value of q
"""
for state, action in states_actions.items():
q[state][action] = q[state][action] + lmbda * (r - q[state][action])
return q
def sarsa_q_learning(q, s, a, alpha, r, gamma, s_prime, is_sarsa=True):
"""
SARSA - on-policy temporal difference algorithm
q(s, a) = q(s, a) + alpha * (r + gamma * q(s', a') - q(s, a))
Q-learning - similar to sarsa; the only difference is that
sarsa employs the given policy to get the new value (a_prime),
whereas q-learning always uses the greedy policy for that
same task.
q(s, a) = q(s, a) + alpha * (r + gamma * max(a')(q(s', a')) - q(s, a))
it runs after each move
:param q: all q-values
:param s: old state
:param a: old action
:param alpha: learning rate
:param r: reward
:param gamma: discount factor
:param s_prime: new state
:param is_sarsa: True (sarsa) or False (q-learning)
:return:
"""
if is_sarsa:
a_prime = get_action_by_policy_name(q_values, s_prime, policy_name)
else:
a_prime = greedy_policy(q_values[player_score]) # use greedy policy for q-learning
new_q_value = 0 # all terminal nodes are initialized to zero
if s_prime < 21:
new_q_value = q[s_prime][a_prime]
q[s][a] = q[s][a] + alpha * (r + gamma * new_q_value - q[s][a])
return q
def get_action_by_policy_name(q_values, player_score, policy_name):
"""
get action for q value and a specific policy
:param q_values: all q_values
:player_score: current player score
:param policy_name: the name of the policy to be adopted
:return:
"""
if player_score >= 21:
return 'hold'
if policy_name == 'greedy':
action = greedy_policy(q_values[player_score])
elif policy_name == 'eps_greedy':
action = eps_greedy_policy(q_values[player_score])
elif policy_name == 'softmax':
action = softmax_policy(q_values[player_score])
else:
raise ValueError("Invalid policy name")
return action
if __name__ == "__main__":
policy_name = 'greedy' # 'greedy', 'eps_greedy' or 'softmax'
num_games = 0
lmbda = 0.1 # for tdl, value from (0, 1)
alpha = 0.1 # learning rate for sarsa/q-learning, value from (0, 1)
gamma = 0.1 # discount factor for sarsa/q-learning, value from (0, 1)
# hit and hold values for each score (e.g. {12: {'hit': val1, 'hold': val2}, ...})
q_values = {i: {'hit': 0, 'hold': 0} for i in range(4, 21)}
is_tdl_update = False # True (will update via tdl) or False (will update via sarsa_q_learning)
is_sarsa = True # True (SARSA) or False (q-learning)
# 1 - agent's wins, -1 - dealer's wins
wins_losses = {1: 0, -1: 0}
if is_tdl_update:
print('Method for updating q-values: TDL')
else:
if is_sarsa:
print('Method for updating q-values: SARSA')
else:
print('Method for updating q-values: Q-learning')
# outer loop for all games
while num_games < 10000:
num_games += 1
player_cards = []
dealer_cards = []
player_cards.append(draw_card())
player_cards.append(draw_card())
dealer_cards.append(draw_card())
dealer_cards.append(draw_card())
player_turn = True
''' {12: 'hit', 7: 'hit', '18': 'hold', ...} - is used to update q_values
via TDL after each game'''
states_actions = {}
reward = 0
if sum(player_cards) >= 21: # got lucky, just move on to the next game
continue
# inner loop for one game
while reward == 0:
if player_turn:
while True: # player can keep hitting
player_score = sum(player_cards)
player_cards, val = blackjack_bot(player_cards, q_values, player_score, policy_name)
if val == 0: # hold
states_actions[player_score] = 'hold'
if not is_tdl_update: # if not tdl, update after every move
q_values = sarsa_q_learning(q_values, player_score, 'hold', alpha, reward, gamma,
sum(player_cards), is_sarsa)
break
else: # hit
states_actions[player_score] = 'hit'
if val == -1 or val == 100: # player loses or player wins
reward = 1 if val == 100 else -1
if not is_tdl_update: # if not tdl, update after every move
q_values = sarsa_q_learning(q_values, player_score, 'hit', alpha, reward, gamma,
sum(player_cards), is_sarsa)
break
if not is_tdl_update: # if not tdl, update after every move
q_values = sarsa_q_learning(q_values, player_score, 'hit', alpha, reward, gamma,
sum(player_cards), is_sarsa)
player_turn = not player_turn
else:
dealer_score = sum(dealer_cards)
dealer_cards, val = dealer(dealer_cards, dealer_score)
if val == 0: # hold - the game is over
player_score = sum(player_cards)
dealer_score = sum(dealer_cards)
if player_score != dealer_score: # if player wins or dealer wins
reward = 1 if player_score > dealer_score else -1
else: # tie, don't update q-values
states_actions = {}
break # the game is over, so break
else: # hit
if val == -1 or val == 100: # if dealer wins or player wins
reward = -1 if val == 100 else 1
break
if len(states_actions) != 0: # if not tie
if reward != 0:
wins_losses[reward] += 1
if num_games == 1000:
lmbda = 0.001 # reduce value of lmbda, i.e. slow down the learning process
if is_tdl_update:
q_values = tdl(q_values, lmbda, reward, states_actions) # update q_values
print("Wins-losses:", wins_losses)
print("Q-values:", q_values)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment