UrosOgrizovic/blackjack.py

## blackjack.py
"""
bot that plays blackjack
Uros Ogrizovic
"""

import numpy as np
from numpy.random import rand, randint


def greedy_policy(q):
    """
    choose the best action

    :param q: object {'hit': val1, 'hold': val2}
    :return: best action
    """
    return 'hit' if q['hit'] >= q['hold'] else 'hold'


def eps_greedy_policy(q, eps=0.1):
    """
    choose a random action with a probability of eps,
    otherwise choose best action, i.e. greedy

    :param q: object {'hit': val1, 'hold': val2}
    :param eps: exploration/exploitation threshold
    :return: random/best action
    """
    if rand() < eps:
        # choose random action
        if randint(0, len(q)) == 0:
            return 'hit'
        return 'hold'
    else:
        return greedy_policy(q)


def softmax_policy(q):
    """
    the probability of choosing an action is proportional to its q-value

    :param q: object {'hit': val1, 'hold': val2}
    :return: chosen action
    """
    s = np.exp(q['hit']) + np.exp(q['hold'])

    # e.g. {'hit': 0.8, 'hold': 0.2}
    probs_actions = {'hit': np.exp(q['hit'] / s), 'hold': np.exp(q['hold'] / s)}

    rand_prob = rand()

    best_action = max(probs_actions)
    worst_action = min(probs_actions)

    return best_action if probs_actions[best_action] >= rand_prob else worst_action


def draw_card():
    """
    Cards with a value of 10 are 4 times as likely to be drawn,
    because 10, J, Q, K all have a value of 10.
    An 11 will be converted to a 1 in the hit() function, if
    necessary.

    :return: a card value from the [2, 11] interval
    """
    probabilities = [4 / 52] * 8 + [16 / 52] + [4 / 52]  # [2 - 9], [10, J, Q, K], ace
    return np.random.choice(np.arange(2, 12), p=probabilities)


def hit(cards):
    """
    draw a card and add it to cards if necessary; otherwise,
    add a terminal value (either -1 or 100) to cards

    :param cards: cards of current player (either agent or dealer)
    :return: modified cards
    """
    value = draw_card()
    score = sum(cards)
    if score + value == 21:  # current player wins
        cards.append(100)
    elif score + value > 21:
        if value == 11:
            # treat ace as 1 instead of as 11
            cards.append(1)
            if sum(cards) == 21:  # current player wins
                cards.append(100)
        elif 11 in cards:
            cards[cards.index(11)] = 1
            cards.append(value)
            if sum(cards) == 21:  # current player wins
                cards.append(100)
        else:  # current player loses
            cards.append(-1)
    else:
        cards.append(value)
    return cards


def dealer(dealer_cards, dealer_score):
    """
    hit if score < 17, otherwise hold

    :param dealer_cards:
    :param dealer_score:
    :return: 0 (i.e. hold, if score >= 17) or card value (i.e. hit, if score < 17)
    """
    value = 0
    while dealer_score < 17:
        dealer_cards = hit(dealer_cards)
        value = dealer_cards[-1]
        if value == -1 or value == 100:  # game is over, exit while
            break
    return dealer_cards, value


def blackjack_bot(player_cards, q_values, player_score, policy_name):
    """
    employ a policy to decide between hit and hold

    :param player_cards: player's cards
    :param q_values: hit and hold values for each score (e.g. {12: {'hit': val1, 'hold': val2}, ...})
    :param player_score: player's current score
    :param policy_name: which policy to adopt
    :return:
    """
    value = 0  # hold value by default
    action = get_action_by_policy_name(q_values, sum(player_cards), policy_name)
    if action == 'hit':
        player_cards = hit(player_cards)
        value = player_cards[-1]
    return player_cards, value


def tdl(q, lmbda, r, states_actions):
    """
    temporal difference learning
    q(s, a) = q(s, a) + lmbda * (r - q(s, a))

    :param q: q-value
    :param lmbda: lambda, discounting factor
    :param r: reward
    :param states_actions: state-action pairs from last game
    :return: updated value of q
    """
    for state, action in states_actions.items():
        q[state][action] = q[state][action] + lmbda * (r - q[state][action])
    return q


def sarsa_q_learning(q, s, a, alpha, r, gamma, s_prime, is_sarsa=True):
    """
    SARSA - on-policy temporal difference algorithm
    q(s, a) = q(s, a) + alpha * (r + gamma * q(s', a') - q(s, a))

    Q-learning - similar to sarsa; the only difference is that
    sarsa employs the given policy to get the new value (a_prime),
    whereas q-learning always uses the greedy policy for that
    same task.
    q(s, a) = q(s, a) + alpha * (r + gamma * max(a')(q(s', a')) - q(s, a))

    it runs after each move

    :param q: all q-values
    :param s: old state
    :param a: old action
    :param alpha: learning rate
    :param r: reward
    :param gamma: discount factor
    :param s_prime: new state
    :param is_sarsa: True (sarsa) or False (q-learning)
    :return:
    """
    if is_sarsa:
        a_prime = get_action_by_policy_name(q_values, s_prime, policy_name)
    else:
        a_prime = greedy_policy(q_values[player_score])  # use greedy policy for q-learning

    new_q_value = 0  # all terminal nodes are initialized to zero
    if s_prime < 21:
        new_q_value = q[s_prime][a_prime]

    q[s][a] = q[s][a] + alpha * (r + gamma * new_q_value - q[s][a])
    return q


def get_action_by_policy_name(q_values, player_score, policy_name):
    """
    get action for q value and a specific policy

    :param q_values: all q_values
    :player_score: current player score
    :param policy_name: the name of the policy to be adopted
    :return:
    """
    if player_score >= 21:
        return 'hold'

    if policy_name == 'greedy':
        action = greedy_policy(q_values[player_score])
    elif policy_name == 'eps_greedy':
        action = eps_greedy_policy(q_values[player_score])
    elif policy_name == 'softmax':
        action = softmax_policy(q_values[player_score])
    else:
        raise ValueError("Invalid policy name")
    return action


if __name__ == "__main__":
    policy_name = 'greedy'  # 'greedy', 'eps_greedy' or 'softmax'
    num_games = 0
    lmbda = 0.1  # for tdl, value from (0, 1)
    alpha = 0.1  # learning rate for sarsa/q-learning, value from (0, 1)
    gamma = 0.1  # discount factor for sarsa/q-learning, value from (0, 1)
    # hit and hold values for each score (e.g. {12: {'hit': val1, 'hold': val2}, ...})
    q_values = {i: {'hit': 0, 'hold': 0} for i in range(4, 21)}
    is_tdl_update = False  # True (will update via tdl) or False (will update via sarsa_q_learning)
    is_sarsa = True  # True (SARSA) or False (q-learning)
    # 1 - agent's wins, -1 - dealer's wins
    wins_losses = {1: 0, -1: 0}

    if is_tdl_update:
        print('Method for updating q-values: TDL')
    else:
        if is_sarsa:
            print('Method for updating q-values: SARSA')
        else:
            print('Method for updating q-values: Q-learning')

    # outer loop for all games
    while num_games < 10000:
        num_games += 1
        player_cards = []
        dealer_cards = []
        player_cards.append(draw_card())
        player_cards.append(draw_card())

        dealer_cards.append(draw_card())
        dealer_cards.append(draw_card())
        player_turn = True

        ''' {12: 'hit', 7: 'hit', '18': 'hold', ...} - is used to update q_values
         via TDL after each game'''
        states_actions = {}
        reward = 0

        if sum(player_cards) >= 21:  # got lucky, just move on to the next game
            continue

        # inner loop for one game
        while reward == 0:
            if player_turn:
                while True:  # player can keep hitting
                    player_score = sum(player_cards)
                    player_cards, val = blackjack_bot(player_cards, q_values, player_score, policy_name)
                    if val == 0:  # hold
                        states_actions[player_score] = 'hold'
                        if not is_tdl_update:  # if not tdl, update after every move
                            q_values = sarsa_q_learning(q_values, player_score, 'hold', alpha, reward, gamma,
                                                        sum(player_cards), is_sarsa)
                        break
                    else:  # hit
                        states_actions[player_score] = 'hit'
                        if val == -1 or val == 100:  # player loses or player wins
                            reward = 1 if val == 100 else -1
                            if not is_tdl_update:  # if not tdl, update after every move
                                q_values = sarsa_q_learning(q_values, player_score, 'hit', alpha, reward, gamma,
                                                            sum(player_cards), is_sarsa)
                            break
                        if not is_tdl_update:  # if not tdl, update after every move
                            q_values = sarsa_q_learning(q_values, player_score, 'hit', alpha, reward, gamma,
                                                        sum(player_cards), is_sarsa)
                player_turn = not player_turn
            else:
                dealer_score = sum(dealer_cards)
                dealer_cards, val = dealer(dealer_cards, dealer_score)
                if val == 0:  # hold - the game is over
                    player_score = sum(player_cards)
                    dealer_score = sum(dealer_cards)
                    if player_score != dealer_score:  # if player wins or dealer wins
                        reward = 1 if player_score > dealer_score else -1
                    else:  # tie, don't update q-values
                        states_actions = {}
                    break  # the game is over, so break
                else:  # hit
                    if val == -1 or val == 100:  # if dealer wins or player wins
                        reward = -1 if val == 100 else 1
                        break

        if len(states_actions) != 0:  # if not tie
            if reward != 0:
                wins_losses[reward] += 1
            if num_games == 1000:
                lmbda = 0.001  # reduce value of lmbda, i.e. slow down the learning process
            if is_tdl_update:
                q_values = tdl(q_values, lmbda, reward, states_actions)  # update q_values
    print("Wins-losses:", wins_losses)
    print("Q-values:", q_values)
	"""
	bot that plays blackjack
	Uros Ogrizovic
	"""

	import numpy as np
	from numpy.random import rand, randint


	def greedy_policy(q):
	"""
	choose the best action

	:param q: object {'hit': val1, 'hold': val2}
	:return: best action
	"""
	return 'hit' if q['hit'] >= q['hold'] else 'hold'


	def eps_greedy_policy(q, eps=0.1):
	"""
	choose a random action with a probability of eps,
	otherwise choose best action, i.e. greedy

	:param q: object {'hit': val1, 'hold': val2}
	:param eps: exploration/exploitation threshold
	:return: random/best action
	"""
	if rand() < eps:
	# choose random action
	if randint(0, len(q)) == 0:
	return 'hit'
	return 'hold'
	else:
	return greedy_policy(q)


	def softmax_policy(q):
	"""
	the probability of choosing an action is proportional to its q-value

	:param q: object {'hit': val1, 'hold': val2}
	:return: chosen action
	"""
	s = np.exp(q['hit']) + np.exp(q['hold'])

	# e.g. {'hit': 0.8, 'hold': 0.2}
	probs_actions = {'hit': np.exp(q['hit'] / s), 'hold': np.exp(q['hold'] / s)}

	rand_prob = rand()

	best_action = max(probs_actions)
	worst_action = min(probs_actions)

	return best_action if probs_actions[best_action] >= rand_prob else worst_action


	def draw_card():
	"""
	Cards with a value of 10 are 4 times as likely to be drawn,
	because 10, J, Q, K all have a value of 10.
	An 11 will be converted to a 1 in the hit() function, if
	necessary.

	:return: a card value from the [2, 11] interval
	"""
	probabilities = [4 / 52] * 8 + [16 / 52] + [4 / 52] # [2 - 9], [10, J, Q, K], ace
	return np.random.choice(np.arange(2, 12), p=probabilities)


	def hit(cards):
	"""
	draw a card and add it to cards if necessary; otherwise,
	add a terminal value (either -1 or 100) to cards

	:param cards: cards of current player (either agent or dealer)
	:return: modified cards
	"""
	value = draw_card()
	score = sum(cards)
	if score + value == 21: # current player wins
	cards.append(100)
	elif score + value > 21:
	if value == 11:
	# treat ace as 1 instead of as 11
	cards.append(1)
	if sum(cards) == 21: # current player wins
	cards.append(100)
	elif 11 in cards:
	cards[cards.index(11)] = 1
	cards.append(value)
	if sum(cards) == 21: # current player wins
	cards.append(100)
	else: # current player loses
	cards.append(-1)
	else:
	cards.append(value)
	return cards


	def dealer(dealer_cards, dealer_score):
	"""
	hit if score < 17, otherwise hold

	:param dealer_cards:
	:param dealer_score:
	:return: 0 (i.e. hold, if score >= 17) or card value (i.e. hit, if score < 17)
	"""
	value = 0
	while dealer_score < 17:
	dealer_cards = hit(dealer_cards)
	value = dealer_cards[-1]
	if value == -1 or value == 100: # game is over, exit while
	break
	return dealer_cards, value


	def blackjack_bot(player_cards, q_values, player_score, policy_name):
	"""
	employ a policy to decide between hit and hold

	:param player_cards: player's cards
	:param q_values: hit and hold values for each score (e.g. {12: {'hit': val1, 'hold': val2}, ...})
	:param player_score: player's current score
	:param policy_name: which policy to adopt
	:return:
	"""
	value = 0 # hold value by default
	action = get_action_by_policy_name(q_values, sum(player_cards), policy_name)
	if action == 'hit':
	player_cards = hit(player_cards)
	value = player_cards[-1]
	return player_cards, value


	def tdl(q, lmbda, r, states_actions):
	"""
	temporal difference learning
	q(s, a) = q(s, a) + lmbda * (r - q(s, a))

	:param q: q-value
	:param lmbda: lambda, discounting factor
	:param r: reward
	:param states_actions: state-action pairs from last game
	:return: updated value of q
	"""
	for state, action in states_actions.items():
	q[state][action] = q[state][action] + lmbda * (r - q[state][action])
	return q


	def sarsa_q_learning(q, s, a, alpha, r, gamma, s_prime, is_sarsa=True):
	"""
	SARSA - on-policy temporal difference algorithm
	q(s, a) = q(s, a) + alpha * (r + gamma * q(s', a') - q(s, a))

	Q-learning - similar to sarsa; the only difference is that
	sarsa employs the given policy to get the new value (a_prime),
	whereas q-learning always uses the greedy policy for that
	same task.
	q(s, a) = q(s, a) + alpha * (r + gamma * max(a')(q(s', a')) - q(s, a))

	it runs after each move

	:param q: all q-values
	:param s: old state
	:param a: old action
	:param alpha: learning rate
	:param r: reward
	:param gamma: discount factor
	:param s_prime: new state
	:param is_sarsa: True (sarsa) or False (q-learning)
	:return:
	"""
	if is_sarsa:
	a_prime = get_action_by_policy_name(q_values, s_prime, policy_name)
	else:
	a_prime = greedy_policy(q_values[player_score]) # use greedy policy for q-learning

	new_q_value = 0 # all terminal nodes are initialized to zero
	if s_prime < 21:
	new_q_value = q[s_prime][a_prime]

	q[s][a] = q[s][a] + alpha * (r + gamma * new_q_value - q[s][a])
	return q


	def get_action_by_policy_name(q_values, player_score, policy_name):
	"""
	get action for q value and a specific policy

	:param q_values: all q_values
	:player_score: current player score
	:param policy_name: the name of the policy to be adopted
	:return:
	"""
	if player_score >= 21:
	return 'hold'

	if policy_name == 'greedy':
	action = greedy_policy(q_values[player_score])
	elif policy_name == 'eps_greedy':
	action = eps_greedy_policy(q_values[player_score])
	elif policy_name == 'softmax':
	action = softmax_policy(q_values[player_score])
	else:
	raise ValueError("Invalid policy name")
	return action


	if __name__ == "__main__":
	policy_name = 'greedy' # 'greedy', 'eps_greedy' or 'softmax'
	num_games = 0
	lmbda = 0.1 # for tdl, value from (0, 1)
	alpha = 0.1 # learning rate for sarsa/q-learning, value from (0, 1)
	gamma = 0.1 # discount factor for sarsa/q-learning, value from (0, 1)
	# hit and hold values for each score (e.g. {12: {'hit': val1, 'hold': val2}, ...})
	q_values = {i: {'hit': 0, 'hold': 0} for i in range(4, 21)}
	is_tdl_update = False # True (will update via tdl) or False (will update via sarsa_q_learning)
	is_sarsa = True # True (SARSA) or False (q-learning)
	# 1 - agent's wins, -1 - dealer's wins
	wins_losses = {1: 0, -1: 0}

	if is_tdl_update:
	print('Method for updating q-values: TDL')
	else:
	if is_sarsa:
	print('Method for updating q-values: SARSA')
	else:
	print('Method for updating q-values: Q-learning')

	# outer loop for all games
	while num_games < 10000:
	num_games += 1
	player_cards = []
	dealer_cards = []
	player_cards.append(draw_card())
	player_cards.append(draw_card())

	dealer_cards.append(draw_card())
	dealer_cards.append(draw_card())
	player_turn = True

	''' {12: 'hit', 7: 'hit', '18': 'hold', ...} - is used to update q_values
	via TDL after each game'''
	states_actions = {}
	reward = 0

	if sum(player_cards) >= 21: # got lucky, just move on to the next game
	continue

	# inner loop for one game
	while reward == 0:
	if player_turn:
	while True: # player can keep hitting
	player_score = sum(player_cards)
	player_cards, val = blackjack_bot(player_cards, q_values, player_score, policy_name)
	if val == 0: # hold
	states_actions[player_score] = 'hold'
	if not is_tdl_update: # if not tdl, update after every move
	q_values = sarsa_q_learning(q_values, player_score, 'hold', alpha, reward, gamma,
	sum(player_cards), is_sarsa)
	break
	else: # hit
	states_actions[player_score] = 'hit'
	if val == -1 or val == 100: # player loses or player wins
	reward = 1 if val == 100 else -1
	if not is_tdl_update: # if not tdl, update after every move
	q_values = sarsa_q_learning(q_values, player_score, 'hit', alpha, reward, gamma,
	sum(player_cards), is_sarsa)
	break
	if not is_tdl_update: # if not tdl, update after every move
	q_values = sarsa_q_learning(q_values, player_score, 'hit', alpha, reward, gamma,
	sum(player_cards), is_sarsa)
	player_turn = not player_turn
	else:
	dealer_score = sum(dealer_cards)
	dealer_cards, val = dealer(dealer_cards, dealer_score)
	if val == 0: # hold - the game is over
	player_score = sum(player_cards)
	dealer_score = sum(dealer_cards)
	if player_score != dealer_score: # if player wins or dealer wins
	reward = 1 if player_score > dealer_score else -1
	else: # tie, don't update q-values
	states_actions = {}
	break # the game is over, so break
	else: # hit
	if val == -1 or val == 100: # if dealer wins or player wins
	reward = -1 if val == 100 else 1
	break

	if len(states_actions) != 0: # if not tie
	if reward != 0:
	wins_losses[reward] += 1
	if num_games == 1000:
	lmbda = 0.001 # reduce value of lmbda, i.e. slow down the learning process
	if is_tdl_update:
	q_values = tdl(q_values, lmbda, reward, states_actions) # update q_values
	print("Wins-losses:", wins_losses)
	print("Q-values:", q_values)