mathemage/tree_cfr.lua

## tree_cfr.lua
--- Runs Counterfactual Regret Minimization (CFR) to approximately
-- solve a game represented by a complete game tree.
--
-- As this class does full solving from the root of the game with no
-- limited lookahead, it is not used in continual re-solving. It is provided
-- simply for convenience.
-- @classmod tree_cfr

local arguments = require 'Settings.arguments'
local constants = require 'Settings.constants'
local game_settings = require 'Settings.game_settings'
local card_tools = require 'Game.card_tools'
require 'TerminalEquity.terminal_equity'

local TreeCFR = torch.class('TreeCFR')

--- Constructor
function TreeCFR:__init()
  --for ease of implementation, we use small epsilon rather than zero when working with regrets
  self.regret_epsilon = 1/1000000000
  self._cached_terminal_equities = {}
end

--- Gets an evaluator for player equities at a terminal node.
--
-- Caches the result to minimize creation of @{terminal_equity|TerminalEquity}
-- objects.
-- @param node the terminal node to evaluate
-- @return a @{terminal_equity|TerminalEquity} evaluator for the node
-- @local
function TreeCFR:_get_terminal_equity(node)
  local cached = self._cached_terminal_equities[node.board]
  if cached == nil then
    cached = TerminalEquity()
    cached:set_board(node.board)
    self._cached_terminal_equities[node.board] = cached
  end

  return cached
end

--- Recursively walks the tree, applying the CFR algorithm.
-- @param node the current node in the tree
-- @param iter the current iteration number
-- @local
function TreeCFR:cfrs_iter_dfs( node, iter )

  assert(node.current_player == constants.players.P1 or node.current_player == constants.players.P2 or node.current_player == constants.players.chance)

  local opponent_index = 3 - node.current_player

  --dimensions in tensor
  local action_dimension = 1
  local card_dimension = 2

  --compute values using terminal_equity in terminal nodes
  if(node.terminal) then

    local terminal_equity = self:_get_terminal_equity(node)

    local values = node.ranges_absolute:clone():fill(0)

    if(node.type == constants.node_types.terminal_fold) then
      terminal_equity:tree_node_fold_value(node.ranges_absolute, values, opponent_index)
    else
      terminal_equity:tree_node_call_value(node.ranges_absolute, values)
    end

    --multiply by the pot
    values = values * node.pot
    node.cf_values = values:viewAs(node.ranges_absolute)
  else

    local actions_count = #node.children
    local current_strategy = nil

    if node.current_player == constants.players.chance then
      current_strategy = node.strategy
    else
      --we have to compute current strategy at the beginning of each iteraton

      --initialize regrets in the first iteration
      node.regrets = node.regrets or arguments.Tensor(actions_count, game_settings.card_count):fill(self.regret_epsilon) --[[actions_count x card_count]]
      node.possitive_regrets = node.possitive_regrets or arguments.Tensor(actions_count, game_settings.card_count):fill(self.regret_epsilon)

      --compute positive regrets so that we can compute the current strategy fromm them
      node.possitive_regrets:copy(node.regrets)
      node.possitive_regrets[torch.le(node.possitive_regrets, self.regret_epsilon)] = self.regret_epsilon

      --compute the current strategy
      local regrets_sum = node.possitive_regrets:sum(action_dimension)
      current_strategy = node.possitive_regrets:clone()
      current_strategy:cdiv(regrets_sum:expandAs(current_strategy))
    end

	--current cfv [[actions, players, ranges]]
    local cf_values_allactions = arguments.Tensor(actions_count, constants.players_count, game_settings.card_count):fill(0)

    local children_ranges_absolute = {}

    if node.current_player == constants.players.chance then
      local ranges_mul_matrix = node.ranges_absolute[1]:repeatTensor(actions_count, 1)
      children_ranges_absolute[1] = torch.cmul(current_strategy, ranges_mul_matrix)

      ranges_mul_matrix = node.ranges_absolute[2]:repeatTensor(actions_count, 1)
      children_ranges_absolute[2] = torch.cmul(current_strategy, ranges_mul_matrix)
    else
      local ranges_mul_matrix = node.ranges_absolute[node.current_player]:repeatTensor(actions_count, 1)
      children_ranges_absolute[node.current_player] = torch.cmul(current_strategy, ranges_mul_matrix)

      children_ranges_absolute[opponent_index] = node.ranges_absolute[opponent_index]:repeatTensor(actions_count, 1):clone()
    end

    for i = 1,#node.children do
      local child_node = node.children[i]
      --set new absolute ranges (after the action) for the child
      child_node.ranges_absolute = node.ranges_absolute:clone()

      child_node.ranges_absolute[1]:copy(children_ranges_absolute[1][{i}])
      child_node.ranges_absolute[2]:copy(children_ranges_absolute[2][{i}])
      self:cfrs_iter_dfs(child_node, iter, card_count)
      cf_values_allactions[i] = child_node.cf_values
    end

    -- TODO: What is the exact role of cf_values?
	  -- -- Is this cfvalue (in the on-going iteration) per each possible state in the range?
    node.cf_values = arguments.Tensor(constants.players_count, game_settings.card_count):fill(0)

    if node.current_player ~= constants.players.chance then
      local strategy_mul_matrix = current_strategy:viewAs(arguments.Tensor(actions_count, game_settings.card_count))

      node.cf_values[node.current_player] = torch.cmul(strategy_mul_matrix, cf_values_allactions[{{}, node.current_player, {}}]):sum(1)

      -- TODO: Why sum? Why not take the max cfvalue (worst-case opponent)?
      node.cf_values[opponent_index] = (cf_values_allactions[{{}, opponent_index, {}}]):sum(1)
    else
      node.cf_values[1] = (cf_values_allactions[{{}, 1, {}}]):sum(1)
      node.cf_values[2] = (cf_values_allactions[{{}, 2, {}}]):sum(1)
    end

    if node.current_player ~= constants.players.chance then
      --computing regrets

			-- NOTE: Why reshape?
	    -- -- To squeeze() the dimension of players.
      local current_regrets = cf_values_allactions[{{}, {node.current_player}, {}}]:reshape(actions_count, game_settings.card_count):clone()

	    -- TODO: Why subtract?
	    -- -- Because regret (current_regrets) means the difference in each pure action's cfvalue (cf_values_allactions)
	    -- -- against the cfvalue of current_player's mixed strategy (node.cf_values)?
	    -- NOTE: Why view()?
	    -- -- To squeeze() the dimension of players.
	    -- NOTE: Why expandAs(), why to replicate cfvalues actions_count-times?
	    -- -- Because we replicate the "node.cf_values" tensor "action_count-times" to compare with cf_values_allactions
	    -- -- (see "Why subtract?" above)
      current_regrets:csub(node.cf_values[node.current_player]:view(1, game_settings.card_count):expandAs(current_regrets))

      self:update_regrets(node, current_regrets)

      --accumulating average strategy
      self:update_average_strategy(node, current_strategy, iter)
    end
  end
end

--- Update a node's total regrets with the current iteration regrets.
-- @param node the node to update
-- @param current_regrets the regrets from the current iteration of CFR
-- @local
function TreeCFR:update_regrets(node, current_regrets)
  --node.regrets:add(current_regrets)
  --local negative_regrets = node.regrets[node.regrets:lt(0)]
  --node.regrets[node.regrets:lt(0)] = negative_regrets
  node.regrets:add(current_regrets)
  node.regrets[torch.le(node.regrets, self.regret_epsilon)] = self.regret_epsilon
end

--- Update a node's average strategy with the current iteration strategy.
-- @param node the node to update
-- @param current_strategy the CFR strategy for the current iteration
-- @param iter the iteration number of the current CFR iteration
function TreeCFR:update_average_strategy(node, current_strategy, iter)
  if iter > arguments.cfr_skip_iters then
    node.strategy = node.strategy or arguments.Tensor(actions_count, game_settings.card_count):fill(0)

	  -- TODO: What is the exact role of iter_weight_contribution?
    local iter_weight_contribution = node.ranges_absolute[node.current_player]:clone()
    iter_weight_contribution[torch.le(iter_weight_contribution, 0)] = self.regret_epsilon

	  node.iter_weight_sum = node.iter_weight_sum or arguments.Tensor(game_settings.card_count):fill(0)
	  -- TODO: Does iter_weight_sum keep track of sum of all weight
    node.iter_weight_sum:add(iter_weight_contribution)
    local iter_weight = torch.cdiv(iter_weight_contribution, node.iter_weight_sum)

    -- TODO:
	  -- -- new_strategy <- (  old_strategy_scale   .* old_strategy) +          strategy_addition
	  --                 == (  old_strategy_scale   .* old_strategy) + (expanded_weight .* current_strategy)
	  --                 == ((-expanded_weight + 1) .* old_strategy) + (expanded_weight .* current_strategy)
	  --                 == old_strategy + (expanded_weight .* (current_strategy - old_strategy))
    local expanded_weight = iter_weight:view(1, game_settings.card_count):expandAs(node.strategy)
    local old_strategy_scale = expanded_weight * (-1) + 1 --same as 1 - expanded weight
    node.strategy:cmul(old_strategy_scale)
    local strategy_addition = current_strategy:cmul(expanded_weight)
    node.strategy:add(strategy_addition)
  end
end

--- Run CFR to solve the given game tree.
-- @param root the root node of the tree to solve.
-- @param[opt] starting_ranges probability vectors over player private hands
-- at the root node (default uniform)
-- @param[opt] iter_count the number of iterations to run CFR for
-- (default @{arguments.cfr_iters})
function TreeCFR:run_cfr( root, starting_ranges, iter_count )

  assert(starting_ranges)
  local iter_count = iter_count or arguments.cfr_iters

  root.ranges_absolute =  starting_ranges

  for iter = 1,iter_count do
    self:cfrs_iter_dfs(root, iter)
  end
end
	--- Runs Counterfactual Regret Minimization (CFR) to approximately
	-- solve a game represented by a complete game tree.
	--
	-- As this class does full solving from the root of the game with no
	-- limited lookahead, it is not used in continual re-solving. It is provided
	-- simply for convenience.
	-- @classmod tree_cfr

	local arguments = require 'Settings.arguments'
	local constants = require 'Settings.constants'
	local game_settings = require 'Settings.game_settings'
	local card_tools = require 'Game.card_tools'
	require 'TerminalEquity.terminal_equity'

	local TreeCFR = torch.class('TreeCFR')

	--- Constructor
	function TreeCFR:__init()
	--for ease of implementation, we use small epsilon rather than zero when working with regrets
	self.regret_epsilon = 1/1000000000
	self._cached_terminal_equities = {}
	end

	--- Gets an evaluator for player equities at a terminal node.
	--
	-- Caches the result to minimize creation of @{terminal_equity\|TerminalEquity}
	-- objects.
	-- @param node the terminal node to evaluate
	-- @return a @{terminal_equity\|TerminalEquity} evaluator for the node
	-- @local
	function TreeCFR:_get_terminal_equity(node)
	local cached = self._cached_terminal_equities[node.board]
	if cached == nil then
	cached = TerminalEquity()
	cached:set_board(node.board)
	self._cached_terminal_equities[node.board] = cached
	end

	return cached
	end

	--- Recursively walks the tree, applying the CFR algorithm.
	-- @param node the current node in the tree
	-- @param iter the current iteration number
	-- @local
	function TreeCFR:cfrs_iter_dfs( node, iter )

	assert(node.current_player == constants.players.P1 or node.current_player == constants.players.P2 or node.current_player == constants.players.chance)

	local opponent_index = 3 - node.current_player

	--dimensions in tensor
	local action_dimension = 1
	local card_dimension = 2

	--compute values using terminal_equity in terminal nodes
	if(node.terminal) then

	local terminal_equity = self:_get_terminal_equity(node)

	local values = node.ranges_absolute:clone():fill(0)

	if(node.type == constants.node_types.terminal_fold) then
	terminal_equity:tree_node_fold_value(node.ranges_absolute, values, opponent_index)
	else
	terminal_equity:tree_node_call_value(node.ranges_absolute, values)
	end

	--multiply by the pot
	values = values * node.pot
	node.cf_values = values:viewAs(node.ranges_absolute)
	else

	local actions_count = #node.children
	local current_strategy = nil

	if node.current_player == constants.players.chance then
	current_strategy = node.strategy
	else
	--we have to compute current strategy at the beginning of each iteraton

	--initialize regrets in the first iteration
	node.regrets = node.regrets or arguments.Tensor(actions_count, game_settings.card_count):fill(self.regret_epsilon) --[[actions_count x card_count]]
	node.possitive_regrets = node.possitive_regrets or arguments.Tensor(actions_count, game_settings.card_count):fill(self.regret_epsilon)

	--compute positive regrets so that we can compute the current strategy fromm them
	node.possitive_regrets:copy(node.regrets)
	node.possitive_regrets[torch.le(node.possitive_regrets, self.regret_epsilon)] = self.regret_epsilon

	--compute the current strategy
	local regrets_sum = node.possitive_regrets:sum(action_dimension)
	current_strategy = node.possitive_regrets:clone()
	current_strategy:cdiv(regrets_sum:expandAs(current_strategy))
	end

	--current cfv [[actions, players, ranges]]
	local cf_values_allactions = arguments.Tensor(actions_count, constants.players_count, game_settings.card_count):fill(0)

	local children_ranges_absolute = {}

	if node.current_player == constants.players.chance then
	local ranges_mul_matrix = node.ranges_absolute[1]:repeatTensor(actions_count, 1)
	children_ranges_absolute[1] = torch.cmul(current_strategy, ranges_mul_matrix)

	ranges_mul_matrix = node.ranges_absolute[2]:repeatTensor(actions_count, 1)
	children_ranges_absolute[2] = torch.cmul(current_strategy, ranges_mul_matrix)
	else
	local ranges_mul_matrix = node.ranges_absolute[node.current_player]:repeatTensor(actions_count, 1)
	children_ranges_absolute[node.current_player] = torch.cmul(current_strategy, ranges_mul_matrix)

	children_ranges_absolute[opponent_index] = node.ranges_absolute[opponent_index]:repeatTensor(actions_count, 1):clone()
	end

	for i = 1,#node.children do
	local child_node = node.children[i]
	--set new absolute ranges (after the action) for the child
	child_node.ranges_absolute = node.ranges_absolute:clone()

	child_node.ranges_absolute[1]:copy(children_ranges_absolute[1][{i}])
	child_node.ranges_absolute[2]:copy(children_ranges_absolute[2][{i}])
	self:cfrs_iter_dfs(child_node, iter, card_count)
	cf_values_allactions[i] = child_node.cf_values
	end

	-- TODO: What is the exact role of cf_values?
	-- -- Is this cfvalue (in the on-going iteration) per each possible state in the range?
	node.cf_values = arguments.Tensor(constants.players_count, game_settings.card_count):fill(0)

	if node.current_player ~= constants.players.chance then
	local strategy_mul_matrix = current_strategy:viewAs(arguments.Tensor(actions_count, game_settings.card_count))

	node.cf_values[node.current_player] = torch.cmul(strategy_mul_matrix, cf_values_allactions[{{}, node.current_player, {}}]):sum(1)

	-- TODO: Why sum? Why not take the max cfvalue (worst-case opponent)?
	node.cf_values[opponent_index] = (cf_values_allactions[{{}, opponent_index, {}}]):sum(1)
	else
	node.cf_values[1] = (cf_values_allactions[{{}, 1, {}}]):sum(1)
	node.cf_values[2] = (cf_values_allactions[{{}, 2, {}}]):sum(1)
	end

	if node.current_player ~= constants.players.chance then
	--computing regrets

	-- NOTE: Why reshape?
	-- -- To squeeze() the dimension of players.
	local current_regrets = cf_values_allactions[{{}, {node.current_player}, {}}]:reshape(actions_count, game_settings.card_count):clone()

	-- TODO: Why subtract?
	-- -- Because regret (current_regrets) means the difference in each pure action's cfvalue (cf_values_allactions)
	-- -- against the cfvalue of current_player's mixed strategy (node.cf_values)?
	-- NOTE: Why view()?
	-- -- To squeeze() the dimension of players.
	-- NOTE: Why expandAs(), why to replicate cfvalues actions_count-times?
	-- -- Because we replicate the "node.cf_values" tensor "action_count-times" to compare with cf_values_allactions
	-- -- (see "Why subtract?" above)
	current_regrets:csub(node.cf_values[node.current_player]:view(1, game_settings.card_count):expandAs(current_regrets))

	self:update_regrets(node, current_regrets)

	--accumulating average strategy
	self:update_average_strategy(node, current_strategy, iter)
	end
	end
	end

	--- Update a node's total regrets with the current iteration regrets.
	-- @param node the node to update
	-- @param current_regrets the regrets from the current iteration of CFR
	-- @local
	function TreeCFR:update_regrets(node, current_regrets)
	--node.regrets:add(current_regrets)
	--local negative_regrets = node.regrets[node.regrets:lt(0)]
	--node.regrets[node.regrets:lt(0)] = negative_regrets
	node.regrets:add(current_regrets)
	node.regrets[torch.le(node.regrets, self.regret_epsilon)] = self.regret_epsilon
	end

	--- Update a node's average strategy with the current iteration strategy.
	-- @param node the node to update
	-- @param current_strategy the CFR strategy for the current iteration
	-- @param iter the iteration number of the current CFR iteration
	function TreeCFR:update_average_strategy(node, current_strategy, iter)
	if iter > arguments.cfr_skip_iters then
	node.strategy = node.strategy or arguments.Tensor(actions_count, game_settings.card_count):fill(0)

	-- TODO: What is the exact role of iter_weight_contribution?
	local iter_weight_contribution = node.ranges_absolute[node.current_player]:clone()
	iter_weight_contribution[torch.le(iter_weight_contribution, 0)] = self.regret_epsilon

	node.iter_weight_sum = node.iter_weight_sum or arguments.Tensor(game_settings.card_count):fill(0)
	-- TODO: Does iter_weight_sum keep track of sum of all weight
	node.iter_weight_sum:add(iter_weight_contribution)
	local iter_weight = torch.cdiv(iter_weight_contribution, node.iter_weight_sum)

	-- TODO:
	-- -- new_strategy <- ( old_strategy_scale .* old_strategy) + strategy_addition
	-- == ( old_strategy_scale .* old_strategy) + (expanded_weight .* current_strategy)
	-- == ((-expanded_weight + 1) .* old_strategy) + (expanded_weight .* current_strategy)
	-- == old_strategy + (expanded_weight .* (current_strategy - old_strategy))
	local expanded_weight = iter_weight:view(1, game_settings.card_count):expandAs(node.strategy)
	local old_strategy_scale = expanded_weight * (-1) + 1 --same as 1 - expanded weight
	node.strategy:cmul(old_strategy_scale)
	local strategy_addition = current_strategy:cmul(expanded_weight)
	node.strategy:add(strategy_addition)
	end
	end

	--- Run CFR to solve the given game tree.
	-- @param root the root node of the tree to solve.
	-- @param[opt] starting_ranges probability vectors over player private hands
	-- at the root node (default uniform)
	-- @param[opt] iter_count the number of iterations to run CFR for
	-- (default @{arguments.cfr_iters})
	function TreeCFR:run_cfr( root, starting_ranges, iter_count )

	assert(starting_ranges)
	local iter_count = iter_count or arguments.cfr_iters

	root.ranges_absolute = starting_ranges

	for iter = 1,iter_count do
	self:cfrs_iter_dfs(root, iter)
	end
	end