luxbock/RPSTrainer.java Secret

## rps.clj
(ns cfr.rps
  (:require [clojure.core.matrix :as m]))

(set! *warn-on-reflection* true)

(defn create-utility-fn
  "Given a payoff-matrix creates a utility function for the game. The utility
  function accepts two strategy vectors as its arguments and returns the utility
  for the first player in question."
  [m]
  (fn [sa sb]
    (let [prob-m
          (m/compute-matrix
           (map count [sa sb])
           #(* (nth sa %1) (nth sb %2)))]
      (m/ereduce + (m/mul prob-m m)))))

(defn regret
  "Given a utility function and three strategy vectors, returns the regret for
  player having played his strategy `sa' instead of `sb' against his opponents `so'"
  [uf sa sb so]
  (- (uf sb so) (uf sa so)))

(defn action-profile [ n]
  "An action profile is the list of pure strategies available to a player."
  (map #(m/mset (repeat n 0) % 1) (range n)))

(defn regret-profile
  "Given a utility function and strategies for both players, this function
  returns the regret for all the pure-strategies the first player could have
  played, including the strategy he did play."
  [uf sa so]
  (map #(regret uf sa % so) (action-profile (count sa))))

(defn normalise-strategies
  [nsum strat]
  (if (pos? nsum)
    (map #(/ % nsum) strat)
    (repeat (count strat) (/ (count strat)))))

(defn new-strategy
  "Creates a new strategy based on the regrets experienced by the player."
  [rgr]
  (let [n     (count rgr)
        strat (map #(if (pos? (nth rgr %)) (nth rgr %) 0) (range n))
        nsum  (apply + strat)]
    (normalise-strategies nsum strat)))

(defn cumulative-probabilities
  "Takes a collection of probabilities (that sum up to one) and turns it into a
  sequence of cumulative probabilities."
  [coll]
  (reduce #(conj %1 (+ %2 (last %1))) [0] coll))

(defn choose-action
  "Given a strategy vector, chooses an action to play based on its probability."
  [strat]
  (let [cp    (cumulative-probabilities strat)
        r     (rand)
        index (dec (count (take-while #(> r %) cp)))]
    (m/mset (repeat (count strat) 0) index 1)))

(defn avarage-strategy
  "Given a vector where each index maps to how often a certain strategy has been
  played, returns the frequency of each strategy as a part of the total."
  [ssum]
  (let [nsum (reduce + ssum)]
    (normalise-strategies nsum ssum)))

(defn cfrm-br
  "Given a utility function, number of iterations and a strategy for the
  opponent, performs the Counterfactual Regret Minimization algorithm to find
  the best response to the strategy in question."
  [uf n sb]
  (loop [i     0
         reg-a [0 0 0]
         ssum  [0 0 0]]
    (if (= i n)
      (avarage-strategy ssum)
      (let [strat-a (choose-action (new-strategy reg-a))
            strat-b sb]
        (recur (inc i)
               (m/add reg-a (regret-profile uf strat-a strat-b))
               (m/add ssum strat-a))))))

(defn cfrm-ne
  "Given a utility function and a number of iterations to perform, performs the
  Counterfactual Regret Minimization algorithm to find an approximation of the
  Nash Equilibrium for the game."
  [uf n]
  (loop [i      (int 0)
         reg-a  [0 0 0]
         reg-b  [0 0 0]
         ssum   [0 0 0]]
    (if (= i n)
      (avarage-strategy ssum)
      (let [strat-a (choose-action (new-strategy reg-a))
            strat-b (choose-action (new-strategy reg-b))]
        (recur (inc i)
               (m/add reg-a (regret-profile uf strat-a strat-b))
               (m/add reg-b (regret-profile uf strat-b strat-a))
               (m/add ssum strat-a))))))

(comment

  (def rps
    (create-utility-fn [[0, -1, 1]
                        [1, 0, -1]
                        [-1, 1, 0]]))

  (cfrm-ne rps 100000)

)

## rps_tweak.clj
(ns cfr.rps-tweak
  (:require [clojure.core.matrix :as m]
            [primitive-math :as pm]))

(set! *warn-on-reflection* true)
(m/set-current-implementation :vectorz)

(defn create-utility-fn
  "Given a payoff-matrix creates a utility function for the game. The utility
  function accepts two strategy vectors as its arguments and returns the utility
  for the first player in question."
  [m]
  (fn ^double [sa sb]
    (let [prob-m
          (m/compute-matrix
           (map m/ecount [sa sb])
           #(pm/* ^double (m/mget sa %1) ^double (m/mget sb %2)))]
      (m/ereduce + (m/mul prob-m m)))))

(defn regret
  "Given a utility function and three strategy vectors, returns the regret for
  player having played his strategy `sa' instead of `sb' against his opponents `so'"
  [uf sa sb so]
  (pm/- ^double (uf sb so) ^double (uf sa so)))

(defn action-profile [n]
  "An action profile is the list of pure strategies available to a player."
  (map #(m/mset (repeat n 0) % 1) (range n)))

(defn regret-profile
  "Given a utility function and strategies for both players, this function
  returns the regret for all the pure-strategies the first player could have
  played, including the strategy he did play."
  [uf sa so]
  (map #(regret uf sa % so) (action-profile (m/ecount sa))))

(defn normalise-strategies
  [nsum strat]
  (if (pm/> ^double nsum 0.0)
    (map #(pm/div ^double % ^double nsum) strat)
    (repeat (m/ecount strat) (pm/div (m/ecount strat)))))

(defn new-strategy
  "Creates a new strategy based on the regrets experienced by the player."
  [rgr]
  (let [n     (m/ecount rgr)
        strat (map #(if (pos? (m/mget rgr %)) (m/mget rgr %) 0) (range n))
        nsum  (reduce + strat)]
    (normalise-strategies nsum strat)))

(defn cumulative-probabilities
  "Takes a collection of probabilities (that sum up to one) and turns it into a
  sequence of cumulative probabilities."
  [coll]
  (reduce #(conj %1 (+ %2 (last %1))) [0] coll))

(defn choose-action
  "Given a strategy vector, chooses an action to play based on its probability."
  [^doubles strat]
  (let [cp    (cumulative-probabilities strat)
        r     (rand)
        index (pm/dec ^long (m/ecount (take-while #(pm/> ^double r ^double %) cp)))]
    (m/mset (repeat (m/ecount strat) 0) index 1)))

(defn avarage-strategy
  "Given a vector where each index maps to how often a certain strategy has been
  played, returns the frequency of each strategy as a part of the total."
  [ssum]
  (let [nsum (reduce + ssum)]
    (normalise-strategies nsum ssum)))

(defn cfrm-be
  "Given a utility function, number of iterations and a strategy for the
  opponent, performs the Counterfactual Regret Minimization algorithm to find
  the best response to the strategy in question."
  [uf n sb]
  (let [n (int n)]
    (loop [i     (int 0)
           reg-a (m/array [0 0 0])
           ssum  (m/array [0 0 0])]
      (if (pm/== i n)
        (avarage-strategy ssum)
        (let [strat-a (choose-action (new-strategy reg-a))
              strat-b sb]
          (recur (pm/inc i)
                 (m/add reg-a (regret-profile uf strat-a strat-b))
                 (m/add ssum strat-a)))))))

(defn cfrm-ne
  "Given a utility function and a number of iterations to perform, performs the
  Counterfactual Regret Minimization algorithm to find an approximation of the
  Nash Equilibrium for the game."
  [uf n]
  (let [n (int n)]
    (loop [i      (int 0)
           reg-a  (m/array [0 0 0])
           reg-b  (m/array [0 0 0])
           ssum   (m/array [0 0 0])]
      (if (pm/== i n)
        (avarage-strategy ssum)
        (let [strat-a (choose-action (new-strategy reg-a))
              strat-b (choose-action (new-strategy reg-b))]
          (recur (pm/inc i)
                 (m/add reg-a (regret-profile uf strat-a strat-b))
                 (m/add reg-b (regret-profile uf strat-b strat-a))
                 (m/add ssum strat-a)))))))

(comment

  (def rps
    (create-utility-fn [[0, -1, 1]
                        [1, 0, -1]
                        [-1, 1, 0]]))

  (cfrm-ne rps 100000)

)

## RPSTrainer.java
import java.util.Arrays;
import java.util.Random;

public class RPSTrainer {

    public static final int ROCK = 0, PAPER = 1, SCISSORS = 2, NUM_ACTIONS = 3;
    public static final Random random = new Random();
    double[] regretSum = new double[NUM_ACTIONS],
             strategy = new double[NUM_ACTIONS],
             strategySum = new double[NUM_ACTIONS],
             oppStrategy = { 0.4, 0.3, 0.3 };
    private double[] getStrategy() {
        double normalizingSum = 0;
        for (int a = 0; a < NUM_ACTIONS; a++) {
            strategy[a] = regretSum[a] > 0 ? regretSum[a] : 0;
            normalizingSum += strategy[a];
        }
        for (int a = 0; a < NUM_ACTIONS; a++) {
            if (normalizingSum > 0)
              strategy[a] /= normalizingSum;
            else
              strategy[a] = 1.0 / NUM_ACTIONS;
            strategySum[a] += strategy[a];
        }
        return strategy;
    }

    public int getAction(double[] strategy) {
        double r = random.nextDouble();
        int a = 0;
        double cumulativeProbability =  0;
        while (a < NUM_ACTIONS - 1) {
            cumulativeProbability += strategy[a];
            if (r < cumulativeProbability)
                break;
            a++;
        }
        return a;
    }

    public void train(int iterations) {
        double[] actionUtility = new double[NUM_ACTIONS];
        for (int i = 0; i < iterations; i++) {
            double[] strategy = getStrategy();
            int myAction = getAction(strategy);
            int otherAction = getAction(oppStrategy);
            actionUtility[otherAction] = 0;
            actionUtility[otherAction == NUM_ACTIONS - 1 ? 0 : otherAction + 1] = 1;
            actionUtility[otherAction == 0 ? NUM_ACTIONS - 1 : otherAction - 1] = -1;
            for (int a = 0; a < NUM_ACTIONS; a++)
                regretSum[a] += actionUtility[a] - actionUtility[myAction];
        }
    }

    public double[] getAverageStrategy() {
        double[] avgStrategy = new double[NUM_ACTIONS];
        double normalizingSum = 0;
        for (int a = 0; a < NUM_ACTIONS; a++)
            normalizingSum += strategySum[a];
        for (int a = 0; a < NUM_ACTIONS; a++)
            if (normalizingSum > 0)
                avgStrategy[a] = strategySum[a] / normalizingSum;
            else
                avgStrategy[a] = 1.0 / NUM_ACTIONS;
        return avgStrategy;
    }
    public static void main(String[] args) {
        RPSTrainer trainer = new RPSTrainer();
        trainer.train(1000000);
        System.out.println(Arrays.toString(trainer.getAverageStrategy()));
    }

}
	(ns cfr.rps
	(:require [clojure.core.matrix :as m]))

	(set! warn-on-reflection true)

	(defn create-utility-fn
	"Given a payoff-matrix creates a utility function for the game. The utility
	function accepts two strategy vectors as its arguments and returns the utility
	for the first player in question."
	[m]
	(fn [sa sb]
	(let [prob-m
	(m/compute-matrix
	(map count [sa sb])
	#(* (nth sa %1) (nth sb %2)))]
	(m/ereduce + (m/mul prob-m m)))))

	(defn regret
	"Given a utility function and three strategy vectors, returns the regret for
	player having played his strategy `sa' instead of `sb' against his opponents `so'"
	[uf sa sb so]
	(- (uf sb so) (uf sa so)))

	(defn action-profile [ n]
	"An action profile is the list of pure strategies available to a player."
	(map #(m/mset (repeat n 0) % 1) (range n)))

	(defn regret-profile
	"Given a utility function and strategies for both players, this function
	returns the regret for all the pure-strategies the first player could have
	played, including the strategy he did play."
	[uf sa so]
	(map #(regret uf sa % so) (action-profile (count sa))))

	(defn normalise-strategies
	[nsum strat]
	(if (pos? nsum)
	(map #(/ % nsum) strat)
	(repeat (count strat) (/ (count strat)))))

	(defn new-strategy
	"Creates a new strategy based on the regrets experienced by the player."
	[rgr]
	(let [n (count rgr)
	strat (map #(if (pos? (nth rgr %)) (nth rgr %) 0) (range n))
	nsum (apply + strat)]
	(normalise-strategies nsum strat)))

	(defn cumulative-probabilities
	"Takes a collection of probabilities (that sum up to one) and turns it into a
	sequence of cumulative probabilities."
	[coll]
	(reduce #(conj %1 (+ %2 (last %1))) [0] coll))

	(defn choose-action
	"Given a strategy vector, chooses an action to play based on its probability."
	[strat]
	(let [cp (cumulative-probabilities strat)
	r (rand)
	index (dec (count (take-while #(> r %) cp)))]
	(m/mset (repeat (count strat) 0) index 1)))

	(defn avarage-strategy
	"Given a vector where each index maps to how often a certain strategy has been
	played, returns the frequency of each strategy as a part of the total."
	[ssum]
	(let [nsum (reduce + ssum)]
	(normalise-strategies nsum ssum)))

	(defn cfrm-br
	"Given a utility function, number of iterations and a strategy for the
	opponent, performs the Counterfactual Regret Minimization algorithm to find
	the best response to the strategy in question."
	[uf n sb]
	(loop [i 0
	reg-a [0 0 0]
	ssum [0 0 0]]
	(if (= i n)
	(avarage-strategy ssum)
	(let [strat-a (choose-action (new-strategy reg-a))
	strat-b sb]
	(recur (inc i)
	(m/add reg-a (regret-profile uf strat-a strat-b))
	(m/add ssum strat-a))))))

	(defn cfrm-ne
	"Given a utility function and a number of iterations to perform, performs the
	Counterfactual Regret Minimization algorithm to find an approximation of the
	Nash Equilibrium for the game."
	[uf n]
	(loop [i (int 0)
	reg-a [0 0 0]
	reg-b [0 0 0]
	ssum [0 0 0]]
	(if (= i n)
	(avarage-strategy ssum)
	(let [strat-a (choose-action (new-strategy reg-a))
	strat-b (choose-action (new-strategy reg-b))]
	(recur (inc i)
	(m/add reg-a (regret-profile uf strat-a strat-b))
	(m/add reg-b (regret-profile uf strat-b strat-a))
	(m/add ssum strat-a))))))

	(comment

	(def rps
	(create-utility-fn [[0, -1, 1]
	[1, 0, -1]
	[-1, 1, 0]]))

	(cfrm-ne rps 100000)

	)
	(ns cfr.rps-tweak
	(:require [clojure.core.matrix :as m]
	[primitive-math :as pm]))

	(set! warn-on-reflection true)
	(m/set-current-implementation :vectorz)

	(defn create-utility-fn
	"Given a payoff-matrix creates a utility function for the game. The utility
	function accepts two strategy vectors as its arguments and returns the utility
	for the first player in question."
	[m]
	(fn ^double [sa sb]
	(let [prob-m
	(m/compute-matrix
	(map m/ecount [sa sb])
	#(pm/* ^double (m/mget sa %1) ^double (m/mget sb %2)))]
	(m/ereduce + (m/mul prob-m m)))))

	(defn regret
	"Given a utility function and three strategy vectors, returns the regret for
	player having played his strategy `sa' instead of `sb' against his opponents `so'"
	[uf sa sb so]
	(pm/- ^double (uf sb so) ^double (uf sa so)))

	(defn action-profile [n]
	"An action profile is the list of pure strategies available to a player."
	(map #(m/mset (repeat n 0) % 1) (range n)))

	(defn regret-profile
	"Given a utility function and strategies for both players, this function
	returns the regret for all the pure-strategies the first player could have
	played, including the strategy he did play."
	[uf sa so]
	(map #(regret uf sa % so) (action-profile (m/ecount sa))))

	(defn normalise-strategies
	[nsum strat]
	(if (pm/> ^double nsum 0.0)
	(map #(pm/div ^double % ^double nsum) strat)
	(repeat (m/ecount strat) (pm/div (m/ecount strat)))))

	(defn new-strategy
	"Creates a new strategy based on the regrets experienced by the player."
	[rgr]
	(let [n (m/ecount rgr)
	strat (map #(if (pos? (m/mget rgr %)) (m/mget rgr %) 0) (range n))
	nsum (reduce + strat)]
	(normalise-strategies nsum strat)))

	(defn cumulative-probabilities
	"Takes a collection of probabilities (that sum up to one) and turns it into a
	sequence of cumulative probabilities."
	[coll]
	(reduce #(conj %1 (+ %2 (last %1))) [0] coll))

	(defn choose-action
	"Given a strategy vector, chooses an action to play based on its probability."
	[^doubles strat]
	(let [cp (cumulative-probabilities strat)
	r (rand)
	index (pm/dec ^long (m/ecount (take-while #(pm/> ^double r ^double %) cp)))]
	(m/mset (repeat (m/ecount strat) 0) index 1)))

	(defn avarage-strategy
	"Given a vector where each index maps to how often a certain strategy has been
	played, returns the frequency of each strategy as a part of the total."
	[ssum]
	(let [nsum (reduce + ssum)]
	(normalise-strategies nsum ssum)))

	(defn cfrm-be
	"Given a utility function, number of iterations and a strategy for the
	opponent, performs the Counterfactual Regret Minimization algorithm to find
	the best response to the strategy in question."
	[uf n sb]
	(let [n (int n)]
	(loop [i (int 0)
	reg-a (m/array [0 0 0])
	ssum (m/array [0 0 0])]
	(if (pm/== i n)
	(avarage-strategy ssum)
	(let [strat-a (choose-action (new-strategy reg-a))
	strat-b sb]
	(recur (pm/inc i)
	(m/add reg-a (regret-profile uf strat-a strat-b))
	(m/add ssum strat-a)))))))

	(defn cfrm-ne
	"Given a utility function and a number of iterations to perform, performs the
	Counterfactual Regret Minimization algorithm to find an approximation of the
	Nash Equilibrium for the game."
	[uf n]
	(let [n (int n)]
	(loop [i (int 0)
	reg-a (m/array [0 0 0])
	reg-b (m/array [0 0 0])
	ssum (m/array [0 0 0])]
	(if (pm/== i n)
	(avarage-strategy ssum)
	(let [strat-a (choose-action (new-strategy reg-a))
	strat-b (choose-action (new-strategy reg-b))]
	(recur (pm/inc i)
	(m/add reg-a (regret-profile uf strat-a strat-b))
	(m/add reg-b (regret-profile uf strat-b strat-a))
	(m/add ssum strat-a)))))))

	(comment

	(def rps
	(create-utility-fn [[0, -1, 1]
	[1, 0, -1]
	[-1, 1, 0]]))

	(cfrm-ne rps 100000)

	)
	import java.util.Arrays;
	import java.util.Random;

	public class RPSTrainer {

	public static final int ROCK = 0, PAPER = 1, SCISSORS = 2, NUM_ACTIONS = 3;
	public static final Random random = new Random();
	double[] regretSum = new double[NUM_ACTIONS],
	strategy = new double[NUM_ACTIONS],
	strategySum = new double[NUM_ACTIONS],
	oppStrategy = { 0.4, 0.3, 0.3 };
	private double[] getStrategy() {
	double normalizingSum = 0;
	for (int a = 0; a < NUM_ACTIONS; a++) {
	strategy[a] = regretSum[a] > 0 ? regretSum[a] : 0;
	normalizingSum += strategy[a];
	}
	for (int a = 0; a < NUM_ACTIONS; a++) {
	if (normalizingSum > 0)
	strategy[a] /= normalizingSum;
	else
	strategy[a] = 1.0 / NUM_ACTIONS;
	strategySum[a] += strategy[a];
	}
	return strategy;
	}

	public int getAction(double[] strategy) {
	double r = random.nextDouble();
	int a = 0;
	double cumulativeProbability = 0;
	while (a < NUM_ACTIONS - 1) {
	cumulativeProbability += strategy[a];
	if (r < cumulativeProbability)
	break;
	a++;
	}
	return a;
	}

	public void train(int iterations) {
	double[] actionUtility = new double[NUM_ACTIONS];
	for (int i = 0; i < iterations; i++) {
	double[] strategy = getStrategy();
	int myAction = getAction(strategy);
	int otherAction = getAction(oppStrategy);
	actionUtility[otherAction] = 0;
	actionUtility[otherAction == NUM_ACTIONS - 1 ? 0 : otherAction + 1] = 1;
	actionUtility[otherAction == 0 ? NUM_ACTIONS - 1 : otherAction - 1] = -1;
	for (int a = 0; a < NUM_ACTIONS; a++)
	regretSum[a] += actionUtility[a] - actionUtility[myAction];
	}
	}

	public double[] getAverageStrategy() {
	double[] avgStrategy = new double[NUM_ACTIONS];
	double normalizingSum = 0;
	for (int a = 0; a < NUM_ACTIONS; a++)
	normalizingSum += strategySum[a];
	for (int a = 0; a < NUM_ACTIONS; a++)
	if (normalizingSum > 0)
	avgStrategy[a] = strategySum[a] / normalizingSum;
	else
	avgStrategy[a] = 1.0 / NUM_ACTIONS;
	return avgStrategy;
	}
	public static void main(String[] args) {
	RPSTrainer trainer = new RPSTrainer();
	trainer.train(1000000);
	System.out.println(Arrays.toString(trainer.getAverageStrategy()));
	}

	}