Skip to content

Instantly share code, notes, and snippets.

@luxbock

luxbock/RPSTrainer.java Secret

Last active Jul 10, 2017
Embed
What would you like to do?
(ns cfr.rps
(:require [clojure.core.matrix :as m]))
(set! *warn-on-reflection* true)
(defn create-utility-fn
"Given a payoff-matrix creates a utility function for the game. The utility
function accepts two strategy vectors as its arguments and returns the utility
for the first player in question."
[m]
(fn [sa sb]
(let [prob-m
(m/compute-matrix
(map count [sa sb])
#(* (nth sa %1) (nth sb %2)))]
(m/ereduce + (m/mul prob-m m)))))
(defn regret
"Given a utility function and three strategy vectors, returns the regret for
player having played his strategy `sa' instead of `sb' against his opponents `so'"
[uf sa sb so]
(- (uf sb so) (uf sa so)))
(defn action-profile [ n]
"An action profile is the list of pure strategies available to a player."
(map #(m/mset (repeat n 0) % 1) (range n)))
(defn regret-profile
"Given a utility function and strategies for both players, this function
returns the regret for all the pure-strategies the first player could have
played, including the strategy he did play."
[uf sa so]
(map #(regret uf sa % so) (action-profile (count sa))))
(defn normalise-strategies
[nsum strat]
(if (pos? nsum)
(map #(/ % nsum) strat)
(repeat (count strat) (/ (count strat)))))
(defn new-strategy
"Creates a new strategy based on the regrets experienced by the player."
[rgr]
(let [n (count rgr)
strat (map #(if (pos? (nth rgr %)) (nth rgr %) 0) (range n))
nsum (apply + strat)]
(normalise-strategies nsum strat)))
(defn cumulative-probabilities
"Takes a collection of probabilities (that sum up to one) and turns it into a
sequence of cumulative probabilities."
[coll]
(reduce #(conj %1 (+ %2 (last %1))) [0] coll))
(defn choose-action
"Given a strategy vector, chooses an action to play based on its probability."
[strat]
(let [cp (cumulative-probabilities strat)
r (rand)
index (dec (count (take-while #(> r %) cp)))]
(m/mset (repeat (count strat) 0) index 1)))
(defn avarage-strategy
"Given a vector where each index maps to how often a certain strategy has been
played, returns the frequency of each strategy as a part of the total."
[ssum]
(let [nsum (reduce + ssum)]
(normalise-strategies nsum ssum)))
(defn cfrm-br
"Given a utility function, number of iterations and a strategy for the
opponent, performs the Counterfactual Regret Minimization algorithm to find
the best response to the strategy in question."
[uf n sb]
(loop [i 0
reg-a [0 0 0]
ssum [0 0 0]]
(if (= i n)
(avarage-strategy ssum)
(let [strat-a (choose-action (new-strategy reg-a))
strat-b sb]
(recur (inc i)
(m/add reg-a (regret-profile uf strat-a strat-b))
(m/add ssum strat-a))))))
(defn cfrm-ne
"Given a utility function and a number of iterations to perform, performs the
Counterfactual Regret Minimization algorithm to find an approximation of the
Nash Equilibrium for the game."
[uf n]
(loop [i (int 0)
reg-a [0 0 0]
reg-b [0 0 0]
ssum [0 0 0]]
(if (= i n)
(avarage-strategy ssum)
(let [strat-a (choose-action (new-strategy reg-a))
strat-b (choose-action (new-strategy reg-b))]
(recur (inc i)
(m/add reg-a (regret-profile uf strat-a strat-b))
(m/add reg-b (regret-profile uf strat-b strat-a))
(m/add ssum strat-a))))))
(comment
(def rps
(create-utility-fn [[0, -1, 1]
[1, 0, -1]
[-1, 1, 0]]))
(cfrm-ne rps 100000)
)
(ns cfr.rps-tweak
(:require [clojure.core.matrix :as m]
[primitive-math :as pm]))
(set! *warn-on-reflection* true)
(m/set-current-implementation :vectorz)
(defn create-utility-fn
"Given a payoff-matrix creates a utility function for the game. The utility
function accepts two strategy vectors as its arguments and returns the utility
for the first player in question."
[m]
(fn ^double [sa sb]
(let [prob-m
(m/compute-matrix
(map m/ecount [sa sb])
#(pm/* ^double (m/mget sa %1) ^double (m/mget sb %2)))]
(m/ereduce + (m/mul prob-m m)))))
(defn regret
"Given a utility function and three strategy vectors, returns the regret for
player having played his strategy `sa' instead of `sb' against his opponents `so'"
[uf sa sb so]
(pm/- ^double (uf sb so) ^double (uf sa so)))
(defn action-profile [n]
"An action profile is the list of pure strategies available to a player."
(map #(m/mset (repeat n 0) % 1) (range n)))
(defn regret-profile
"Given a utility function and strategies for both players, this function
returns the regret for all the pure-strategies the first player could have
played, including the strategy he did play."
[uf sa so]
(map #(regret uf sa % so) (action-profile (m/ecount sa))))
(defn normalise-strategies
[nsum strat]
(if (pm/> ^double nsum 0.0)
(map #(pm/div ^double % ^double nsum) strat)
(repeat (m/ecount strat) (pm/div (m/ecount strat)))))
(defn new-strategy
"Creates a new strategy based on the regrets experienced by the player."
[rgr]
(let [n (m/ecount rgr)
strat (map #(if (pos? (m/mget rgr %)) (m/mget rgr %) 0) (range n))
nsum (reduce + strat)]
(normalise-strategies nsum strat)))
(defn cumulative-probabilities
"Takes a collection of probabilities (that sum up to one) and turns it into a
sequence of cumulative probabilities."
[coll]
(reduce #(conj %1 (+ %2 (last %1))) [0] coll))
(defn choose-action
"Given a strategy vector, chooses an action to play based on its probability."
[^doubles strat]
(let [cp (cumulative-probabilities strat)
r (rand)
index (pm/dec ^long (m/ecount (take-while #(pm/> ^double r ^double %) cp)))]
(m/mset (repeat (m/ecount strat) 0) index 1)))
(defn avarage-strategy
"Given a vector where each index maps to how often a certain strategy has been
played, returns the frequency of each strategy as a part of the total."
[ssum]
(let [nsum (reduce + ssum)]
(normalise-strategies nsum ssum)))
(defn cfrm-be
"Given a utility function, number of iterations and a strategy for the
opponent, performs the Counterfactual Regret Minimization algorithm to find
the best response to the strategy in question."
[uf n sb]
(let [n (int n)]
(loop [i (int 0)
reg-a (m/array [0 0 0])
ssum (m/array [0 0 0])]
(if (pm/== i n)
(avarage-strategy ssum)
(let [strat-a (choose-action (new-strategy reg-a))
strat-b sb]
(recur (pm/inc i)
(m/add reg-a (regret-profile uf strat-a strat-b))
(m/add ssum strat-a)))))))
(defn cfrm-ne
"Given a utility function and a number of iterations to perform, performs the
Counterfactual Regret Minimization algorithm to find an approximation of the
Nash Equilibrium for the game."
[uf n]
(let [n (int n)]
(loop [i (int 0)
reg-a (m/array [0 0 0])
reg-b (m/array [0 0 0])
ssum (m/array [0 0 0])]
(if (pm/== i n)
(avarage-strategy ssum)
(let [strat-a (choose-action (new-strategy reg-a))
strat-b (choose-action (new-strategy reg-b))]
(recur (pm/inc i)
(m/add reg-a (regret-profile uf strat-a strat-b))
(m/add reg-b (regret-profile uf strat-b strat-a))
(m/add ssum strat-a)))))))
(comment
(def rps
(create-utility-fn [[0, -1, 1]
[1, 0, -1]
[-1, 1, 0]]))
(cfrm-ne rps 100000)
)
import java.util.Arrays;
import java.util.Random;
public class RPSTrainer {
public static final int ROCK = 0, PAPER = 1, SCISSORS = 2, NUM_ACTIONS = 3;
public static final Random random = new Random();
double[] regretSum = new double[NUM_ACTIONS],
strategy = new double[NUM_ACTIONS],
strategySum = new double[NUM_ACTIONS],
oppStrategy = { 0.4, 0.3, 0.3 };
private double[] getStrategy() {
double normalizingSum = 0;
for (int a = 0; a < NUM_ACTIONS; a++) {
strategy[a] = regretSum[a] > 0 ? regretSum[a] : 0;
normalizingSum += strategy[a];
}
for (int a = 0; a < NUM_ACTIONS; a++) {
if (normalizingSum > 0)
strategy[a] /= normalizingSum;
else
strategy[a] = 1.0 / NUM_ACTIONS;
strategySum[a] += strategy[a];
}
return strategy;
}
public int getAction(double[] strategy) {
double r = random.nextDouble();
int a = 0;
double cumulativeProbability = 0;
while (a < NUM_ACTIONS - 1) {
cumulativeProbability += strategy[a];
if (r < cumulativeProbability)
break;
a++;
}
return a;
}
public void train(int iterations) {
double[] actionUtility = new double[NUM_ACTIONS];
for (int i = 0; i < iterations; i++) {
double[] strategy = getStrategy();
int myAction = getAction(strategy);
int otherAction = getAction(oppStrategy);
actionUtility[otherAction] = 0;
actionUtility[otherAction == NUM_ACTIONS - 1 ? 0 : otherAction + 1] = 1;
actionUtility[otherAction == 0 ? NUM_ACTIONS - 1 : otherAction - 1] = -1;
for (int a = 0; a < NUM_ACTIONS; a++)
regretSum[a] += actionUtility[a] - actionUtility[myAction];
}
}
public double[] getAverageStrategy() {
double[] avgStrategy = new double[NUM_ACTIONS];
double normalizingSum = 0;
for (int a = 0; a < NUM_ACTIONS; a++)
normalizingSum += strategySum[a];
for (int a = 0; a < NUM_ACTIONS; a++)
if (normalizingSum > 0)
avgStrategy[a] = strategySum[a] / normalizingSum;
else
avgStrategy[a] = 1.0 / NUM_ACTIONS;
return avgStrategy;
}
public static void main(String[] args) {
RPSTrainer trainer = new RPSTrainer();
trainer.train(1000000);
System.out.println(Arrays.toString(trainer.getAverageStrategy()));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment