(ns cfr.rps | |
(:require [clojure.core.matrix :as m])) | |
(set! *warn-on-reflection* true) | |
(defn create-utility-fn | |
"Given a payoff-matrix creates a utility function for the game. The utility | |
function accepts two strategy vectors as its arguments and returns the utility | |
for the first player in question." | |
[m] | |
(fn [sa sb] | |
(let [prob-m | |
(m/compute-matrix | |
(map count [sa sb]) | |
#(* (nth sa %1) (nth sb %2)))] | |
(m/ereduce + (m/mul prob-m m))))) | |
(defn regret | |
"Given a utility function and three strategy vectors, returns the regret for | |
player having played his strategy `sa' instead of `sb' against his opponents `so'" | |
[uf sa sb so] | |
(- (uf sb so) (uf sa so))) | |
(defn action-profile [ n] | |
"An action profile is the list of pure strategies available to a player." | |
(map #(m/mset (repeat n 0) % 1) (range n))) | |
(defn regret-profile | |
"Given a utility function and strategies for both players, this function | |
returns the regret for all the pure-strategies the first player could have | |
played, including the strategy he did play." | |
[uf sa so] | |
(map #(regret uf sa % so) (action-profile (count sa)))) | |
(defn normalise-strategies | |
[nsum strat] | |
(if (pos? nsum) | |
(map #(/ % nsum) strat) | |
(repeat (count strat) (/ (count strat))))) | |
(defn new-strategy | |
"Creates a new strategy based on the regrets experienced by the player." | |
[rgr] | |
(let [n (count rgr) | |
strat (map #(if (pos? (nth rgr %)) (nth rgr %) 0) (range n)) | |
nsum (apply + strat)] | |
(normalise-strategies nsum strat))) | |
(defn cumulative-probabilities | |
"Takes a collection of probabilities (that sum up to one) and turns it into a | |
sequence of cumulative probabilities." | |
[coll] | |
(reduce #(conj %1 (+ %2 (last %1))) [0] coll)) | |
(defn choose-action | |
"Given a strategy vector, chooses an action to play based on its probability." | |
[strat] | |
(let [cp (cumulative-probabilities strat) | |
r (rand) | |
index (dec (count (take-while #(> r %) cp)))] | |
(m/mset (repeat (count strat) 0) index 1))) | |
(defn avarage-strategy | |
"Given a vector where each index maps to how often a certain strategy has been | |
played, returns the frequency of each strategy as a part of the total." | |
[ssum] | |
(let [nsum (reduce + ssum)] | |
(normalise-strategies nsum ssum))) | |
(defn cfrm-br | |
"Given a utility function, number of iterations and a strategy for the | |
opponent, performs the Counterfactual Regret Minimization algorithm to find | |
the best response to the strategy in question." | |
[uf n sb] | |
(loop [i 0 | |
reg-a [0 0 0] | |
ssum [0 0 0]] | |
(if (= i n) | |
(avarage-strategy ssum) | |
(let [strat-a (choose-action (new-strategy reg-a)) | |
strat-b sb] | |
(recur (inc i) | |
(m/add reg-a (regret-profile uf strat-a strat-b)) | |
(m/add ssum strat-a)))))) | |
(defn cfrm-ne | |
"Given a utility function and a number of iterations to perform, performs the | |
Counterfactual Regret Minimization algorithm to find an approximation of the | |
Nash Equilibrium for the game." | |
[uf n] | |
(loop [i (int 0) | |
reg-a [0 0 0] | |
reg-b [0 0 0] | |
ssum [0 0 0]] | |
(if (= i n) | |
(avarage-strategy ssum) | |
(let [strat-a (choose-action (new-strategy reg-a)) | |
strat-b (choose-action (new-strategy reg-b))] | |
(recur (inc i) | |
(m/add reg-a (regret-profile uf strat-a strat-b)) | |
(m/add reg-b (regret-profile uf strat-b strat-a)) | |
(m/add ssum strat-a)))))) | |
(comment | |
(def rps | |
(create-utility-fn [[0, -1, 1] | |
[1, 0, -1] | |
[-1, 1, 0]])) | |
(cfrm-ne rps 100000) | |
) |
(ns cfr.rps-tweak | |
(:require [clojure.core.matrix :as m] | |
[primitive-math :as pm])) | |
(set! *warn-on-reflection* true) | |
(m/set-current-implementation :vectorz) | |
(defn create-utility-fn | |
"Given a payoff-matrix creates a utility function for the game. The utility | |
function accepts two strategy vectors as its arguments and returns the utility | |
for the first player in question." | |
[m] | |
(fn ^double [sa sb] | |
(let [prob-m | |
(m/compute-matrix | |
(map m/ecount [sa sb]) | |
#(pm/* ^double (m/mget sa %1) ^double (m/mget sb %2)))] | |
(m/ereduce + (m/mul prob-m m))))) | |
(defn regret | |
"Given a utility function and three strategy vectors, returns the regret for | |
player having played his strategy `sa' instead of `sb' against his opponents `so'" | |
[uf sa sb so] | |
(pm/- ^double (uf sb so) ^double (uf sa so))) | |
(defn action-profile [n] | |
"An action profile is the list of pure strategies available to a player." | |
(map #(m/mset (repeat n 0) % 1) (range n))) | |
(defn regret-profile | |
"Given a utility function and strategies for both players, this function | |
returns the regret for all the pure-strategies the first player could have | |
played, including the strategy he did play." | |
[uf sa so] | |
(map #(regret uf sa % so) (action-profile (m/ecount sa)))) | |
(defn normalise-strategies | |
[nsum strat] | |
(if (pm/> ^double nsum 0.0) | |
(map #(pm/div ^double % ^double nsum) strat) | |
(repeat (m/ecount strat) (pm/div (m/ecount strat))))) | |
(defn new-strategy | |
"Creates a new strategy based on the regrets experienced by the player." | |
[rgr] | |
(let [n (m/ecount rgr) | |
strat (map #(if (pos? (m/mget rgr %)) (m/mget rgr %) 0) (range n)) | |
nsum (reduce + strat)] | |
(normalise-strategies nsum strat))) | |
(defn cumulative-probabilities | |
"Takes a collection of probabilities (that sum up to one) and turns it into a | |
sequence of cumulative probabilities." | |
[coll] | |
(reduce #(conj %1 (+ %2 (last %1))) [0] coll)) | |
(defn choose-action | |
"Given a strategy vector, chooses an action to play based on its probability." | |
[^doubles strat] | |
(let [cp (cumulative-probabilities strat) | |
r (rand) | |
index (pm/dec ^long (m/ecount (take-while #(pm/> ^double r ^double %) cp)))] | |
(m/mset (repeat (m/ecount strat) 0) index 1))) | |
(defn avarage-strategy | |
"Given a vector where each index maps to how often a certain strategy has been | |
played, returns the frequency of each strategy as a part of the total." | |
[ssum] | |
(let [nsum (reduce + ssum)] | |
(normalise-strategies nsum ssum))) | |
(defn cfrm-be | |
"Given a utility function, number of iterations and a strategy for the | |
opponent, performs the Counterfactual Regret Minimization algorithm to find | |
the best response to the strategy in question." | |
[uf n sb] | |
(let [n (int n)] | |
(loop [i (int 0) | |
reg-a (m/array [0 0 0]) | |
ssum (m/array [0 0 0])] | |
(if (pm/== i n) | |
(avarage-strategy ssum) | |
(let [strat-a (choose-action (new-strategy reg-a)) | |
strat-b sb] | |
(recur (pm/inc i) | |
(m/add reg-a (regret-profile uf strat-a strat-b)) | |
(m/add ssum strat-a))))))) | |
(defn cfrm-ne | |
"Given a utility function and a number of iterations to perform, performs the | |
Counterfactual Regret Minimization algorithm to find an approximation of the | |
Nash Equilibrium for the game." | |
[uf n] | |
(let [n (int n)] | |
(loop [i (int 0) | |
reg-a (m/array [0 0 0]) | |
reg-b (m/array [0 0 0]) | |
ssum (m/array [0 0 0])] | |
(if (pm/== i n) | |
(avarage-strategy ssum) | |
(let [strat-a (choose-action (new-strategy reg-a)) | |
strat-b (choose-action (new-strategy reg-b))] | |
(recur (pm/inc i) | |
(m/add reg-a (regret-profile uf strat-a strat-b)) | |
(m/add reg-b (regret-profile uf strat-b strat-a)) | |
(m/add ssum strat-a))))))) | |
(comment | |
(def rps | |
(create-utility-fn [[0, -1, 1] | |
[1, 0, -1] | |
[-1, 1, 0]])) | |
(cfrm-ne rps 100000) | |
) |
import java.util.Arrays; | |
import java.util.Random; | |
public class RPSTrainer { | |
public static final int ROCK = 0, PAPER = 1, SCISSORS = 2, NUM_ACTIONS = 3; | |
public static final Random random = new Random(); | |
double[] regretSum = new double[NUM_ACTIONS], | |
strategy = new double[NUM_ACTIONS], | |
strategySum = new double[NUM_ACTIONS], | |
oppStrategy = { 0.4, 0.3, 0.3 }; | |
private double[] getStrategy() { | |
double normalizingSum = 0; | |
for (int a = 0; a < NUM_ACTIONS; a++) { | |
strategy[a] = regretSum[a] > 0 ? regretSum[a] : 0; | |
normalizingSum += strategy[a]; | |
} | |
for (int a = 0; a < NUM_ACTIONS; a++) { | |
if (normalizingSum > 0) | |
strategy[a] /= normalizingSum; | |
else | |
strategy[a] = 1.0 / NUM_ACTIONS; | |
strategySum[a] += strategy[a]; | |
} | |
return strategy; | |
} | |
public int getAction(double[] strategy) { | |
double r = random.nextDouble(); | |
int a = 0; | |
double cumulativeProbability = 0; | |
while (a < NUM_ACTIONS - 1) { | |
cumulativeProbability += strategy[a]; | |
if (r < cumulativeProbability) | |
break; | |
a++; | |
} | |
return a; | |
} | |
public void train(int iterations) { | |
double[] actionUtility = new double[NUM_ACTIONS]; | |
for (int i = 0; i < iterations; i++) { | |
double[] strategy = getStrategy(); | |
int myAction = getAction(strategy); | |
int otherAction = getAction(oppStrategy); | |
actionUtility[otherAction] = 0; | |
actionUtility[otherAction == NUM_ACTIONS - 1 ? 0 : otherAction + 1] = 1; | |
actionUtility[otherAction == 0 ? NUM_ACTIONS - 1 : otherAction - 1] = -1; | |
for (int a = 0; a < NUM_ACTIONS; a++) | |
regretSum[a] += actionUtility[a] - actionUtility[myAction]; | |
} | |
} | |
public double[] getAverageStrategy() { | |
double[] avgStrategy = new double[NUM_ACTIONS]; | |
double normalizingSum = 0; | |
for (int a = 0; a < NUM_ACTIONS; a++) | |
normalizingSum += strategySum[a]; | |
for (int a = 0; a < NUM_ACTIONS; a++) | |
if (normalizingSum > 0) | |
avgStrategy[a] = strategySum[a] / normalizingSum; | |
else | |
avgStrategy[a] = 1.0 / NUM_ACTIONS; | |
return avgStrategy; | |
} | |
public static void main(String[] args) { | |
RPSTrainer trainer = new RPSTrainer(); | |
trainer.train(1000000); | |
System.out.println(Arrays.toString(trainer.getAverageStrategy())); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment