Skip to content

Instantly share code, notes, and snippets.

@lynaghk
Created September 4, 2019 06:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lynaghk/03c6e519cd051a2e252262946a0a80f6 to your computer and use it in GitHub Desktop.
Save lynaghk/03c6e519cd051a2e252262946a0a80f6 to your computer and use it in GitHub Desktop.
Reinforcement learning sketch
(ns reinforcement-learning
(:require [clojure.set :refer [difference union]]
[clojure.string :as str]
[lonocloud.synthread :as ->]))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Tic Tac Toe bits
;; based on https://github.com/paraseba/tictactoe/blob/master/src/tictactoe/core.clj
(def empty-board
{:x #{}
:o #{}})
(def all-cells
(set (range 9)))
(def win-cells
(let [row1 #{0 1 2} row2 #{3 4 5} row3 #{6 7 8}
col1 #{0 3 6} col2 #{1 4 7} col3 #{2 5 8}
dia1 #{0 4 8} dia2 #{2 4 6}]
[row1 row2 row3 col1 col2 col3 dia1 dia2]))
(defn empty-cells
[board]
(difference all-cells (:x board) (:o board)))
(defn won?
[cells]
(some #(every? cells %) win-cells))
(defn winner
[board]
(cond
(won? (:x board)) :x
(won? (:o board)) :o))
(defn draw?
[board]
(empty? (empty-cells board)))
(defn mark
[board cell]
(assert (contains? (empty-cells board) cell))
(let [turn (if (> (count (:x board))
(count (:o board)))
:o
:x)]
(update board turn conj cell)))
(defn print-board
[board]
(doseq [row (range 3)]
(println (str/join "|" (for [col (range 3)]
(let [idx (+ col (* row 3))]
(cond
(contains? (:x board) idx) "X"
(contains? (:o board) idx) "O"
:else " "))))))
(println))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Reinforcement learning bits
(def initial-values
"Map of board state -> value (from perspective of player x)."
{})
(def default-value
"Default value assigned to unknown state"
0.5)
(defn get-value
[values state]
(let [w (winner state)]
(cond
(= :x w)
1
(= :o w)
0
(draw? state)
0
:else
(get values state default-value))))
(def α
0.2)
(defn update-values
[values state next-state]
(assoc values state (let [v (get-value values state)]
(+ v (* α (- (get-value values next-state)
v))))))
(defn random-next-state
[state]
(-> state
(->/when-let [cell (rand-nth (seq (empty-cells state)))]
(mark cell))))
(defn best-next-state
[state values]
(let [[top-value states] (->> (empty-cells state)
(map (partial mark state))
(group-by values)
(sort-by first)
last)]
(rand-nth states)))
(defn run
[num-iterations]
(loop [idx 0
player :x
values initial-values
state empty-board]
(cond
;;done; save values so we can inspect at repl
(= num-iterations idx)
values
;;game is over, begin again
(or (draw? state) (winner state))
(recur (inc idx) :x values empty-board)
:else
(let [next-state (case player
:x
(if (> (rand) 0.2)
;;greedy move
(best-next-state state values)
;;exploratory move
(random-next-state state))
:o
(random-next-state state))]
(recur (inc idx)
(case player :x :o :o :x)
(update-values values state next-state)
next-state)))))
(comment
(def values
(run 200000))
(->> values
(sort-by second)
reverse
(take 10)
(map first)
(map print-board)
(doall))
(doseq [[board v] (->> (range 9)
(map #(mark empty-board %))
(map (juxt identity values))
(sort-by second))]
(println v)
(print-board board)
(println ""))
(-> empty-board
(best-next-state values) (->/aside state (print-board state))
(mark 4) (->/aside state (print-board state))
(best-next-state values) (->/aside state (print-board state))
;; (next-states-by-value values)
;; (->>
;; (map (juxt identity (partial get-value values)) ))
(mark 2) (->/aside state (print-board state))
(best-next-state values) (->/aside state (print-board state))
(mark 7) (->/aside state (print-board state))
(best-next-state values) (->/aside state (print-board state))
)
(print-board {:x #{7 4 8}, :o #{0 3 2}})
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment