lynaghk/reinforcement_learning.clj

## reinforcement_learning.clj
(ns reinforcement-learning
  (:require [clojure.set :refer [difference union]]
            [clojure.string :as str]
            [lonocloud.synthread :as ->]))


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Tic Tac Toe bits
;; based on https://github.com/paraseba/tictactoe/blob/master/src/tictactoe/core.clj

(def empty-board
  {:x #{}
   :o #{}})


(def all-cells
  (set (range 9)))


(def win-cells
  (let [row1 #{0 1 2} row2 #{3 4 5} row3 #{6 7 8}
        col1 #{0 3 6} col2 #{1 4 7} col3 #{2 5 8}
        dia1 #{0 4 8} dia2 #{2 4 6}]
    [row1 row2 row3 col1 col2 col3 dia1 dia2]))


(defn empty-cells
  [board]
  (difference all-cells (:x board) (:o board)))


(defn won?
  [cells]
  (some #(every? cells %) win-cells))


(defn winner
  [board]
  (cond
    (won? (:x board)) :x
    (won? (:o board)) :o))


(defn draw?
  [board]
  (empty? (empty-cells board)))


(defn mark
  [board cell]
  (assert (contains? (empty-cells board) cell))
  (let [turn (if (> (count (:x board))
                    (count (:o board)))
               :o
               :x)]
    (update board turn conj cell)))


(defn print-board
  [board]
  (doseq [row (range 3)]
    (println (str/join "|" (for [col (range 3)]
                             (let [idx (+ col (* row 3))]
                               (cond
                                 (contains? (:x board) idx) "X"
                                 (contains? (:o board) idx) "O"
                                 :else " "))))))
  (println))


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Reinforcement learning bits

(def initial-values
  "Map of board state -> value (from perspective of player x)."
  {})


(def default-value
  "Default value assigned to unknown state"
  0.5)


(defn get-value
  [values state]

  (let [w (winner state)]
    (cond
      (= :x w)
      1

      (= :o w)
      0

      (draw? state)
      0

      :else
      (get values state default-value))))


(def α
  0.2)


(defn update-values
  [values state next-state]
  (assoc values state (let [v (get-value values state)]
                        (+ v (* α (- (get-value values next-state)
                                     v))))))


(defn random-next-state
  [state]
  (-> state
      (->/when-let [cell (rand-nth (seq (empty-cells state)))]
        (mark cell))))


(defn best-next-state
  [state values]
  (let [[top-value states] (->> (empty-cells state)
                                (map (partial mark state))
                                (group-by values)
                                (sort-by first)
                                last)]
    (rand-nth states)))


(defn run
  [num-iterations]
  (loop [idx 0
         player :x
         values initial-values
         state empty-board]
    (cond
      ;;done; save values so we can inspect at repl
      (= num-iterations idx)
      values

      ;;game is over, begin again
      (or (draw? state) (winner state))
      (recur (inc idx) :x values empty-board)

      :else
      (let [next-state (case player
                         :x
                         (if (> (rand) 0.2)
                           ;;greedy move
                           (best-next-state state values)
                           ;;exploratory move
                           (random-next-state state))

                         :o
                         (random-next-state state))]

        (recur (inc idx)
               (case player :x :o :o :x)
               (update-values values state next-state)
               next-state)))))

(comment

  (def values
    (run 200000))

  (->> values
       (sort-by second)
       reverse
       (take 10)
       (map first)
       (map print-board)
       (doall))


  (doseq [[board v] (->> (range 9)
                         (map #(mark empty-board %))
                         (map (juxt identity values))
                         (sort-by second))]
    (println v)
    (print-board board)
    (println ""))


  (-> empty-board
      (best-next-state values) (->/aside state (print-board state))
      (mark 4)                 (->/aside state (print-board state))
      (best-next-state values) (->/aside state (print-board state))

      ;; (next-states-by-value values)
      ;; (->>
      ;;  (map (juxt identity (partial get-value values)) ))

      (mark 2)                 (->/aside state (print-board state))
      (best-next-state values) (->/aside state (print-board state))
      (mark 7)                 (->/aside state (print-board state))
      (best-next-state values) (->/aside state (print-board state))
      )


  (print-board {:x #{7 4 8}, :o #{0 3 2}})
  )
	(ns reinforcement-learning
	(:require [clojure.set :refer [difference union]]
	[clojure.string :as str]
	[lonocloud.synthread :as ->]))


	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	;; Tic Tac Toe bits
	;; based on https://github.com/paraseba/tictactoe/blob/master/src/tictactoe/core.clj

	(def empty-board
	{:x #{}
	:o #{}})


	(def all-cells
	(set (range 9)))


	(def win-cells
	(let [row1 #{0 1 2} row2 #{3 4 5} row3 #{6 7 8}
	col1 #{0 3 6} col2 #{1 4 7} col3 #{2 5 8}
	dia1 #{0 4 8} dia2 #{2 4 6}]
	[row1 row2 row3 col1 col2 col3 dia1 dia2]))


	(defn empty-cells
	[board]
	(difference all-cells (:x board) (:o board)))


	(defn won?
	[cells]
	(some #(every? cells %) win-cells))


	(defn winner
	[board]
	(cond
	(won? (:x board)) :x
	(won? (:o board)) :o))


	(defn draw?
	[board]
	(empty? (empty-cells board)))


	(defn mark
	[board cell]
	(assert (contains? (empty-cells board) cell))
	(let [turn (if (> (count (:x board))
	(count (:o board)))
	:o
	:x)]
	(update board turn conj cell)))


	(defn print-board
	[board]
	(doseq [row (range 3)]
	(println (str/join "\|" (for [col (range 3)]
	(let [idx (+ col (* row 3))]
	(cond
	(contains? (:x board) idx) "X"
	(contains? (:o board) idx) "O"
	:else " "))))))
	(println))


	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	;; Reinforcement learning bits

	(def initial-values
	"Map of board state -> value (from perspective of player x)."
	{})


	(def default-value
	"Default value assigned to unknown state"
	0.5)


	(defn get-value
	[values state]

	(let [w (winner state)]
	(cond
	(= :x w)
	1

	(= :o w)
	0

	(draw? state)
	0

	:else
	(get values state default-value))))


	(def α
	0.2)


	(defn update-values
	[values state next-state]
	(assoc values state (let [v (get-value values state)]
	(+ v (* α (- (get-value values next-state)
	v))))))


	(defn random-next-state
	[state]
	(-> state
	(->/when-let [cell (rand-nth (seq (empty-cells state)))]
	(mark cell))))


	(defn best-next-state
	[state values]
	(let [[top-value states] (->> (empty-cells state)
	(map (partial mark state))
	(group-by values)
	(sort-by first)
	last)]
	(rand-nth states)))


	(defn run
	[num-iterations]
	(loop [idx 0
	player :x
	values initial-values
	state empty-board]
	(cond
	;;done; save values so we can inspect at repl
	(= num-iterations idx)
	values

	;;game is over, begin again
	(or (draw? state) (winner state))
	(recur (inc idx) :x values empty-board)

	:else
	(let [next-state (case player
	:x
	(if (> (rand) 0.2)
	;;greedy move
	(best-next-state state values)
	;;exploratory move
	(random-next-state state))

	:o
	(random-next-state state))]

	(recur (inc idx)
	(case player :x :o :o :x)
	(update-values values state next-state)
	next-state)))))

	(comment

	(def values
	(run 200000))

	(->> values
	(sort-by second)
	reverse
	(take 10)
	(map first)
	(map print-board)
	(doall))


	(doseq [[board v] (->> (range 9)
	(map #(mark empty-board %))
	(map (juxt identity values))
	(sort-by second))]
	(println v)
	(print-board board)
	(println ""))


	(-> empty-board
	(best-next-state values) (->/aside state (print-board state))
	(mark 4) (->/aside state (print-board state))
	(best-next-state values) (->/aside state (print-board state))

	;; (next-states-by-value values)
	;; (->>
	;; (map (juxt identity (partial get-value values)) ))

	(mark 2) (->/aside state (print-board state))
	(best-next-state values) (->/aside state (print-board state))
	(mark 7) (->/aside state (print-board state))
	(best-next-state values) (->/aside state (print-board state))
	)



	(print-board {:x #{7 4 8}, :o #{0 3 2}})
	)