Skip to content

Instantly share code, notes, and snippets.

@andrewvc
Created August 5, 2012 21:02
Show Gist options
  • Save andrewvc/3267117 to your computer and use it in GitHub Desktop.
Save andrewvc/3267117 to your computer and use it in GitHub Desktop.
Test for markov generation in engulf
(ns engulf.formulas.markov-requests
"Support for generating markov chains of requests"
(:use clojure.pprint
[clojure.walk :only [keywordize-keys]])
(:require [clojure.string :as string]
[clojure.tools.logging :as log]
[cheshire.core :as json])
(:import java.util.TreeMap))
(defn compile-transitions
"Compile a system's transitions into a format suitable for efficient markov chain generation.
Internally this creates a lookup table of URLs with treemaps used to map probabilities
onto numerical ranges."
[transitions]
(doall
(reduce
(fn [out [url-id edges]]
(assoc out
url-id
(let [m (TreeMap.)]
(reduce
(fn [offset [edge-url-id weight]]
(let [off-ceil (+ offset weight)]
(.put m off-ceil edge-url-id)
off-ceil))
0.0
edges)
m)))
{}
transitions)))
(defn chain
"Returns a lazy sequence of requests. Not guaranteed to terminate."
([compiled]
(chain compiled :rand))
([{:keys [transitions request-keys requests] :as compiled} request]
(let [r (request-keys (rand-int (count request-keys)))]
(if-let [edges (transitions r)]
(let [e (.ceilingEntry edges (rand))]
(if-let [e-req (.getValue e)]
(lazy-seq (cons (requests e-req) (chain compiled e-req)))
(throw (Exception. "This should never happen. The morkov chain is compromised!"))))
(throw (Exception. (str "Could not find edge for: " request)))))))
(defn parse-corpus
[corpus]
(map
#(let [[method url raw-opts] (string/split (string/trim %) #"\s+" 3)
opts (if (not raw-opts)
{}
(try (json/parse-string raw-opts)
(catch com.fasterxml.jackson.core.JsonParseException e
(log/warn e (str "Could not parse line: " %))
(throw e)
)))]
(merge (keywordize-keys opts) {:method method :url url }))
(filter (comp not string/blank?) (string/split-lines corpus))))
(defn build-requests
[parsed]
(reduce
(fn [m request]
(-> m
(update-in [(.hashCode request)] #(if % (update-in % [1] inc) [request 1]))
(update-in [:total] inc)))
{:total 0}
parsed))
(defn incr-or-one
[v]
(if v (inc v) 1))
(defn counted-tuples
[parsed requests]
(let [tuples (partition 2 1 parsed)
first-edge (first (first tuples))
last-edge (last (last tuples))
counted (reduce
(fn [m [a b]]
(let [ac (.hashCode a)
bc (.hashCode b)]
(-> m
(update-in [ac bc] incr-or-one)
(update-in [ac :total] incr-or-one))))
{}
tuples)]
;; loop the end to the front to finish it
(-> counted
(update-in [(.hashCode last-edge) (.hashCode first-edge)] incr-or-one)
(update-in [(.hashCode last-edge) :total] incr-or-one))))
(defn counted-probabilities
[counted]
(map
(fn [[request edges]]
[request
(reduce (fn [m [ereq ecount]]
(if (= ereq :total)
m
(assoc m ereq (float (/ ecount (:total edges))))))
{}
edges)])
counted))
(defn compile-preprocessed
"Compiles a preprocessed corpus into a format suitable for the (chain) method"
[{:keys [requests counted]}]
(let [probabilities (counted-probabilities counted)]
{:requests requests
:request-keys (vec (filter #(not= :total %) (keys requests)))
:transitions (compile-transitions probabilities)}))
(defn preprocess-corpus
"Compiles the corpus into something that's a processed as possible but can still
be serialized into JSON. Call 'compile-preprocessed' to finish it."
[corpus]
(let [parsed (parse-corpus corpus)
requests (build-requests parsed)
counted (counted-tuples parsed requests)]
{:requests requests
:counted counted}))
(def test-corpus
"POST http://localhost/foo {\"body\": \"a=b&c=d\"}
GET http://localhost/bar
GET http://localhost/bar
GET http://localhost/foo
GET http://localhost/
GET http://localhost/bar
GET http://localhost/foo
GET http://localhost/foo
GET http://localhost/bar
GET http://localhost/
POST http://localhost/foo {\"body\": \"a=b&c=d\"}
GET http://localhost/bar
GET http://localhost/foo
GET http://localhost/bzx
GET http://localhost/another
GET http://localhost/bzx
POST http://localhost/foo {\"body\": \"a=b&c=d\"}
GET http://localhost/foo
POST http://localhost/foo {\"body\": \"a=b&c=d\"}
GET http://localhost/bar
GET http://localhost/bar
GET http://localhost/THEEND")
(pprint (last (let [compiled (compile-preprocessed (preprocess-corpus test-corpus))]
(take 5000000 (chain compiled)))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment