Created
October 14, 2015 06:17
-
-
Save jdevoo/71007a5e4ecf35e5d3f7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns cascalab.webgraph | |
(:use [cascalog api]) | |
(:require [cascalog.cascading.tap :as tap] | |
[cascalog.logic.ops :as c]) | |
(:gen-class)) | |
(defn- parse-str [^String s] | |
(seq (clojure.string/split s #"\s+"))) | |
(defmapcatfn mk-node [from to] | |
[[from] [to]]) | |
(defn nodes [edges] | |
(<- [?node] | |
(edges ?from ?to) | |
(mk-node ?from ?to :> ?node) | |
(:distinct true))) | |
(defn node-cnt [nodes] | |
(first (first (??<- [?cnt] | |
(nodes ?node) | |
(c/distinct-count ?node :> ?cnt))))) | |
(defn out-degree [edges] | |
(let [vertices (nodes edges)] | |
(<- [?node ?out_d] | |
(vertices ?node) | |
(edges ?node !!to) | |
(c/!count !!to :> ?out_d)))) | |
(defn edges-tap [path] | |
(let [src (tap/hfs-textline path)] | |
(<- [?from ?to] | |
(src ?line) | |
((filterfn [^String line] (not (.startsWith line "#"))) ?line) | |
(parse-str ?line :> ?from ?to)))) | |
(defn pr-init [nodes n] | |
(<- [?node ?pr] | |
(nodes ?node) | |
(div 1 n :> ?pr))) | |
(defmapfn out-factor [r d teleport] | |
(if (not (zero? d)) (* (- 1 teleport) (div r d)) 0)) | |
(defmapfn safe-add [a b] | |
(if a (+ a b) b)) | |
(defn calc-pr [edges vertices n pageranks teleport] | |
(let [degrees (out-degree edges) | |
out-factors (<- [?node ?out] | |
(pageranks ?node ?pr) | |
(degrees ?node ?d) | |
(out-factor ?pr ?d teleport :> ?out)) | |
outlink-terms (<- [?node ?pr-outlink] | |
(edges ?from ?node) | |
(out-factors ?from ?out) | |
(c/sum ?out :> ?pr-outlink)) | |
leakage-term (/ (- 1 (first (first (??<- [?s] | |
(outlink-terms _ ?out) | |
(c/sum ?out :> ?s))))) n)] | |
(<- [?node ?pr_new] | |
(vertices ?node) | |
(outlink-terms ?node !!outlink-term) | |
(safe-add !!outlink-term leakage-term :> ?pr_new)))) | |
(defn pr-tap [path] | |
(let [src (tap/hfs-textline path)] | |
(<- [?node ?pr] | |
(src ?line) | |
(parse-str ?line :> ?node ?pr-str) | |
((mapfn [d] (Double/parseDouble d)) ?pr-str :> ?pr)))) | |
(defmapfn abs-delta [^Double a ^Double b] | |
(Math/abs ^Double (- a b))) | |
(defn delta-pr-taps [path iter] | |
(let [pr-old (pr-tap (str path (dec iter))) | |
pr-new (pr-tap (str path iter))] | |
(first (first (??<- [?delta] | |
(pr-old ?node ?old) | |
(pr-new ?node ?new) | |
(abs-delta ?old ?new :> ?tmp) | |
(c/sum ?tmp :> ?delta)))))) | |
(defn pagerank [in-path out-path epsilon teleport] | |
(let [e (edges-tap in-path) | |
v (nodes e) | |
n (node-cnt v) | |
pr-path (str out-path "/pagerank/iter-") | |
pr0 (pr-init v n)] | |
(?- (hfs-textline (str pr-path 0)) pr0) | |
(loop [pr pr0 | |
iter 1] | |
(let [pr-new (calc-pr e v n pr teleport)] | |
(?- (hfs-textline (str pr-path iter)) pr-new) | |
(if (> (delta-pr-taps pr-path iter) epsilon) | |
(recur | |
(pr-tap (str pr-path iter)) | |
(inc iter))))))) | |
(defn -main [in-path out-path epsilon teleport] | |
(pagerank in-path out-path epsilon teleport)) | |
;;(def e (edges-tap "resources/mmds.txt")) | |
;;(def v (nodes e)) | |
;;(time (def n (node-cnt v))) | |
;;(def pr0 (pr-init v n)) | |
;;(?- (stdout) pr0) | |
;;(?- (stdout) (out-degree e)) | |
;;(?- (stdout) (calc-pr e v n pr0 0.8)) | |
;;(pagerank "resources/mmds.txt" "resources" 1e-5 0.2) | |
;;(pagerank "resources/web-Google.txt" "resources" 1e-5 0.2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment