Skip to content

Instantly share code, notes, and snippets.

@afiore
Forked from anonymous/gist:5400914
Last active December 16, 2015 07:48
Show Gist options
  • Save afiore/5400916 to your computer and use it in GitHub Desktop.
Save afiore/5400916 to your computer and use it in GitHub Desktop.
(ns cascalog-tutorial.core
(:use cascalog.api
cascalog.playground
[clojure.string :only [split]]
[cheshire.core :only [parse-string generate-string]]))
(def input-tap
(hfs-tap (cascading.scheme.hadoop.TextLine.)
(str (System/getProperty "user.dir")
"/all.json")))
(def output-tap
(hfs-tap (cascading.scheme.hadoop.TextLine.)
(str (System/getProperty "user.dir")
"/results.json") :sinkmode :replace))
(defn documents []
(<- [?doc]
(input-tap _ ?line)
(parse-string ?line true :> ?doc)))
(defn subjects [doc]
(:subjects doc []))
(defn subject-codes [doc]
(->> doc
(subjects)
(mapcat (fn [s] (split (:path s "") #"\s/\s")))))
(defn has-subject-codes [doc]
((comp not empty?)
(subject-codes doc)))
(defmapcatop subject-doi [doc]
(let [doi (:doi doc)
codes (subject-codes doc)]
(map (fn [c] {(keyword c) [doi]}) codes)))
(defn accumulate-into-map [oldv newv]
(if (nil? oldv)
[newv]
(conj oldv (first newv))))
(defaggregateop accumulate-dois-by-subject
([] {})
([acc code-doi] (merge-with accumulate-into-map acc code-doi))
([acc] [acc]))
(defn -main []
(?<- output-tap
[?json]
(generate-string ?x :> ?json)
(accumulate-dois-by-subject ?subject-doi :> ?x)
(subject-doi ?doc :> ?subject-doi)
((documents) ?doc)
(has-subject-codes ?doc)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment