Skip to content

Instantly share code, notes, and snippets.

@pchalasani
Created December 19, 2012 16:17
Show Gist options
  • Save pchalasani/4337940 to your computer and use it in GitHub Desktop.
Save pchalasani/4337940 to your computer and use it in GitHub Desktop.
Cascalog Example
;; Create a map without quoting: (an interesting defmacro exercise!)
;; (qmap city nyc population 14000000)
;; => {"city" "nyc", "population" "14000000"}
;; (some special chars still need to be quoted though!)
(defmacro qmap [& args]
`(apply hash-map (map name (map str '~args))))
;; conf settings:
(def my-conf
(qmap
hadoop.tmp.dir "/tmp/prasadch"
mapreduce.job.acl-view-job "*"
mapred.map.tasks.speculative.execution true
mapred.job.queue.name adhoc
mapred.min.split.size 10737418240
;; I just added the below options to see if it helps speed it up.
mapred.reduce.tasks 1000
mapred.reduce.child.java.opts -Xmx3000m
mapreduce.reduce.child.java.opts -Xmx3000m
;; mapred.reduce.tasks 100
mapred.job.reduce.memory.mb 3072
mapreduce.job.reduce.memory.mb 3072
))
;; save propterty_id -> property_name map
(defn prop-map [prop-file]
(with-job-conf my-conf
(??<-
[?site-id ?site]
( (hfs-textline prop-file ) :> ?line )
( re-parse-with [","] ?line :> ?site-id ?site ))))
;; generate audience data fields from raw feed
(defn abf-gen [prop-file path]
(<- [!week !bcookie !site !sess-id !time-spent]
( (hfs-seqproj path :outfields ["timestamp:Long" "cookie_id" "src_pty" "type" "nw_sess_id" "time_spent:Long"] )
!time !bcookie !site-id "p" !sess-id !time-spent)
(:distinct false) ;; allow identical output rows
(:trap errors)
( (prop-map prop-file) !site-id !site)
(quot !time (* 7 24 60 60) :> !week )))
;; calc various site metrics
(defmain SiteMetricsSketch [prop-file data-path out-path]
(with-job-conf my-conf
;; (cio/with-log-level :fatal ;; suppress all log msgs except fatal
(?<-
(hfs-textline out-path)
[!site !week !users !views !visits !len !views-visit !len-visit !views-user !len-user] ;; agg by (site,week)
( (abf-gen prop-file data-path) !week !bcookie !site !sess-id !time-spent)
(:trap errors)
(c/count :> !views)
(ops/agg-hyperloglog !bcookie :> !u-hll)
(hll/cardinality !u-hll :> !users)
(ops/agg-hyperloglog !sess-id :> !s-hll)
(hll/cardinality !s-hll :> !visits)
(c/sum !time-spent :> !len)
(quot !views !users :> !views-user)
(quot !len !users :> !len-user)
(quot !views !visits :> !views-visit)
(quot !len !visits :> !len-visit))))
@belablotski
Copy link

Thanks for my-conf example

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment