Skip to content

Instantly share code, notes, and snippets.

package forma.tap;
import backtype.hadoop.pail.PailStructure;
import java.util.Collections;
import java.util.List;
import org.apache.thrift.TBase;
import org.apache.thrift.TDeserializer;
import org.apache.thrift.TException;
import org.apache.thrift.TSerializer;
(def sample-variance
"Predicate macro that calculates the sample variance of the supplied input
var."
(<- [!val :> !var]
(* !val !val :> !squared)
(c/sum !squared :> !squared-sum)
(c/count !count)
(c/sum !val :> !sum)
(c/avg !val :> !mean)
(* !sum !mean :> !i)
(ns ybot.kwphrase-tests
(:use [ybot.analytics.ga kwphrase]
[ybot datastores]
[midje sweet cascalog]))
(let [ga-data [["20121025"
"http://dopeness.org"
"United States"
"Oregon"
"Portland"
(gen-class :name ybot.hadoop.formats.SimpleRecordStreamFactory
:implements backtype.hadoop.formats.RecordStreamFactory
:prefix "recordfactory-")
(defn recordfactory-getInputStream [^FileSystem fs ^Path p]
(SimpleInputStream. (.open fs p)))
(defn recordfactory-getOutputStream [^FileSystem fs ^Path p]
(SimpleOutputStream. (.create fs p)))
(gen-class :name ybot.transfer.GlobPathLister
:implements [backtype.hadoop.PathLister]
:prefix "lister-")
(defn lister-getFiles [this ^FileSystem fs ^Path p]
(for [status (.globStatus fs p)]
[(.getPath status)]))
(defn generate-others [root]
(let [ga (select-fields (gadata-tap root) ["!kw"])
kw-stats (select-fields (kw-stats-tap root) "?ngram")]
(<- [?ngram ?other ?kw]
(ga ?kw)
(kw-stats ?ngram) ;join on kw-stats to filter
(not= ?kw "")
(other-ngrams ?kw 4 :> ?ngram ?other))))
(defn generate-other-words [output-tap root date]
(defn generate-other-words [output-tap root date]
(let [others (generate-others root)
landings (landings-by-kw root)
sq (<- [?ngram ?other ?total-l ?total-pv ?total-b ?total-nv ?total-rv
?b-rate ?nv-rate ?rv-rate ?avg-depth]
(others ?ngram ?other ?kw)
(landings ?kw ?l ?pv ?b ?nv ?rv)
(ybot-stats ?l ?pv ?b ?nv ?rv :>
?total-l ?total-pv ?total-b ?total-nv ?total-rv
?b-rate ?nv-rate ?rv-rate ?avg-depth))]
(ns ybot.hadoop.pail
(:use cascalog.api
[cascalog.io :only (with-fs-tmp)])
(:import [backtype.cascading.tap PailTap PailTap$PailTapOptions]
[backtype.hadoop.pail Pail]))
(defn- pail-tap
[path colls structure]
(let [seqs (into-array java.util.List colls)
spec (PailTap/makeSpec nil structure)
@sorenmacbeth
sorenmacbeth / gist:1529424
Created December 28, 2011 19:57 — forked from cmiles74/gist:1529376
Load data from a file (line by line) into HBase
(defn hfs-report
[path]
"Loads the log data from an HDFS path into Hbase."
(?<- (hbase-tap "urls" "?url-hash" "urls" "?url" "?crawl-date"
"?crawl-time" "?response-code" "?status" "?host")
[?url-hash ?url ?crawl-date ?crawl-time ?response-code ?status ?host]
((hfs-textline path) ?text)
(fetch-value-hash ?text :url :> ?url-hash)
(fetch-value ?text :url :> ?url)
(fetch-value ?text :crawl-date :> ?crawl-date)
(defmapop decode-ybtag [json]
(let [{:keys [psn co vi pvi si sts ts ln ce sd lo r ua la na np nc
c_st g_C g_r g_c dma nv]} (json/parse-string json true)]
[psn co vi pvi si sts ts ln ce sd lo r ua la na np nc
c_st g_C g_r g_c dma nv]))
(defn glob-ybtag-json [pattern]
(let [tap (globhfs-textline pattern)]
(<- [!psn !co !vi !pvi !si !sts !ts !ln !ce !sd !lo !r !ua !la !na !np !nc
!c_st !g_C !g_r !g_c !dma !nv]