This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package forma.tap; | |
import backtype.hadoop.pail.PailStructure; | |
import java.util.Collections; | |
import java.util.List; | |
import org.apache.thrift.TBase; | |
import org.apache.thrift.TDeserializer; | |
import org.apache.thrift.TException; | |
import org.apache.thrift.TSerializer; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(def sample-variance | |
"Predicate macro that calculates the sample variance of the supplied input | |
var." | |
(<- [!val :> !var] | |
(* !val !val :> !squared) | |
(c/sum !squared :> !squared-sum) | |
(c/count !count) | |
(c/sum !val :> !sum) | |
(c/avg !val :> !mean) | |
(* !sum !mean :> !i) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns ybot.kwphrase-tests | |
(:use [ybot.analytics.ga kwphrase] | |
[ybot datastores] | |
[midje sweet cascalog])) | |
(let [ga-data [["20121025" | |
"http://dopeness.org" | |
"United States" | |
"Oregon" | |
"Portland" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(gen-class :name ybot.hadoop.formats.SimpleRecordStreamFactory | |
:implements backtype.hadoop.formats.RecordStreamFactory | |
:prefix "recordfactory-") | |
(defn recordfactory-getInputStream [^FileSystem fs ^Path p] | |
(SimpleInputStream. (.open fs p))) | |
(defn recordfactory-getOutputStream [^FileSystem fs ^Path p] | |
(SimpleOutputStream. (.create fs p))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(gen-class :name ybot.transfer.GlobPathLister | |
:implements [backtype.hadoop.PathLister] | |
:prefix "lister-") | |
(defn lister-getFiles [this ^FileSystem fs ^Path p] | |
(for [status (.globStatus fs p)] | |
[(.getPath status)])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn generate-others [root] | |
(let [ga (select-fields (gadata-tap root) ["!kw"]) | |
kw-stats (select-fields (kw-stats-tap root) "?ngram")] | |
(<- [?ngram ?other ?kw] | |
(ga ?kw) | |
(kw-stats ?ngram) ;join on kw-stats to filter | |
(not= ?kw "") | |
(other-ngrams ?kw 4 :> ?ngram ?other)))) | |
(defn generate-other-words [output-tap root date] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn generate-other-words [output-tap root date] | |
(let [others (generate-others root) | |
landings (landings-by-kw root) | |
sq (<- [?ngram ?other ?total-l ?total-pv ?total-b ?total-nv ?total-rv | |
?b-rate ?nv-rate ?rv-rate ?avg-depth] | |
(others ?ngram ?other ?kw) | |
(landings ?kw ?l ?pv ?b ?nv ?rv) | |
(ybot-stats ?l ?pv ?b ?nv ?rv :> | |
?total-l ?total-pv ?total-b ?total-nv ?total-rv | |
?b-rate ?nv-rate ?rv-rate ?avg-depth))] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns ybot.hadoop.pail | |
(:use cascalog.api | |
[cascalog.io :only (with-fs-tmp)]) | |
(:import [backtype.cascading.tap PailTap PailTap$PailTapOptions] | |
[backtype.hadoop.pail Pail])) | |
(defn- pail-tap | |
[path colls structure] | |
(let [seqs (into-array java.util.List colls) | |
spec (PailTap/makeSpec nil structure) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defn hfs-report | |
[path] | |
"Loads the log data from an HDFS path into Hbase." | |
(?<- (hbase-tap "urls" "?url-hash" "urls" "?url" "?crawl-date" | |
"?crawl-time" "?response-code" "?status" "?host") | |
[?url-hash ?url ?crawl-date ?crawl-time ?response-code ?status ?host] | |
((hfs-textline path) ?text) | |
(fetch-value-hash ?text :url :> ?url-hash) | |
(fetch-value ?text :url :> ?url) | |
(fetch-value ?text :crawl-date :> ?crawl-date) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defmapop decode-ybtag [json] | |
(let [{:keys [psn co vi pvi si sts ts ln ce sd lo r ua la na np nc | |
c_st g_C g_r g_c dma nv]} (json/parse-string json true)] | |
[psn co vi pvi si sts ts ln ce sd lo r ua la na np nc | |
c_st g_C g_r g_c dma nv])) | |
(defn glob-ybtag-json [pattern] | |
(let [tap (globhfs-textline pattern)] | |
(<- [!psn !co !vi !pvi !si !sts !ts !ln !ce !sd !lo !r !ua !la !na !np !nc | |
!c_st !g_C !g_r !g_c !dma !nv] |