Skip to content

Instantly share code, notes, and snippets.

Compiling ybot.analytics.ga.aggregate
Exception in thread "main" java.lang.NoSuchMethodError: clojure.lang.RT.keyword(Ljava/lang/String;Ljava/lang/String;)Lclojure/lang/Keyword; (util.clj:5)
at clojure.lang.Compiler$InvokeExpr.eval(Compiler.java:2911)
at clojure.lang.Compiler.compile1(Compiler.java:5933)
at clojure.lang.Compiler.compile1(Compiler.java:5923)
at clojure.lang.Compiler.compile(Compiler.java:5992)
at clojure.lang.RT.compile(RT.java:368)
at clojure.lang.RT.load(RT.java:407)
at clojure.lang.RT.load(RT.java:381)
at clojure.core$load$fn__4519.invoke(core.clj:4915)
(defn lemmatize-text
"Apply a lucene tokenizer to cleaned text content as a lazy-seq"
[page-text]
(let [reader (java.io.StringReader. page-text)
analyzer (->
(resource-to-temp-file
"stanford_nlp_models/bidirectional-distsim-wsj-0-18.tagger"
".tagger")
(.getAbsolutePath)
(MaxentTagger.)
@sorenmacbeth
sorenmacbeth / bash prompt
Created October 14, 2011 02:56 — forked from luikore/bash prompt
lambda-like bash prompt with git / rvm hints
# mac port installs bash_completion in /opt/local
if [ -f /opt/local/etc/bash_completion ]; then
. /opt/local/etc/bash_completion
# *
export GIT_PS1_SHOWDIRTYSTATE=1
# $
export GIT_PS1_SHOWSTASHSTATE=1
# %
# export GIT_PS1_SHOWUNTRACKEDFILES=1
export PS1='\[\e[32m\]λ \w\[\e[36m\]$(__git_ps1 " (%s)") [$(~/.rvm/bin/rvm-prompt i v)]\[\e[0m\]\n\[\e[32m\]→\[\e[0m\] '
@sorenmacbeth
sorenmacbeth / gist:1340591
Created November 4, 2011 21:54
our query
(defn tokenize-strings [in-path out-path]
(let [src (hfs-textline in-path)]
(?<- (hfs-textline out-path :sinkmode :replace)
[!line ?token]
(src !line)
(tokenize-string !line :> ?token)
(:distinct false))))
@sorenmacbeth
sorenmacbeth / gist:1340642
Created November 4, 2011 22:23
a stateful defmapcatop
(defmapcatop tokenize-string {:stateful true}
([] (load-analyzer StandardAnalyzer/STOP_WORDS_SET))
([analyzer text]
(emit-tokens (tokenize-text analyzer text)))
([analyzer] nil))
@sorenmacbeth
sorenmacbeth / gist:1340665
Created November 4, 2011 22:32
The rest of our utility functions
(defn tokenizer-seq
"Build a lazy-seq out of a tokenizer with TermAttribute"
[^TokenStream tokenizer ^TermAttribute term-att]
(lazy-seq
(when (.incrementToken tokenizer)
(cons (.term term-att) (tokenizer-seq tokenizer term-att)))))
(defn load-analyzer [^java.util.Set stopwords]
(StandardAnalyzer. Version/LUCENE_CURRENT stopwords))
(ns ybot.analytics.edb
(:use [ybot bootstrap datastores])
(:use elephantdb.cascalog.core)
(:import [elephantdb.persistence JavaBerkDB]
[org.apache.hadoop.io BytesWritable]))
(bootstrap-ybot)
(defn ser-long [val]
(BytesWritable. (.getBytes (str val))))
(defmapop decode-ybtag [json]
(let [{:keys [psn co vi pvi si sts ts ln ce sd lo r ua la na np nc
c_st g_C g_r g_c dma nv]} (json/parse-string json true)]
[psn co vi pvi si sts ts ln ce sd lo r ua la na np nc
c_st g_C g_r g_c dma nv]))
(defn glob-ybtag-json [pattern]
(let [tap (globhfs-textline pattern)]
(<- [!psn !co !vi !pvi !si !sts !ts !ln !ce !sd !lo !r !ua !la !na !np !nc
!c_st !g_C !g_r !g_c !dma !nv]
@sorenmacbeth
sorenmacbeth / gist:1529424
Created December 28, 2011 19:57 — forked from cmiles74/gist:1529376
Load data from a file (line by line) into HBase
(defn hfs-report
[path]
"Loads the log data from an HDFS path into Hbase."
(?<- (hbase-tap "urls" "?url-hash" "urls" "?url" "?crawl-date"
"?crawl-time" "?response-code" "?status" "?host")
[?url-hash ?url ?crawl-date ?crawl-time ?response-code ?status ?host]
((hfs-textline path) ?text)
(fetch-value-hash ?text :url :> ?url-hash)
(fetch-value ?text :url :> ?url)
(fetch-value ?text :crawl-date :> ?crawl-date)
(ns ybot.hadoop.pail
(:use cascalog.api
[cascalog.io :only (with-fs-tmp)])
(:import [backtype.cascading.tap PailTap PailTap$PailTapOptions]
[backtype.hadoop.pail Pail]))
(defn- pail-tap
[path colls structure]
(let [seqs (into-array java.util.List colls)
spec (PailTap/makeSpec nil structure)