Skip to content

Instantly share code, notes, and snippets.

(ns ybot.analytics.edb
(:use [ybot bootstrap datastores])
(:use elephantdb.cascalog.core)
(:import [elephantdb.persistence JavaBerkDB]
[org.apache.hadoop.io BytesWritable]))
(bootstrap-ybot)
(defn ser-long [val]
(BytesWritable. (.getBytes (str val))))
@sorenmacbeth
sorenmacbeth / gist:1340665
Created November 4, 2011 22:32
The rest of our utility functions
(defn tokenizer-seq
"Build a lazy-seq out of a tokenizer with TermAttribute"
[^TokenStream tokenizer ^TermAttribute term-att]
(lazy-seq
(when (.incrementToken tokenizer)
(cons (.term term-att) (tokenizer-seq tokenizer term-att)))))
(defn load-analyzer [^java.util.Set stopwords]
(StandardAnalyzer. Version/LUCENE_CURRENT stopwords))
@sorenmacbeth
sorenmacbeth / gist:1340642
Created November 4, 2011 22:23
a stateful defmapcatop
(defmapcatop tokenize-string {:stateful true}
([] (load-analyzer StandardAnalyzer/STOP_WORDS_SET))
([analyzer text]
(emit-tokens (tokenize-text analyzer text)))
([analyzer] nil))
@sorenmacbeth
sorenmacbeth / gist:1340591
Created November 4, 2011 21:54
our query
(defn tokenize-strings [in-path out-path]
(let [src (hfs-textline in-path)]
(?<- (hfs-textline out-path :sinkmode :replace)
[!line ?token]
(src !line)
(tokenize-string !line :> ?token)
(:distinct false))))
@sorenmacbeth
sorenmacbeth / bash prompt
Created October 14, 2011 02:56 — forked from luikore/bash prompt
lambda-like bash prompt with git / rvm hints
# mac port installs bash_completion in /opt/local
if [ -f /opt/local/etc/bash_completion ]; then
. /opt/local/etc/bash_completion
# *
export GIT_PS1_SHOWDIRTYSTATE=1
# $
export GIT_PS1_SHOWSTASHSTATE=1
# %
# export GIT_PS1_SHOWUNTRACKEDFILES=1
export PS1='\[\e[32m\]λ \w\[\e[36m\]$(__git_ps1 " (%s)") [$(~/.rvm/bin/rvm-prompt i v)]\[\e[0m\]\n\[\e[32m\]→\[\e[0m\] '
(defn lemmatize-text
"Apply a lucene tokenizer to cleaned text content as a lazy-seq"
[page-text]
(let [reader (java.io.StringReader. page-text)
analyzer (->
(resource-to-temp-file
"stanford_nlp_models/bidirectional-distsim-wsj-0-18.tagger"
".tagger")
(.getAbsolutePath)
(MaxentTagger.)
Compiling ybot.analytics.ga.aggregate
Exception in thread "main" java.lang.NoSuchMethodError: clojure.lang.RT.keyword(Ljava/lang/String;Ljava/lang/String;)Lclojure/lang/Keyword; (util.clj:5)
at clojure.lang.Compiler$InvokeExpr.eval(Compiler.java:2911)
at clojure.lang.Compiler.compile1(Compiler.java:5933)
at clojure.lang.Compiler.compile1(Compiler.java:5923)
at clojure.lang.Compiler.compile(Compiler.java:5992)
at clojure.lang.RT.compile(RT.java:368)
at clojure.lang.RT.load(RT.java:407)
at clojure.lang.RT.load(RT.java:381)
at clojure.core$load$fn__4519.invoke(core.clj:4915)
Caused by: java.lang.NoSuchMethodError: clojure.lang.RT.keyword(Ljava/lang/String;Ljava/lang/String;)Lclojure/lang/Keyword;
at cascalog.ops__init.__init0(Unknown Source)
at cascalog.ops__init.<clinit>(Unknown Source)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:247)
at clojure.lang.RT.loadClassForName(RT.java:1578)
at clojure.lang.RT.load(RT.java:399)
at clojure.lang.RT.load(RT.java:381)
at clojure.core$load$fn__4519.invoke(core.clj:4915)
at clojure.core$load.doInvoke(core.clj:4914)
(ns hbase.cascalog.core
(:require [cascalog.workflow :as w])
(:import [cascading.hbase HBaseTap HBaseScheme ByteHolder]
[cascading.tuple Fields]
org.apache.hadoop.hbase.util.Bytes))
(defn hbase-tap [table-name key-field column-family & value-fields]
(let [scheme (HBaseScheme. (w/fields key-field) column-family (w/fields value-fields))]
(HBaseTap. table-name scheme)))
(let [src [["http://www.google.co.in.../search?hl=en&source=hp&q=farewell quotes&meta=&oq=farewell &aq=0&aqi=g10&aql=&gs_sm=c&gs_upl=1235l2985l0l6500l9l9l0l2l2l0l234l1451l0.2.5l7l0"]
["http://www.google.com/search?q=farewell%20quotes"]
["http://www.dopeness.org/foo"]
[""]]]
(fact?<-
[[nil nil] ["www.google.com" "farewell quotes"] ["www.dopeness.org" ""] ["(direct)" ""]]
[!ref !kw]
(src ?r)
(expand-fields ?r :> !kw !ref)))