Skip to content

Instantly share code, notes, and snippets.

package forma.tap;
import backtype.hadoop.pail.PailStructure;
import java.util.Collections;
import java.util.List;
import org.apache.thrift.TBase;
import org.apache.thrift.TDeserializer;
import org.apache.thrift.TException;
import org.apache.thrift.TSerializer;
@sorenmacbeth
sorenmacbeth / gist:1529424
Created December 28, 2011 19:57 — forked from cmiles74/gist:1529376
Load data from a file (line by line) into HBase
(defn hfs-report
[path]
"Loads the log data from an HDFS path into Hbase."
(?<- (hbase-tap "urls" "?url-hash" "urls" "?url" "?crawl-date"
"?crawl-time" "?response-code" "?status" "?host")
[?url-hash ?url ?crawl-date ?crawl-time ?response-code ?status ?host]
((hfs-textline path) ?text)
(fetch-value-hash ?text :url :> ?url-hash)
(fetch-value ?text :url :> ?url)
(fetch-value ?text :crawl-date :> ?crawl-date)
@sorenmacbeth
sorenmacbeth / bash prompt
Created October 14, 2011 02:56 — forked from luikore/bash prompt
lambda-like bash prompt with git / rvm hints
# mac port installs bash_completion in /opt/local
if [ -f /opt/local/etc/bash_completion ]; then
. /opt/local/etc/bash_completion
# *
export GIT_PS1_SHOWDIRTYSTATE=1
# $
export GIT_PS1_SHOWSTASHSTATE=1
# %
# export GIT_PS1_SHOWUNTRACKEDFILES=1
export PS1='\[\e[32m\]λ \w\[\e[36m\]$(__git_ps1 " (%s)") [$(~/.rvm/bin/rvm-prompt i v)]\[\e[0m\]\n\[\e[32m\]→\[\e[0m\] '
(ns gist.globhfs
(:import [cascading.tap GlobHfs]))
;; ### Bucket to Cluster
;;
;;; To get tuples back out of our directory structure on S3, we employ
;; Cascading's [GlobHFS] (http://goo.gl/1Vwdo) tap, along with an
;; interface tailored for datasets stored in the MODIS sinusoidal
;; projection. For details on the globbing syntax, see
;; [here](http://goo.gl/uIEzu).
@sorenmacbeth
sorenmacbeth / gist:827971
Created February 15, 2011 18:37 — forked from michaelmontano/gist:535794
updated to whirr-0.3.0
diff -Naur whirr-0.3.0-incubating/contrib/python/src/py/hadoop/cloud/cli.py whirr-0.3.0-incubating-backtype/contrib/python/src/py/hadoop/cloud/cli.py
--- whirr-0.3.0-incubating/contrib/python/src/py/hadoop/cloud/cli.py 2011-01-15 23:03:44.000000000 -0800
+++ whirr-0.3.0-incubating-backtype/contrib/python/src/py/hadoop/cloud/cli.py 2011-02-15 11:51:49.000000000 -0800
@@ -296,7 +296,7 @@
opt.get('user_data_file'),
opt.get('availability_zone'), opt.get('user_packages'),
opt.get('auto_shutdown'), opt.get('env'),
- opt.get('security_group'))
+ opt.get('security_group'), opt.get('spot_price'))
service.launch_master(template, config_dir, opt.get('client_cidr'))
class TwitterUser
def calculate_tunkrank(p=0.05)
self.followers.inject(0.0) do |sum, follower|
sum + ((1.0 + (p * follower.tunkrank_score)) / (1.0 + follower.num_friends))
end
end
end
SELECT 1.0 + SUM((1.0 + #{p} * tunkrank_score) / (1.0 + num_friends)) AS tunkrank_score
FROM twitter_users
INNER JOIN twitter_id_follows ON (twitter_users.twitter_id = twitter_id_follows.follower_twitter_id)
WHERE twitter_id_follows.user_twitter_id = #{twitter_id};
from redis import Redis
import simplejson
class Resque(object):
"""Dirt simple Resque client in Python. Can be used to create jobs."""
redis_server = 'localhost:6379'
def __init__(self):
host, port = self.redis_server.split(':')
self.redis = Redis(host=host, port=int(port))