Skip to content

Instantly share code, notes, and snippets.

@samklr
samklr / 0.setup.sh
Last active August 29, 2015 14:08 — forked from ceteri/0.setup.sh
# using four part files to construct "minitweet"
cat rawtweets/part-0000[1-3] > minitweets
# change log4j properties to WARN to reduce noise during demo
mv conf/log4j.properties.template conf/log4j.properties
vim conf/log4j.properties # Change to WARN
# launch Spark shell REPL
./bin/spark-shell
#! /bin/bash
sudo apt-get update
sudo apt-get install -yq git-core
git clone https://github.com/sstephenson/rbenv.git ~/.rbenv
echo 'export PATH="$HOME/.rbenv/bin:$PATH"' >> ~/.bashrc
echo 'eval "$(rbenv init -)"' >> ~/.bashrc
# type rbenv ===> should print rbenv is a function
import com.twitter.algebird._
import HyperLogLog._
import com.twitter.algebird.Monoid
import com.twitter.algebird.DecayedValue
import com.twitter.algebird.Operators._
val hll = new HyperLogLogMonoid(4)
#! /bin/bash
sudo echo "deb http://ppa.launchpad.net/webupd8team/java/ubuntu precise main" | tee -a /etc/apt/sources.list
sudo echo "deb-src http://ppa.launchpad.net/webupd8team/java/ubuntu precise main" | tee -a /etc/apt/sources.list
sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys EEA14886
sudo apt-get -y update
## auto accept oracle jdk license
sudo echo oracle-java8-installer shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections
sudo apt-get install -y oracle-java8-installer ca-certificates
#! /bin/bash
sudo apt-get -y update
sudo apt-get -y install git-core curl
sudo apt-key adv --keyserver keyserver.ubuntu.com --recv E56151BF
DISTRO=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
CODENAME=$(lsb_release -cs)
echo "deb http://repos.mesosphere.io/${DISTRO} ${CODENAME} main" | sudo tee /etc/apt/sources.list.d/mesosphere.list
@samklr
samklr / Main.scala
Last active August 29, 2015 14:09 — forked from guenter/Main.scala
import mesosphere.mesos.util.FrameworkInfo
import org.apache.mesos.MesosSchedulerDriver
/**
* @author Tobi Knaup
*/
object Main extends App {
import com.twitter.algebird._
import com.twitter.algebird.Operators._
// generate 2 lists
val A = (1 to 300).toList
val B = (201 to 400).toList
// Generate a Bloomfilter
val NUM_HASHES = 6
val WIDTH = 6000 // bits
// transcribed from an Apache Spark 1.0 spark-shell session
// using data from http://chriswhong.com/open-data/foil_nyc_taxi/
// and the QTree algorithm for approximate quantiles over large datasets
// each of the distanceRange and minutesRange calculations below takes about 15 minutes on my four-core SSD-based Macbook Pro
import com.twitter.algebird._
import com.twitter.algebird.Operators._
implicit val qtSemigroupD = new QTreeSemigroup[Double](6)
val in = sc.textFile("trip_data") // a directory containing all the trip_data*.csv files downloaded from the above link
import com.twitter.scalding._
import com.twitter.algebird.{ MinHasher, MinHasher32, MinHashSignature }
/**
* Computes similar items (with a string itemId), based on approximate
* Jaccard similarity, using LSH.
*
* Assumes an input data TSV file of the following format:
*
* itemId userId
#!/bin/bash
sudo add-apt-repository ppa:webupd8team/java -y && apt-get update && \
echo oracle-java7-installer shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections && \
apt-get install -y oracle-java8-installer
wget https://dl.bintray.com/sbt/debian/sbt-0.13.8.deb
sudo dpkg -i sbt-0.13.8.deb
rm sbt-0.13.8.deb