Skip to content

Instantly share code, notes, and snippets.

@samklr
samklr / Kmeans.scala
Created May 4, 2013 18:17
Kmeans forked
class Point(dx: Double, dy: Double) {
val x: Double = dx
val y: Double = dy
override def toString(): String = {
"(" + x + ", " + y + ")"
}
def dist(p: Point): Double = {
return x * p.x + y * p.y
@samklr
samklr / PortfolioSelection.scala
Created May 25, 2013 11:43
PortfolioSelection
import com.twitter.scalding._
import cern.colt.matrix.{DoubleFactory2D, DoubleFactory1D }
import cern.colt.matrix.linalg.Algebra
import java.util.StringTokenizer
class Portfolios(args : Args) extends Job(args) {
val cash = 1000.0 // money at hand
val error = 1 // its ok if we cannot invest the last dollar
@samklr
samklr / MRCountWord.java
Last active December 17, 2015 18:19
Hadoop MR Count Word
public class CountWordsMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
public void map(LongWritable key, Text value, Context context) throws
IOException, InterruptedException {
String line = value.toString();
StringTokenizer itr = new StringTokenizer(line);
while (itr.hasMoreTokens()) {
class WordCountJob(args : Args) extends Job(args) {
TextLine( args("input") )
.flatMap('line -> 'word) { line : String => tokenize(line) }
.groupBy('word) { _.size }
.write( Tsv( args("output") ) )
}
BECOMES
class WordCountJob(args: Args) extends Job(args) {
val lines : TypedPipe[String] = TextLine(args("input"))
class StocksDividendsJoin4(args : Args) extends Job(args) {
val stockSchema = ('symd, 'price_open, 'price_high, 'price_low, 'price_close, 'volume, 'price_adj_close)
val dividendsSchema = ('dymd, 'dividend)
val stocksPipe = new Csv(args("stocks"), stockSchema).read.project('symd, 'price_close)
val dividendsPipe = new Csv(args("dividends"), dividendsSchema) .read
/*
* Inner join! Use the "tiny" variant that attempts to replicate the dividend table
* to all nodes for faster joining.
* Then suppress the extra ymd and specify the remaining fields in the desired order.
class StocksDividendsRevisited8(args : Args) extends Job(args) {
val stocksSchema =
('symd, 'price_open, 'price_high, 'price_low, 'price_close, 'volume, 'price_adj_close)
val dividendsSchema = ('dymd, 'dividend)
def startPipe(rootPath: String, symbol: String, beforeSchema: Fields, afterSchema: Fields) =
new Csv(rootPath+"/"+symbol+".csv", beforeSchema)
.read
.write(Tsv("output/dump1"))
object WordCount extends PipelineApp {
def ScrunchWordCount(file: String) = {
read(from.textFile(file))
.flatMap(_.split("\\W+")
.filter(!_.isEmpty()))
.count
}
// This F# dojo is directly inspired by the Digit Recognizer
// competition from Kaggle.com:
// http://www.kaggle.com/c/digit-recognizer
// The datasets below are simply shorter versions of
// the training dataset from Kaggle.
// 0. Load data files from the following location:
// http://brandewinder.blob.core.windows.net/public/digitssample.csv
// http://brandewinder.blob.core.windows.net/public/digitscheck.csv
This is how we create an inclusive/exclusive range of Ints:
val ie = 0 until 500
//1. Fill in the missing item to create a range of Ints from 1 to 100 inclusive
val ints = 1 ??? 100
//2. Find the sum of the integers in this range
object TradingDays extends App {
import scalaz._
import Scalaz._
case class Trade(sym: String, trader: String, qty: Int)
case class TradingDay(symbols: Map[String, SymDay] = Map.empty)
object TradingDay {