Nick Pentreath MLnick

## StreamingHLL.scala
import spark.streaming.StreamingContext._
import spark.streaming.{Seconds, StreamingContext}
import spark.SparkContext._
import spark.storage.StorageLevel
import spark.streaming.examples.twitter.TwitterInputDStream
import com.twitter.algebird.HyperLogLog._
import com.twitter.algebird._

/**
 * Example of using HyperLogLog monoid from Twitter's Algebird together with Spark Streaming's

## StreamingCMS.scala
import spark.streaming.{Seconds, StreamingContext}
import spark.storage.StorageLevel
import spark.streaming.examples.twitter.TwitterInputDStream
import com.twitter.algebird._
import spark.streaming.StreamingContext._
import spark.SparkContext._

/**
 * Example of using CountMinSketch monoid from Twitter's Algebird together with Spark Streaming's
 * TwitterInputDStream

## MovieSimilarities.scala
import spark.SparkContext
import SparkContext._

/**
 * A port of [[http://blog.echen.me/2012/02/09/movie-recommendations-and-more-via-mapreduce-and-scalding/]]
 * to Spark.
 * Uses movie ratings data from MovieLens 100k dataset found at [[http://www.grouplens.org/node/73]]
 */
object MovieSimilarities {

## HyperLogLogStoreUDAF.scala
class HyperLogLogStoreUDAF extends UserDefinedAggregateFunction {

  override def inputSchema = new StructType()
    .add("stringInput", BinaryType)

  override def update(buffer: MutableAggregationBuffer, input: Row) = {
    // This input Row only has a single column storing the input value in String (or other Binary data).
    // We only update the buffer when the input value is not null.
    if (!input.isNullAt(0)) {
      if (buffer.isNullAt(0)) {

## pyspark-hbase.py
$ sbt/sbt assembly/assembly
$ sbt/sbt examples/assembly
$ SPARK_CLASSPATH=examples/target/scala-2.10/spark-examples-1.1.0-SNAPSHOT-hadoop1.0.4.jar IPYTHON=1 ./bin/pyspark

...

14/06/03 15:34:11 INFO SparkUI: Started SparkUI at http://10.0.0.4:4040
Welcome to
      ____              __
     / __/__  ___ _____/ /__

## SQLTransformerWithJoin.scala
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 2.0.0
      /_/

Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_77)
Type in expressions to have them evaluated.
Type :help for more information.

## onnx.pb
graph {
  node {
    input: "X"
    input: "W"
    output: "Y"
    name: "matmult"
    op_type: "Mul"
  }
  input {
    name: "X"

## sklearn-lr-spark.py
import sys

from pyspark.context import SparkContext
from numpy import array, random as np_random
from sklearn import linear_model as lm
from sklearn.base import copy

N = 10000   # Number of data points
D = 10      # Numer of dimensions
ITERATIONS = 5

## dask-ps.py
# ==== dask-ps

import dask
import dask.array as da
from dask import delayed
from dask_glm import families
from dask_glm.algorithms import lbfgs
from distributed import LocalCluster, Client, worker_client
import numpy as np
import time

## 2.1.2-RC4-R-errors.txt
1. Error: gapply() and gapplyCollect() on a DataFrame (@test_sparkSQL.R#2569) --
org.apache.spark.SparkException: Job aborted due to stage failure: Task 114 in stage 957.0 failed 1 times, most recent failure: Lost task 114.0 in stage 957.0 (TID 13209, localhost, executor driver): org.apache.spark.SparkException: R computation failed with
 [1] 1
[1] 3
[1] 2
[1][1] 1 2

[1] 3
[1] 2
[1] 2
	import spark.streaming.StreamingContext._
	import spark.streaming.{Seconds, StreamingContext}
	import spark.SparkContext._
	import spark.storage.StorageLevel
	import spark.streaming.examples.twitter.TwitterInputDStream
	import com.twitter.algebird.HyperLogLog._
	import com.twitter.algebird._

	/**
	* Example of using HyperLogLog monoid from Twitter's Algebird together with Spark Streaming's
	import spark.SparkContext
	import SparkContext._

	/**
	* A port of [[http://blog.echen.me/2012/02/09/movie-recommendations-and-more-via-mapreduce-and-scalding/]]
	* to Spark.
	* Uses movie ratings data from MovieLens 100k dataset found at [[http://www.grouplens.org/node/73]]
	*/
	object MovieSimilarities {
	class HyperLogLogStoreUDAF extends UserDefinedAggregateFunction {

	override def inputSchema = new StructType()
	.add("stringInput", BinaryType)

	override def update(buffer: MutableAggregationBuffer, input: Row) = {
	// This input Row only has a single column storing the input value in String (or other Binary data).
	// We only update the buffer when the input value is not null.
	if (!input.isNullAt(0)) {
	if (buffer.isNullAt(0)) {
	$ sbt/sbt assembly/assembly
	$ sbt/sbt examples/assembly
	$ SPARK_CLASSPATH=examples/target/scala-2.10/spark-examples-1.1.0-SNAPSHOT-hadoop1.0.4.jar IPYTHON=1 ./bin/pyspark

	...

	14/06/03 15:34:11 INFO SparkUI: Started SparkUI at http://10.0.0.4:4040
	Welcome to
	____ __
	/ __/__ ___ _____/ /__
	graph {
	node {
	input: "X"
	input: "W"
	output: "Y"
	name: "matmult"
	op_type: "Mul"
	}
	input {
	name: "X"
	import sys

	from pyspark.context import SparkContext
	from numpy import array, random as np_random
	from sklearn import linear_model as lm
	from sklearn.base import copy

	N = 10000 # Number of data points
	D = 10 # Numer of dimensions
	ITERATIONS = 5
	# ==== dask-ps

	import dask
	import dask.array as da
	from dask import delayed
	from dask_glm import families
	from dask_glm.algorithms import lbfgs
	from distributed import LocalCluster, Client, worker_client
	import numpy as np
	import time
	1. Error: gapply() and gapplyCollect() on a DataFrame (@test_sparkSQL.R#2569) --
	org.apache.spark.SparkException: Job aborted due to stage failure: Task 114 in stage 957.0 failed 1 times, most recent failure: Lost task 114.0 in stage 957.0 (TID 13209, localhost, executor driver): org.apache.spark.SparkException: R computation failed with
	[1] 1
	[1] 3
	[1] 2
	[1][1] 1 2

	[1] 3
	[1] 2
	[1] 2