Nick Pentreath MLnick

## config
Elasticsearch               : "1.1.1"
Spark                       : "0.9.1-hadoop1"
Shark                       : "0.9.1-hadoop1"
elasticsearch-hadoop-hive   : "elasticsearch-hadoop-hive-2.0.0.RC1.jar"
elasticsearch-hadoop        : 2.0.0RC1

- Spark using ESInputFormat works fine. However the type returned from the "date" field ("_ts") is Text. I convert that toString the toLong to get the timestamp, which I can use as I wish within Spark
- Shark returns NULL for the timestamp field. There's nothing funny about the timestamps themselves as I can access them in Spark (as Text) and I can do date math on that field in elasticsearch queries. This also returns nulls in EC2, so it's not just on my machine.

## fromAvro.scala
def fromAvro(obj: Any, schema: Schema): Any = {
  if (obj == null) {
    return null
  }
  schema.getType match {
    case UNION   => unpackUnion(obj, schema)
    case ARRAY   => unpackArray(obj, schema)
    case FIXED   => unpackFixed(obj, schema)
    case MAP     => unpackMap(obj, schema)
    case BYTES   => unpackBytes(obj)

## logstash.txt
Failed to flush outgoing items {:outgoing_count=>1, :exception=>#<SocketError: initialize: java.net.SocketException: Too many open files>, :backtrace=>["org/jruby/ext/socket/RubySocket.java:190:in `initialize'", "org/jruby/RubyIO.java:852:in `new'", "/home/ec2-user/logstash-1.4.2/vendor/bundle/jruby/1.9/gems/ftw-0.0.39/lib/ftw/connection.rb:146:in `connect'", "org/jruby/RubyArray.java:1613:in `each'", "/home/ec2-user/logstash-1.4.2/vendor/bundle/jruby/1.9/gems/ftw-0.0.39/lib/ftw/connection.rb:139:in `connect'", "/home/ec2-user/logstash-1.4.2/vendor/bundle/jruby/1.9/gems/ftw-0.0.39/lib/ftw/agent.rb:406:in `connect'", "org/jruby/RubyProc.java:271:in `call'", "/home/ec2-user/logstash-1.4.2/vendor/bundle/jruby/1.9/gems/ftw-0.0.39/lib/ftw/pool.rb:48:in `fetch'", "/home/ec2-user/logstash-1.4.2/vendor/bundle/jruby/1.9/gems/ftw-0.0.39/lib/ftw/agent.rb:403:in `connect'", "/home/ec2-user/logstash-1.4.2/vendor/bundle/jruby/1.9/gems/ftw-0.0.39/lib/ftw/agent.rb:319:in `execute'", "/home/ec2-user/logstash-1.4.2/vendor/bund

## call_graphflow_shortcode_in_snippet.php
add_shortcode('graphflow_shortcode_demo', 'graphflow_shortcode_demo_display');
function graphflow_shortcode_demo_display($attr) {
  ob_start();
  if ( isset( $_REQUEST['cat'] ) ) {
	$cat_id = $_REQUEST['cat'];
	$term = get_term_by( 'id', $cat_id, 'product_cat', 'ARRAY_A' );
	$cat_name = $term['name'];
	echo do_shortcode( '[graphflow_recommendations title="Recommended for you in ' . $cat_name . '" columns=4 per_page=4 product_cat=' . $cat_id . ']' );
  } else {
	echo do_shortcode( '[graphflow_recommendations columns=4 per_page=4]' );

## SparkML.scala
// An Example is an observation with optional target value and features in the form of a vector of Doubles
case class Example(target: Option[Double] = None, features: Vector[Double])

// Base model API looks something like:
abstract class BaseModel(val modelSettings: Settings)
  extends Serializable
  with Logging {

  def fit(data: RDD[Example])

## Configuration.scala
trait Configured {
    val config = ConfigFactory.load().getConfig("spark")
}

object ConfigUtils {
    def asOption[T](t: => T): Option[T] = {
        try {
            Option(t)
        } catch {
            case e: ConfigException.Missing => None

## JavaSparkContext.scala
def newHadoopFileAsText[K, V, F <: NewInputFormat[K, V]](
                                                path: String,
                                                inputFormatClazz: String,
                                                keyClazz: String,
                                                valueClazz: String,
                                                delimiter: String): JavaRDD[String] = {

    implicit val kcm = ClassManifest.fromClass(Class.forName(keyClazz))
    implicit val vcm = ClassManifest.fromClass(Class.forName(valueClazz))
    implicit val fcm = ClassManifest.fromClass(Class.forName(inputFormatClazz))

## output
13/12/09 23:01:02 INFO spark.SparkContext: Job finished: runJob at PythonRDD.scala:288, took 0.175237 s
Count raw: 683; Count svml: 683
Raw sample: [u'2.000000  1:1000025.000000 2:5.000000 3:1.000000 4:1.000000 5:1.000000 6:2.000000 7:1.000000 8:3.000000 9:1.000000 10:1.000000', u'2.000000  1:1002945.000000 2:5.000000 3:4.000000 4:4.000000 5:5.000000 6:7.000000 7:10.000000 8:3.000000 9:2.000000 10:1.000000', u'2.000000  1:1015425.000000 2:3.000000 3:1.000000 4:1.000000 5:1.000000 6:2.000000 7:2.000000 8:3.000000 9:1.000000 10:1.000000', u'2.000000  1:1016277.000000 2:6.000000 3:8.000000 4:8.000000 5:1.000000 6:3.000000 7:4.000000 8:3.000000 9:7.000000 10:1.000000', u'2.000000  1:1017023.000000 2:4.000000 3:1.000000 4:1.000000 5:3.000000 6:2.000000 7:1.000000 8:3.000000 9:1.000000 10:1.000000', u'4.000000  1:1017122.000000 2:8.000000 3:10.000000 4:10.000000 5:8.000000 6:7.000000 7:10.000000 8:9.000000 9:7.000000 10:1.000000', u'2.000000  1:1018099.000000 2:1.000000 3:1.000000 4:1.000000 5:1.000000 6:2.000000 7

## App.scala
package example

import org.apache.avro.Schema.Parser
import java.io.{DataInput, DataOutput, File}
import org.apache.avro.generic.GenericData.Record
import org.apache.avro.generic.{GenericRecord, GenericDatumWriter}
import org.apache.avro.file.DataFileWriter
import org.apache.spark.SparkContext
import org.apache.avro.mapreduce.AvroKeyInputFormat
import org.apache.avro.mapred.AvroKey

## JavaSparkContext.scala
// Python RDD creation functions //

// SequenceFile converted to Text and then to String
def sequenceFileAsText(path: String) = {
    implicit val kcm = ClassManifest.fromClass(classOf[Text])
    implicit val fcm = ClassManifest.fromClass(classOf[SequenceFileAsTextInputFormat])

    new JavaPairRDD(sc
      .newAPIHadoopFile[Text, Text, SequenceFileAsTextInputFormat](path)
      .map{ case (k, v) => (k.toString, v.toString) }
	Elasticsearch : "1.1.1"
	Spark : "0.9.1-hadoop1"
	Shark : "0.9.1-hadoop1"
	elasticsearch-hadoop-hive : "elasticsearch-hadoop-hive-2.0.0.RC1.jar"
	elasticsearch-hadoop : 2.0.0RC1

	- Spark using ESInputFormat works fine. However the type returned from the "date" field ("_ts") is Text. I convert that toString the toLong to get the timestamp, which I can use as I wish within Spark
	- Shark returns NULL for the timestamp field. There's nothing funny about the timestamps themselves as I can access them in Spark (as Text) and I can do date math on that field in elasticsearch queries. This also returns nulls in EC2, so it's not just on my machine.
	def fromAvro(obj: Any, schema: Schema): Any = {
	if (obj == null) {
	return null
	}
	schema.getType match {
	case UNION => unpackUnion(obj, schema)
	case ARRAY => unpackArray(obj, schema)
	case FIXED => unpackFixed(obj, schema)
	case MAP => unpackMap(obj, schema)
	case BYTES => unpackBytes(obj)
	add_shortcode('graphflow_shortcode_demo', 'graphflow_shortcode_demo_display');
	function graphflow_shortcode_demo_display($attr) {
	ob_start();
	if ( isset( $_REQUEST['cat'] ) ) {
	$cat_id = $_REQUEST['cat'];
	$term = get_term_by( 'id', $cat_id, 'product_cat', 'ARRAY_A' );
	$cat_name = $term['name'];
	echo do_shortcode( '[graphflow_recommendations title="Recommended for you in ' . $cat_name . '" columns=4 per_page=4 product_cat=' . $cat_id . ']' );
	} else {
	echo do_shortcode( '[graphflow_recommendations columns=4 per_page=4]' );
	// An Example is an observation with optional target value and features in the form of a vector of Doubles
	case class Example(target: Option[Double] = None, features: Vector[Double])

	// Base model API looks something like:
	abstract class BaseModel(val modelSettings: Settings)
	extends Serializable
	with Logging {

	def fit(data: RDD[Example])
	trait Configured {
	val config = ConfigFactory.load().getConfig("spark")
	}

	object ConfigUtils {
	def asOption[T](t: => T): Option[T] = {
	try {
	Option(t)
	} catch {
	case e: ConfigException.Missing => None
	def newHadoopFileAsText[K, V, F <: NewInputFormat[K, V]](
	path: String,
	inputFormatClazz: String,
	keyClazz: String,
	valueClazz: String,
	delimiter: String): JavaRDD[String] = {

	implicit val kcm = ClassManifest.fromClass(Class.forName(keyClazz))
	implicit val vcm = ClassManifest.fromClass(Class.forName(valueClazz))
	implicit val fcm = ClassManifest.fromClass(Class.forName(inputFormatClazz))
	13/12/09 23:01:02 INFO spark.SparkContext: Job finished: runJob at PythonRDD.scala:288, took 0.175237 s
	Count raw: 683; Count svml: 683
	Raw sample: [u'2.000000 1:1000025.000000 2:5.000000 3:1.000000 4:1.000000 5:1.000000 6:2.000000 7:1.000000 8:3.000000 9:1.000000 10:1.000000', u'2.000000 1:1002945.000000 2:5.000000 3:4.000000 4:4.000000 5:5.000000 6:7.000000 7:10.000000 8:3.000000 9:2.000000 10:1.000000', u'2.000000 1:1015425.000000 2:3.000000 3:1.000000 4:1.000000 5:1.000000 6:2.000000 7:2.000000 8:3.000000 9:1.000000 10:1.000000', u'2.000000 1:1016277.000000 2:6.000000 3:8.000000 4:8.000000 5:1.000000 6:3.000000 7:4.000000 8:3.000000 9:7.000000 10:1.000000', u'2.000000 1:1017023.000000 2:4.000000 3:1.000000 4:1.000000 5:3.000000 6:2.000000 7:1.000000 8:3.000000 9:1.000000 10:1.000000', u'4.000000 1:1017122.000000 2:8.000000 3:10.000000 4:10.000000 5:8.000000 6:7.000000 7:10.000000 8:9.000000 9:7.000000 10:1.000000', u'2.000000 1:1018099.000000 2:1.000000 3:1.000000 4:1.000000 5:1.000000 6:2.000000 7
	package example

	import org.apache.avro.Schema.Parser
	import java.io.{DataInput, DataOutput, File}
	import org.apache.avro.generic.GenericData.Record
	import org.apache.avro.generic.{GenericRecord, GenericDatumWriter}
	import org.apache.avro.file.DataFileWriter
	import org.apache.spark.SparkContext
	import org.apache.avro.mapreduce.AvroKeyInputFormat
	import org.apache.avro.mapred.AvroKey
	// Python RDD creation functions //

	// SequenceFile converted to Text and then to String
	def sequenceFileAsText(path: String) = {
	implicit val kcm = ClassManifest.fromClass(classOf[Text])
	implicit val fcm = ClassManifest.fromClass(classOf[SequenceFileAsTextInputFormat])

	new JavaPairRDD(sc
	.newAPIHadoopFile[Text, Text, SequenceFileAsTextInputFormat](path)
	.map{ case (k, v) => (k.toString, v.toString) }