Andy Pernsteiner andypern

## gist:6b742025bc596949c930
[root@sandbox conf.empty]# hadoop fs -ls wasb://andypcontainer@maprazuretest.blob.core.windows.net/
15/03/23 21:23:25 DEBUG lib.MutableMetricsFactory: field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.loginSuccess with annotation @org.apache.hadoop.metrics2.annotation.Metric(value=[Rate of successful kerberos logins and latency (milliseconds)], about=, valueName=Time, type=DEFAULT, always=false, sampleName=Ops)
15/03/23 21:23:25 DEBUG lib.MutableMetricsFactory: field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.loginFailure with annotation @org.apache.hadoop.metrics2.annotation.Metric(value=[Rate of failed kerberos logins and latency (milliseconds)], about=, valueName=Time, type=DEFAULT, always=false, sampleName=Ops)
15/03/23 21:23:25 DEBUG lib.MutableMetricsFactory: field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.getGroups with annotati

## flume demo
#Twitter drill demo


Install flume:

	yum install -y mapr-flume

	cd /opt/mapr/flume/flume-1.5.0/conf

	cp flume-env.sh.template flume-env.sh

## gist:b0aa02d446a8c5f413b6
// package drilljdbctest;

import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

public class DrillJDBCTest {

## gist:3447376c27cd0cf98ef1
10:29:41.443 [2634939c-333e-4d3f-8caf-31389859a336:frag:0:0] DEBUG o.a.d.e.w.fragment.FragmentExecutor - Caught exception while running fragment
java.lang.NullPointerException: null
	at org.apache.hadoop.hive.ql.metadata.Table.getFields(Table.java:403) ~[storage-hive-1.0.0-m2-incubating-SNAPSHOT.jar:1.0.0-m2-incubating-SNAPSHOT]
	at org.apache.drill.exec.store.hive.schema.DrillHiveTable.getRowType(DrillHiveTable.java:52) ~[drill-java-exec-1.0.0-m2-incubating-SNAPSHOT-rebuffed.jar:1.0.0-m2-incubating-SNAPSHOT]
	at org.apache.drill.exec.store.ischema.OptiqProvider$OptiqScanner.scanSchema(OptiqProvider.java:233) ~[drill-java-exec-1.0.0-m2-incubating-SNAPSHOT-rebuffed.jar:1.0.0-m2-incubating-SNAPSHOT]
	at org.apache.drill.exec.store.ischema.OptiqProvider$OptiqScanner.scanSchema(OptiqProvider.java:218) ~[drill-java-exec-1.0.0-m2-incubating-SNAPSHOT-rebuffed.jar:1.0.0-m2-incubating-SNAPSHOT]
	at org.apache.drill.exec.store.ischema.OptiqProvider$OptiqScanner.scanSchema(OptiqProvider.java:218) ~[drill-java-exec-1.0.0

## gist:60a98fa43607a9cf0bf7

##Running the demo

###Introduction

- Show a slide with a diagram showing the components
- show a slide detailing the clickstream use case
- talk about how the data comes in different formats, and we don't want to have to do any sort of manipulation to be able to query against it.

###Dataset intro

## gist:5a205e11ab25763ac854
val records = ssc.socketTextStream(host, port.toInt, StorageLevel.MEMORY_ONLY_SER)


    //basically, foreach rdd inside the Dstream, perform a 'collect' on the RDD, which creates an array,
    // and run a foreach on the elements within the array.  Maybe there's a more 'sparky' way of doing this...
    records.foreach(rdd => {
      val rddarray = rdd.collect
        if(rddarray.length > 0) {
          var linecount = 0

## spark classpath launcher
#!/usr/bin/bash


export SHARK_HOME=/opt/mapr/shark/shark-0.9.0
export SPARK_HOME=/opt/mapr/spark/spark-0.9.1
export SCALA_HOME=/usr/share/java
export CLASSPATH
	[root@sandbox conf.empty]# hadoop fs -ls wasb://andypcontainer@maprazuretest.blob.core.windows.net/
	15/03/23 21:23:25 DEBUG lib.MutableMetricsFactory: field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.loginSuccess with annotation @org.apache.hadoop.metrics2.annotation.Metric(value=[Rate of successful kerberos logins and latency (milliseconds)], about=, valueName=Time, type=DEFAULT, always=false, sampleName=Ops)
	15/03/23 21:23:25 DEBUG lib.MutableMetricsFactory: field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.loginFailure with annotation @org.apache.hadoop.metrics2.annotation.Metric(value=[Rate of failed kerberos logins and latency (milliseconds)], about=, valueName=Time, type=DEFAULT, always=false, sampleName=Ops)
	15/03/23 21:23:25 DEBUG lib.MutableMetricsFactory: field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.getGroups with annotati
	#Twitter drill demo


	Install flume:

	yum install -y mapr-flume

	cd /opt/mapr/flume/flume-1.5.0/conf

	cp flume-env.sh.template flume-env.sh
	// package drilljdbctest;

	import java.io.IOException;
	import java.sql.Connection;
	import java.sql.DriverManager;
	import java.sql.ResultSet;
	import java.sql.SQLException;
	import java.sql.Statement;

	public class DrillJDBCTest {
	10:29:41.443 [2634939c-333e-4d3f-8caf-31389859a336:frag:0:0] DEBUG o.a.d.e.w.fragment.FragmentExecutor - Caught exception while running fragment
	java.lang.NullPointerException: null
	at org.apache.hadoop.hive.ql.metadata.Table.getFields(Table.java:403) ~[storage-hive-1.0.0-m2-incubating-SNAPSHOT.jar:1.0.0-m2-incubating-SNAPSHOT]
	at org.apache.drill.exec.store.hive.schema.DrillHiveTable.getRowType(DrillHiveTable.java:52) ~[drill-java-exec-1.0.0-m2-incubating-SNAPSHOT-rebuffed.jar:1.0.0-m2-incubating-SNAPSHOT]
	at org.apache.drill.exec.store.ischema.OptiqProvider$OptiqScanner.scanSchema(OptiqProvider.java:233) ~[drill-java-exec-1.0.0-m2-incubating-SNAPSHOT-rebuffed.jar:1.0.0-m2-incubating-SNAPSHOT]
	at org.apache.drill.exec.store.ischema.OptiqProvider$OptiqScanner.scanSchema(OptiqProvider.java:218) ~[drill-java-exec-1.0.0-m2-incubating-SNAPSHOT-rebuffed.jar:1.0.0-m2-incubating-SNAPSHOT]
	at org.apache.drill.exec.store.ischema.OptiqProvider$OptiqScanner.scanSchema(OptiqProvider.java:218) ~[drill-java-exec-1.0.0

	##Running the demo

	###Introduction

	- Show a slide with a diagram showing the components
	- show a slide detailing the clickstream use case
	- talk about how the data comes in different formats, and we don't want to have to do any sort of manipulation to be able to query against it.

	###Dataset intro
	val records = ssc.socketTextStream(host, port.toInt, StorageLevel.MEMORY_ONLY_SER)



	//basically, foreach rdd inside the Dstream, perform a 'collect' on the RDD, which creates an array,
	// and run a foreach on the elements within the array. Maybe there's a more 'sparky' way of doing this...
	records.foreach(rdd => {
	val rddarray = rdd.collect
	if(rddarray.length > 0) {
	var linecount = 0
	#!/usr/bin/bash




	export SHARK_HOME=/opt/mapr/shark/shark-0.9.0
	export SPARK_HOME=/opt/mapr/spark/spark-0.9.1
	export SCALA_HOME=/usr/share/java
	export CLASSPATH