matt winkler mwinkle

## foo.py
from pyspark.sql.functions import udf
import httplib, urllib, base64, json
def whichLanguage(text):
    headers = {
        # Request headers
        'Content-Type': 'application/json',
        'Ocp-Apim-Subscription-Key': '{your subscription key here}',
    }

    params = urllib.urlencode({

## gist:f11407dfebbba952adc3
su hdfs
hadoop fs –mkdir /user/root

hadoop fs –chmod 777 /user/root
hadoop fs –chmod 777 /user/guest
exit


wget http://www.gutenberg.org/files/50831/50831-0.txt

## pyspark.py
text_file = sc.textFile("hdfs://sandbox.hortonworks.com/user/guest/install.log")
counts = text_file.flatMap( lambda line: line.split(" ")) \
.map(lambda word: (word, 1) ) \
.reduceByKey(lambda a, b : a + b)
counts.saveAsTextFile("hdfs://sandbox.hortonworks.com/user/guest/output1.txt")

## vbProcessor.vb
' sample U-SQL UDO (Processor) written in VB.NET

Imports Microsoft.Analytics.Interfaces

Public Class vbProcessor
    Inherits IProcessor

    Private CountryTranslation As New Dictionary(Of String, String) From
      {{"Deutschland", "Germany"},
            {"Schwiiz", "Switzerland"},

## fSharpProcessor.fs
// sample U-SQL UDO (Processor) written in F#
// Note, currently (11/2015) requires deployment of F#.Core

namespace fSharpProcessor

open Microsoft.Analytics.Interfaces

type myProcessor() =
    inherit IProcessor()


## python_processing.sql
# assumes table is my_json, with one column containing all of the json body

add file wasb:///example/apps/process_json.py;

SELECT transform(json_body)
   USING 'd:\python27\python.exe process_json.py'
   AS id, lessonbranch, elapsedseconds, activity, the_date
FROM my_json;

## process_json.py
# this is a python streaming program designed to be called from a Hive query
# this will process a complex json document, and will return the right set of columns and rows
# a second GIST will contain the hive query that can be used to process this

import sys
import json


# this returns five columns
# id, lessonbranch, elapsedseconds, activity, datetime

## manytosinglepython.py
import sys


lines = []
for line in sys.stdin:
   lines.append(line)

if len(lines) > 0:
  cleaned_lines = [line.strip() for line in lines]
  single_line = ' '.join(cleaned_lines)

## Giraph on HDinsight on Linux
special thanks to http://giraph.apache.org/quick_start.html, and thanks to this for the last tip, http://stackoverflow.com/a/27003213/500945

sudo apt-get install openjdk-7-jdk
sudo apt-get install git
sudo apt-get install maven

git clone https://github.com/apache/giraph.git
mvn -Phadoop_2 -fae -DskipTests -Dhadoop=non_secure clean package

# need to put the sample file in storage

## transact-hive.hql
set hive.support.concurrency=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
set hive.compactor.initiator.on=true;
set hive.compactor.worker.threads=2 ;

CREATE TABLE AcidTest (name string, num int) clustered by (num) into 2 buckets STORED AS orc TBLPROPERTIES('transactional'='true');

INSERT INTO TABLE AcidTest VALUES ('one',1), ('two',2),('three',3),('four',4);
	from pyspark.sql.functions import udf
	import httplib, urllib, base64, json
	def whichLanguage(text):
	headers = {
	# Request headers
	'Content-Type': 'application/json',
	'Ocp-Apim-Subscription-Key': '{your subscription key here}',
	}

	params = urllib.urlencode({
	su hdfs
	hadoop fs –mkdir /user/root

	hadoop fs –chmod 777 /user/root
	hadoop fs –chmod 777 /user/guest
	exit


	wget http://www.gutenberg.org/files/50831/50831-0.txt
	text_file = sc.textFile("hdfs://sandbox.hortonworks.com/user/guest/install.log")
	counts = text_file.flatMap( lambda line: line.split(" ")) \
	.map(lambda word: (word, 1) ) \
	.reduceByKey(lambda a, b : a + b)
	counts.saveAsTextFile("hdfs://sandbox.hortonworks.com/user/guest/output1.txt")
	' sample U-SQL UDO (Processor) written in VB.NET

	Imports Microsoft.Analytics.Interfaces

	Public Class vbProcessor
	Inherits IProcessor

	Private CountryTranslation As New Dictionary(Of String, String) From
	{{"Deutschland", "Germany"},
	{"Schwiiz", "Switzerland"},
	// sample U-SQL UDO (Processor) written in F#
	// Note, currently (11/2015) requires deployment of F#.Core

	namespace fSharpProcessor

	open Microsoft.Analytics.Interfaces

	type myProcessor() =
	inherit IProcessor()
	# assumes table is my_json, with one column containing all of the json body

	add file wasb:///example/apps/process_json.py;

	SELECT transform(json_body)
	USING 'd:\python27\python.exe process_json.py'
	AS id, lessonbranch, elapsedseconds, activity, the_date
	FROM my_json;
	# this is a python streaming program designed to be called from a Hive query
	# this will process a complex json document, and will return the right set of columns and rows
	# a second GIST will contain the hive query that can be used to process this

	import sys
	import json


	# this returns five columns
	# id, lessonbranch, elapsedseconds, activity, datetime
	import sys


	lines = []
	for line in sys.stdin:
	lines.append(line)

	if len(lines) > 0:
	cleaned_lines = [line.strip() for line in lines]
	single_line = ' '.join(cleaned_lines)
	special thanks to http://giraph.apache.org/quick_start.html, and thanks to this for the last tip, http://stackoverflow.com/a/27003213/500945

	sudo apt-get install openjdk-7-jdk
	sudo apt-get install git
	sudo apt-get install maven

	git clone https://github.com/apache/giraph.git
	mvn -Phadoop_2 -fae -DskipTests -Dhadoop=non_secure clean package

	# need to put the sample file in storage
	set hive.support.concurrency=true;
	set hive.exec.dynamic.partition.mode=nonstrict;
	set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
	set hive.compactor.initiator.on=true;
	set hive.compactor.worker.threads=2 ;

	CREATE TABLE AcidTest (name string, num int) clustered by (num) into 2 buckets STORED AS orc TBLPROPERTIES('transactional'='true');

	INSERT INTO TABLE AcidTest VALUES ('one',1), ('two',2),('three',3),('four',4);