Vedant Ari Jain ari-vedant-jain

## s3-unzip.py
import boto3

import zipfile

from datetime import *

from io import BytesIO

import json

## Get python library version installed
import pip
installed_packages = pip.get_installed_distributions()
installed_packages_list = sorted(["%s==%s" % (i.key, i.version)
     for i in installed_packages])
print(installed_packages_list)

sorted(["%s==%s" % (i.key, i.version) for i in pip.get_installed_distributions()])

## Unzipping using Python & Pyspark
# Using Python
import os, zipfile

z = zipfile.ZipFile('/databricks/driver/D-Dfiles.zip')
for f in z.namelist():
    if f.endswith('/'):
        os.makedirs(f)

# Reading zipped folder data in Pyspark

## MLLIB to ML
# From spark 1.x to spark 2.x
from pyspark.ml.linalg import VectorUDT
from pyspark.sql.functions import udf

as_ml = udf(lambda v: v.asML() if v is not None else None, VectorUDT())
result = labeledData.withColumn("features", as_ml("features"))

## pymongo-example
import pymongo
from pymongo import MongoClient

connection = MongoClient('localhost', 27017)
db = connection.omh
data = db.results
resultList = data.find()

#print collections in the database
db.collection_names()

## Install python2.7 & virtualenv
For Python 2.7.1
Python source

mkdir ~/src
mkdir ~/.localpython
cd ~/src
wget http://www.python.org/ftp/python/2.7.1/Python-2.7.1.tgz
tar -zxvf Python-2.7.1.tgz
cd Python-2.7.1

## Access Phoenix Table using Zeppelin
1. Checkout source code from https://github.com/apache/incubator-zeppelin
2. Custom build the code with spark 1.3 and with the respective Hadoop version.
     mvn clean package -Pspark-1.3 -Dhadoop.version=2.6.0 -Phadoop-2.6 -DskipTests
3. Have the following jars in the spark classpath by placing them in the location $ZEPPELIN_HOME/interpreter/spark
   a. hbase-client.jar
   b. hbase-protocol.jar
   c. hbase-common.jar
   d. phoenix-4.4.x-client-without-hbase.jar

4. Start Zeppelin

## install-mvn.sh
#!/bin/bash

mkdir /usr/share/maven
cd /usr/share/maven
wget http://mirrors.koehn.com/apache/maven/maven-3/3.2.5/binaries/apache-maven-3.2.5-bin.tar.gz
tar xvzf apache-maven-3.2.5-bin.tar.gz
ln -s /usr/share/maven/apache-maven-3.2.5/ /usr/share/maven/latest
echo 'M2_HOME=/usr/share/maven/latest' >> ~/.bashrc
echo 'M2=$M2_HOME/bin' >> ~/.bashrc
echo 'PATH=$PATH:$M2' >> ~/.bashrc

## pyspark
Building Spark for PySpark use on top of YARN

Build Spark on local machine (only if using PySpark; otherwise, remote machine works) (http://spark.apache.org/docs/latest/building-with-maven.html)

export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -DskipTests clean package
Copy the assembly/target/scala-2.10/...jar to the corresponding directory on the cluster node and also into a location in HDFS.


pyspark --master yarn-cluster --num-executors 3 --driver-memory 512m --executor-memory 512m --executor-cores 1

## pyPhoenix
import jaydebeapi
conn = jaydebeapi.connect('org.apache.phoenix.jdbc.PhoenixDriver', \
                           ['jdbc:phoenix:localhost:2181:/hbase-unsecure', '', ''], \
                           '/usr/hdp/current/phoenix-client/phoenix-client.jar', \
                           ['phoenix.functions.allowUserDefinedFunctions', 'true'])
curs = conn.cursor()
curs.execute('select * from "driver_dangerous_events"')
curs.fetchall()
curs.execute('select "driverId", "driverName", count("eventType") from "driver_dangerous_events" GROUP BY "driverId", "driverName"')
curs.fetchall()
	import boto3

	import zipfile

	from datetime import *

	from io import BytesIO

	import json
	import pip
	installed_packages = pip.get_installed_distributions()
	installed_packages_list = sorted(["%s==%s" % (i.key, i.version)
	for i in installed_packages])
	print(installed_packages_list)

	sorted(["%s==%s" % (i.key, i.version) for i in pip.get_installed_distributions()])
	# Using Python
	import os, zipfile

	z = zipfile.ZipFile('/databricks/driver/D-Dfiles.zip')
	for f in z.namelist():
	if f.endswith('/'):
	os.makedirs(f)

	# Reading zipped folder data in Pyspark
	# From spark 1.x to spark 2.x
	from pyspark.ml.linalg import VectorUDT
	from pyspark.sql.functions import udf

	as_ml = udf(lambda v: v.asML() if v is not None else None, VectorUDT())
	result = labeledData.withColumn("features", as_ml("features"))
	import pymongo
	from pymongo import MongoClient

	connection = MongoClient('localhost', 27017)
	db = connection.omh
	data = db.results
	resultList = data.find()

	#print collections in the database
	db.collection_names()
	For Python 2.7.1
	Python source

	mkdir ~/src
	mkdir ~/.localpython
	cd ~/src
	wget http://www.python.org/ftp/python/2.7.1/Python-2.7.1.tgz
	tar -zxvf Python-2.7.1.tgz
	cd Python-2.7.1
	1. Checkout source code from https://github.com/apache/incubator-zeppelin
	2. Custom build the code with spark 1.3 and with the respective Hadoop version.
	mvn clean package -Pspark-1.3 -Dhadoop.version=2.6.0 -Phadoop-2.6 -DskipTests
	3. Have the following jars in the spark classpath by placing them in the location $ZEPPELIN_HOME/interpreter/spark
	a. hbase-client.jar
	b. hbase-protocol.jar
	c. hbase-common.jar
	d. phoenix-4.4.x-client-without-hbase.jar

	4. Start Zeppelin
	#!/bin/bash

	mkdir /usr/share/maven
	cd /usr/share/maven
	wget http://mirrors.koehn.com/apache/maven/maven-3/3.2.5/binaries/apache-maven-3.2.5-bin.tar.gz
	tar xvzf apache-maven-3.2.5-bin.tar.gz
	ln -s /usr/share/maven/apache-maven-3.2.5/ /usr/share/maven/latest
	echo 'M2_HOME=/usr/share/maven/latest' >> ~/.bashrc
	echo 'M2=$M2_HOME/bin' >> ~/.bashrc
	echo 'PATH=$PATH:$M2' >> ~/.bashrc
	Building Spark for PySpark use on top of YARN

	Build Spark on local machine (only if using PySpark; otherwise, remote machine works) (http://spark.apache.org/docs/latest/building-with-maven.html)

	export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m"
	mvn -Pyarn -Phadoop-2.4 -Dhadoop.version=2.4.0 -DskipTests clean package
	Copy the assembly/target/scala-2.10/...jar to the corresponding directory on the cluster node and also into a location in HDFS.


	pyspark --master yarn-cluster --num-executors 3 --driver-memory 512m --executor-memory 512m --executor-cores 1
	import jaydebeapi
	conn = jaydebeapi.connect('org.apache.phoenix.jdbc.PhoenixDriver', \
	['jdbc:phoenix:localhost:2181:/hbase-unsecure', '', ''], \
	'/usr/hdp/current/phoenix-client/phoenix-client.jar', \
	['phoenix.functions.allowUserDefinedFunctions', 'true'])
	curs = conn.cursor()
	curs.execute('select * from "driver_dangerous_events"')
	curs.fetchall()
	curs.execute('select "driverId", "driverName", count("eventType") from "driver_dangerous_events" GROUP BY "driverId", "driverName"')
	curs.fetchall()