vinodkc/HWC-Oozie integration-Pyspark

## HWC-Oozie integration-Pyspark
Please try the following steps to test HWC read and write from Oozie
Step 1 :
in hive , login as hive user
-----------------
create database db_hwc_test;
use db_hwc_test;

CREATE TABLE demo_input_table (
   id int,
   name varchar(10) )
 PARTITIONED BY (
   dept int)
 STORED AS ORC TBLPROPERTIES ('orc.compress'='ZLIB','transactional'='true');

  INSERT INTO demo_input_table PARTITION(dept) SELECT * FROM( SELECT  1 as id, 'bcd' as name, 1 as dept ) dual;
  INSERT INTO demo_input_table PARTITION(dept) SELECT * FROM( SELECT  2 as id, 'qwe' as name, 2 as dept ) dual;
  INSERT INTO demo_input_table PARTITION(dept) SELECT * FROM( SELECT  3 as id, 'asd' as name, 1 as dept ) dual;
  INSERT INTO demo_input_table PARTITION(dept) SELECT * FROM( SELECT  4 as id, 'zxc' as name, 2 as dept ) dual;

  select * from demo_input_table;

  CREATE TABLE demo_output_table (
   id int,
   name varchar(10) )
 PARTITIONED BY (
   dept int)
 STORED AS ORC TBLPROPERTIES ('orc.compress'='ZLIB','transactional'='true');

 from Ranger : give select right to hr-user1 on db_hwc_test.demo_input_table
 from Ranger : give write right to hr-user1 on db_hwc_test.demo_output_table

Step2:  Test table read and write using spark-submit in cluster mode to confirm the hwc read and write from spark-submit

cat pyspark_HWCDemo.py
-----------------------
import sys
from pyspark.sql import SparkSession
from pyspark_llap.sql.session import HiveWarehouseSession


def main(argv):

    # initiate the spark session builder
    spark = SparkSession.builder.appName("pyspark_HWCDemo.py").enableHiveSupport().getOrCreate()

    hive = HiveWarehouseSession.session(spark).build()
    inputTable  = sys.argv[1]
    outputTable  = sys.argv[2]
    selectQuery = "select * from {} limit 10".format(inputTable)
    df = hive.executeQuery(selectQuery)
    df.show()
    print("Writing data from table {} to {}".format(inputTable, outputTable))
    df.write.format(HiveWarehouseSession.HIVE_WAREHOUSE_CONNECTOR).mode("append").option("table",outputTable).save()
if __name__ == '__main__':
   main(sys.argv[1:])


----------------------------------
spark-submit --master yarn --deploy-mode cluster --conf spark.sql.hive.hiveserver2.jdbc.url.principal=hive/_HOST@EXAMPLE.COM --conf spark.datasource.hive.warehouse.load.staging.dir=/tmp --conf spark.datasource.hive.warehouse.metastoreUri=thrift://c220-node3.example.com:9083 --conf spark.hadoop.hive.llap.daemon.service.hosts=@llap0 --conf spark.jars=/usr/hdp/current/hive_warehouse_connector/hive-warehouse-connector-assembly-1.0.0.3.1.5.0-152_dev.jar --conf spark.submit.pyFiles=/usr/hdp/current/hive_warehouse_connector/pyspark_hwc-1.0.0.3.1.5.0-152.zip --conf spark.security.credentials.hiveserver2.enabled=true --conf spark.sql.hive.hiveserver2.jdbc.url="jdbc:hive2://c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2-interactive" --conf spark.sql.hive.zookeeper.quorum="c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181" pyspark_HWCDemo.py db_hwc_test.demo_input_table db_hwc_test.demo_output_table


step 3: check the table result from hive beeline
from hive beeline  run
select * from db_hwc_test.demo_output_table;


Step 4.1: test table read and write using oozie

[hr-user1@c220-node3 ooziehwc_write_demo]$ ll -R
.:
total 8
-rw-r--r-- 1 hr-user1 hadoop  409 May 28 06:02 job.properties
drwxr-xr-x 2 hr-user1 hadoop  183 May 28 06:27 lib
-rw-r--r-- 1 hr-user1 hadoop 2317 May 28 07:20 workflow.xml

./lib:
total 55040
-rw-r--r-- 1 hr-user1 hadoop 56340621 May 28 06:17 hive-warehouse-connector-assembly-1.0.0.3.1.5.0-152_dev.jar
-rw------- 1 hr-user1 hadoop      158 May 28 06:27 hr-user1.keytab
-rw-r--r-- 1 hr-user1 hadoop      743 May 28 05:47 pyspark_HWCDemo.py
-rw-r--r-- 1 hr-user1 hadoop     6044 May 28 06:18 pyspark_hwc-1.0.0.3.1.5.0-152.zip


cat workflow.xml
-----------------------
<?xml version="1.0" encoding="UTF-8"?>
<workflow-app xmlns="uri:oozie:workflow:0.4" name="HWC-Spark-Demo-Workflow">
   <credentials>
      <credential name="hs2-creds" type="hive2">
         <property>
            <name>hive2.server.principal</name>
            <value>hive/_HOST@EXAMPLE.COM</value>
         </property>
         <property>
            <name>hive2.jdbc.url</name>
            <value>jdbc:hive2://c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2</value>
         </property>
      </credential>
   </credentials>
   <start to="spark-hwc" />
   <action name="spark-hwc" cred="hs2-creds">
      <spark xmlns="uri:oozie:spark-action:0.2">
         <job-tracker>${jobTracker}</job-tracker>
         <name-node>${nameNode}</name-node>
         <master>${master}</master>
         <name>${appname}</name>
         <jar>pyspark_HWCDemo.py</jar>
          <spark-opts>--conf spark.sql.hive.hiveserver2.jdbc.url.principal=hive/_HOST@EXAMPLE.COM --conf spark.submit.pyFiles=/usr/hdp/current/hive_warehouse_connector/pyspark_hwc-1.0.0.3.1.5.0-152.zip  --keytab hr-user101.keytab --principal hr-user1@EXAMPLE.COM  --conf spark.datasource.hive.warehouse.load.staging.dir=/tmp --conf spark.datasource.hive.warehouse.metastoreUri=thrift://c220-node3.example.com:9083 --conf spark.hadoop.hive.llap.daemon.service.hosts=@llap0 --conf spark.security.credentials.hiveserver2.enabled=true --conf spark.sql.hive.hiveserver2.jdbc.url="jdbc:hive2://c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2-interactive" --conf spark.sql.hive.zookeeper.quorum="c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181"</spark-opts>
          <arg>${arg1}</arg>
         <arg>${arg2}</arg>
         <file>${nameNode}${jarPath}/hr-user1.keytab#hr-user101.keytab</file>
      </spark>
      <ok to="end" />
      <error to="fail" />
   </action>
   <kill name="fail">
      <message>Workflow is Failed! message[${wf:errorMessage(wf:lastErrorNode())}]</message>
   </kill>
   <end name="end" />
</workflow-app>


----------------------
cat job.properties
-------------------------
nameNode=hdfs://c220-node2.example.com:8020
jobTracker=c220-node2.example.com:8050
queueName=default
appname=HWC-Spark-Write_Demo
oozie.use.system.libpath=true
oozie.wf.application.path=/tmp/sparkhwc_Write_demoOozieAction
jarPath=/tmp/sparkhwc_Write_demoOozieAction/lib
master=yarn-cluster
oozie.action.sharelib.for.spark=spark
arg1=db_hwc_test.demo_input_table
arg2=db_hwc_test.demo_output_table

step 4.2 : upload files to hdfs

 hdfs dfs -ls -R /tmp/sparkhwc_Write_demoOozieAction
drwxr-xr-x   - hr-user1 hdfs          0 2021-05-28 06:28 /tmp/sparkhwc_Write_demoOozieAction/lib
-rw-r--r--   3 hr-user1 hdfs   56340621 2021-05-28 06:19 /tmp/sparkhwc_Write_demoOozieAction/lib/hive-warehouse-connector-assembly-1.0.0.3.1.5.0-152_dev.jar
-rw-r--r--   3 hr-user1 hdfs        158 2021-05-28 06:28 /tmp/sparkhwc_Write_demoOozieAction/lib/hr-user1.keytab
-rw-r--r--   3 hr-user1 hdfs        743 2021-05-28 06:19 /tmp/sparkhwc_Write_demoOozieAction/lib/pyspark_HWCDemo.py
-rw-r--r--   3 hr-user1 hdfs       6044 2021-05-28 06:19 /tmp/sparkhwc_Write_demoOozieAction/lib/pyspark_hwc-1.0.0.3.1.5.0-152.zip
-rw-r--r--   3 hr-user1 hdfs       2317 2021-05-28 07:20 /tmp/sparkhwc_Write_demoOozieAction/workflow.xml

step 4.3 run oozie job
oozie job -oozie http://c220-node4.example.com:11000/oozie/ -config job.properties -run

Step 4.4 : after completion of the job, check table result from hive beeline
  select * from db_hwc_test.demo_output_table;
	Please try the following steps to test HWC read and write from Oozie
	Step 1 :
	in hive , login as hive user
	-----------------
	create database db_hwc_test;
	use db_hwc_test;

	CREATE TABLE demo_input_table (
	id int,
	name varchar(10) )
	PARTITIONED BY (
	dept int)
	STORED AS ORC TBLPROPERTIES ('orc.compress'='ZLIB','transactional'='true');

	INSERT INTO demo_input_table PARTITION(dept) SELECT * FROM( SELECT 1 as id, 'bcd' as name, 1 as dept ) dual;
	INSERT INTO demo_input_table PARTITION(dept) SELECT * FROM( SELECT 2 as id, 'qwe' as name, 2 as dept ) dual;
	INSERT INTO demo_input_table PARTITION(dept) SELECT * FROM( SELECT 3 as id, 'asd' as name, 1 as dept ) dual;
	INSERT INTO demo_input_table PARTITION(dept) SELECT * FROM( SELECT 4 as id, 'zxc' as name, 2 as dept ) dual;

	select * from demo_input_table;

	CREATE TABLE demo_output_table (
	id int,
	name varchar(10) )
	PARTITIONED BY (
	dept int)
	STORED AS ORC TBLPROPERTIES ('orc.compress'='ZLIB','transactional'='true');

	from Ranger : give select right to hr-user1 on db_hwc_test.demo_input_table
	from Ranger : give write right to hr-user1 on db_hwc_test.demo_output_table

	Step2: Test table read and write using spark-submit in cluster mode to confirm the hwc read and write from spark-submit

	cat pyspark_HWCDemo.py
	-----------------------
	import sys
	from pyspark.sql import SparkSession
	from pyspark_llap.sql.session import HiveWarehouseSession


	def main(argv):

	# initiate the spark session builder
	spark = SparkSession.builder.appName("pyspark_HWCDemo.py").enableHiveSupport().getOrCreate()

	hive = HiveWarehouseSession.session(spark).build()
	inputTable = sys.argv[1]
	outputTable = sys.argv[2]
	selectQuery = "select * from {} limit 10".format(inputTable)
	df = hive.executeQuery(selectQuery)
	df.show()
	print("Writing data from table {} to {}".format(inputTable, outputTable))
	df.write.format(HiveWarehouseSession.HIVE_WAREHOUSE_CONNECTOR).mode("append").option("table",outputTable).save()
	if __name__ == '__main__':
	main(sys.argv[1:])


	----------------------------------
	spark-submit --master yarn --deploy-mode cluster --conf spark.sql.hive.hiveserver2.jdbc.url.principal=hive/_HOST@EXAMPLE.COM --conf spark.datasource.hive.warehouse.load.staging.dir=/tmp --conf spark.datasource.hive.warehouse.metastoreUri=thrift://c220-node3.example.com:9083 --conf spark.hadoop.hive.llap.daemon.service.hosts=@llap0 --conf spark.jars=/usr/hdp/current/hive_warehouse_connector/hive-warehouse-connector-assembly-1.0.0.3.1.5.0-152_dev.jar --conf spark.submit.pyFiles=/usr/hdp/current/hive_warehouse_connector/pyspark_hwc-1.0.0.3.1.5.0-152.zip --conf spark.security.credentials.hiveserver2.enabled=true --conf spark.sql.hive.hiveserver2.jdbc.url="jdbc:hive2://c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2-interactive" --conf spark.sql.hive.zookeeper.quorum="c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181" pyspark_HWCDemo.py db_hwc_test.demo_input_table db_hwc_test.demo_output_table


	step 3: check the table result from hive beeline
	from hive beeline run
	select * from db_hwc_test.demo_output_table;



	Step 4.1: test table read and write using oozie

	[hr-user1@c220-node3 ooziehwc_write_demo]$ ll -R
	.:
	total 8
	-rw-r--r-- 1 hr-user1 hadoop 409 May 28 06:02 job.properties
	drwxr-xr-x 2 hr-user1 hadoop 183 May 28 06:27 lib
	-rw-r--r-- 1 hr-user1 hadoop 2317 May 28 07:20 workflow.xml

	./lib:
	total 55040
	-rw-r--r-- 1 hr-user1 hadoop 56340621 May 28 06:17 hive-warehouse-connector-assembly-1.0.0.3.1.5.0-152_dev.jar
	-rw------- 1 hr-user1 hadoop 158 May 28 06:27 hr-user1.keytab
	-rw-r--r-- 1 hr-user1 hadoop 743 May 28 05:47 pyspark_HWCDemo.py
	-rw-r--r-- 1 hr-user1 hadoop 6044 May 28 06:18 pyspark_hwc-1.0.0.3.1.5.0-152.zip


	cat workflow.xml
	-----------------------
	<?xml version="1.0" encoding="UTF-8"?>
	<workflow-app xmlns="uri:oozie:workflow:0.4" name="HWC-Spark-Demo-Workflow">
	<credentials>
	<credential name="hs2-creds" type="hive2">
	<property>
	<name>hive2.server.principal</name>
	<value>hive/_HOST@EXAMPLE.COM</value>
	</property>
	<property>
	<name>hive2.jdbc.url</name>
	<value>jdbc:hive2://c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2</value>
	</property>
	</credential>
	</credentials>
	<start to="spark-hwc" />
	<action name="spark-hwc" cred="hs2-creds">
	<spark xmlns="uri:oozie:spark-action:0.2">
	<job-tracker>${jobTracker}</job-tracker>
	<name-node>${nameNode}</name-node>
	<master>${master}</master>
	<name>${appname}</name>
	<jar>pyspark_HWCDemo.py</jar>
	<spark-opts>--conf spark.sql.hive.hiveserver2.jdbc.url.principal=hive/_HOST@EXAMPLE.COM --conf spark.submit.pyFiles=/usr/hdp/current/hive_warehouse_connector/pyspark_hwc-1.0.0.3.1.5.0-152.zip --keytab hr-user101.keytab --principal hr-user1@EXAMPLE.COM --conf spark.datasource.hive.warehouse.load.staging.dir=/tmp --conf spark.datasource.hive.warehouse.metastoreUri=thrift://c220-node3.example.com:9083 --conf spark.hadoop.hive.llap.daemon.service.hosts=@llap0 --conf spark.security.credentials.hiveserver2.enabled=true --conf spark.sql.hive.hiveserver2.jdbc.url="jdbc:hive2://c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2-interactive" --conf spark.sql.hive.zookeeper.quorum="c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181"</spark-opts>
	<arg>${arg1}</arg>
	<arg>${arg2}</arg>
	<file>${nameNode}${jarPath}/hr-user1.keytab#hr-user101.keytab</file>
	</spark>
	<ok to="end" />
	<error to="fail" />
	</action>
	<kill name="fail">
	<message>Workflow is Failed! message[${wf:errorMessage(wf:lastErrorNode())}]</message>
	</kill>
	<end name="end" />
	</workflow-app>


	----------------------
	cat job.properties
	-------------------------
	nameNode=hdfs://c220-node2.example.com:8020
	jobTracker=c220-node2.example.com:8050
	queueName=default
	appname=HWC-Spark-Write_Demo
	oozie.use.system.libpath=true
	oozie.wf.application.path=/tmp/sparkhwc_Write_demoOozieAction
	jarPath=/tmp/sparkhwc_Write_demoOozieAction/lib
	master=yarn-cluster
	oozie.action.sharelib.for.spark=spark
	arg1=db_hwc_test.demo_input_table
	arg2=db_hwc_test.demo_output_table

	step 4.2 : upload files to hdfs

	hdfs dfs -ls -R /tmp/sparkhwc_Write_demoOozieAction
	drwxr-xr-x - hr-user1 hdfs 0 2021-05-28 06:28 /tmp/sparkhwc_Write_demoOozieAction/lib
	-rw-r--r-- 3 hr-user1 hdfs 56340621 2021-05-28 06:19 /tmp/sparkhwc_Write_demoOozieAction/lib/hive-warehouse-connector-assembly-1.0.0.3.1.5.0-152_dev.jar
	-rw-r--r-- 3 hr-user1 hdfs 158 2021-05-28 06:28 /tmp/sparkhwc_Write_demoOozieAction/lib/hr-user1.keytab
	-rw-r--r-- 3 hr-user1 hdfs 743 2021-05-28 06:19 /tmp/sparkhwc_Write_demoOozieAction/lib/pyspark_HWCDemo.py
	-rw-r--r-- 3 hr-user1 hdfs 6044 2021-05-28 06:19 /tmp/sparkhwc_Write_demoOozieAction/lib/pyspark_hwc-1.0.0.3.1.5.0-152.zip
	-rw-r--r-- 3 hr-user1 hdfs 2317 2021-05-28 07:20 /tmp/sparkhwc_Write_demoOozieAction/workflow.xml

	step 4.3 run oozie job
	oozie job -oozie http://c220-node4.example.com:11000/oozie/ -config job.properties -run

	Step 4.4 : after completion of the job, check table result from hive beeline
	select * from db_hwc_test.demo_output_table;