Skip to content

Instantly share code, notes, and snippets.

@vinodkc
Created May 28, 2021 08:12
Show Gist options
  • Save vinodkc/1d8d3e5b917fe8117dd5e324e0ba788f to your computer and use it in GitHub Desktop.
Save vinodkc/1d8d3e5b917fe8117dd5e324e0ba788f to your computer and use it in GitHub Desktop.
Please try the following steps to test HWC read and write from Oozie
Step 1 :
in hive , login as hive user
-----------------
create database db_hwc_test;
use db_hwc_test;
CREATE TABLE demo_input_table (
id int,
name varchar(10) )
PARTITIONED BY (
dept int)
STORED AS ORC TBLPROPERTIES ('orc.compress'='ZLIB','transactional'='true');
INSERT INTO demo_input_table PARTITION(dept) SELECT * FROM( SELECT 1 as id, 'bcd' as name, 1 as dept ) dual;
INSERT INTO demo_input_table PARTITION(dept) SELECT * FROM( SELECT 2 as id, 'qwe' as name, 2 as dept ) dual;
INSERT INTO demo_input_table PARTITION(dept) SELECT * FROM( SELECT 3 as id, 'asd' as name, 1 as dept ) dual;
INSERT INTO demo_input_table PARTITION(dept) SELECT * FROM( SELECT 4 as id, 'zxc' as name, 2 as dept ) dual;
select * from demo_input_table;
CREATE TABLE demo_output_table (
id int,
name varchar(10) )
PARTITIONED BY (
dept int)
STORED AS ORC TBLPROPERTIES ('orc.compress'='ZLIB','transactional'='true');
from Ranger : give select right to hr-user1 on db_hwc_test.demo_input_table
from Ranger : give write right to hr-user1 on db_hwc_test.demo_output_table
Step2: Test table read and write using spark-submit in cluster mode to confirm the hwc read and write from spark-submit
cat pyspark_HWCDemo.py
-----------------------
import sys
from pyspark.sql import SparkSession
from pyspark_llap.sql.session import HiveWarehouseSession
def main(argv):
# initiate the spark session builder
spark = SparkSession.builder.appName("pyspark_HWCDemo.py").enableHiveSupport().getOrCreate()
hive = HiveWarehouseSession.session(spark).build()
inputTable = sys.argv[1]
outputTable = sys.argv[2]
selectQuery = "select * from {} limit 10".format(inputTable)
df = hive.executeQuery(selectQuery)
df.show()
print("Writing data from table {} to {}".format(inputTable, outputTable))
df.write.format(HiveWarehouseSession.HIVE_WAREHOUSE_CONNECTOR).mode("append").option("table",outputTable).save()
if __name__ == '__main__':
main(sys.argv[1:])
----------------------------------
spark-submit --master yarn --deploy-mode cluster --conf spark.sql.hive.hiveserver2.jdbc.url.principal=hive/_HOST@EXAMPLE.COM --conf spark.datasource.hive.warehouse.load.staging.dir=/tmp --conf spark.datasource.hive.warehouse.metastoreUri=thrift://c220-node3.example.com:9083 --conf spark.hadoop.hive.llap.daemon.service.hosts=@llap0 --conf spark.jars=/usr/hdp/current/hive_warehouse_connector/hive-warehouse-connector-assembly-1.0.0.3.1.5.0-152_dev.jar --conf spark.submit.pyFiles=/usr/hdp/current/hive_warehouse_connector/pyspark_hwc-1.0.0.3.1.5.0-152.zip --conf spark.security.credentials.hiveserver2.enabled=true --conf spark.sql.hive.hiveserver2.jdbc.url="jdbc:hive2://c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2-interactive" --conf spark.sql.hive.zookeeper.quorum="c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181" pyspark_HWCDemo.py db_hwc_test.demo_input_table db_hwc_test.demo_output_table
step 3: check the table result from hive beeline
from hive beeline run
select * from db_hwc_test.demo_output_table;
Step 4.1: test table read and write using oozie
[hr-user1@c220-node3 ooziehwc_write_demo]$ ll -R
.:
total 8
-rw-r--r-- 1 hr-user1 hadoop 409 May 28 06:02 job.properties
drwxr-xr-x 2 hr-user1 hadoop 183 May 28 06:27 lib
-rw-r--r-- 1 hr-user1 hadoop 2317 May 28 07:20 workflow.xml
./lib:
total 55040
-rw-r--r-- 1 hr-user1 hadoop 56340621 May 28 06:17 hive-warehouse-connector-assembly-1.0.0.3.1.5.0-152_dev.jar
-rw------- 1 hr-user1 hadoop 158 May 28 06:27 hr-user1.keytab
-rw-r--r-- 1 hr-user1 hadoop 743 May 28 05:47 pyspark_HWCDemo.py
-rw-r--r-- 1 hr-user1 hadoop 6044 May 28 06:18 pyspark_hwc-1.0.0.3.1.5.0-152.zip
cat workflow.xml
-----------------------
<?xml version="1.0" encoding="UTF-8"?>
<workflow-app xmlns="uri:oozie:workflow:0.4" name="HWC-Spark-Demo-Workflow">
<credentials>
<credential name="hs2-creds" type="hive2">
<property>
<name>hive2.server.principal</name>
<value>hive/_HOST@EXAMPLE.COM</value>
</property>
<property>
<name>hive2.jdbc.url</name>
<value>jdbc:hive2://c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2</value>
</property>
</credential>
</credentials>
<start to="spark-hwc" />
<action name="spark-hwc" cred="hs2-creds">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>${master}</master>
<name>${appname}</name>
<jar>pyspark_HWCDemo.py</jar>
<spark-opts>--conf spark.sql.hive.hiveserver2.jdbc.url.principal=hive/_HOST@EXAMPLE.COM --conf spark.submit.pyFiles=/usr/hdp/current/hive_warehouse_connector/pyspark_hwc-1.0.0.3.1.5.0-152.zip --keytab hr-user101.keytab --principal hr-user1@EXAMPLE.COM --conf spark.datasource.hive.warehouse.load.staging.dir=/tmp --conf spark.datasource.hive.warehouse.metastoreUri=thrift://c220-node3.example.com:9083 --conf spark.hadoop.hive.llap.daemon.service.hosts=@llap0 --conf spark.security.credentials.hiveserver2.enabled=true --conf spark.sql.hive.hiveserver2.jdbc.url="jdbc:hive2://c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2-interactive" --conf spark.sql.hive.zookeeper.quorum="c220-node2.example.com:2181,c220-node3.example.com:2181,c220-node4.example.com:2181"</spark-opts>
<arg>${arg1}</arg>
<arg>${arg2}</arg>
<file>${nameNode}${jarPath}/hr-user1.keytab#hr-user101.keytab</file>
</spark>
<ok to="end" />
<error to="fail" />
</action>
<kill name="fail">
<message>Workflow is Failed! message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<end name="end" />
</workflow-app>
----------------------
cat job.properties
-------------------------
nameNode=hdfs://c220-node2.example.com:8020
jobTracker=c220-node2.example.com:8050
queueName=default
appname=HWC-Spark-Write_Demo
oozie.use.system.libpath=true
oozie.wf.application.path=/tmp/sparkhwc_Write_demoOozieAction
jarPath=/tmp/sparkhwc_Write_demoOozieAction/lib
master=yarn-cluster
oozie.action.sharelib.for.spark=spark
arg1=db_hwc_test.demo_input_table
arg2=db_hwc_test.demo_output_table
step 4.2 : upload files to hdfs
hdfs dfs -ls -R /tmp/sparkhwc_Write_demoOozieAction
drwxr-xr-x - hr-user1 hdfs 0 2021-05-28 06:28 /tmp/sparkhwc_Write_demoOozieAction/lib
-rw-r--r-- 3 hr-user1 hdfs 56340621 2021-05-28 06:19 /tmp/sparkhwc_Write_demoOozieAction/lib/hive-warehouse-connector-assembly-1.0.0.3.1.5.0-152_dev.jar
-rw-r--r-- 3 hr-user1 hdfs 158 2021-05-28 06:28 /tmp/sparkhwc_Write_demoOozieAction/lib/hr-user1.keytab
-rw-r--r-- 3 hr-user1 hdfs 743 2021-05-28 06:19 /tmp/sparkhwc_Write_demoOozieAction/lib/pyspark_HWCDemo.py
-rw-r--r-- 3 hr-user1 hdfs 6044 2021-05-28 06:19 /tmp/sparkhwc_Write_demoOozieAction/lib/pyspark_hwc-1.0.0.3.1.5.0-152.zip
-rw-r--r-- 3 hr-user1 hdfs 2317 2021-05-28 07:20 /tmp/sparkhwc_Write_demoOozieAction/workflow.xml
step 4.3 run oozie job
oozie job -oozie http://c220-node4.example.com:11000/oozie/ -config job.properties -run
Step 4.4 : after completion of the job, check table result from hive beeline
select * from db_hwc_test.demo_output_table;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment