Skip to content

Instantly share code, notes, and snippets.

@NitinKumar94
Last active August 22, 2016 10:34
Show Gist options
  • Save NitinKumar94/df1dd0e2058816b7ed2872ac7564af11 to your computer and use it in GitHub Desktop.
Save NitinKumar94/df1dd0e2058816b7ed2872ac7564af11 to your computer and use it in GitHub Desktop.
--run this script after inserting records using spark
use test_inserts;
--should not spawn a map-reduce job and return wrong result immediately
select count(*) from test_hive_inserts;
--should display all inserted records from both hive and spark
select * from test_hive_inserts;
## Run: spark-submit insert_recotrds.py
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
def insert_records():
conf = SparkConf().setAppName("Testing Inserts").setMaster("local[*]")
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)
data_points = [(3, "ghi", "xyz"), (4, "jkl", "xyz"), (5, "mno", "xyz")]
column_names = ["identity_id", "emp_name", "department_name"]
df = sqlContext.createDataFrame(data_points, column_names)
df.show()
df.registerTempTable("temp_table")
sqlContext.sql("insert into table test_inserts.test_hive_inserts select * from temp_table")
if __name__ == "__main__":
insert_records()
create database if not exists test_inserts;
use test_inserts;
create table if not exists test_hive_inserts (id int, name string, dept string) stored as ORC;
insert into table test_hive_inserts values (1, 'abc', 'xyz');
insert into table test_hive_inserts values (2, 'def', 'xyz');
select count(*) from test_hive_inserts;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment