Skip to content

Instantly share code, notes, and snippets.

@gwenshap
Created March 22, 2014 00:03
Show Gist options
  • Save gwenshap/9699072 to your computer and use it in GitHub Desktop.
Save gwenshap/9699072 to your computer and use it in GitHub Desktop.
Using Hive to Efficiently Merge Data
-- create temporary table for merging
CREATE external TABLE staging.ItemFulfillment_merged
(ItemFulfillment_b13aFilingOptionFedEx STRING,
ItemFulfillment_b13aStatementDataFedEx STRING,
ItemFulfillment_backupEmailAddressFedEx STRING,
ItemFulfillment_backupEmailAddressUps STRING,
ItemFulfillment_internalId STRING,
ItemFulfillment_itemList_item_item_internalId STRING,
ItemFulfillment_itemList_item_item_name STRING,
ItemFulfillment_itemList_item_item_type STRING,
ItemFulfillment_itemList_item_orderLine STRING)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION 'hdfs://nameservice1/data/netsuite/ItemFulfillment/2014-03-20'
TBLPROPERTIES (
'avro.schema.url'='hdfs://nameservice1/etl/NetSuite/metadata/ItemFulfillment.avsc');
-- populate it with content from existing and new data
-- we take all the new rows, and only the existing rows that don't exist in the new data set.
insert overwrite table staging.ItemFulfillment_merged
select * from (
select /*+ MAPJOIN(st) */ gt.ItemFulfillment_b13aFilingOptionFedEx,
gt.ItemFulfillment_b13aStatementDataFedEx,
gt.ItemFulfillment_backupEmailAddressFedEx,
gt.ItemFulfillment_backupEmailAddressUps,
gt.ItemFulfillment_internalId,
gt.ItemFulfillment_itemList_item_item_internalId,
gt.ItemFulfillment_itemList_item_item_name,
gt.ItemFulfillment_itemList_item_item_type,
gt.ItemFulfillment_itemList_item_orderLine
from netsuite.ItemFulfillment gt
left outer join staging.ItemFulfillment_csv st
on st.ItemFulfillment_itemList_item_orderLine = gt.ItemFulfillment_itemList_item_orderLine and st.itemfulfillment_internalid = gt.itemfulfillment_internalid
where st.ItemFulfillment_itemList_item_orderLine is null and st.itemfulfillment_internalid is null
union all
select ItemFulfillment_b13aFilingOptionFedEx,
ItemFulfillment_b13aStatementDataFedEx,
ItemFulfillment_backupEmailAddressFedEx,
ItemFulfillment_backupEmailAddressUps,
ItemFulfillment_internalId,
ItemFulfillment_itemList_item_item_internalId,
ItemFulfillment_itemList_item_item_name,
ItemFulfillment_itemList_item_item_type,
ItemFulfillment_itemList_item_orderLine
from staging.ItemFulfillment_csv) T;
-- then we point the existing "master" table and point it to the data in the new, merged table.
-- after this quick, atomic operation that changes only metadata - users will now see updated data
-- as a last step, we drop the definition of the temporary, merged table
-- the data stays because its an external table
use netsuite;
alter table ItemFulfillment set location 'hdfs://nameservice1/data/netsuite/ItemFulfillment/2014-03-20';
drop table staging.ItemFulfillment_merged;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment