Template for different hive sqls
--Hive queries template: | |
--1) DDL | |
--CSV | |
CREATE TABLE IF NOT EXISTS <schemaname>.CSV_TABLE | |
( | |
uuid VARCHAR(50), | |
flag CHAR(1), | |
bigid BIGINT, | |
record_dt DATE, | |
amount DECIMAL(20,2), | |
record_tmstp TIMESTAMP | |
) | |
ROW FORMAT DELIMITED FIELDS TERMINATED BY '~' | |
LINES TERMINATED BY '\n' | |
LOCATION 's3://<bucket>/<prefix>/'; | |
--ORC | |
CREATE EXTERNAL TABLE `<schemaname>.ORC_TABLE`( | |
`uuid` VARCHAR(50), | |
`flag` CHAR(1), | |
`bigid` BIGINT, | |
`record_dt` DATE, | |
`amount` DECIMAL(20,2), | |
`record_tmstp` TIMESTAMP) | |
ROW FORMAT SERDE | |
'org.apache.hadoop.hive.ql.io.orc.OrcSerde' | |
STORED AS INPUTFORMAT | |
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' | |
OUTPUTFORMAT | |
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' | |
LOCATION | |
's3://bucket/prefix' | |
TBLPROPERTIES ( | |
'orc.compress'='SNAPPY') | |
--AVRO | |
--The Schema would be taken from Avro Schema file | |
CREATE EXTERNAL TABLE `<schemaname>.AVRO_TABLE`( | |
`uuid` VARCHAR(50), | |
`flag` CHAR(1), | |
`bigid` BIGINT, | |
`record_dt` DATE, | |
`amount` DECIMAL(20,2), | |
`record_tmstp` TIMESTAMP) | |
PARTITIONED BY ( | |
`partition` int) | |
ROW FORMAT SERDE | |
'org.apache.hadoop.hive.serde2.avro.AvroSerDe' | |
STORED AS INPUTFORMAT | |
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' | |
OUTPUTFORMAT | |
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' | |
LOCATION | |
's3://bucket/prefix' | |
TBLPROPERTIES ( | |
'avro.schema.url'='s3://bucket/prefix2/file.avsc') | |
--PARQUET | |
--JSON | |
CREATE EXTERNAL TABLE `<schemaname>.JSON_TABLE`( | |
`uuid` string COMMENT 'from deserializer', | |
`firstname` string COMMENT 'from deserializer', | |
`lastname` string COMMENT 'from deserializer') | |
ROW FORMAT SERDE | |
'org.apache.hive.hcatalog.data.JsonSerDe' | |
WITH SERDEPROPERTIES ( | |
'paths'='uuid, firstname, lastname') | |
STORED AS INPUTFORMAT | |
'org.apache.hadoop.mapred.TextInputFormat' | |
OUTPUTFORMAT | |
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' | |
LOCATION | |
's3://bucket/prefix' | |
--CSV TABLE WITH BUCKETS | |
CREATE TABLE `<schemaname>.CSV_TABLE`( | |
`id` string, | |
`id2` string, | |
`primarykey` string, | |
`noid` string, | |
`type` string, | |
`source` string) | |
COMMENT 'Data from source' | |
CLUSTERED BY ( | |
bucketid) | |
INTO 100 BUCKETS | |
ROW FORMAT SERDE | |
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' | |
WITH SERDEPROPERTIES ( | |
'field.delim'=',', | |
'serialization.format'=',') | |
STORED AS INPUTFORMAT | |
'org.apache.hadoop.mapred.TextInputFormat' | |
OUTPUTFORMAT | |
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' | |
LOCATION | |
's3://bucket/prefix' | |
--WITH PARTITION | |
CREATE TABLE `<schemaname>.CSV_TABLE_WITH_PARTITION`( | |
`id` string, | |
`id2` string, | |
`primarykey` string, | |
`noid` string, | |
`type` string, | |
`source` string) | |
COMMENT 'Data from source' | |
CLUSTERED BY ( | |
bucketid) | |
INTO 100 BUCKETS | |
PARTITIONED BY ( | |
`partition` int) | |
ROW FORMAT SERDE | |
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' | |
WITH SERDEPROPERTIES ( | |
'field.delim'=',', | |
'serialization.format'=',') | |
STORED AS INPUTFORMAT | |
'org.apache.hadoop.mapred.TextInputFormat' | |
OUTPUTFORMAT | |
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' | |
LOCATION | |
's3://bucket/prefix' | |
--COMPRESSION TYPES | |
--CTAS | |
-- | |
--2) DML | |
--INSERT TO BUCKET TABLE | |
--INSERT TO PARTITION TABLE - DYNAMIC PARTITION FLAGS | |
--CSV GZIP - MENTION COMPRESSION FLAGS | |
--INSERT OVERWRITE TO TABLE | |
-- | |
--3) EXTERNAL TABLE | |
--MSCK REPAIR TABLE | |
-- | |
--4) STATS Collection for partitioned and non partitioned columns. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment