Skip to content

Instantly share code, notes, and snippets.

@irajhedayati
Last active March 10, 2021 04:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save irajhedayati/d82772d59e48bb783832329465afdb97 to your computer and use it in GitHub Desktop.
Save irajhedayati/d82772d59e48bb783832329465afdb97 to your computer and use it in GitHub Desktop.
A set of tutorials for Hive encodings
CREATE DATABASE fall2019_iraj;
-- Encoding; CSV table
CREATE TABLE test (
name STRING,
age INT
);
/* Check the structure */
-- Note the location
DESCRIBE test;
DESCRIBE EXTENDED test;
DESCRIBE FORMATTED test;
-- Insert one record
INSERT into test values ('John', 30);
-- Check the file on HDFS
CREATE TABLE test_csv (
name STRING,
age INT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
INSERT into test_csv values ('John', 30);
-- Check the file on HDFS
-- Most of the time, we work with already stored data on HDFS
-- and we'd like to run analysis and query data using Hive
/* Create an extrernal table */
CREATE EXTERNAL TABLE ext_movie (
id INT,
title STRING,
year INT,
director STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION '/user/bdss2019/iraj/movie';
SELECT * FROM ext_movie;
-- What is the problem? header is the first row
CREATE EXTERNAL TABLE ext_movie_wo_header (
id int,
title string,
year int,
director string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION '/user/bdsf2001/iraj/movie'
TBLPROPERTIES ('skip.header.line.count' = '1');
SELECT * FROM ext_movie;
-- What is the problem? a null value is represented with empty string
CREATE EXTERNAL TABLE ext_movie_wo_header_w_null (
id int,
title string,
year int,
director string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION '/user/bdsf2001/iraj/movie'
TBLPROPERTIES (
'skip.header.line.count' = '1',
'serialization.null.format' = ''
);
/* We could have less columns compared with data */
CREATE EXTERNAL TABLE ext_movie_wo_header_w_null_wo_director (
id int,
title string,
year int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION '/user/bdsf2001/iraj/movie'
TBLPROPERTIES (
'skip.header.line.count' = '1',
'serialization.null.format' = ''
);
SELECT * FROM ext_movie;
/* If schema doesn't match, returns null */
DROP TABLE ext_movie;
CREATE EXTERNAL TABLE ext_movie (
mID INT,
title STRING,
director STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/user/bdss2019/iraj/movie'
TBLPROPERTIES (
"skip.header.line.count" = "1",
"serialization.null.format" =""
);
/* If schema doesn't match, returns null */
CREATE EXTERNAL TABLE ext_movie_wo_header_w_null_wo_director_type_mismatch (
id int,
title string,
director string,
year int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LOCATION '/user/bdsf2001/iraj/movie'
TBLPROPERTIES (
'skip.header.line.count' = '1',
'serialization.null.format' = ''
);
CREATE TABLE movie_sequence
STORED AS SEQUENCEFILE
AS
SELECT * FROM ext_movie_wo_header_w_null;
CREATE TABLE movie_parquet
STORED AS parquet
AS
SELECT * FROM ext_movie_wo_header_w_null;
-- parquet-tools head hdfs://quickstart.cloudera:8020/user/hive/warehouse/bdss2001_iraj.db/movie_parquet/000000_0
-- parquet-tools schema hdfs://quickstart.cloudera:8020/user/hive/warehouse/bdss2001_iraj.db/movie_parquet/000000_0
create external table ext_movie_parquet (
mid int,
title string,
year int,
director string
)
stored as parquet
location '/user/hive/warehouse/bdss2001_iraj.db/movie_parquet/';
select * from ext_movie_parquet;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment