jamesrajendran/Hive Commands

## Hive Commands
hive> set mapreduce.framework.name=local
display hive database name: set hive.cli.print.current.db=true;

DESCRIBE EXTENDED husn_small; --to get statistics
Analyze table husn_small compute statistics;

create table snpn(sn String, pn String)
LOAD DATA  INPATH 'hdfs://127200813master.eap.g4ihos.itcs.hpecorp.net:8020/user/centos7/test_data/snpn' append INTO TABLE snpn


----------------------ORC in hive performance tuning--------------------------------------
TEZ

CREATE TABLE A_ORC (customerID int, name string, age int, address string ) STORED AS ORC tblproperties (“orc.compress" = “SNAPPY”);
INSERT INTO TABLE A_ORC SELECT * FROM A;

set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.reduce.enabled = true;
data should be stored in ORC format

--enbale CBO---------------
set hive.cbo.enbale=true;
set hive.compute.query.using.stats=true;
set hive.stats.fetch.column.stats=true;
set hive.stats.fetch.partition.stats=true;

analyze table tweets compute statistics for columns;

----window function------------
SELECT * FROM
(SELECT *, RANK() over (partition by sessionID,order by timestamp desc) as rank FROM clicks) ranked_clicks
WHERE ranked_clicks.rank=1;
-----avoid global sort------------------------
SELECT id, name, salary, dept FROM employee
DISTRIBUTE BY dept --custom partition like
SORT BY id ASC, name DESC;
--------------------parallel -------------
   <name>hive.exec.parallel</name>

   <name>hive.exec.parallel.thread.number</name>
   <value>8</value>
---------------- multi group by single reducer------------------

    <name>hive.multigroupby.singlereducer</name>
    <value>true</value>
    <description>
      Whether to optimize multi group by query to generate single M/R  job plan. If the multi group by query has
      common group by keys, it will be optimized to generate single M/R job.

--------------------bucketing----------------------------
uses hash partitioning
useful when cardinality of data is high
result in fixed number of files as opposed to Partitioning
it's similar to clustering.
recommended not to use

-----------------functions-----------------
show functions
show functions  ".*date.*"
describe function <function name>
describe function extended <function name>
	hive> set mapreduce.framework.name=local
	display hive database name: set hive.cli.print.current.db=true;

	DESCRIBE EXTENDED husn_small; --to get statistics
	Analyze table husn_small compute statistics;

	create table snpn(sn String, pn String)
	LOAD DATA INPATH 'hdfs://127200813master.eap.g4ihos.itcs.hpecorp.net:8020/user/centos7/test_data/snpn' append INTO TABLE snpn


	----------------------ORC in hive performance tuning--------------------------------------
	TEZ

	CREATE TABLE A_ORC (customerID int, name string, age int, address string ) STORED AS ORC tblproperties (“orc.compress" = “SNAPPY”);
	INSERT INTO TABLE A_ORC SELECT * FROM A;

	set hive.vectorized.execution.enabled = true;
	set hive.vectorized.execution.reduce.enabled = true;
	data should be stored in ORC format

	--enbale CBO---------------
	set hive.cbo.enbale=true;
	set hive.compute.query.using.stats=true;
	set hive.stats.fetch.column.stats=true;
	set hive.stats.fetch.partition.stats=true;

	analyze table tweets compute statistics for columns;

	----window function------------
	SELECT * FROM
	(SELECT *, RANK() over (partition by sessionID,order by timestamp desc) as rank FROM clicks) ranked_clicks
	WHERE ranked_clicks.rank=1;
	-----avoid global sort------------------------
	SELECT id, name, salary, dept FROM employee
	DISTRIBUTE BY dept --custom partition like
	SORT BY id ASC, name DESC;
	--------------------parallel -------------
	<name>hive.exec.parallel</name>

	<name>hive.exec.parallel.thread.number</name>
	<value>8</value>
	---------------- multi group by single reducer------------------

	<name>hive.multigroupby.singlereducer</name>
	<value>true</value>
	<description>
	Whether to optimize multi group by query to generate single M/R job plan. If the multi group by query has
	common group by keys, it will be optimized to generate single M/R job.

	--------------------bucketing----------------------------
	uses hash partitioning
	useful when cardinality of data is high
	result in fixed number of files as opposed to Partitioning
	it's similar to clustering.
	recommended not to use

	-----------------functions-----------------
	show functions
	show functions ".date."
	describe function <function name>
	describe function extended <function name>