Skip to content

Instantly share code, notes, and snippets.

@jamesrajendran
Created May 7, 2017 04:44
Show Gist options
  • Save jamesrajendran/ee21a7329450cfed0a1a78e961216fcc to your computer and use it in GitHub Desktop.
Save jamesrajendran/ee21a7329450cfed0a1a78e961216fcc to your computer and use it in GitHub Desktop.
hive> set mapreduce.framework.name=local
display hive database name: set hive.cli.print.current.db=true;
DESCRIBE EXTENDED husn_small; --to get statistics
Analyze table husn_small compute statistics;
create table snpn(sn String, pn String)
LOAD DATA INPATH 'hdfs://127200813master.eap.g4ihos.itcs.hpecorp.net:8020/user/centos7/test_data/snpn' append INTO TABLE snpn
----------------------ORC in hive performance tuning--------------------------------------
TEZ
CREATE TABLE A_ORC (customerID int, name string, age int, address string ) STORED AS ORC tblproperties (“orc.compress" = “SNAPPY”);
INSERT INTO TABLE A_ORC SELECT * FROM A;
set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.reduce.enabled = true;
data should be stored in ORC format
--enbale CBO---------------
set hive.cbo.enbale=true;
set hive.compute.query.using.stats=true;
set hive.stats.fetch.column.stats=true;
set hive.stats.fetch.partition.stats=true;
analyze table tweets compute statistics for columns;
----window function------------
SELECT * FROM
(SELECT *, RANK() over (partition by sessionID,order by timestamp desc) as rank FROM clicks) ranked_clicks
WHERE ranked_clicks.rank=1;
-----avoid global sort------------------------
SELECT id, name, salary, dept FROM employee
DISTRIBUTE BY dept --custom partition like
SORT BY id ASC, name DESC;
--------------------parallel -------------
<name>hive.exec.parallel</name>
<name>hive.exec.parallel.thread.number</name>
<value>8</value>
---------------- multi group by single reducer------------------
<name>hive.multigroupby.singlereducer</name>
<value>true</value>
<description>
Whether to optimize multi group by query to generate single M/R job plan. If the multi group by query has
common group by keys, it will be optimized to generate single M/R job.
--------------------bucketing----------------------------
uses hash partitioning
useful when cardinality of data is high
result in fixed number of files as opposed to Partitioning
it's similar to clustering.
recommended not to use
-----------------functions-----------------
show functions
show functions ".*date.*"
describe function <function name>
describe function extended <function name>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment