set hive.execution.engine=tez;
CREATE TABLE ORC_TABLE (
customerID int,
name string,
age int,
address string
) STORED AS ORC tblproperties (“orc.compress" = “SNAPPY”);
set hive.vectorized.execution.enabled = true;
set hive.vectorized.execution.reduce.enabled = true;
set hive.cbo.enable=true;
set hive.compute.query.using.stats=true;
set hive.stats.fetch.column.stats=true;
set hive.stats.fetch.partition.stats=true;
Then collect table statistics:
analyze table tweets compute statistics;
analyze table tweets compute statistics for columns;
Example. A query like this:
SELECT clicks.*
FROM clicks inner join (
select sessionID, max(timestamp) as max_ts
from clicks
group by sessionID
) latest ON clicks.sessionID = latest.sessionID and
clicks.timestamp = latest.max_ts;
could be re-written like this:
SELECT *
FROM (
SELECT *,
RANK() over (partition by sessionID,
order by timestamp desc) as rank
FROM clicks
) ranked_clicks
WHERE ranked_clicks.rank=1;