hadoopsters hadoopsters

## analyzing_columns_in_hive_tables.hql
ANALYZE TABLE my_database.my_table compute statistics for column1, column2, column3; -- column stats for non-partitioned table
ANALYZE TABLE my_database.my_table PARTITION (YEAR=2017, MONTH=11, DAY=30, HOUR=0) compute statistics for column1, column2, column3; -- column stats for single hour of partitioned table
ANALYZE TABLE my_database.my_table PARTITION (YEAR=2017, MONTH=11, DAY=30, HOUR) compute statistics for column1, column2, column3; -- column stats for a single day of partitioned table

## hive_settings.hql
set hive.compute.query.using.stats=true;
set hive.stats.fetch.column.stats=true;

## ctas_hive_table.hql
CREATE TABLE my_database.my_table
STORED AS ORC TBLPROPERTIES('ORC.COMPRESS'='SNAPPY') as
SELECT * FROM my_database.my_other_table WHERE YEAR=2017 AND MONTH=11 AND DAY=30;

## limiting.hql
select * from my_table
limit 10000;

## order_by_rand.hql
select * from my_table
order by rand()
limit 10000;

## sort_by_rand.hql
select * from my_table
sort by rand()
limit 10000;

## random_distribution.hql
select * from my_table
distribute by rand()
sort by rand()
limit 10000;

## random_proportion_control.hql
select * from my_table
where rand() <= 0.0001
distribute by rand()
sort by rand()
limit 10000;

## hiveToCsv_1.sh
#!/bin/bash
hive -e "insert overwrite local directory '/path/in/local/'
row format delimited fields terminated by ','
select * from my_database.my_table"
cat /path/in/local/* > /another/path/in/local/my_table.csv

## streamingLogLevel.scala
val conf = new SparkConf().setAppName(appName) // run on cluster
val ssc = new StreamingContext(conf, Seconds(5))
val sc = ssc.sparkContext
sc.setLogLevel("ERROR")
	ANALYZE TABLE my_database.my_table compute statistics for column1, column2, column3; -- column stats for non-partitioned table
	ANALYZE TABLE my_database.my_table PARTITION (YEAR=2017, MONTH=11, DAY=30, HOUR=0) compute statistics for column1, column2, column3; -- column stats for single hour of partitioned table
	ANALYZE TABLE my_database.my_table PARTITION (YEAR=2017, MONTH=11, DAY=30, HOUR) compute statistics for column1, column2, column3; -- column stats for a single day of partitioned table
	set hive.compute.query.using.stats=true;
	set hive.stats.fetch.column.stats=true;
	CREATE TABLE my_database.my_table
	STORED AS ORC TBLPROPERTIES('ORC.COMPRESS'='SNAPPY') as
	SELECT * FROM my_database.my_other_table WHERE YEAR=2017 AND MONTH=11 AND DAY=30;
	select * from my_table
	distribute by rand()
	sort by rand()
	limit 10000;
	select * from my_table
	where rand() <= 0.0001
	distribute by rand()
	sort by rand()
	limit 10000;
	#!/bin/bash
	hive -e "insert overwrite local directory '/path/in/local/'
	row format delimited fields terminated by ','
	select * from my_database.my_table"
	cat /path/in/local/* > /another/path/in/local/my_table.csv
	val conf = new SparkConf().setAppName(appName) // run on cluster
	val ssc = new StreamingContext(conf, Seconds(5))
	val sc = ssc.sparkContext
	sc.setLogLevel("ERROR")