hadoopsters hadoopsters

## streamingLogLevel.scala
val conf = new SparkConf().setAppName(appName) // run on cluster
val ssc = new StreamingContext(conf, Seconds(5))
val sc = ssc.sparkContext
sc.setLogLevel("ERROR")

## hiveToCsv_1.sh
#!/bin/bash
hive -e "insert overwrite local directory '/path/in/local/'
row format delimited fields terminated by ','
select * from my_database.my_table"
cat /path/in/local/* > /another/path/in/local/my_table.csv

## hiveToCsv_2.sh
#!/bin/bash
hive -e "drop table if exists csv_dump;
create table csv_dump ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
LOCATION '/temp/storage/path' as
select * from my_data_table;"
hadoop fs -getmerge /temp/storage/path/ /local/path/my.csv

## dataframe_to_orc.scala
val mydataframe = ... //put some data in your dataframe, friend

mydataframe
  .write
  .option("orc.compress", "snappy")
  .mode(SaveMode.Append)
  .orc("/this/is/an/hdfs/directory/")

## dataframe_to_orc_partitioned.scala
val mydataframe = ... //put some data in your dataframe, friend

mydataframe
  .write
  .partitionBy("year", "month", "day", "hour")
  .option("orc.compress", "snappy")
  .mode(SaveMode.Append)
  .orc("/this/is/another/hdfs/directory")

## rdd_to_orc.scala
// import this guy
import org.apache.spark.sql.hive.HiveContext

// this should look familiar
val conf = new SparkConf()
val sc = new SparkContext(conf)

// setup this fella
val hiveContext = new HiveContext(sc)

## dstream_to_orc.scala
val mydstream = ... // these usually come from Spark Streaming apps

// they basically contain a chain of RDDs that you can convert to DFs
mydstream.foreachRDD(rdd => {

          hiveContext.createDataFrame(rdd)
            .write
            .option("orc.compress", "snappy")
            .mode(SaveMode.Append)
            .orc("/this/is/an/hdfs/directory/too")

## joining_streaming_and_static_datasets.scala
// create a case class to represent a Transaction (from streaming)
case class Transaction(
                        ts: Int,
                        customer_id: Int,
                        transaction_id: String,
                        amount: Double
                       )

// create a case class to represent a TransactionDetail (from static)
case class TransactionDetail(

## create_hive_table_basic.hql
CREATE TABLE my_database.my_table
(
    column_1 string,
    column_2 int,
    column_3 double
)
STORED AS ORC
TBLPROPERTIES('ORC.COMPRESS'='SNAPPY'); -- ensure SNAPPY is uppercase, lowercase triggers a nasty bug in Hive (fixed in later versions)

## ctas_hive_table.hql
CREATE TABLE my_database.my_table
STORED AS ORC TBLPROPERTIES('ORC.COMPRESS'='SNAPPY') as
SELECT * FROM my_database.my_other_table WHERE YEAR=2017 AND MONTH=11 AND DAY=30;
	val conf = new SparkConf().setAppName(appName) // run on cluster
	val ssc = new StreamingContext(conf, Seconds(5))
	val sc = ssc.sparkContext
	sc.setLogLevel("ERROR")
	#!/bin/bash
	hive -e "insert overwrite local directory '/path/in/local/'
	row format delimited fields terminated by ','
	select * from my_database.my_table"
	cat /path/in/local/* > /another/path/in/local/my_table.csv
	#!/bin/bash
	hive -e "drop table if exists csv_dump;
	create table csv_dump ROW FORMAT DELIMITED
	FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
	LOCATION '/temp/storage/path' as
	select * from my_data_table;"
	hadoop fs -getmerge /temp/storage/path/ /local/path/my.csv
	val mydataframe = ... //put some data in your dataframe, friend

	mydataframe
	.write
	.option("orc.compress", "snappy")
	.mode(SaveMode.Append)
	.orc("/this/is/an/hdfs/directory/")
	// import this guy
	import org.apache.spark.sql.hive.HiveContext

	// this should look familiar
	val conf = new SparkConf()
	val sc = new SparkContext(conf)

	// setup this fella
	val hiveContext = new HiveContext(sc)
	val mydstream = ... // these usually come from Spark Streaming apps

	// they basically contain a chain of RDDs that you can convert to DFs
	mydstream.foreachRDD(rdd => {

	hiveContext.createDataFrame(rdd)
	.write
	.option("orc.compress", "snappy")
	.mode(SaveMode.Append)
	.orc("/this/is/an/hdfs/directory/too")
	// create a case class to represent a Transaction (from streaming)
	case class Transaction(
	ts: Int,
	customer_id: Int,
	transaction_id: String,
	amount: Double
	)

	// create a case class to represent a TransactionDetail (from static)
	case class TransactionDetail(
	CREATE TABLE my_database.my_table
	(
	column_1 string,
	column_2 int,
	column_3 double
	)
	STORED AS ORC
	TBLPROPERTIES('ORC.COMPRESS'='SNAPPY'); -- ensure SNAPPY is uppercase, lowercase triggers a nasty bug in Hive (fixed in later versions)
	CREATE TABLE my_database.my_table
	STORED AS ORC TBLPROPERTIES('ORC.COMPRESS'='SNAPPY') as
	SELECT * FROM my_database.my_other_table WHERE YEAR=2017 AND MONTH=11 AND DAY=30;