anuragkh/Spark 1.4,Java7

## Spark 1.4,Java7
/* Spark Shell Executed */
./bin/spark-shell --master spark://servername:7077 --driver-class-path $CLASSPATH


/* Output */
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 1.4.0
      /_/

Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_79)
Type in expressions to have them evaluated.
Type :help for more information.
Spark context available as sc.
15/07/06 18:39:40 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
15/07/06 18:39:40 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
15/07/06 18:39:42 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 0.13.1aa
SQL context available as sqlContext.

scala> val df = sqlContext.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...

scala> df.registerTempTable("training")

scala> sqlContext.sql("select count(*) as cnt from training").collect().take(10).foreach(println)
[88175]

scala> import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.HiveContext

scala> val ctx = new HiveContext(sc)
ctx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@21165c0e

scala> import ctx.implicits._
import ctx.implicits._

scala> val df = ctx.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
java.lang.OutOfMemoryError: PermGen space
Stopping spark context.
Exception in thread "main"
Exception: java.lang.OutOfMemoryError thrown from the UncaughtExceptionHandler in thread "main"

## Spark 1.4,Java7,PermGenSize=256
/* Spark Shell Executed */
 ./bin/spark-shell --master spark://servername:7077 --driver-class-path $CLASSPATH --conf "spark.driver.extraJavaOptions=-XX:MaxPermSize=256m"


/* Output */
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 1.4.0
      /_/

Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_79)
Type in expressions to have them evaluated.
Type :help for more information.
Spark context available as sc.
SQL context available as sqlContext.

scala> val df = sqlContext.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...

scala> sqlContext.sql("select count(*) as cnt from training").collect().take(10).foreach(println)
[88175]

scala> import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.HiveContext

scala> val ctx = new HiveContext(sc)
ctx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@1f5588c1

scala> import ctx.implicits._
import ctx.implicits._

scala> val df = ctx.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...
scala> df.registerTempTable("training")

scala> val dfCount = ctx.sql("select count(*) as cnt from training")
dfCount: org.apache.spark.sql.DataFrame = [cnt: bigint]

scala> println(dfCount.first.getLong(0))
88175


## Spark 1.8,Java8
/* Spark Shell Executed */
./bin/spark-shell --master spark://servername:7077 --driver-class-path $CLASSPATH


/* Output */
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 1.4.0
      /_/

Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_45)
Type in expressions to have them evaluated.
Type :help for more information.
Spark context available as sc.
SQL context available as sqlContext.

scala> val df = sqlContext.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...

scala> df.registerTempTable("training")

scala> sqlContext.sql("select count(*) as cnt from training").collect().take(10).foreach(println)
[88175]


scala> import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.HiveContext

scala> val ctx = new HiveContext(sc)
ctx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@38c408c7

scala> import ctx.implicits._

scala> val df = ctx.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")

scala> df.registerTempTable("training")
df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...
scala>

scala> val dfCount = ctx.sql("select count(*) as cnt from training")
dfCount: org.apache.spark.sql.DataFrame = [cnt: bigint]

scala> println(dfCount.first.getLong(0))
88175
	/* Spark Shell Executed */
	./bin/spark-shell --master spark://servername:7077 --driver-class-path $CLASSPATH


	/* Output */
	Welcome to
	____ __
	/ __/__ ___ _____/ /__
	_\ \/ _ \/ _ `/ __/ '_/
	/___/ .__/\_,_/_/ /_/\_\ version 1.4.0
	/_/

	Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_79)
	Type in expressions to have them evaluated.
	Type :help for more information.
	Spark context available as sc.
	15/07/06 18:39:40 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
	15/07/06 18:39:40 WARN Connection: BoneCP specified but not present in CLASSPATH (or one of dependencies)
	15/07/06 18:39:42 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 0.13.1aa
	SQL context available as sqlContext.

	scala> val df = sqlContext.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
	df: org.apache.spark.sql.DataFrame = [aac_brand: string, aag__id: bigint, aag_weight: bigint, aca_brand: string, aca_conversion_integration: boolean, aca_daily_budget: bigint, aca_hide_brand_from_publishers: boolean, aca_is_remnant: boolean, aca_short_name: string, accid: string, acr__id: bigint, acr_choices: array<struct<cta:string,headline:string,img:string,target:string>>, acr_cta: string, acr_description1: string, acr_description2: string, acr_destination: string, acr_displayUrl: string, acr_headline: string, acr_img: string, acr_isiUrl: string, acr_paramCTA: string, acr_paramName: string, acr_paramPlaceholder: string, acr_target: string, acr_type: string, acr_weight: bigint, agid: string, akw__id: bigint, akw_canonical_id: bigint, akw_criterion_type: string, akw_destination_url: st...

	scala> df.registerTempTable("training")

	scala> sqlContext.sql("select count(*) as cnt from training").collect().take(10).foreach(println)
	[88175]

	scala> import org.apache.spark.sql.hive.HiveContext
	import org.apache.spark.sql.hive.HiveContext

	scala> val ctx = new HiveContext(sc)
	ctx: org.apache.spark.sql.hive.HiveContext = org.apache.spark.sql.hive.HiveContext@21165c0e

	scala> import ctx.implicits._
	import ctx.implicits._

	scala> val df = ctx.jsonFile("/data/ssimeonov/gz/apache-spark-failure-data-part-00000.gz")
	java.lang.OutOfMemoryError: PermGen space
	Stopping spark context.
	Exception in thread "main"
	Exception: java.lang.OutOfMemoryError thrown from the UncaughtExceptionHandler in thread "main"