Abhiknoldur/SparkSql

## SparkSql
scala> val rdd1=sc.parallelize(Seq((1,3.6)))
rdd1: org.apache.spark.rdd.RDD[(Int, Double)] = ParallelCollectionRDD[0] at parallelize at <console>:24

scala> val rdd2=sc.parallelize(Seq((1,1.1)))
rdd2: org.apache.spark.rdd.RDD[(Int, Double)] = ParallelCollectionRDD[1] at parallelize at <console>:24

scala> val jo=rdd1 join rdd2
jo: org.apache.spark.rdd.RDD[(Int, (Double, Double))] = MapPartitionsRDD[4] at join at <console>:27

scala> jo.map(r=>(r._1,(r._2._1-r._2._2)))
res0: org.apache.spark.rdd.RDD[(Int, Double)] = MapPartitionsRDD[5] at map at <console>:26

scala> res0.collect
res1: Array[(Int, Double)] = Array((1,2.5))

scala>

scala> val df =spark.range(100)
2018-09-12 12:12:53 WARN  ObjectStore:6666 - Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0
2018-09-12 12:12:53 WARN  ObjectStore:568 - Failed to get database default, returning NoSuchObjectException
2018-09-12 12:12:57 WARN  ObjectStore:568 - Failed to get database global_temp, returning NoSuchObjectException
df: org.apache.spark.sql.Dataset[Long] = [id: bigint]

scala> val df =df.toDF
<console>:25: error: recursive value df needs type
       val df =df.toDF
               ^

scala> val df1 =df.toDF
df1: org.apache.spark.sql.DataFrame = [id: bigint]

scala> df.show
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+
only showing top 20 rows


scala> val df1 =df.toDF("mycol")
df1: org.apache.spark.sql.DataFrame = [mycol: bigint]

scala> df.show
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+
only showing top 20 rows


scala> df1.filter(mycol % 2==0)
<console>:26: error: not found: value mycol
       df1.filter(mycol % 2==0)
                  ^

scala> df1.filter(id % 2==0)
<console>:26: error: not found: value id
       df1.filter(id % 2==0)
                  ^

scala> df1.filter("id % 2==0")
res6: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [mycol: bigint]

scala> df1.filter("mycol % 2==0")
res7: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [mycol: bigint]

scala> df1.filter("id % 2==0").show
+-----+
|mycol|
+-----+
|    0|
|    2|
|    4|
|    6|
|    8|
|   10|
|   12|
|   14|
|   16|
|   18|
|   20|
|   22|
|   24|
|   26|
|   28|
|   30|
|   32|
|   34|
|   36|
|   38|
+-----+
only showing top 20 rows


scala> df1.filter("mycol % 2==0").show
+-----+
|mycol|
+-----+
|    0|
|    2|
|    4|
|    6|
|    8|
|   10|
|   12|
|   14|
|   16|
|   18|
|   20|
|   22|
|   24|
|   26|
|   28|
|   30|
|   32|
|   34|
|   36|
|   38|
+-----+
only showing top 20 rows


scala> df1.printSchema
root
 |-- mycol: long (nullable = false)


scala> df1.create
createGlobalTempView   createOrReplaceGlobalTempView   createOrReplaceTempView   createTempView

scala> df1.createOrReplace
createOrReplaceGlobalTempView   createOrReplaceTempView

scala> df1.createOrReplaceTempView
<console>:26: error: missing argument list for method createOrReplaceTempView in class Dataset
Unapplied methods are only converted to functions when a function type is expected.
You can make this conversion explicit by writing `createOrReplaceTempView _` or `createOrReplaceTempView(_)` instead of `createOrReplaceTempView`.
       df1.createOrReplaceTempView
           ^

scala> df1.createOrReplaceTempView("Numbers")

scala> spark.sql("select * from numbers where mycol>10").show
+-----+
|mycol|
+-----+
|   11|
|   12|
|   13|
|   14|
|   15|
|   16|
|   17|
|   18|
|   19|
|   20|
|   21|
|   22|
|   23|
|   24|
|   25|
|   26|
|   27|
|   28|
|   29|
|   30|
+-----+
only showing top 20 rows


scala>
	scala> val rdd1=sc.parallelize(Seq((1,3.6)))
	rdd1: org.apache.spark.rdd.RDD[(Int, Double)] = ParallelCollectionRDD[0] at parallelize at <console>:24

	scala> val rdd2=sc.parallelize(Seq((1,1.1)))
	rdd2: org.apache.spark.rdd.RDD[(Int, Double)] = ParallelCollectionRDD[1] at parallelize at <console>:24

	scala> val jo=rdd1 join rdd2
	jo: org.apache.spark.rdd.RDD[(Int, (Double, Double))] = MapPartitionsRDD[4] at join at <console>:27

	scala> jo.map(r=>(r._1,(r._2._1-r._2._2)))
	res0: org.apache.spark.rdd.RDD[(Int, Double)] = MapPartitionsRDD[5] at map at <console>:26

	scala> res0.collect
	res1: Array[(Int, Double)] = Array((1,2.5))

	scala>

	scala> val df =spark.range(100)
	2018-09-12 12:12:53 WARN ObjectStore:6666 - Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0
	2018-09-12 12:12:53 WARN ObjectStore:568 - Failed to get database default, returning NoSuchObjectException
	2018-09-12 12:12:57 WARN ObjectStore:568 - Failed to get database global_temp, returning NoSuchObjectException
	df: org.apache.spark.sql.Dataset[Long] = [id: bigint]

	scala> val df =df.toDF
	<console>:25: error: recursive value df needs type
	val df =df.toDF
	^

	scala> val df1 =df.toDF
	df1: org.apache.spark.sql.DataFrame = [id: bigint]

	scala> df.show
	+---+
	\| id\|
	+---+
	\| 0\|
	\| 1\|
	\| 2\|
	\| 3\|
	\| 4\|
	\| 5\|
	\| 6\|
	\| 7\|
	\| 8\|
	\| 9\|
	\| 10\|
	\| 11\|
	\| 12\|
	\| 13\|
	\| 14\|
	\| 15\|
	\| 16\|
	\| 17\|
	\| 18\|
	\| 19\|
	+---+
	only showing top 20 rows


	scala> val df1 =df.toDF("mycol")
	df1: org.apache.spark.sql.DataFrame = [mycol: bigint]

	scala> df.show
	+---+
	\| id\|
	+---+
	\| 0\|
	\| 1\|
	\| 2\|
	\| 3\|
	\| 4\|
	\| 5\|
	\| 6\|
	\| 7\|
	\| 8\|
	\| 9\|
	\| 10\|
	\| 11\|
	\| 12\|
	\| 13\|
	\| 14\|
	\| 15\|
	\| 16\|
	\| 17\|
	\| 18\|
	\| 19\|
	+---+
	only showing top 20 rows


	scala> df1.filter(mycol % 2==0)
	<console>:26: error: not found: value mycol
	df1.filter(mycol % 2==0)
	^

	scala> df1.filter(id % 2==0)
	<console>:26: error: not found: value id
	df1.filter(id % 2==0)
	^

	scala> df1.filter("id % 2==0")
	res6: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [mycol: bigint]

	scala> df1.filter("mycol % 2==0")
	res7: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [mycol: bigint]

	scala> df1.filter("id % 2==0").show
	+-----+
	\|mycol\|
	+-----+
	\| 0\|
	\| 2\|
	\| 4\|
	\| 6\|
	\| 8\|
	\| 10\|
	\| 12\|
	\| 14\|
	\| 16\|
	\| 18\|
	\| 20\|
	\| 22\|
	\| 24\|
	\| 26\|
	\| 28\|
	\| 30\|
	\| 32\|
	\| 34\|
	\| 36\|
	\| 38\|
	+-----+
	only showing top 20 rows


	scala> df1.filter("mycol % 2==0").show
	+-----+
	\|mycol\|
	+-----+
	\| 0\|
	\| 2\|
	\| 4\|
	\| 6\|
	\| 8\|
	\| 10\|
	\| 12\|
	\| 14\|
	\| 16\|
	\| 18\|
	\| 20\|
	\| 22\|
	\| 24\|
	\| 26\|
	\| 28\|
	\| 30\|
	\| 32\|
	\| 34\|
	\| 36\|
	\| 38\|
	+-----+
	only showing top 20 rows


	scala> df1.printSchema
	root
	\|-- mycol: long (nullable = false)


	scala> df1.create
	createGlobalTempView createOrReplaceGlobalTempView createOrReplaceTempView createTempView

	scala> df1.createOrReplace
	createOrReplaceGlobalTempView createOrReplaceTempView

	scala> df1.createOrReplaceTempView
	<console>:26: error: missing argument list for method createOrReplaceTempView in class Dataset
	Unapplied methods are only converted to functions when a function type is expected.
	You can make this conversion explicit by writing `createOrReplaceTempView _` or `createOrReplaceTempView(_)` instead of `createOrReplaceTempView`.
	df1.createOrReplaceTempView
	^

	scala> df1.createOrReplaceTempView("Numbers")

	scala> spark.sql("select * from numbers where mycol>10").show
	+-----+
	\|mycol\|
	+-----+
	\| 11\|
	\| 12\|
	\| 13\|
	\| 14\|
	\| 15\|
	\| 16\|
	\| 17\|
	\| 18\|
	\| 19\|
	\| 20\|
	\| 21\|
	\| 22\|
	\| 23\|
	\| 24\|
	\| 25\|
	\| 26\|
	\| 27\|
	\| 28\|
	\| 29\|
	\| 30\|
	+-----+
	only showing top 20 rows


	scala>