Created
September 13, 2018 07:10
-
-
Save Abhiknoldur/1cf594d5aacd90735e88f7ea5027943d to your computer and use it in GitHub Desktop.
Dataframe and dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
scala> val rdd1=sc.parallelize(Seq((1,3.6))) | |
rdd1: org.apache.spark.rdd.RDD[(Int, Double)] = ParallelCollectionRDD[0] at parallelize at <console>:24 | |
scala> val rdd2=sc.parallelize(Seq((1,1.1))) | |
rdd2: org.apache.spark.rdd.RDD[(Int, Double)] = ParallelCollectionRDD[1] at parallelize at <console>:24 | |
scala> val jo=rdd1 join rdd2 | |
jo: org.apache.spark.rdd.RDD[(Int, (Double, Double))] = MapPartitionsRDD[4] at join at <console>:27 | |
scala> jo.map(r=>(r._1,(r._2._1-r._2._2))) | |
res0: org.apache.spark.rdd.RDD[(Int, Double)] = MapPartitionsRDD[5] at map at <console>:26 | |
scala> res0.collect | |
res1: Array[(Int, Double)] = Array((1,2.5)) | |
scala> | |
scala> val df =spark.range(100) | |
2018-09-12 12:12:53 WARN ObjectStore:6666 - Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0 | |
2018-09-12 12:12:53 WARN ObjectStore:568 - Failed to get database default, returning NoSuchObjectException | |
2018-09-12 12:12:57 WARN ObjectStore:568 - Failed to get database global_temp, returning NoSuchObjectException | |
df: org.apache.spark.sql.Dataset[Long] = [id: bigint] | |
scala> val df =df.toDF | |
<console>:25: error: recursive value df needs type | |
val df =df.toDF | |
^ | |
scala> val df1 =df.toDF | |
df1: org.apache.spark.sql.DataFrame = [id: bigint] | |
scala> df.show | |
+---+ | |
| id| | |
+---+ | |
| 0| | |
| 1| | |
| 2| | |
| 3| | |
| 4| | |
| 5| | |
| 6| | |
| 7| | |
| 8| | |
| 9| | |
| 10| | |
| 11| | |
| 12| | |
| 13| | |
| 14| | |
| 15| | |
| 16| | |
| 17| | |
| 18| | |
| 19| | |
+---+ | |
only showing top 20 rows | |
scala> val df1 =df.toDF("mycol") | |
df1: org.apache.spark.sql.DataFrame = [mycol: bigint] | |
scala> df.show | |
+---+ | |
| id| | |
+---+ | |
| 0| | |
| 1| | |
| 2| | |
| 3| | |
| 4| | |
| 5| | |
| 6| | |
| 7| | |
| 8| | |
| 9| | |
| 10| | |
| 11| | |
| 12| | |
| 13| | |
| 14| | |
| 15| | |
| 16| | |
| 17| | |
| 18| | |
| 19| | |
+---+ | |
only showing top 20 rows | |
scala> df1.filter(mycol % 2==0) | |
<console>:26: error: not found: value mycol | |
df1.filter(mycol % 2==0) | |
^ | |
scala> df1.filter(id % 2==0) | |
<console>:26: error: not found: value id | |
df1.filter(id % 2==0) | |
^ | |
scala> df1.filter("id % 2==0") | |
res6: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [mycol: bigint] | |
scala> df1.filter("mycol % 2==0") | |
res7: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [mycol: bigint] | |
scala> df1.filter("id % 2==0").show | |
+-----+ | |
|mycol| | |
+-----+ | |
| 0| | |
| 2| | |
| 4| | |
| 6| | |
| 8| | |
| 10| | |
| 12| | |
| 14| | |
| 16| | |
| 18| | |
| 20| | |
| 22| | |
| 24| | |
| 26| | |
| 28| | |
| 30| | |
| 32| | |
| 34| | |
| 36| | |
| 38| | |
+-----+ | |
only showing top 20 rows | |
scala> df1.filter("mycol % 2==0").show | |
+-----+ | |
|mycol| | |
+-----+ | |
| 0| | |
| 2| | |
| 4| | |
| 6| | |
| 8| | |
| 10| | |
| 12| | |
| 14| | |
| 16| | |
| 18| | |
| 20| | |
| 22| | |
| 24| | |
| 26| | |
| 28| | |
| 30| | |
| 32| | |
| 34| | |
| 36| | |
| 38| | |
+-----+ | |
only showing top 20 rows | |
scala> df1.printSchema | |
root | |
|-- mycol: long (nullable = false) | |
scala> df1.create | |
createGlobalTempView createOrReplaceGlobalTempView createOrReplaceTempView createTempView | |
scala> df1.createOrReplace | |
createOrReplaceGlobalTempView createOrReplaceTempView | |
scala> df1.createOrReplaceTempView | |
<console>:26: error: missing argument list for method createOrReplaceTempView in class Dataset | |
Unapplied methods are only converted to functions when a function type is expected. | |
You can make this conversion explicit by writing `createOrReplaceTempView _` or `createOrReplaceTempView(_)` instead of `createOrReplaceTempView`. | |
df1.createOrReplaceTempView | |
^ | |
scala> df1.createOrReplaceTempView("Numbers") | |
scala> spark.sql("select * from numbers where mycol>10").show | |
+-----+ | |
|mycol| | |
+-----+ | |
| 11| | |
| 12| | |
| 13| | |
| 14| | |
| 15| | |
| 16| | |
| 17| | |
| 18| | |
| 19| | |
| 20| | |
| 21| | |
| 22| | |
| 23| | |
| 24| | |
| 25| | |
| 26| | |
| 27| | |
| 28| | |
| 29| | |
| 30| | |
+-----+ | |
only showing top 20 rows | |
scala> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
SparkSql Handson