Skip to content

Instantly share code, notes, and snippets.

@tmcgrath
Created December 10, 2015 21:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save tmcgrath/ec88ada09b37126713a5 to your computer and use it in GitHub Desktop.
Save tmcgrath/ec88ada09b37126713a5 to your computer and use it in GitHub Desktop.
Spark Console Action functions in Scala
scala> val names1 = sc.parallelize(List("abe", "abby", "apple"))
names1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0] at parallelize at <console>:12
scala> names1.reduce((t1,t2) => t1 + t2)
res0: String = abbyappleabe
scala> names1.flatMap(k => List(k.size) ).reduce((t1,t2) => t1 + t2)
res1: Int = 12
scala> val names2 = sc.parallelize(List("apple", "beatty", "beatrice")).map(a => (a, a.size))
names2: org.apache.spark.rdd.RDD[(String, Int)] = MappedRDD[3] at map at <console>:12
scala> names2.flatMap(t => Array(t._2)).reduce(_ + _)
res2: Int = 19
scala> sc.parallelize(List(1,2,3)).flatMap(x=>List(x,x,x)).collect
res3: Array[Int] = Array(1, 1, 1, 2, 2, 2, 3, 3, 3)
scala> names2.count
res4: Long = 3
scala>
scala> names2.first
res5: (String, Int) = (apple,5)
scala> names2.take(2)
res6: Array[(String, Int)] = Array((apple,5), (beatty,6))
scala> names2.take(2)
res7: Array[(String, Int)] = Array((apple,5), (beatty,6))
scala> names2.take(2)
res8: Array[(String, Int)] = Array((apple,5), (beatty,6))
scala> names2.take(2)
res9: Array[(String, Int)] = Array((apple,5), (beatty,6))
scala> names2.take(2)
res10: Array[(String, Int)] = Array((apple,5), (beatty,6))
scala> names2.take(2)
res11: Array[(String, Int)] = Array((apple,5), (beatty,6))
scala> names2.take(2)
res12: Array[(String, Int)] = Array((apple,5), (beatty,6))
scala> names2.take(2)
res13: Array[(String, Int)] = Array((apple,5), (beatty,6))
scala> names2.take(2)
res14: Array[(String, Int)] = Array((apple,5), (beatty,6))
scala> names2.take(2)
res15: Array[(String, Int)] = Array((apple,5), (beatty,6))
scala> names2.take(2)
res16: Array[(String, Int)] = Array((apple,5), (beatty,6))
scala> val teams = sc.parallelize(List("twins", "brewers", "cubs", "white sox", "indians", "bad news bears"))
teams: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[7] at parallelize at <console>:12
scala> teams.takeSample(true, 3)
res17: Array[String] = Array(white sox, twins, brewers)
scala> teams.takeSample(true, 3)
res18: Array[String] = Array(white sox, bad news bears, brewers)
scala> teams.takeSample(true, 3)
res19: Array[String] = Array(indians, cubs, white sox)
scala> teams.takeSample(true, 3)
res20: Array[String] = Array(bad news bears, brewers, cubs)
scala> teams.takeSample(true, 3)
res21: Array[String] = Array(white sox, white sox, indians)
scala> teams.takeSample(true, 3)
res22: Array[String] = Array(bad news bears, twins, bad news bears)
scala> val hockeyTeams = sc.parallelize(List("wild", "blackhawks", "red wings", "wild", "oilers", "whalers", "jets", "wild"))
hockeyTeams: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[14] at parallelize at <console>:12
scala> hockeyTeams.map(k => (k,1)).countByKey
res23: scala.collection.Map[String,Long] = Map(jets -> 1, blackhawks -> 1, red wings -> 1, oilers -> 1, whalers -> 1, wild -> 3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment