Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save tmcgrath/f00b40c1d4a7dca9ff2b to your computer and use it in GitHub Desktop.
Save tmcgrath/f00b40c1d4a7dca9ff2b to your computer and use it in GitHub Desktop.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 1.1.0
/_/
Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.6.0_65)
Type in expressions to have them evaluated.
Type :help for more information.
2015-12-10 11:06:08.048 java[89392:1203] Unable to load realm info from SCDynamicStore
15/12/10 11:06:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Spark context available as sc.
scala> val babyNames = sc.textFile("baby_names.csv")
babyNames: org.apache.spark.rdd.RDD[String] = baby_names.csv MappedRDD[1] at textFile at <console>:12
scala> val rows = babyNames.map(line => line.split(","))
rows: org.apache.spark.rdd.RDD[Array[String]] = MappedRDD[2] at map at <console>:14
scala> sc.parallelize(List(1,2,3)).flatMap(x=>List(x,x,x)).collect
res0: Array[Int] = Array(1, 1, 1, 2, 2, 2, 3, 3, 3)
scala> sc.parallelize(List(1,2,3)).map(x=>List(x,x,x)).collect
res1: Array[List[Int]] = Array(List(1, 1, 1), List(2, 2, 2), List(3, 3, 3))
scala>
scala> sc.parallelize(List(1,2,3)).flatMap(x=>List(x,x,x))
res2: org.apache.spark.rdd.RDD[Int] = FlatMappedRDD[8] at flatMap at <console>:13
scala> sc.parallelize(List(1,2,3)).map(x=>List(x,x,x))
res3: org.apache.spark.rdd.RDD[List[Int]] = MappedRDD[10] at map at <console>:13
scala> val davidRows = rows.filter(row => row(1).contains("DAVID"))
davidRows: org.apache.spark.rdd.RDD[Array[String]] = FilteredRDD[11] at filter at <console>:16
scala> val parallel = sc.parallelize(1 to 9, 3)
parallel: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[12] at parallelize at <console>:12
scala> parallel.mapPartitions( x => List(x.next).iterator).collect
res4: Array[Int] = Array(1, 4, 7)
scala> val parallel = sc.parallelize(1 to 9)
parallel: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[14] at parallelize at <console>:12
scala> parallel.mapPartitions( x => List(x.next).iterator).collect
res5: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8)
scala> parallel.mapPartitionsWithIndex( (index: Int, it: Iterator[Int]) => it.toList.map(x => index + ", "+x).iterator).collect
res6: Array[String] = Array(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 7, 9)
scala> val parallel = sc.parallelize(1 to 9, 3)
parallel: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[17] at parallelize at <console>:12
scala> parallel.mapPartitionsWithIndex( (index: Int, it: Iterator[Int]) => it.toList.map(x => index + ", "+x).iterator).collect
res7: Array[String] = Array(0, 1, 0, 2, 0, 3, 1, 4, 1, 5, 1, 6, 2, 7, 2, 8, 2, 9)
scala> val parallel = sc.parallelize(1 to 9)
parallel: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[19] at parallelize at <console>:12
scala> parallel.sample(true,.2).count
res8: Long = 2
scala> parallel.sample(true,.2).count
res9: Long = 2
scala> parallel.sample(true,.2).count
res10: Long = 2
scala> parallel.sample(true,.2).count
res11: Long = 3
scala> parallel.sample(true,.2).count
res12: Long = 2
scala> parallel.sample(true,.2).count
res13: Long = 0
scala> parallel.sample(true,.1)
res14: org.apache.spark.rdd.RDD[Int] = PartitionwiseSampledRDD[26] at sample at <console>:15
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment