Skip to content

Instantly share code, notes, and snippets.

@ueshin
Created September 7, 2014 05:59
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ueshin/9a2eec6a6034e170d94c to your computer and use it in GitHub Desktop.
Save ueshin/9a2eec6a6034e170d94c to your computer and use it in GitHub Desktop.

Getting Started with Spark by Aaron

RDD

val numbers = 0 until 24
numbers.filter(_ > 10)
val rdd = sc.makeRDD(numbers)

rdd.first()

rdd.collect

rdd.glom().collect.map(_.toList).foreach(println)

rdd.collect.foreach(println)


val f = sc.textFile("CHANGES.txt")
f.first()

f.collect.foreach(println)

val words = f.flatMap(_.split(" ")).filter(_.nonEmpty)
words.take(10)

val wordsPairs = words.map(_ -> 1)
wordsPairs.take(10)

val wordcount = wordsPairs.reduceByKey(_ + _)
wordcount.take(10)

wordcount.collect.foreach(println)

implicit val ordering = Ordering.by[(String, Int), Int](_._2)
wordcount.takeOrdered(10).foreach(println)

Spark SQL

val peopleFile = sc.textFile("examples/src/main/resources/people.txt")
peopleFile.first()

case class Person(name: String, age: Int)

val p = peopleFile.map(_.split(",")).map(p => Person(p(0), p(1).trim.toInt))
p.collect


import org.apache.spark.sql._

val sqlContext = new SQLContext(sc)
import sqlContext._

val people = sqlContext.createSchemaRDD(p)

people.printSchema
people.registerAsTable("people")

sql("select * from people").collect


sql("select name from people where age > 20").collect

people.where('age > 20).select('name).collect

wikipedia

val wiki = sqlContext.parquetFile("wiki_parquet")
wiki.printSchema
wiki.registerAsTable("wiki")

sql("SELECT title FROM wiki WHERE text LIKE '% Scala %'").collect

sql("SELECT title FROM wiki WHERE text LIKE '%Matsuri%'").collect
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment