Skip to content

Instantly share code, notes, and snippets.

Last active Apr 24, 2019
What would you like to do?
Spark Streaming demo
# using four part files to construct "minitweet"
cat rawtweets/part-0000[1-3] > minitweets
# change log4j properties to WARN to reduce noise during demo
mv conf/ conf/
vim conf/ # Change to WARN
# launch Spark shell REPL
// load some tweets
val tweets = sc.textFile("minitweets")
// pretty-print the data that's in JSON format
import com.fasterxml.jackson.databind.ObjectMapper
def prettyPrint(str: String) = {
val mapper = new ObjectMapper()
val prettyPrinter = mapper.writerWithDefaultPrettyPrinter()
val obj = mapper.readValue(str, classOf[java.util.Map[String, Object]])
// sc is an existing SparkContext
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext._
// create a SchemaRDD from the JSON file
val tweetTable = sqlContext.jsonFile("minitweets")
// let's take a look at the schema
// use SQL queries to explore the data
sql("SELECT text FROM tweetTable LIMIT 10").collect.foreach(println)
sql("SELECT, text, lang FROM tweetTable LIMIT 10").collect.foreach(println)
// which are the top ten languages represented?
sql("SELECT lang, COUNT(*) AS cnt FROM tweetTable GROUP BY lang ORDER BY cnt DESC LIMIT 10").collect.foreach(println)
// feature engineering, based on an ngram approach
val texts = sql("SELECT text FROM tweetTable").map(_.head.toString)
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Vector, Vectors}
def featurize(s: String): Vector = {
val n = 1000
val result = new Array[Double](n)
val bigrams = s.sliding(2).toArray
for (h <- % n)) {
result(h) += 1.0 / bigrams.length
Vectors.sparse(n, result.zipWithIndex.filter(_._1 != 0).map(_.swap))
// test this function
featurize("Hello World!")
// train a model
val vectors =
val model = KMeans.train(vectors, 10, 20)
// monitor the stages of this unit of work
// http://localhost:4040
// take a look into those clusters
val some_tweets = texts.take(100)
for (i <- 0 until 10) {
println(s"\nCLUSTER $i:")
some_tweets.foreach { t =>
if (model.predict(featurize(t)) == i) {
// persist the model to disk, so we can use it for streaming
sc.makeRDD(model.clusterCenters, 10).saveAsObjectFile("model")
# streaming code example
git clone
mv spark ~/opt/aarondav-spark
cd aarondav-spark
git fetch
git checkout demo-latest
git pull origin demo-latest
# fix the Twitter API keys in the code
vim examples/src/main/scala/org/apache/spark/examples/streaming/ClusteringDemo.scala
# change log4j properties to WARN to reduce noise during demo
mv conf/ conf/
vim conf/ # Change to WARN
sbt/sbt clean assembly
# move the trained model to local path
mv ~/opt/model .
./bin/spark-submit ./examples/target/scala-2.10/spark-examples-1.1.0-SNAPSHOT-hadoop1.0.4.jar --class org.apache.spark.examples.streaming.ClusteringDemo model 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment