Skip to content

Instantly share code, notes, and snippets.

@dhesse
Created April 10, 2016 14:26
Show Gist options
  • Save dhesse/ef40c6d0f4c10188ce057860d40eb5e4 to your computer and use it in GitHub Desktop.
Save dhesse/ef40c6d0f4c10188ce057860d40eb5e4 to your computer and use it in GitHub Desktop.
import collection.JavaConversions._
import org.apache.hadoop.conf.Configuration
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDD
import org.bson.BSONObject
import org.bson.types.BasicBSONList
import java.io._
import com.mongodb.hadoop.{
MongoInputFormat, MongoOutputFormat,
BSONFileInputFormat, BSONFileOutputFormat}
import com.mongodb.hadoop.io.MongoUpdateWritable
val mongoConfig = new Configuration()
mongoConfig.set("mongo.input.uri",
"mongodb://localhost:27017/twitter.tweets")
val documents = sc.newAPIHadoopRDD(
mongoConfig, // Configuration
classOf[MongoInputFormat], // InputFormat
classOf[Object], // Key type
classOf[BSONObject]) // Value type
val head = documents.take(5)
val topTags = documents.map{case (a, b) => b}.
map(_ get("entities")).
map{case u: BSONObject => u.get("hashtags")}.
flatMap{case l: BasicBSONList => l.toMap.map(_._2).toList}.
map{case u: BSONObject => u.get("text")}.
map{case u: String => u.toLowerCase()}.
filter(_ != "datascience").
countByValue().
toList.
sortBy(- _._2).
take(15)
val file = new File("topTags.csv")
val writer = new BufferedWriter(new FileWriter(file))
writer.write("Hashtag,Use Count\n")
for (pair <- topTags) writer.write(pair._1 + "," + pair._2 + "\n")
writer.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment