dhesse/readMongo.scala

## readMongo.scala
import collection.JavaConversions._

import org.apache.hadoop.conf.Configuration
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDD

import org.bson.BSONObject
import org.bson.types.BasicBSONList
import java.io._
import com.mongodb.hadoop.{
  MongoInputFormat, MongoOutputFormat,
  BSONFileInputFormat, BSONFileOutputFormat}
import com.mongodb.hadoop.io.MongoUpdateWritable

val mongoConfig = new Configuration()
mongoConfig.set("mongo.input.uri",
    "mongodb://localhost:27017/twitter.tweets")
val documents = sc.newAPIHadoopRDD(
    mongoConfig,                // Configuration
    classOf[MongoInputFormat],  // InputFormat
    classOf[Object],            // Key type
    classOf[BSONObject])        // Value type

val head = documents.take(5)

val topTags = documents.map{case (a, b) => b}.
  map(_ get("entities")).
  map{case u: BSONObject => u.get("hashtags")}.
  flatMap{case l: BasicBSONList => l.toMap.map(_._2).toList}.
  map{case u: BSONObject => u.get("text")}.
  map{case u: String => u.toLowerCase()}.
  filter(_ != "datascience").
  countByValue().
  toList.
  sortBy(- _._2).
  take(15)

val file = new File("topTags.csv")
val writer = new BufferedWriter(new FileWriter(file))
writer.write("Hashtag,Use Count\n")
for (pair <- topTags) writer.write(pair._1 + "," + pair._2 + "\n")
writer.close()
	import collection.JavaConversions._

	import org.apache.hadoop.conf.Configuration
	import org.apache.spark.{SparkContext, SparkConf}
	import org.apache.spark.rdd.RDD

	import org.bson.BSONObject
	import org.bson.types.BasicBSONList
	import java.io._
	import com.mongodb.hadoop.{
	MongoInputFormat, MongoOutputFormat,
	BSONFileInputFormat, BSONFileOutputFormat}
	import com.mongodb.hadoop.io.MongoUpdateWritable

	val mongoConfig = new Configuration()
	mongoConfig.set("mongo.input.uri",
	"mongodb://localhost:27017/twitter.tweets")
	val documents = sc.newAPIHadoopRDD(
	mongoConfig, // Configuration
	classOf[MongoInputFormat], // InputFormat
	classOf[Object], // Key type
	classOf[BSONObject]) // Value type

	val head = documents.take(5)

	val topTags = documents.map{case (a, b) => b}.
	map(_ get("entities")).
	map{case u: BSONObject => u.get("hashtags")}.
	flatMap{case l: BasicBSONList => l.toMap.map(_._2).toList}.
	map{case u: BSONObject => u.get("text")}.
	map{case u: String => u.toLowerCase()}.
	filter(_ != "datascience").
	countByValue().
	toList.
	sortBy(- _._2).
	take(15)

	val file = new File("topTags.csv")
	val writer = new BufferedWriter(new FileWriter(file))
	writer.write("Hashtag,Use Count\n")
	for (pair <- topTags) writer.write(pair._1 + "," + pair._2 + "\n")
	writer.close()