Skip to content

Instantly share code, notes, and snippets.

@maccam912
Created November 10, 2015 13:53
Show Gist options
  • Save maccam912/d35e5a15dac30fd306ba to your computer and use it in GitHub Desktop.
Save maccam912/d35e5a15dac30fd306ba to your computer and use it in GitHub Desktop.
/**
* Created by maccam912 on 11/9/15.
*/
import org.apache.spark
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.
object BigramatizerApp {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Simple Application").setMaster("local[*]")
val sc = new SparkContext(conf)
val textFile = sc.textFile("src/main/resources/text1")
val words = textFile.flatMap(line => line.split(" "))
val wordsWithIndex = words.zipWithIndex()
val counts = words.map(word => (word, 1)).reduceByKey(_ + _)
val wordsFirst = wordsWithIndex.filter(x => x._2 == 0).keys
val wordsRestIndex = wordsWithIndex.filter(x => x._2 != 0).keys.++(wordsFirst).zipWithIndex
val reverseWords = wordsWithIndex.map(_.swap)
val reverseWordsRest = wordsRestIndex.map(_.swap)
val bigrams = reverseWords.join(reverseWordsRest).sortByKey(true, reverseWords.partitions.length).values.map(lambda (a, b))
println(bigrams.collect.mkString(" "))
// counts.saveAsTextFile("hdfs://...")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment