juanmf/Spark-WordCount-Java8.java

## Spark-WordCount-Java8.java
package com.mycompany;

/**
 * Created by juanmf on 15/07/17.
 */
import java.util.Arrays;
import java.util.stream.Collectors;

import org.apache.spark.api.java.*;
import org.apache.spark.SparkConf;
import scala.Tuple2;

public class WordCount {
    public static void main(String[] args) {

        // create Spark context with Spark configuration
        try (JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("Spark Count"))) {
            int threshold = Integer.parseInt(args[1]);
            String filePath = args[0];
            JavaPairRDD<String, Integer> filtered = filterWordsUnderThreshold(sc, threshold, filePath);

            // count characters
            JavaPairRDD<Character, Integer> charCounts = filtered
                    .flatMap(
                            s -> s._1.chars().mapToObj(i -> (char)i).collect(Collectors.toList()))
                    .mapToPair(c -> new Tuple2<>(c, 1))
                    .reduceByKey((i1, i2) -> i1 + i2);

            System.out.println(charCounts.collect());
        }
    }

    private static JavaPairRDD<String, Integer> filterWordsUnderThreshold(JavaSparkContext sc, int threshold, String filePath) {
        // read in text file and split each document into words
        JavaRDD<String> tokenized = sc.textFile(filePath).flatMap(s -> Arrays.asList(s.split(" ")));

        // count the occurrence of each word
        JavaPairRDD<String, Integer> counts = tokenized
                .mapToPair(s -> new Tuple2<>(s, 1))
                .reduceByKey((i1, i2) -> i1 + i2);

        // filter out words with fewer than threshold occurrences
        return counts.filter(tup -> tup._2 >= threshold);
    }
}
	package com.mycompany;

	/**
	* Created by juanmf on 15/07/17.
	*/
	import java.util.Arrays;
	import java.util.stream.Collectors;

	import org.apache.spark.api.java.*;
	import org.apache.spark.SparkConf;
	import scala.Tuple2;

	public class WordCount {
	public static void main(String[] args) {

	// create Spark context with Spark configuration
	try (JavaSparkContext sc = new JavaSparkContext(new SparkConf().setAppName("Spark Count"))) {
	int threshold = Integer.parseInt(args[1]);
	String filePath = args[0];
	JavaPairRDD<String, Integer> filtered = filterWordsUnderThreshold(sc, threshold, filePath);

	// count characters
	JavaPairRDD<Character, Integer> charCounts = filtered
	.flatMap(
	s -> s._1.chars().mapToObj(i -> (char)i).collect(Collectors.toList()))
	.mapToPair(c -> new Tuple2<>(c, 1))
	.reduceByKey((i1, i2) -> i1 + i2);

	System.out.println(charCounts.collect());
	}
	}

	private static JavaPairRDD<String, Integer> filterWordsUnderThreshold(JavaSparkContext sc, int threshold, String filePath) {
	// read in text file and split each document into words
	JavaRDD<String> tokenized = sc.textFile(filePath).flatMap(s -> Arrays.asList(s.split(" ")));

	// count the occurrence of each word
	JavaPairRDD<String, Integer> counts = tokenized
	.mapToPair(s -> new Tuple2<>(s, 1))
	.reduceByKey((i1, i2) -> i1 + i2);

	// filter out words with fewer than threshold occurrences
	return counts.filter(tup -> tup._2 >= threshold);
	}
	}