sujathakvr/WordCounter\WordCounter.java

## WordCounter\WordCounter.java
package com.ourOffice;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import com.ourOffice.utils.Utils;
import com.ourOffice.model.*;

/* Specs are as follows:
There are around 5K text files. These are divided into 5 folders - each folder containing 1k files.
The program needs to take each text file , consider each word in the file and calculate the count/frequency of each and every word.
The output needs to be a .csv file with the columns -word , frequency or count , count/total number of words in file.
sample input files( G0001Text , G0002Text) and sample output files are part of the project. */


public class WordCounter {

	static class ProcessDoc implements Callable<DocFrequencyHashMap> {
		private File path;
		private DocFrequencyHashMap dhm;
		private ArrayList<String> arr;
		private String fileName;

		ProcessDoc(File path, DocFrequencyHashMap dhm, ArrayList<String> arr, String fileName) {
			this.path = path;
			this.dhm = dhm;
			this.arr = arr;
			this.fileName = fileName;
		}

		/*
		 * Each folder is executed in a different thread. The code below reads
		 * each data file in the specified folder. As long as there is a match,
		 * the search string is searched inside the data file set. Once a match
		 * is found, the hashmap is updated with the value and next search
		 * string is used for consideration. (non-Javadoc)
		 *
		 * @see java.util.concurrent.Callable#call()
		 */
		@Override
		public DocFrequencyHashMap call() {
			HashMap<String, Integer> hm = new HashMap<String, Integer>();
			for (File fileEntry : path.listFiles()) {
				// if it is a file
				if (fileEntry.isFile()) {
					try (Scanner scanner = new Scanner(fileEntry)) {
						// reads a file

						String strLine;
						// tokenizes based on spaces
						ArrayList<String> arr1 = new ArrayList<String>();
						while (scanner.hasNextLine()) {
							strLine = scanner.nextLine();
							String[] stringArray = strLine.split("\\s+");
							Collections.addAll(arr1, stringArray);
						}

						// More efficient way of doing comparison

						Set<String> termsSet = new HashSet<String>(arr);

						Set<String> documentTermsSet = new HashSet<String>(arr1);

						Set<String> intersectionSet = new HashSet<String>(termsSet);

						intersectionSet.retainAll(documentTermsSet);

						if (!intersectionSet.isEmpty()) {
							for (String str : intersectionSet) {
								if (hm.containsKey(str)) {
									hm.put(str, hm.get(str) + 1);
								} else {
									hm.put(str, 1);
								}
							}
						}

						dhm.setHm(hm);
						dhm.setFileName(fileName);

					} catch (FileNotFoundException e) {

						e.printStackTrace();

					}
				}

			}
			return dhm;
		}
	}

	static ExecutorService executor;

	public static void main(String[] args) throws Exception {

		if (args.length < 3) {
			System.out.println(
					"Usage::: java  <package name>ProcessDocuments <input files folder location> <data files root folder location> <output file to be stored>");
			System.exit(0);
		}

		File inputFolder = new File(args[0]);

		File folder = new File(args[1]);

		File outputFolder = new File(args[2]);

		// get input
		ArrayList<WordPool> wordPoolArrayList = new ArrayList<WordPool>();
		createTermsArray(inputFolder, wordPoolArrayList);

		Collection<ProcessDoc> collection = new ArrayList<ProcessDoc>();
		File[] folders = folder.listFiles();
		long startTime = System.currentTimeMillis();
		executor = Executors.newFixedThreadPool(folders.length);
		int i = 0;

		for (File fileEntry1 : folders) {
			if (fileEntry1.isDirectory()) {
				ProcessDoc sum = new ProcessDoc(fileEntry1, new DocFrequencyHashMap(),
						wordPoolArrayList.get(i).getWordList(), wordPoolArrayList.get(i++).getFileName());
				collection.add(sum);
			}
		}

		List<Future<DocFrequencyHashMap>> list = executor.invokeAll(collection);

		i = 0;
		long endTime = System.currentTimeMillis();

		System.out.println("Total number of threads executed::" + folders.length);
		System.out.println("Timing " + (endTime - startTime) + " milliseconds");

		for (Future<DocFrequencyHashMap> fut : list) {
			// output file name can be changed here.
			String fileName = fut.get().getFileName();
			String t = outputFolder + "\\output_" + fileName + ".csv";
			Utils.generateCsvFile(t, fut.get().getHm(), 1000);
			System.out.println("Output CSV files are generated in :: " + t);
		}

		executor.shutdown();
	}

	/*
	 * Terms Array is created by parsing the input files in the input folder.
	 * Each index corresponds to each file and the value is a String array of
	 * the data in the file.
	 *
	 * @param folder folder to be parsed for creating terms array
	 *
	 * @param wordPoolArrayList arraylist of WordPool objects
	 */
	public static void createTermsArray(File folder, ArrayList<WordPool> wordPoolArrayList) {
		for (File fileEntry : folder.listFiles()) {
			if (fileEntry.isDirectory()) {
				createTermsArray(fileEntry, wordPoolArrayList);
			} else {
				String temp = "";
				String extension = null;
				String fileName = null;
				// if it is a file
				if (fileEntry.isFile()) {
					temp = fileEntry.getName();
					System.out.println("Reading file :: " + temp);
					extension = Utils.getFileExtension(temp);
					fileName = Utils.getFileNameWithoutExtension(temp);
					if (extension != null && extension.equalsIgnoreCase("txt")) {
						WordPool wordPool = new WordPool();
						try (Scanner scanner = new Scanner(fileEntry)) {
							ArrayList<String> tokenizedTerms = new ArrayList<String>();
							String strLine = null;
							while (scanner.hasNextLine()) {
								strLine = scanner.nextLine();
								String[] stringArray = strLine.split("\\s+");
								Collections.addAll(tokenizedTerms, stringArray);
							}
							wordPool.setFileName(fileName);
							wordPool.setWordList(tokenizedTerms);
							wordPoolArrayList.add(wordPool);
						} catch (Exception e) {
							System.out.println("File is corrupted, unable to read" + temp);
						}
					} else {
						System.out.println("File is not a text file" + temp);
						break;
					}

				}

			}

		}
	}

}
	package com.ourOffice;

	import java.io.File;
	import java.io.FileNotFoundException;
	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Scanner;
	import java.util.Set;
	import java.util.concurrent.Callable;
	import java.util.concurrent.ExecutorService;
	import java.util.concurrent.Executors;
	import java.util.concurrent.Future;

	import com.ourOffice.utils.Utils;
	import com.ourOffice.model.*;

	/* Specs are as follows:
	There are around 5K text files. These are divided into 5 folders - each folder containing 1k files.
	The program needs to take each text file , consider each word in the file and calculate the count/frequency of each and every word.
	The output needs to be a .csv file with the columns -word , frequency or count , count/total number of words in file.
	sample input files( G0001Text , G0002Text) and sample output files are part of the project. */


	public class WordCounter {

	static class ProcessDoc implements Callable<DocFrequencyHashMap> {
	private File path;
	private DocFrequencyHashMap dhm;
	private ArrayList<String> arr;
	private String fileName;

	ProcessDoc(File path, DocFrequencyHashMap dhm, ArrayList<String> arr, String fileName) {
	this.path = path;
	this.dhm = dhm;
	this.arr = arr;
	this.fileName = fileName;
	}

	/*
	* Each folder is executed in a different thread. The code below reads
	* each data file in the specified folder. As long as there is a match,
	* the search string is searched inside the data file set. Once a match
	* is found, the hashmap is updated with the value and next search
	* string is used for consideration. (non-Javadoc)
	*
	* @see java.util.concurrent.Callable#call()
	*/
	@Override
	public DocFrequencyHashMap call() {
	HashMap<String, Integer> hm = new HashMap<String, Integer>();
	for (File fileEntry : path.listFiles()) {
	// if it is a file
	if (fileEntry.isFile()) {
	try (Scanner scanner = new Scanner(fileEntry)) {
	// reads a file

	String strLine;
	// tokenizes based on spaces
	ArrayList<String> arr1 = new ArrayList<String>();
	while (scanner.hasNextLine()) {
	strLine = scanner.nextLine();
	String[] stringArray = strLine.split("\\s+");
	Collections.addAll(arr1, stringArray);
	}

	// More efficient way of doing comparison

	Set<String> termsSet = new HashSet<String>(arr);

	Set<String> documentTermsSet = new HashSet<String>(arr1);

	Set<String> intersectionSet = new HashSet<String>(termsSet);

	intersectionSet.retainAll(documentTermsSet);

	if (!intersectionSet.isEmpty()) {
	for (String str : intersectionSet) {
	if (hm.containsKey(str)) {
	hm.put(str, hm.get(str) + 1);
	} else {
	hm.put(str, 1);
	}
	}
	}

	dhm.setHm(hm);
	dhm.setFileName(fileName);

	} catch (FileNotFoundException e) {

	e.printStackTrace();

	}
	}

	}
	return dhm;
	}
	}

	static ExecutorService executor;

	public static void main(String[] args) throws Exception {

	if (args.length < 3) {
	System.out.println(
	"Usage::: java <package name>ProcessDocuments <input files folder location> <data files root folder location> <output file to be stored>");
	System.exit(0);
	}

	File inputFolder = new File(args[0]);

	File folder = new File(args[1]);

	File outputFolder = new File(args[2]);

	// get input
	ArrayList<WordPool> wordPoolArrayList = new ArrayList<WordPool>();
	createTermsArray(inputFolder, wordPoolArrayList);

	Collection<ProcessDoc> collection = new ArrayList<ProcessDoc>();
	File[] folders = folder.listFiles();
	long startTime = System.currentTimeMillis();
	executor = Executors.newFixedThreadPool(folders.length);
	int i = 0;

	for (File fileEntry1 : folders) {
	if (fileEntry1.isDirectory()) {
	ProcessDoc sum = new ProcessDoc(fileEntry1, new DocFrequencyHashMap(),
	wordPoolArrayList.get(i).getWordList(), wordPoolArrayList.get(i++).getFileName());
	collection.add(sum);
	}
	}

	List<Future<DocFrequencyHashMap>> list = executor.invokeAll(collection);

	i = 0;
	long endTime = System.currentTimeMillis();

	System.out.println("Total number of threads executed::" + folders.length);
	System.out.println("Timing " + (endTime - startTime) + " milliseconds");

	for (Future<DocFrequencyHashMap> fut : list) {
	// output file name can be changed here.
	String fileName = fut.get().getFileName();
	String t = outputFolder + "\\output_" + fileName + ".csv";
	Utils.generateCsvFile(t, fut.get().getHm(), 1000);
	System.out.println("Output CSV files are generated in :: " + t);
	}

	executor.shutdown();
	}

	/*
	* Terms Array is created by parsing the input files in the input folder.
	* Each index corresponds to each file and the value is a String array of
	* the data in the file.
	*
	* @param folder folder to be parsed for creating terms array
	*
	* @param wordPoolArrayList arraylist of WordPool objects
	*/
	public static void createTermsArray(File folder, ArrayList<WordPool> wordPoolArrayList) {
	for (File fileEntry : folder.listFiles()) {
	if (fileEntry.isDirectory()) {
	createTermsArray(fileEntry, wordPoolArrayList);
	} else {
	String temp = "";
	String extension = null;
	String fileName = null;
	// if it is a file
	if (fileEntry.isFile()) {
	temp = fileEntry.getName();
	System.out.println("Reading file :: " + temp);
	extension = Utils.getFileExtension(temp);
	fileName = Utils.getFileNameWithoutExtension(temp);
	if (extension != null && extension.equalsIgnoreCase("txt")) {
	WordPool wordPool = new WordPool();
	try (Scanner scanner = new Scanner(fileEntry)) {
	ArrayList<String> tokenizedTerms = new ArrayList<String>();
	String strLine = null;
	while (scanner.hasNextLine()) {
	strLine = scanner.nextLine();
	String[] stringArray = strLine.split("\\s+");
	Collections.addAll(tokenizedTerms, stringArray);
	}
	wordPool.setFileName(fileName);
	wordPool.setWordList(tokenizedTerms);
	wordPoolArrayList.add(wordPool);
	} catch (Exception e) {
	System.out.println("File is corrupted, unable to read" + temp);
	}
	} else {
	System.out.println("File is not a text file" + temp);
	break;
	}

	}

	}

	}
	}

	}