Created
November 9, 2016 16:51
-
-
Save sujathakvr/9d90e7aa9d11c588a0f5b6cdd1bdcad9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.ourOffice; | |
import java.io.File; | |
import java.io.FileNotFoundException; | |
import java.util.ArrayList; | |
import java.util.Collection; | |
import java.util.Collections; | |
import java.util.HashMap; | |
import java.util.HashSet; | |
import java.util.List; | |
import java.util.Scanner; | |
import java.util.Set; | |
import java.util.concurrent.Callable; | |
import java.util.concurrent.ExecutorService; | |
import java.util.concurrent.Executors; | |
import java.util.concurrent.Future; | |
import com.ourOffice.utils.Utils; | |
import com.ourOffice.model.*; | |
/* Specs are as follows: | |
There are around 5K text files. These are divided into 5 folders - each folder containing 1k files. | |
The program needs to take each text file , consider each word in the file and calculate the count/frequency of each and every word. | |
The output needs to be a .csv file with the columns -word , frequency or count , count/total number of words in file. | |
sample input files( G0001Text , G0002Text) and sample output files are part of the project. */ | |
public class WordCounter { | |
static class ProcessDoc implements Callable<DocFrequencyHashMap> { | |
private File path; | |
private DocFrequencyHashMap dhm; | |
private ArrayList<String> arr; | |
private String fileName; | |
ProcessDoc(File path, DocFrequencyHashMap dhm, ArrayList<String> arr, String fileName) { | |
this.path = path; | |
this.dhm = dhm; | |
this.arr = arr; | |
this.fileName = fileName; | |
} | |
/* | |
* Each folder is executed in a different thread. The code below reads | |
* each data file in the specified folder. As long as there is a match, | |
* the search string is searched inside the data file set. Once a match | |
* is found, the hashmap is updated with the value and next search | |
* string is used for consideration. (non-Javadoc) | |
* | |
* @see java.util.concurrent.Callable#call() | |
*/ | |
@Override | |
public DocFrequencyHashMap call() { | |
HashMap<String, Integer> hm = new HashMap<String, Integer>(); | |
for (File fileEntry : path.listFiles()) { | |
// if it is a file | |
if (fileEntry.isFile()) { | |
try (Scanner scanner = new Scanner(fileEntry)) { | |
// reads a file | |
String strLine; | |
// tokenizes based on spaces | |
ArrayList<String> arr1 = new ArrayList<String>(); | |
while (scanner.hasNextLine()) { | |
strLine = scanner.nextLine(); | |
String[] stringArray = strLine.split("\\s+"); | |
Collections.addAll(arr1, stringArray); | |
} | |
// More efficient way of doing comparison | |
Set<String> termsSet = new HashSet<String>(arr); | |
Set<String> documentTermsSet = new HashSet<String>(arr1); | |
Set<String> intersectionSet = new HashSet<String>(termsSet); | |
intersectionSet.retainAll(documentTermsSet); | |
if (!intersectionSet.isEmpty()) { | |
for (String str : intersectionSet) { | |
if (hm.containsKey(str)) { | |
hm.put(str, hm.get(str) + 1); | |
} else { | |
hm.put(str, 1); | |
} | |
} | |
} | |
dhm.setHm(hm); | |
dhm.setFileName(fileName); | |
} catch (FileNotFoundException e) { | |
e.printStackTrace(); | |
} | |
} | |
} | |
return dhm; | |
} | |
} | |
static ExecutorService executor; | |
public static void main(String[] args) throws Exception { | |
if (args.length < 3) { | |
System.out.println( | |
"Usage::: java <package name>ProcessDocuments <input files folder location> <data files root folder location> <output file to be stored>"); | |
System.exit(0); | |
} | |
File inputFolder = new File(args[0]); | |
File folder = new File(args[1]); | |
File outputFolder = new File(args[2]); | |
// get input | |
ArrayList<WordPool> wordPoolArrayList = new ArrayList<WordPool>(); | |
createTermsArray(inputFolder, wordPoolArrayList); | |
Collection<ProcessDoc> collection = new ArrayList<ProcessDoc>(); | |
File[] folders = folder.listFiles(); | |
long startTime = System.currentTimeMillis(); | |
executor = Executors.newFixedThreadPool(folders.length); | |
int i = 0; | |
for (File fileEntry1 : folders) { | |
if (fileEntry1.isDirectory()) { | |
ProcessDoc sum = new ProcessDoc(fileEntry1, new DocFrequencyHashMap(), | |
wordPoolArrayList.get(i).getWordList(), wordPoolArrayList.get(i++).getFileName()); | |
collection.add(sum); | |
} | |
} | |
List<Future<DocFrequencyHashMap>> list = executor.invokeAll(collection); | |
i = 0; | |
long endTime = System.currentTimeMillis(); | |
System.out.println("Total number of threads executed::" + folders.length); | |
System.out.println("Timing " + (endTime - startTime) + " milliseconds"); | |
for (Future<DocFrequencyHashMap> fut : list) { | |
// output file name can be changed here. | |
String fileName = fut.get().getFileName(); | |
String t = outputFolder + "\\output_" + fileName + ".csv"; | |
Utils.generateCsvFile(t, fut.get().getHm(), 1000); | |
System.out.println("Output CSV files are generated in :: " + t); | |
} | |
executor.shutdown(); | |
} | |
/* | |
* Terms Array is created by parsing the input files in the input folder. | |
* Each index corresponds to each file and the value is a String array of | |
* the data in the file. | |
* | |
* @param folder folder to be parsed for creating terms array | |
* | |
* @param wordPoolArrayList arraylist of WordPool objects | |
*/ | |
public static void createTermsArray(File folder, ArrayList<WordPool> wordPoolArrayList) { | |
for (File fileEntry : folder.listFiles()) { | |
if (fileEntry.isDirectory()) { | |
createTermsArray(fileEntry, wordPoolArrayList); | |
} else { | |
String temp = ""; | |
String extension = null; | |
String fileName = null; | |
// if it is a file | |
if (fileEntry.isFile()) { | |
temp = fileEntry.getName(); | |
System.out.println("Reading file :: " + temp); | |
extension = Utils.getFileExtension(temp); | |
fileName = Utils.getFileNameWithoutExtension(temp); | |
if (extension != null && extension.equalsIgnoreCase("txt")) { | |
WordPool wordPool = new WordPool(); | |
try (Scanner scanner = new Scanner(fileEntry)) { | |
ArrayList<String> tokenizedTerms = new ArrayList<String>(); | |
String strLine = null; | |
while (scanner.hasNextLine()) { | |
strLine = scanner.nextLine(); | |
String[] stringArray = strLine.split("\\s+"); | |
Collections.addAll(tokenizedTerms, stringArray); | |
} | |
wordPool.setFileName(fileName); | |
wordPool.setWordList(tokenizedTerms); | |
wordPoolArrayList.add(wordPool); | |
} catch (Exception e) { | |
System.out.println("File is corrupted, unable to read" + temp); | |
} | |
} else { | |
System.out.println("File is not a text file" + temp); | |
break; | |
} | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Running the WordCounter
Usage::: java ProcessDocuments
Step -1 Attached a sample folder structure
Step -2 Sample input/data files
Attached a sample folder for input, output and data files
https://drive.google.com/file/d/0B5eVbwLsgneUbXI3RGpjMFZCVDQ/view?usp=sharing
Extract the above zip file into say C: or any other folder.
Let us say if we extract the contents in C:. You may have the above mentioned image (as in step-1) type of folder structure with input, output and data files.
Step-3 Running the program
Extract the project zip from https://drive.google.com/file/d/0B5eVbwLsgneUUk9WNkh6cUFoRjA/view?usp=sharing
![image](https://cloud.githubusercontent.com/assets/9601293/20147688/251d52ec-a677-11e6-84b3-c6a013668d39.png)
Import the project in Eclipse.
Ensure you have java 7 installed.
Run the program as a java application.
Step-4 Output for the program
You can check the output for the program in C:\MyTestingRoot\output