Skip to content

Instantly share code, notes, and snippets.

@sujathakvr
Created November 9, 2016 16:51
Show Gist options
  • Save sujathakvr/9d90e7aa9d11c588a0f5b6cdd1bdcad9 to your computer and use it in GitHub Desktop.
Save sujathakvr/9d90e7aa9d11c588a0f5b6cdd1bdcad9 to your computer and use it in GitHub Desktop.
package com.ourOffice;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Scanner;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import com.ourOffice.utils.Utils;
import com.ourOffice.model.*;
/* Specs are as follows:
There are around 5K text files. These are divided into 5 folders - each folder containing 1k files.
The program needs to take each text file , consider each word in the file and calculate the count/frequency of each and every word.
The output needs to be a .csv file with the columns -word , frequency or count , count/total number of words in file.
sample input files( G0001Text , G0002Text) and sample output files are part of the project. */
public class WordCounter {
static class ProcessDoc implements Callable<DocFrequencyHashMap> {
private File path;
private DocFrequencyHashMap dhm;
private ArrayList<String> arr;
private String fileName;
ProcessDoc(File path, DocFrequencyHashMap dhm, ArrayList<String> arr, String fileName) {
this.path = path;
this.dhm = dhm;
this.arr = arr;
this.fileName = fileName;
}
/*
* Each folder is executed in a different thread. The code below reads
* each data file in the specified folder. As long as there is a match,
* the search string is searched inside the data file set. Once a match
* is found, the hashmap is updated with the value and next search
* string is used for consideration. (non-Javadoc)
*
* @see java.util.concurrent.Callable#call()
*/
@Override
public DocFrequencyHashMap call() {
HashMap<String, Integer> hm = new HashMap<String, Integer>();
for (File fileEntry : path.listFiles()) {
// if it is a file
if (fileEntry.isFile()) {
try (Scanner scanner = new Scanner(fileEntry)) {
// reads a file
String strLine;
// tokenizes based on spaces
ArrayList<String> arr1 = new ArrayList<String>();
while (scanner.hasNextLine()) {
strLine = scanner.nextLine();
String[] stringArray = strLine.split("\\s+");
Collections.addAll(arr1, stringArray);
}
// More efficient way of doing comparison
Set<String> termsSet = new HashSet<String>(arr);
Set<String> documentTermsSet = new HashSet<String>(arr1);
Set<String> intersectionSet = new HashSet<String>(termsSet);
intersectionSet.retainAll(documentTermsSet);
if (!intersectionSet.isEmpty()) {
for (String str : intersectionSet) {
if (hm.containsKey(str)) {
hm.put(str, hm.get(str) + 1);
} else {
hm.put(str, 1);
}
}
}
dhm.setHm(hm);
dhm.setFileName(fileName);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
return dhm;
}
}
static ExecutorService executor;
public static void main(String[] args) throws Exception {
if (args.length < 3) {
System.out.println(
"Usage::: java <package name>ProcessDocuments <input files folder location> <data files root folder location> <output file to be stored>");
System.exit(0);
}
File inputFolder = new File(args[0]);
File folder = new File(args[1]);
File outputFolder = new File(args[2]);
// get input
ArrayList<WordPool> wordPoolArrayList = new ArrayList<WordPool>();
createTermsArray(inputFolder, wordPoolArrayList);
Collection<ProcessDoc> collection = new ArrayList<ProcessDoc>();
File[] folders = folder.listFiles();
long startTime = System.currentTimeMillis();
executor = Executors.newFixedThreadPool(folders.length);
int i = 0;
for (File fileEntry1 : folders) {
if (fileEntry1.isDirectory()) {
ProcessDoc sum = new ProcessDoc(fileEntry1, new DocFrequencyHashMap(),
wordPoolArrayList.get(i).getWordList(), wordPoolArrayList.get(i++).getFileName());
collection.add(sum);
}
}
List<Future<DocFrequencyHashMap>> list = executor.invokeAll(collection);
i = 0;
long endTime = System.currentTimeMillis();
System.out.println("Total number of threads executed::" + folders.length);
System.out.println("Timing " + (endTime - startTime) + " milliseconds");
for (Future<DocFrequencyHashMap> fut : list) {
// output file name can be changed here.
String fileName = fut.get().getFileName();
String t = outputFolder + "\\output_" + fileName + ".csv";
Utils.generateCsvFile(t, fut.get().getHm(), 1000);
System.out.println("Output CSV files are generated in :: " + t);
}
executor.shutdown();
}
/*
* Terms Array is created by parsing the input files in the input folder.
* Each index corresponds to each file and the value is a String array of
* the data in the file.
*
* @param folder folder to be parsed for creating terms array
*
* @param wordPoolArrayList arraylist of WordPool objects
*/
public static void createTermsArray(File folder, ArrayList<WordPool> wordPoolArrayList) {
for (File fileEntry : folder.listFiles()) {
if (fileEntry.isDirectory()) {
createTermsArray(fileEntry, wordPoolArrayList);
} else {
String temp = "";
String extension = null;
String fileName = null;
// if it is a file
if (fileEntry.isFile()) {
temp = fileEntry.getName();
System.out.println("Reading file :: " + temp);
extension = Utils.getFileExtension(temp);
fileName = Utils.getFileNameWithoutExtension(temp);
if (extension != null && extension.equalsIgnoreCase("txt")) {
WordPool wordPool = new WordPool();
try (Scanner scanner = new Scanner(fileEntry)) {
ArrayList<String> tokenizedTerms = new ArrayList<String>();
String strLine = null;
while (scanner.hasNextLine()) {
strLine = scanner.nextLine();
String[] stringArray = strLine.split("\\s+");
Collections.addAll(tokenizedTerms, stringArray);
}
wordPool.setFileName(fileName);
wordPool.setWordList(tokenizedTerms);
wordPoolArrayList.add(wordPool);
} catch (Exception e) {
System.out.println("File is corrupted, unable to read" + temp);
}
} else {
System.out.println("File is not a text file" + temp);
break;
}
}
}
}
}
}
@sujathakvr
Copy link
Author

sujathakvr commented Nov 9, 2016

Running the WordCounter
Usage::: java ProcessDocuments

Step -1 Attached a sample folder structure

image

Step -2 Sample input/data files

Attached a sample folder for input, output and data files
https://drive.google.com/file/d/0B5eVbwLsgneUbXI3RGpjMFZCVDQ/view?usp=sharing
Extract the above zip file into say C: or any other folder.
Let us say if we extract the contents in C:. You may have the above mentioned image (as in step-1) type of folder structure with input, output and data files.

Step-3 Running the program

Extract the project zip from https://drive.google.com/file/d/0B5eVbwLsgneUUk9WNkh6cUFoRjA/view?usp=sharing
Import the project in Eclipse.
Ensure you have java 7 installed.
Run the program as a java application.
image

Step-4 Output for the program

You can check the output for the program in C:\MyTestingRoot\output

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment