Last active
August 29, 2015 14:27
-
-
Save sujathakvr/3314036a32916d5cb420 to your computer and use it in GitHub Desktop.
Counting the Frequency of words in large number of text files using Threads in java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Create the following folders | |
C:\input | |
C:\MyTestingRoot | |
C:\output | |
the folder names can vary which can be passed accordingly in the arguments while running the program. | |
The input folder contains files like (for e.g) | |
1.txt which contains Search Home New Image Ask Question Fact | |
2.txt which contains Fact | |
3.txt which contains asteroid sourc signific perturb planet among brightest main belt Fact | |
The data folder which needs to be scanned based on the above txt files will be available in C:\MyTestingRoot | |
the output generated will be in c:\output. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.ourOffice; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.util.HashMap; | |
import java.util.Iterator; | |
import java.util.Map; | |
public class CSVGenerator { | |
static void generateCsvFile(String fileName,HashMap<String , Integer> hMap, int totalWordsCounter) { | |
try { | |
FileWriter writer = new FileWriter(fileName, false); | |
writer.append("Word"); | |
writer.append(','); | |
writer.append("Count"); | |
writer.append(','); | |
writer.append("Count/Total Number of words"); | |
writer.append('\n'); | |
Iterator iter = hMap.entrySet().iterator(); | |
while (iter.hasNext()) { | |
Map.Entry mEntry = (Map.Entry) iter.next(); | |
String key = mEntry.getKey().toString(); | |
String value = mEntry.getValue().toString(); | |
double valueDouble = Double.parseDouble(value); | |
double totalWordsCounterDouble = totalWordsCounter; | |
writer.append(key); | |
writer.append(','); | |
writer.append(value); | |
writer.append(','); | |
// If number of occurrences / Total Words is needed, uncomment the next line and comment the one after next line. | |
//writer.append(value + "/" + totalWordsCounter); | |
writer.append(Double.valueOf(valueDouble/totalWordsCounterDouble).toString()); | |
writer.append('\n'); | |
} | |
// generate whatever data you want | |
writer.flush(); | |
writer.close(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.ourOffice; | |
import java.io.BufferedReader; | |
import java.io.DataInputStream; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.FileNotFoundException; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.util.*; | |
import java.util.concurrent.*; | |
//This class is used for storing input word pool file name and word list | |
class WordPool { | |
private ArrayList<String> wordList; | |
private String fileName; | |
public ArrayList<String> getWordList() { | |
return wordList; | |
} | |
public void setWordList(ArrayList<String> wordList) { | |
this.wordList = wordList; | |
} | |
public String getFileName() { | |
return fileName; | |
} | |
public void setFileName(String fileName) { | |
this.fileName = fileName; | |
} | |
} | |
// This class is used for storing filename, Hashmap containing the words and | |
// their frequency of occurence in the files in the given folder | |
class DocFrequencyHashMap { | |
private HashMap<String, Integer> hm; | |
private String fileName; | |
public HashMap<String, Integer> getHm() { | |
return hm; | |
} | |
public void setHm(HashMap<String, Integer> hm) { | |
this.hm = hm; | |
} | |
public String getFileName() { | |
return fileName; | |
} | |
public void setFileName(String fileName) { | |
this.fileName = fileName; | |
} | |
} | |
public class ProcessDocuments { | |
static class ProcessDoc implements Callable<DocFrequencyHashMap> { | |
private File path; | |
private DocFrequencyHashMap dhm; | |
private ArrayList<String> arr; | |
private String fileName; | |
ProcessDoc(File path, DocFrequencyHashMap dhm, ArrayList<String> arr, | |
String fileName) { | |
this.path = path; | |
this.dhm = dhm; | |
this.arr = arr; | |
this.fileName = fileName; | |
} | |
/* | |
* Each folder is executed in a different thread. The code below reads | |
* each data file in the specified folder. As long as there is a match, | |
* the search string is searched inside the data file set. Once | |
* a match is found, the hashmap is updated with the value and next | |
* search string is used for consideration. (non-Javadoc) | |
* | |
* @see java.util.concurrent.Callable#call() | |
*/ | |
@Override | |
public DocFrequencyHashMap call() { | |
HashMap<String, Integer> hm = new HashMap<String, Integer>(); | |
for (File fileEntry : path.listFiles()) { | |
// if it is a file | |
if (fileEntry.isFile()) { | |
try (FileInputStream fstream = new FileInputStream( | |
fileEntry); | |
DataInputStream in = new DataInputStream(fstream); | |
BufferedReader br = new BufferedReader( | |
new InputStreamReader(in));) { | |
// reads a file | |
String strLine; | |
// tokenizes based on spaces | |
ArrayList<String> arr1 = new ArrayList<String>(); | |
while ((strLine = br.readLine()) != null) { | |
StringTokenizer st = new StringTokenizer(strLine, | |
" "); | |
while (st.hasMoreTokens()) { | |
arr1.add(st.nextToken().trim().toLowerCase()); | |
} | |
} | |
// Earlier way of doing the comparison | |
/* for (String searchWord : arr) { | |
for (String testingWord : arr1) { | |
if (searchWord.equalsIgnoreCase(testingWord)) { | |
if (hm.containsKey(searchWord.toLowerCase())) { | |
hm.put(searchWord.toLowerCase(), | |
hm.get(searchWord.toLowerCase()) + 1); | |
} else { | |
hm.put(searchWord.toLowerCase(), 1); | |
} | |
break; | |
} | |
} | |
}*/ | |
// More efficient way of doing comparison | |
Set<String> termsSet = new HashSet<String>(arr); | |
Set<String> documentTermsSet = new HashSet<String>(arr1); // | |
Set<String> intersectionSet = new HashSet<String>( | |
termsSet); | |
intersectionSet.retainAll(documentTermsSet); // | |
if (!intersectionSet.isEmpty()) { | |
for (String str : intersectionSet) { | |
if (hm.containsKey(str)) { | |
hm.put(str,hm.get(str) + 1); | |
} else { | |
hm.put(str, 1); | |
} | |
} | |
} | |
dhm.setHm(hm); | |
dhm.setFileName(fileName); | |
} catch (FileNotFoundException e) | |
{ | |
e.printStackTrace(); | |
} | |
catch (IOException e) | |
{ | |
e.printStackTrace(); | |
} | |
} | |
} | |
return dhm; | |
} | |
} | |
static ExecutorService executor; | |
public static void main(String[] args) throws Exception { | |
if (args.length < 3) { | |
System.out | |
.println("Usage::: java <package name>ProcessDocuments <input files folder location> <data files root folder location> <output file to be stored>"); | |
System.exit(0); | |
} | |
File inputFolder = new File(args[0]); | |
File folder = new File(args[1]); | |
File outputFolder = new File(args[2]); | |
// get input | |
ArrayList<WordPool> wordPoolArrayList = createTermsArray(inputFolder); | |
Collection<ProcessDoc> collection = new ArrayList<ProcessDoc>(); | |
File[] folders = folder.listFiles(); | |
long startTime = System.currentTimeMillis(); | |
executor = Executors.newFixedThreadPool(folders.length); | |
int i = 0; | |
for (File fileEntry1 : folders) { | |
if (fileEntry1.isDirectory()) { | |
for (File fileEntry : fileEntry1.listFiles()) { | |
if (fileEntry.isDirectory() | |
&& fileEntry.getName().equalsIgnoreCase("text")) { | |
ProcessDoc sum = new ProcessDoc(fileEntry, | |
new DocFrequencyHashMap(), wordPoolArrayList.get(i) | |
.getWordList(), wordPoolArrayList.get(i++) | |
.getFileName()); | |
collection.add(sum); | |
} | |
} | |
} | |
} | |
List<Future<DocFrequencyHashMap>> list = executor.invokeAll(collection); | |
i = 0; | |
long endTime = System.currentTimeMillis(); | |
System.out.println("Total number of threads executed::" | |
+ folders.length); | |
System.out.println("Timing " + (endTime - startTime) + " milliseconds"); | |
for (Future<DocFrequencyHashMap> fut : list) { | |
// System.out.println(fut.isDone()); | |
// System.out.println(fut.get()); | |
// output file name can be changed here. | |
String fileName = fut.get().getFileName(); | |
String t = outputFolder + "\\output_" + fileName + ".csv"; | |
generateCsvFile(t, fut.get().getHm(), 1000); | |
System.out.println("Output CSV files are generated in :: " + t); | |
} | |
executor.shutdown(); | |
} | |
/* | |
* Terms Array is created by parsing the input files in the input folder. | |
* Each index corresponds to each file and the value is a String array of | |
* the data in the file. | |
*/ | |
public static ArrayList<WordPool> createTermsArray(File folder) { | |
ArrayList<WordPool> wordPoolArrayList = new ArrayList<WordPool>(); | |
for (File fileEntry : folder.listFiles()) { | |
if (fileEntry.isDirectory()) { | |
createTermsArray(fileEntry); | |
} else { | |
// if it is a file | |
if (fileEntry.isFile()) { | |
WordPool wordPool = new WordPool(); | |
// checks if it is a txt file | |
try (FileInputStream fstream = new FileInputStream( | |
fileEntry); | |
DataInputStream in = new DataInputStream(fstream); | |
BufferedReader br = new BufferedReader( | |
new InputStreamReader(in));) { | |
ArrayList<String> tokenizedTerms = new ArrayList<String>(); | |
String strLine = null; | |
while ((strLine = br.readLine()) != null) | |
tokenizedTerms.add(strLine.trim().toLowerCase()); | |
String fileName = fileEntry.getName(); | |
wordPool.setFileName(fileEntry.getName().trim() | |
.substring(0, fileName.length() - 4)); | |
wordPool.setWordList(tokenizedTerms); | |
wordPoolArrayList.add(wordPool); | |
} catch (Exception e) { | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
} | |
} | |
} | |
return wordPoolArrayList; | |
} | |
/** | |
* @param fileName | |
* @param hMap | |
* @param totalWordsCounter | |
*/ | |
static void generateCsvFile(String fileName, HashMap<String, Integer> hMap, | |
int totalWordsCounter) { | |
try (FileWriter writer = new FileWriter(fileName, false);) { | |
writer.append("Word"); | |
writer.append(','); | |
writer.append("Doc Frequency"); | |
writer.append(','); | |
writer.append("Doc Frequency/1000"); | |
writer.append('\n'); | |
for (String key : hMap.keySet()) { | |
String value = hMap.get(key).toString(); | |
double valueDouble = Double.parseDouble(value); | |
double totalWordsCounterDouble = totalWordsCounter; | |
writer.append(key); | |
writer.append(','); | |
writer.append(value); | |
writer.append(','); | |
// If number of occurrences / Total Words is needed, uncomment | |
// the next line and comment the one after next line. | |
// writer.append(value + "/" + totalWordsCounter); | |
writer.append(Double.valueOf( | |
valueDouble / totalWordsCounterDouble).toString()); | |
writer.append('\n'); | |
} | |
writer.flush(); | |
writer.close(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* | |
*/ | |
package com.ourOffice; | |
import java.io.BufferedReader; | |
import java.io.DataInputStream; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.FileNotFoundException; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.util.HashMap; | |
import java.util.StringTokenizer; | |
/** | |
* @author | |
* | |
*/ | |
public class ReadFilesFromFolder { | |
private static String temp = ""; | |
public static void listFilesForFolder(File folder) { | |
HashMap<String, Integer> map = null; | |
for (File fileEntry : folder.listFiles()) { | |
if (fileEntry.isDirectory()) { | |
// System.out.println("Reading files under the folder "+folder.getAbsolutePath()); | |
listFilesForFolder(fileEntry); | |
} else { | |
//if it is a file | |
if (fileEntry.isFile()) { | |
map = new HashMap<>(); | |
temp = fileEntry.getName(); | |
//checks if it is a txt file | |
String fileName = temp.substring(0, temp.length()-4); | |
if ((temp.substring(temp.lastIndexOf('.') + 1, | |
temp.length()).toLowerCase()).equals("txt")) { | |
System.out.println("File= " + folder.getAbsolutePath() | |
+ "\\" + fileEntry.getName()); | |
try(FileInputStream fstream = new FileInputStream( | |
fileEntry); | |
DataInputStream in = new DataInputStream(fstream); | |
BufferedReader br = new BufferedReader( | |
new InputStreamReader(in));) { | |
//reads a file | |
String strLine; | |
int totalNumberOfWords = 0; | |
while ((strLine = br.readLine()) != null) { | |
//tokenizes based on spaces | |
StringTokenizer st = new StringTokenizer(strLine, " "); | |
while (st.hasMoreTokens()) { | |
String word = st.nextToken(); | |
//adds the word to hashmap and updates the counter | |
totalNumberOfWords++; | |
if (map.containsKey(word.toLowerCase())) { | |
map.put(word.toLowerCase(), map.get(word.toLowerCase()) + 1); | |
} else { | |
map.put(word.toLowerCase(), 1); | |
} | |
} | |
} | |
//file to be generated in this path. | |
//System.out.println("ABS::"+folder.getAbsolutePath()); | |
String t = folder.getAbsolutePath()+"\\"+fileName+".csv"; | |
CSVGenerator.generateCsvFile(t,map,totalNumberOfWords); | |
} catch (FileNotFoundException e) | |
{ | |
e.printStackTrace(); | |
} | |
catch (IOException e) | |
{ | |
e.printStackTrace(); | |
} | |
} | |
} | |
} | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
The specs for the problem are as follows : | |
1. There are 11 .txt input files which will contain list of words. | |
2. Output will be 11 .csv files with columns word , document frequency , doc frequency/1000 | |
2. We will need to search for each word in the 1000 text files(document) and calculate document freq ie in how many documents does the word occur. | |
eg - input file may look like | |
planet | |
moon | |
sun | |
eg - output file may look like | |
word doc freq doc frequency/1000 | |
================================ | |
planet 10 10/1000 | |
moon 5 5/1000 | |
sun 1 1/1000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* | |
*/ | |
package com.ourOffice; | |
import java.io.File; | |
/** | |
* @author | |
* | |
*/ | |
public class WordCounter { | |
public static File folder = new File("C:\\MyTestingRoot"); | |
/** | |
* @param args | |
*/ | |
public static void main(String[] args) { | |
// TODO Auto-generated method stub | |
System.out.println("Reading files under the folder "+ folder.getAbsolutePath()); | |
ReadFilesFromFolder.listFilesForFolder(folder); | |
//ReadFilesFromFolder.listFilesForFolder(new File(args[0])); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment